openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,10 +63,6 @@ from openforis_whisp.stats import (
63
63
  )
64
64
 
65
65
  from openforis_whisp.advanced_stats import (
66
- whisp_stats_geojson_to_df_concurrent,
67
- whisp_formatted_stats_geojson_to_df_concurrent,
68
- whisp_stats_geojson_to_df_sequential,
69
- whisp_formatted_stats_geojson_to_df_sequential,
70
66
  whisp_formatted_stats_geojson_to_df_fast,
71
67
  )
72
68
 
@@ -83,7 +79,6 @@ from openforis_whisp.reformat import (
83
79
  create_schema_from_dataframe,
84
80
  load_schema_if_any_file_changed,
85
81
  format_stats_dataframe,
86
- # log_missing_columns,
87
82
  )
88
83
 
89
84
  from openforis_whisp.data_conversion import (
@@ -96,11 +91,16 @@ from openforis_whisp.data_conversion import (
96
91
 
97
92
  from openforis_whisp.risk import whisp_risk, detect_unit_type
98
93
 
99
- from openforis_whisp.utils import get_example_data_path, generate_test_polygons
94
+ from openforis_whisp.utils import (
95
+ get_example_data_path,
96
+ generate_test_polygons, # to be deprecated
97
+ generate_random_features,
98
+ generate_random_points,
99
+ generate_random_polygons,
100
+ )
100
101
 
101
102
  from openforis_whisp.data_checks import (
102
103
  analyze_geojson,
103
104
  validate_geojson_constraints,
104
- _check_metric_constraints,
105
105
  suggest_method,
106
106
  )
@@ -600,18 +600,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
600
600
  If incorrect endpoint and raise_error=True
601
601
  """
602
602
  if not check_ee_endpoint(endpoint_type):
603
- msg = (
604
- f"Not using {endpoint_type.upper()} endpoint.\n"
605
- f"Current URL: {ee.data._cloud_api_base_url}\n"
606
- f"\nTo use {endpoint_type} endpoint, run:\n"
607
- )
608
- msg += "ee.Reset()\n"
609
603
  if endpoint_type == "high-volume":
610
- msg += (
611
- "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
604
+ msg = (
605
+ "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
606
+ "ee.Reset()\n"
607
+ "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
608
+ "Or with project specified (e.g. when in Colab):\n"
609
+ "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
610
+ )
611
+ else: # standard endpoint
612
+ msg = (
613
+ "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
614
+ "ee.Reset()\n"
615
+ "ee.Initialize()\n"
616
+ "Or with project specified (e.g. when in Colab):\n"
617
+ "ee.Initialize(project='your_cloud_project_name')"
612
618
  )
613
- else:
614
- msg += "ee.Initialize() # Uses standard endpoint by default"
615
619
 
616
620
  if raise_error:
617
621
  raise RuntimeError(msg)
@@ -808,8 +812,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
808
812
 
809
813
  def clean_geodataframe(
810
814
  gdf: gpd.GeoDataFrame,
811
- remove_nulls: bool = True,
812
- fix_invalid: bool = True,
815
+ remove_nulls: bool = False,
816
+ repair_geometries: bool = False,
813
817
  logger: logging.Logger = None,
814
818
  ) -> gpd.GeoDataFrame:
815
819
  """
@@ -820,9 +824,11 @@ def clean_geodataframe(
820
824
  gdf : gpd.GeoDataFrame
821
825
  Input GeoDataFrame
822
826
  remove_nulls : bool
823
- Remove null geometries
824
- fix_invalid : bool
825
- Fix invalid geometries
827
+ Remove null geometries. Defaults to False to preserve data integrity.
828
+ Set to True only if you explicitly want to drop rows with null geometries.
829
+ repair_geometries : bool
830
+ Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
831
+ original geometries. Set to True only if you want to automatically repair invalid geometries.
826
832
  logger : logging.Logger, optional
827
833
  Logger for output
828
834
 
@@ -839,11 +845,11 @@ def clean_geodataframe(
839
845
  logger.warning(f"Removing {null_count} null geometries")
840
846
  gdf = gdf[~gdf.geometry.isna()].copy()
841
847
 
842
- if fix_invalid:
848
+ if repair_geometries:
843
849
  valid_count = gdf.geometry.is_valid.sum()
844
850
  invalid_count = len(gdf) - valid_count
845
851
  if invalid_count > 0:
846
- logger.warning(f"Fixing {invalid_count} invalid geometries")
852
+ logger.warning(f"Repairing {invalid_count} invalid geometries")
847
853
  from shapely.validation import make_valid
848
854
 
849
855
  gdf = gdf.copy()
@@ -855,6 +861,19 @@ def clean_geodataframe(
855
861
  return gdf
856
862
 
857
863
 
864
+ # ============================================================================
865
+ # BATCH RETRY HELPER
866
+ # ============================================================================
867
+
868
+
869
+ # ============================================================================
870
+ # BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
871
+ # ============================================================================
872
+ # Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
873
+ # approach: when a batch fails, reduce batch_size parameter and retry manually.
874
+ # This avoids semaphore deadlocks and provides clearer error messages.
875
+
876
+
858
877
  # ============================================================================
859
878
  # EE PROCESSING WITH RETRY LOGIC
860
879
  # ============================================================================
@@ -1041,7 +1060,9 @@ def whisp_stats_geojson_to_df_concurrent(
1041
1060
  logger.info(f"Loaded {len(gdf):,} features")
1042
1061
 
1043
1062
  if validate_geometries:
1044
- gdf = clean_geodataframe(gdf, logger=logger)
1063
+ gdf = clean_geodataframe(
1064
+ gdf, remove_nulls=False, repair_geometries=False, logger=logger
1065
+ )
1045
1066
 
1046
1067
  # Add stable plotIds for merging (starting from 1, not 0)
1047
1068
  gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1134,7 +1155,12 @@ def whisp_stats_geojson_to_df_concurrent(
1134
1155
  for i, batch in enumerate(batches)
1135
1156
  }
1136
1157
 
1158
+ # Track which batches failed for retry
1159
+ batch_map = {i: batch for i, batch in enumerate(batches)}
1160
+ batch_futures = {future: i for future, i in futures.items()}
1161
+
1137
1162
  for future in as_completed(futures):
1163
+ batch_idx = batch_futures[future]
1138
1164
  try:
1139
1165
  batch_idx, df_server, df_client = future.result()
1140
1166
 
@@ -1179,12 +1205,16 @@ def whisp_stats_geojson_to_df_concurrent(
1179
1205
  progress.update()
1180
1206
 
1181
1207
  except Exception as e:
1208
+ # Batch failed - fail fast with clear guidance
1182
1209
  error_msg = str(e)
1183
- logger.error(f"Batch processing error: {error_msg[:100]}")
1184
- import traceback
1210
+ logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1211
+ logger.debug(f"Full error: {error_msg}")
1212
+
1213
+ # Get original batch for error reporting
1214
+ original_batch = batch_map[batch_idx]
1185
1215
 
1186
- logger.debug(traceback.format_exc())
1187
- batch_errors.append(error_msg)
1216
+ # Add to batch errors for final reporting
1217
+ batch_errors.append((batch_idx, original_batch, error_msg))
1188
1218
  finally:
1189
1219
  # Restore logger levels
1190
1220
  fiona_logger.setLevel(old_fiona_level)
@@ -1192,8 +1222,60 @@ def whisp_stats_geojson_to_df_concurrent(
1192
1222
 
1193
1223
  progress.finish()
1194
1224
 
1195
- # Check if we should retry with validation due to band errors
1196
- if batch_errors and not results:
1225
+ # If we have batch errors after retry attempts, fail the entire process
1226
+ if batch_errors:
1227
+ total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
1228
+ failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
1229
+
1230
+ # Format detailed error information for debugging
1231
+ error_details_list = []
1232
+ for idx, batch, msg in batch_errors:
1233
+ error_details_list.append(f" Batch {idx} ({len(batch)} features): {msg}")
1234
+ error_details = "\n".join(error_details_list)
1235
+
1236
+ # Analyze error patterns for debugging hints
1237
+ error_patterns = {
1238
+ "memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
1239
+ "request_size": any(
1240
+ keyword in msg.lower()
1241
+ for _, _, msg in batch_errors
1242
+ for keyword in ["too large", "10mb", "payload", "size limit"]
1243
+ ),
1244
+ "quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
1245
+ "timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
1246
+ }
1247
+
1248
+ # Build helpful suggestions based on error patterns
1249
+ suggestions = []
1250
+ if error_patterns["memory"]:
1251
+ suggestions.append(
1252
+ f" • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
1253
+ )
1254
+ if error_patterns["request_size"]:
1255
+ suggestions.append(
1256
+ " • Request payload too large: reduce batch_size or simplify input geometries"
1257
+ )
1258
+ if error_patterns["quota"]:
1259
+ suggestions.append(" • Earth Engine quota exceeded: wait and retry later")
1260
+ if error_patterns["timeout"]:
1261
+ suggestions.append(
1262
+ " • Processing timeout: reduce batch_size or simplify input geometries"
1263
+ )
1264
+
1265
+ suggestions_text = (
1266
+ "\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
1267
+ )
1268
+
1269
+ raise RuntimeError(
1270
+ f"Failed to process {len(batch_errors)} batch(es):\n"
1271
+ f"\n{error_details}\n"
1272
+ f"\nTotal rows affected: {total_failed_rows}\n"
1273
+ f"{suggestions_text}\n"
1274
+ f"Please reduce batch_size and try again."
1275
+ )
1276
+
1277
+ # Check if we should retry with validation due to band errors (legacy band error handling)
1278
+ if not results:
1197
1279
  # All batches failed - likely a bad band issue
1198
1280
  is_band_error = any(
1199
1281
  keyword in str(batch_errors)
@@ -1564,8 +1646,10 @@ def whisp_stats_geojson_to_df_sequential(
1564
1646
  gdf = _load_geojson_silently(input_geojson_filepath)
1565
1647
  logger.info(f"Loaded {len(gdf):,} features")
1566
1648
 
1567
- # Clean geometries
1568
- gdf = clean_geodataframe(gdf, logger=logger)
1649
+ # Clean geometries (preserve both null and invalid geometries by default)
1650
+ gdf = clean_geodataframe(
1651
+ gdf, remove_nulls=False, repair_geometries=False, logger=logger
1652
+ )
1569
1653
 
1570
1654
  # Add stable plotIds for merging (starting from 1, not 0)
1571
1655
  gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1748,7 +1832,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1748
1832
  convert_water_flag: bool = True,
1749
1833
  water_flag_threshold: float = 0.5,
1750
1834
  sort_column: str = "plotId",
1751
- include_geometry_audit_trail: bool = False,
1835
+ geometry_audit_trail: bool = False,
1752
1836
  ) -> pd.DataFrame:
1753
1837
  """
1754
1838
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -1799,14 +1883,10 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1799
1883
  Water flag ratio threshold (default 0.5)
1800
1884
  sort_column : str
1801
1885
  Column to sort by (default "plotId", None to skip)
1802
- include_geometry_audit_trail : bool, default False
1803
- If True, includes audit trail columns:
1804
- - geo_original: Original input geometry (before EE processing)
1805
- - geometry_type_original: Original geometry type
1806
- - geometry_type: Processed geometry type (from EE)
1807
- - geometry_type_changed: Boolean flag if geometry changed
1808
- - geometry_type_transition: Description of how it changed
1809
- These columns enable full transparency and auditability for compliance tracking.
1886
+ geometry_audit_trail : bool, default False
1887
+ If True, includes original input geometry column:
1888
+ - geo_original: Original input geometry (before EE processing), stored as GeoJSON
1889
+ Enables geometry traceability for compliance and audit purposes.
1810
1890
 
1811
1891
  Returns
1812
1892
  -------
@@ -1826,8 +1906,11 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1826
1906
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1827
1907
  logger.debug(f"Using decimal_places={decimal_places} from config")
1828
1908
 
1829
- # Normalize keep_external_columns parameter early (will be used in merge logic later)
1830
- # Load GeoJSON temporarily to get column names for normalization
1909
+ # Load original geometries once here if needed for audit trail (avoid reloading later)
1910
+ gdf_original_geoms = None
1911
+ if geometry_audit_trail:
1912
+ logger.debug("Pre-loading GeoJSON for geometry audit trail...")
1913
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
1831
1914
 
1832
1915
  # Step 1: Get raw stats
1833
1916
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
@@ -1890,95 +1973,39 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1890
1973
  )
1891
1974
 
1892
1975
  # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
1893
- if include_geometry_audit_trail:
1976
+ if geometry_audit_trail:
1894
1977
  logger.debug("Adding audit trail columns...")
1895
1978
  try:
1896
- # Capture original geometries AFTER we have the raw stats
1897
- logger.debug("Capturing original geometries for audit trail...")
1898
- gdf_original = _load_geojson_silently(input_geojson_filepath)
1979
+ # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
1980
+ if gdf_original_geoms is None:
1981
+ logger.warning("Original geometries not pre-loaded, loading now...")
1982
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
1899
1983
 
1900
1984
  # Use plotId from df_validated to maintain mapping
1901
1985
  df_original_geom = pd.DataFrame(
1902
1986
  {
1903
- "plotId": df_validated["plotId"].values[: len(gdf_original)],
1904
- "geo_original": gdf_original["geometry"].apply(
1987
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
1988
+ "geo_original": gdf_original_geoms["geometry"].apply(
1905
1989
  lambda g: json.dumps(mapping(g)) if g is not None else None
1906
1990
  ),
1907
- "geometry_type_original": gdf_original["geometry"].geom_type.values,
1908
1991
  }
1909
1992
  )
1910
1993
 
1911
1994
  # Merge original geometries back
1912
1995
  df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
1913
1996
 
1914
- # Extract geometry type from processed 'geo' column if it exists
1915
- # Note: 'geo' column may not exist after validation removes extra columns
1916
- if "geo" in df_validated.columns:
1917
- # Use geo column from validated dataframe
1918
- def extract_geom_type(x):
1919
- try:
1920
- if isinstance(x, dict):
1921
- return x.get("type")
1922
- elif isinstance(x, str):
1923
- # Handle both JSON strings and Python dict string representations
1924
- try:
1925
- parsed = json.loads(x)
1926
- except:
1927
- # Try ast.literal_eval for Python dict representations
1928
- import ast
1929
-
1930
- parsed = ast.literal_eval(x)
1931
- return (
1932
- parsed.get("type") if isinstance(parsed, dict) else None
1933
- )
1934
- except:
1935
- pass
1936
- return None
1937
-
1938
- df_validated["geometry_type"] = df_validated["geo"].apply(
1939
- extract_geom_type
1940
- )
1941
- else:
1942
- # If geo doesn't exist, just use the original type
1943
- df_validated["geometry_type"] = df_validated["geometry_type_original"]
1944
-
1945
- # Flag if geometry changed
1946
- df_validated["geometry_type_changed"] = (
1947
- df_validated["geometry_type_original"] != df_validated["geometry_type"]
1948
- )
1949
-
1950
- # Classify the geometry type transition
1951
- def classify_transition(orig, proc):
1952
- if orig == proc:
1953
- return "no_change"
1954
- elif proc == "LineString":
1955
- return f"{orig}_simplified_to_linestring"
1956
- elif proc == "Point":
1957
- return f"{orig}_simplified_to_point"
1958
- else:
1959
- return f"{orig}_to_{proc}"
1960
-
1961
- df_validated["geometry_type_transition"] = df_validated.apply(
1962
- lambda row: classify_transition(
1963
- row["geometry_type_original"], row["geometry_type"]
1964
- ),
1965
- axis=1,
1966
- )
1967
-
1968
1997
  # Store processing metadata
1969
1998
  df_validated.attrs["processing_metadata"] = {
1970
- "whisp_version": "2.0",
1999
+ "whisp_version": "3.0.0a1",
1971
2000
  "processing_date": datetime.now().isoformat(),
1972
2001
  "processing_mode": "concurrent",
1973
2002
  "ee_endpoint": "high_volume",
1974
2003
  "validate_geometries": validate_geometries,
1975
2004
  "datasets_used": national_codes or [],
1976
- "include_geometry_audit_trail": True,
2005
+ "geometry_audit_trail": True,
1977
2006
  }
1978
2007
 
1979
- logger.info(
1980
- f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
1981
- )
2008
+ logger.info(f"Audit trail added: geo_original column")
1982
2009
 
1983
2010
  except Exception as e:
1984
2011
  logger.warning(f"Error adding audit trail: {e}")
@@ -2016,7 +2043,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2016
2043
  convert_water_flag: bool = True,
2017
2044
  water_flag_threshold: float = 0.5,
2018
2045
  sort_column: str = "plotId",
2019
- include_geometry_audit_trail: bool = False,
2046
+ geometry_audit_trail: bool = False,
2020
2047
  ) -> pd.DataFrame:
2021
2048
  """
2022
2049
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -2059,14 +2086,10 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2059
2086
  Water flag ratio threshold (default 0.5)
2060
2087
  sort_column : str
2061
2088
  Column to sort by (default "plotId", None to skip)
2062
- include_geometry_audit_trail : bool, default True
2063
- If True, includes audit trail columns:
2064
- - geo_original: Original input geometry (before EE processing)
2065
- - geometry_type_original: Original geometry type
2066
- - geometry_type: Processed geometry type (from EE)
2067
- - geometry_type_changed: Boolean flag if geometry changed
2068
- - geometry_type_transition: Description of how it changed
2069
- These columns enable full transparency and auditability for EUDR compliance.
2089
+ geometry_audit_trail : bool, default True
2090
+ If True, includes original input geometry column:
2091
+ - geo_original: Original input geometry (before EE processing), stored as GeoJSON
2092
+ Enables geometry traceability for compliance and audit purposes.
2070
2093
 
2071
2094
  Returns
2072
2095
  -------
@@ -2086,6 +2109,12 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2086
2109
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
2087
2110
  logger.debug(f"Using decimal_places={decimal_places} from config")
2088
2111
 
2112
+ # Load original geometries once here if needed for audit trail (avoid reloading later)
2113
+ gdf_original_geoms = None
2114
+ if geometry_audit_trail:
2115
+ logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2116
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2117
+
2089
2118
  # Step 1: Get raw stats
2090
2119
  logger.debug("Step 1/2: Extracting statistics (sequential)...")
2091
2120
  df_raw = whisp_stats_geojson_to_df_sequential(
@@ -2143,94 +2172,38 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2143
2172
  )
2144
2173
 
2145
2174
  # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2146
- if include_geometry_audit_trail:
2175
+ if geometry_audit_trail:
2147
2176
  logger.debug("Adding audit trail columns...")
2148
2177
  try:
2149
- # Capture original geometries AFTER we have the raw stats
2150
- logger.debug("Capturing original geometries for audit trail...")
2151
- gdf_original = _load_geojson_silently(input_geojson_filepath)
2178
+ # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2179
+ if gdf_original_geoms is None:
2180
+ logger.warning("Original geometries not pre-loaded, loading now...")
2181
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2152
2182
 
2153
2183
  # Use plotId from df_validated to maintain mapping
2154
2184
  df_original_geom = pd.DataFrame(
2155
2185
  {
2156
- "plotId": df_validated["plotId"].values[: len(gdf_original)],
2157
- "geo_original": gdf_original["geometry"].apply(
2186
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2187
+ "geo_original": gdf_original_geoms["geometry"].apply(
2158
2188
  lambda g: json.dumps(mapping(g)) if g is not None else None
2159
2189
  ),
2160
- "geometry_type_original": gdf_original["geometry"].geom_type.values,
2161
2190
  }
2162
2191
  )
2163
2192
 
2164
2193
  # Merge original geometries back
2165
2194
  df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2166
2195
 
2167
- # Extract geometry type from processed 'geo' column if it exists
2168
- # Note: 'geo' column may not exist after validation removes extra columns
2169
- if "geo" in df_validated.columns:
2170
- # Use geo column from validated dataframe
2171
- def extract_geom_type(x):
2172
- try:
2173
- if isinstance(x, dict):
2174
- return x.get("type")
2175
- elif isinstance(x, str):
2176
- # Handle both JSON strings and Python dict string representations
2177
- try:
2178
- parsed = json.loads(x)
2179
- except:
2180
- # Try ast.literal_eval for Python dict representations
2181
- import ast
2182
-
2183
- parsed = ast.literal_eval(x)
2184
- return (
2185
- parsed.get("type") if isinstance(parsed, dict) else None
2186
- )
2187
- except:
2188
- pass
2189
- return None
2190
-
2191
- df_validated["geometry_type"] = df_validated["geo"].apply(
2192
- extract_geom_type
2193
- )
2194
- else:
2195
- # If geo doesn't exist, just use the original type
2196
- df_validated["geometry_type"] = df_validated["geometry_type_original"]
2197
-
2198
- # Flag if geometry changed
2199
- df_validated["geometry_type_changed"] = (
2200
- df_validated["geometry_type_original"] != df_validated["geometry_type"]
2201
- )
2202
-
2203
- # Classify the geometry type transition
2204
- def classify_transition(orig, proc):
2205
- if orig == proc:
2206
- return "no_change"
2207
- elif proc == "LineString":
2208
- return f"{orig}_simplified_to_linestring"
2209
- elif proc == "Point":
2210
- return f"{orig}_simplified_to_point"
2211
- else:
2212
- return f"{orig}_to_{proc}"
2213
-
2214
- df_validated["geometry_type_transition"] = df_validated.apply(
2215
- lambda row: classify_transition(
2216
- row["geometry_type_original"], row["geometry_type"]
2217
- ),
2218
- axis=1,
2219
- )
2220
-
2221
2196
  # Store processing metadata
2222
2197
  df_validated.attrs["processing_metadata"] = {
2223
- "whisp_version": "2.0",
2198
+ "whisp_version": "3.0.0a1",
2224
2199
  "processing_date": datetime.now().isoformat(),
2225
2200
  "processing_mode": "sequential",
2226
2201
  "ee_endpoint": "standard",
2227
2202
  "datasets_used": national_codes or [],
2228
- "include_geometry_audit_trail": True,
2203
+ "geometry_audit_trail": True,
2229
2204
  }
2230
2205
 
2231
- logger.info(
2232
- f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
2233
- )
2206
+ logger.info(f"Audit trail added: geo_original column")
2234
2207
 
2235
2208
  except Exception as e:
2236
2209
  logger.warning(f"Error adding audit trail: {e}")
@@ -2265,7 +2238,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
2265
2238
  unit_type: str = "ha",
2266
2239
  whisp_image: ee.Image = None,
2267
2240
  custom_bands: Dict[str, Any] = None,
2268
- mode: str = "auto",
2241
+ mode: str = "sequential",
2269
2242
  # Concurrent-specific parameters
2270
2243
  batch_size: int = 10,
2271
2244
  max_concurrent: int = 20,
@@ -2278,15 +2251,15 @@ def whisp_formatted_stats_geojson_to_df_fast(
2278
2251
  convert_water_flag: bool = True,
2279
2252
  water_flag_threshold: float = 0.5,
2280
2253
  sort_column: str = "plotId",
2281
- include_geometry_audit_trail: bool = False,
2254
+ geometry_audit_trail: bool = False,
2282
2255
  ) -> pd.DataFrame:
2283
2256
  """
2284
2257
  Process GeoJSON to Whisp statistics with optimized fast processing.
2285
2258
 
2286
- Automatically selects between concurrent (high-volume endpoint) and sequential
2287
- (standard endpoint) based on file size, or allows explicit mode selection.
2259
+ Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
2260
+ based on explicit mode selection.
2288
2261
 
2289
- This is the recommended entry point for most users who want automatic optimization.
2262
+ This is the recommended entry point for most users.
2290
2263
 
2291
2264
  Parameters
2292
2265
  ----------
@@ -2306,12 +2279,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
2306
2279
  Custom band information
2307
2280
  mode : str
2308
2281
  Processing mode:
2309
- - "auto": Choose based on file size (default)
2310
- * <1MB: sequential
2311
- * 1-5MB: sequential
2312
- * >5MB: concurrent
2313
- - "concurrent": Force high-volume endpoint (batch processing)
2314
- - "sequential": Force standard endpoint (single-threaded)
2282
+ - "concurrent": Uses high-volume endpoint with batch processing
2283
+ - "sequential": Uses standard endpoint for sequential processing
2315
2284
  batch_size : int
2316
2285
  Features per batch (only for concurrent mode)
2317
2286
  max_concurrent : int
@@ -2332,6 +2301,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
2332
2301
  Water flag ratio threshold
2333
2302
  sort_column : str
2334
2303
  Column to sort by
2304
+ geometry_audit_trail : bool
2305
+ Include geometry modification audit trail columns
2335
2306
 
2336
2307
  Returns
2337
2308
  -------
@@ -2340,16 +2311,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
2340
2311
 
2341
2312
  Examples
2342
2313
  --------
2343
- >>> # Auto-detect best method based on file size
2344
- >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
2345
-
2346
- >>> # Force concurrent processing for large datasets
2314
+ >>> # Use concurrent processing (recommended for most datasets)
2347
2315
  >>> df = whisp_formatted_stats_geojson_to_df_fast(
2348
- ... "large_data.geojson",
2316
+ ... "data.geojson",
2349
2317
  ... mode="concurrent"
2350
2318
  ... )
2351
2319
 
2352
- >>> # Use sequential for guaranteed completion
2320
+ >>> # Use sequential processing for more stable results
2353
2321
  >>> df = whisp_formatted_stats_geojson_to_df_fast(
2354
2322
  ... "data.geojson",
2355
2323
  ... mode="sequential"
@@ -2357,35 +2325,16 @@ def whisp_formatted_stats_geojson_to_df_fast(
2357
2325
  """
2358
2326
  logger = logging.getLogger("whisp")
2359
2327
 
2360
- # Determine processing mode
2361
- if mode == "auto":
2362
- try:
2363
- file_size = Path(input_geojson_filepath).stat().st_size
2364
- if file_size > 5_000_000: # >5MB
2365
- chosen_mode = "concurrent"
2366
- logger.info(
2367
- f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
2368
- )
2369
- else: # <=5MB
2370
- chosen_mode = "sequential"
2371
- logger.info(
2372
- f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
2373
- )
2374
- except Exception as e:
2375
- logger.warning(
2376
- f"Could not determine file size: {e}. Defaulting to sequential."
2377
- )
2378
- chosen_mode = "sequential"
2379
- elif mode in ("concurrent", "sequential"):
2380
- chosen_mode = mode
2381
- logger.info(f"Mode explicitly set to: {mode}")
2382
- else:
2328
+ # Validate mode parameter
2329
+ if mode not in ("concurrent", "sequential"):
2383
2330
  raise ValueError(
2384
- f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
2331
+ f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
2385
2332
  )
2386
2333
 
2334
+ logger.info(f"Mode: {mode}")
2335
+
2387
2336
  # Route to appropriate function
2388
- if chosen_mode == "concurrent":
2337
+ if mode == "concurrent":
2389
2338
  logger.debug("Routing to concurrent processing...")
2390
2339
  return whisp_formatted_stats_geojson_to_df_concurrent(
2391
2340
  input_geojson_filepath=input_geojson_filepath,
@@ -2406,7 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
2406
2355
  convert_water_flag=convert_water_flag,
2407
2356
  water_flag_threshold=water_flag_threshold,
2408
2357
  sort_column=sort_column,
2409
- include_geometry_audit_trail=include_geometry_audit_trail,
2358
+ geometry_audit_trail=geometry_audit_trail,
2410
2359
  )
2411
2360
  else: # sequential
2412
2361
  logger.debug("Routing to sequential processing...")
@@ -2424,5 +2373,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
2424
2373
  convert_water_flag=convert_water_flag,
2425
2374
  water_flag_threshold=water_flag_threshold,
2426
2375
  sort_column=sort_column,
2427
- include_geometry_audit_trail=include_geometry_audit_trail,
2376
+ geometry_audit_trail=geometry_audit_trail,
2428
2377
  )