openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +7 -7
- openforis_whisp/advanced_stats.py +171 -222
- openforis_whisp/reformat.py +2 -29
- openforis_whisp/stats.py +6 -51
- openforis_whisp/utils.py +449 -80
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a3.dist-info}/METADATA +1 -1
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a3.dist-info}/RECORD +9 -9
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a3.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a3.dist-info}/WHEEL +0 -0
openforis_whisp/__init__.py
CHANGED
|
@@ -63,10 +63,6 @@ from openforis_whisp.stats import (
|
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
from openforis_whisp.advanced_stats import (
|
|
66
|
-
whisp_stats_geojson_to_df_concurrent,
|
|
67
|
-
whisp_formatted_stats_geojson_to_df_concurrent,
|
|
68
|
-
whisp_stats_geojson_to_df_sequential,
|
|
69
|
-
whisp_formatted_stats_geojson_to_df_sequential,
|
|
70
66
|
whisp_formatted_stats_geojson_to_df_fast,
|
|
71
67
|
)
|
|
72
68
|
|
|
@@ -83,7 +79,6 @@ from openforis_whisp.reformat import (
|
|
|
83
79
|
create_schema_from_dataframe,
|
|
84
80
|
load_schema_if_any_file_changed,
|
|
85
81
|
format_stats_dataframe,
|
|
86
|
-
# log_missing_columns,
|
|
87
82
|
)
|
|
88
83
|
|
|
89
84
|
from openforis_whisp.data_conversion import (
|
|
@@ -96,11 +91,16 @@ from openforis_whisp.data_conversion import (
|
|
|
96
91
|
|
|
97
92
|
from openforis_whisp.risk import whisp_risk, detect_unit_type
|
|
98
93
|
|
|
99
|
-
from openforis_whisp.utils import
|
|
94
|
+
from openforis_whisp.utils import (
|
|
95
|
+
get_example_data_path,
|
|
96
|
+
generate_test_polygons, # to be deprecated
|
|
97
|
+
generate_random_features,
|
|
98
|
+
generate_random_points,
|
|
99
|
+
generate_random_polygons,
|
|
100
|
+
)
|
|
100
101
|
|
|
101
102
|
from openforis_whisp.data_checks import (
|
|
102
103
|
analyze_geojson,
|
|
103
104
|
validate_geojson_constraints,
|
|
104
|
-
_check_metric_constraints,
|
|
105
105
|
suggest_method,
|
|
106
106
|
)
|
|
@@ -600,18 +600,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
|
|
|
600
600
|
If incorrect endpoint and raise_error=True
|
|
601
601
|
"""
|
|
602
602
|
if not check_ee_endpoint(endpoint_type):
|
|
603
|
-
msg = (
|
|
604
|
-
f"Not using {endpoint_type.upper()} endpoint.\n"
|
|
605
|
-
f"Current URL: {ee.data._cloud_api_base_url}\n"
|
|
606
|
-
f"\nTo use {endpoint_type} endpoint, run:\n"
|
|
607
|
-
)
|
|
608
|
-
msg += "ee.Reset()\n"
|
|
609
603
|
if endpoint_type == "high-volume":
|
|
610
|
-
msg
|
|
611
|
-
"
|
|
604
|
+
msg = (
|
|
605
|
+
"Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
606
|
+
"ee.Reset()\n"
|
|
607
|
+
"ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
608
|
+
"Or with project specified (e.g. when in Colab):\n"
|
|
609
|
+
"ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
610
|
+
)
|
|
611
|
+
else: # standard endpoint
|
|
612
|
+
msg = (
|
|
613
|
+
"Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
|
|
614
|
+
"ee.Reset()\n"
|
|
615
|
+
"ee.Initialize()\n"
|
|
616
|
+
"Or with project specified (e.g. when in Colab):\n"
|
|
617
|
+
"ee.Initialize(project='your_cloud_project_name')"
|
|
612
618
|
)
|
|
613
|
-
else:
|
|
614
|
-
msg += "ee.Initialize() # Uses standard endpoint by default"
|
|
615
619
|
|
|
616
620
|
if raise_error:
|
|
617
621
|
raise RuntimeError(msg)
|
|
@@ -808,8 +812,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
808
812
|
|
|
809
813
|
def clean_geodataframe(
|
|
810
814
|
gdf: gpd.GeoDataFrame,
|
|
811
|
-
remove_nulls: bool =
|
|
812
|
-
|
|
815
|
+
remove_nulls: bool = False,
|
|
816
|
+
repair_geometries: bool = False,
|
|
813
817
|
logger: logging.Logger = None,
|
|
814
818
|
) -> gpd.GeoDataFrame:
|
|
815
819
|
"""
|
|
@@ -820,9 +824,11 @@ def clean_geodataframe(
|
|
|
820
824
|
gdf : gpd.GeoDataFrame
|
|
821
825
|
Input GeoDataFrame
|
|
822
826
|
remove_nulls : bool
|
|
823
|
-
Remove null geometries
|
|
824
|
-
|
|
825
|
-
|
|
827
|
+
Remove null geometries. Defaults to False to preserve data integrity.
|
|
828
|
+
Set to True only if you explicitly want to drop rows with null geometries.
|
|
829
|
+
repair_geometries : bool
|
|
830
|
+
Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
|
|
831
|
+
original geometries. Set to True only if you want to automatically repair invalid geometries.
|
|
826
832
|
logger : logging.Logger, optional
|
|
827
833
|
Logger for output
|
|
828
834
|
|
|
@@ -839,11 +845,11 @@ def clean_geodataframe(
|
|
|
839
845
|
logger.warning(f"Removing {null_count} null geometries")
|
|
840
846
|
gdf = gdf[~gdf.geometry.isna()].copy()
|
|
841
847
|
|
|
842
|
-
if
|
|
848
|
+
if repair_geometries:
|
|
843
849
|
valid_count = gdf.geometry.is_valid.sum()
|
|
844
850
|
invalid_count = len(gdf) - valid_count
|
|
845
851
|
if invalid_count > 0:
|
|
846
|
-
logger.warning(f"
|
|
852
|
+
logger.warning(f"Repairing {invalid_count} invalid geometries")
|
|
847
853
|
from shapely.validation import make_valid
|
|
848
854
|
|
|
849
855
|
gdf = gdf.copy()
|
|
@@ -855,6 +861,19 @@ def clean_geodataframe(
|
|
|
855
861
|
return gdf
|
|
856
862
|
|
|
857
863
|
|
|
864
|
+
# ============================================================================
|
|
865
|
+
# BATCH RETRY HELPER
|
|
866
|
+
# ============================================================================
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
# ============================================================================
|
|
870
|
+
# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
|
|
871
|
+
# ============================================================================
|
|
872
|
+
# Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
|
|
873
|
+
# approach: when a batch fails, reduce batch_size parameter and retry manually.
|
|
874
|
+
# This avoids semaphore deadlocks and provides clearer error messages.
|
|
875
|
+
|
|
876
|
+
|
|
858
877
|
# ============================================================================
|
|
859
878
|
# EE PROCESSING WITH RETRY LOGIC
|
|
860
879
|
# ============================================================================
|
|
@@ -1041,7 +1060,9 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1041
1060
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1042
1061
|
|
|
1043
1062
|
if validate_geometries:
|
|
1044
|
-
gdf = clean_geodataframe(
|
|
1063
|
+
gdf = clean_geodataframe(
|
|
1064
|
+
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
1065
|
+
)
|
|
1045
1066
|
|
|
1046
1067
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1047
1068
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
@@ -1134,7 +1155,12 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1134
1155
|
for i, batch in enumerate(batches)
|
|
1135
1156
|
}
|
|
1136
1157
|
|
|
1158
|
+
# Track which batches failed for retry
|
|
1159
|
+
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1160
|
+
batch_futures = {future: i for future, i in futures.items()}
|
|
1161
|
+
|
|
1137
1162
|
for future in as_completed(futures):
|
|
1163
|
+
batch_idx = batch_futures[future]
|
|
1138
1164
|
try:
|
|
1139
1165
|
batch_idx, df_server, df_client = future.result()
|
|
1140
1166
|
|
|
@@ -1179,12 +1205,16 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1179
1205
|
progress.update()
|
|
1180
1206
|
|
|
1181
1207
|
except Exception as e:
|
|
1208
|
+
# Batch failed - fail fast with clear guidance
|
|
1182
1209
|
error_msg = str(e)
|
|
1183
|
-
logger.error(f"Batch
|
|
1184
|
-
|
|
1210
|
+
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1211
|
+
logger.debug(f"Full error: {error_msg}")
|
|
1212
|
+
|
|
1213
|
+
# Get original batch for error reporting
|
|
1214
|
+
original_batch = batch_map[batch_idx]
|
|
1185
1215
|
|
|
1186
|
-
|
|
1187
|
-
batch_errors.append(error_msg)
|
|
1216
|
+
# Add to batch errors for final reporting
|
|
1217
|
+
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1188
1218
|
finally:
|
|
1189
1219
|
# Restore logger levels
|
|
1190
1220
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -1192,8 +1222,60 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1192
1222
|
|
|
1193
1223
|
progress.finish()
|
|
1194
1224
|
|
|
1195
|
-
#
|
|
1196
|
-
if batch_errors
|
|
1225
|
+
# If we have batch errors after retry attempts, fail the entire process
|
|
1226
|
+
if batch_errors:
|
|
1227
|
+
total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
|
|
1228
|
+
failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
|
|
1229
|
+
|
|
1230
|
+
# Format detailed error information for debugging
|
|
1231
|
+
error_details_list = []
|
|
1232
|
+
for idx, batch, msg in batch_errors:
|
|
1233
|
+
error_details_list.append(f" Batch {idx} ({len(batch)} features): {msg}")
|
|
1234
|
+
error_details = "\n".join(error_details_list)
|
|
1235
|
+
|
|
1236
|
+
# Analyze error patterns for debugging hints
|
|
1237
|
+
error_patterns = {
|
|
1238
|
+
"memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
|
|
1239
|
+
"request_size": any(
|
|
1240
|
+
keyword in msg.lower()
|
|
1241
|
+
for _, _, msg in batch_errors
|
|
1242
|
+
for keyword in ["too large", "10mb", "payload", "size limit"]
|
|
1243
|
+
),
|
|
1244
|
+
"quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
|
|
1245
|
+
"timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
# Build helpful suggestions based on error patterns
|
|
1249
|
+
suggestions = []
|
|
1250
|
+
if error_patterns["memory"]:
|
|
1251
|
+
suggestions.append(
|
|
1252
|
+
f" • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
|
|
1253
|
+
)
|
|
1254
|
+
if error_patterns["request_size"]:
|
|
1255
|
+
suggestions.append(
|
|
1256
|
+
" • Request payload too large: reduce batch_size or simplify input geometries"
|
|
1257
|
+
)
|
|
1258
|
+
if error_patterns["quota"]:
|
|
1259
|
+
suggestions.append(" • Earth Engine quota exceeded: wait and retry later")
|
|
1260
|
+
if error_patterns["timeout"]:
|
|
1261
|
+
suggestions.append(
|
|
1262
|
+
" • Processing timeout: reduce batch_size or simplify input geometries"
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
suggestions_text = (
|
|
1266
|
+
"\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
raise RuntimeError(
|
|
1270
|
+
f"Failed to process {len(batch_errors)} batch(es):\n"
|
|
1271
|
+
f"\n{error_details}\n"
|
|
1272
|
+
f"\nTotal rows affected: {total_failed_rows}\n"
|
|
1273
|
+
f"{suggestions_text}\n"
|
|
1274
|
+
f"Please reduce batch_size and try again."
|
|
1275
|
+
)
|
|
1276
|
+
|
|
1277
|
+
# Check if we should retry with validation due to band errors (legacy band error handling)
|
|
1278
|
+
if not results:
|
|
1197
1279
|
# All batches failed - likely a bad band issue
|
|
1198
1280
|
is_band_error = any(
|
|
1199
1281
|
keyword in str(batch_errors)
|
|
@@ -1564,8 +1646,10 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1564
1646
|
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1565
1647
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1566
1648
|
|
|
1567
|
-
# Clean geometries
|
|
1568
|
-
gdf = clean_geodataframe(
|
|
1649
|
+
# Clean geometries (preserve both null and invalid geometries by default)
|
|
1650
|
+
gdf = clean_geodataframe(
|
|
1651
|
+
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
1652
|
+
)
|
|
1569
1653
|
|
|
1570
1654
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1571
1655
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
@@ -1748,7 +1832,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1748
1832
|
convert_water_flag: bool = True,
|
|
1749
1833
|
water_flag_threshold: float = 0.5,
|
|
1750
1834
|
sort_column: str = "plotId",
|
|
1751
|
-
|
|
1835
|
+
geometry_audit_trail: bool = False,
|
|
1752
1836
|
) -> pd.DataFrame:
|
|
1753
1837
|
"""
|
|
1754
1838
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -1799,14 +1883,10 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1799
1883
|
Water flag ratio threshold (default 0.5)
|
|
1800
1884
|
sort_column : str
|
|
1801
1885
|
Column to sort by (default "plotId", None to skip)
|
|
1802
|
-
|
|
1803
|
-
If True, includes
|
|
1804
|
-
- geo_original: Original input geometry (before EE processing)
|
|
1805
|
-
|
|
1806
|
-
- geometry_type: Processed geometry type (from EE)
|
|
1807
|
-
- geometry_type_changed: Boolean flag if geometry changed
|
|
1808
|
-
- geometry_type_transition: Description of how it changed
|
|
1809
|
-
These columns enable full transparency and auditability for compliance tracking.
|
|
1886
|
+
geometry_audit_trail : bool, default False
|
|
1887
|
+
If True, includes original input geometry column:
|
|
1888
|
+
- geo_original: Original input geometry (before EE processing), stored as GeoJSON
|
|
1889
|
+
Enables geometry traceability for compliance and audit purposes.
|
|
1810
1890
|
|
|
1811
1891
|
Returns
|
|
1812
1892
|
-------
|
|
@@ -1826,8 +1906,11 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1826
1906
|
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1827
1907
|
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1828
1908
|
|
|
1829
|
-
#
|
|
1830
|
-
|
|
1909
|
+
# Load original geometries once here if needed for audit trail (avoid reloading later)
|
|
1910
|
+
gdf_original_geoms = None
|
|
1911
|
+
if geometry_audit_trail:
|
|
1912
|
+
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
1913
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
1831
1914
|
|
|
1832
1915
|
# Step 1: Get raw stats
|
|
1833
1916
|
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
@@ -1890,95 +1973,39 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1890
1973
|
)
|
|
1891
1974
|
|
|
1892
1975
|
# Step 2c: Add audit trail columns (AFTER validation to preserve columns)
|
|
1893
|
-
if
|
|
1976
|
+
if geometry_audit_trail:
|
|
1894
1977
|
logger.debug("Adding audit trail columns...")
|
|
1895
1978
|
try:
|
|
1896
|
-
#
|
|
1897
|
-
|
|
1898
|
-
|
|
1979
|
+
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
1980
|
+
if gdf_original_geoms is None:
|
|
1981
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
1982
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
1899
1983
|
|
|
1900
1984
|
# Use plotId from df_validated to maintain mapping
|
|
1901
1985
|
df_original_geom = pd.DataFrame(
|
|
1902
1986
|
{
|
|
1903
|
-
"plotId": df_validated["plotId"].values[: len(
|
|
1904
|
-
"geo_original":
|
|
1987
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
1988
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
1905
1989
|
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
1906
1990
|
),
|
|
1907
|
-
"geometry_type_original": gdf_original["geometry"].geom_type.values,
|
|
1908
1991
|
}
|
|
1909
1992
|
)
|
|
1910
1993
|
|
|
1911
1994
|
# Merge original geometries back
|
|
1912
1995
|
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
1913
1996
|
|
|
1914
|
-
# Extract geometry type from processed 'geo' column if it exists
|
|
1915
|
-
# Note: 'geo' column may not exist after validation removes extra columns
|
|
1916
|
-
if "geo" in df_validated.columns:
|
|
1917
|
-
# Use geo column from validated dataframe
|
|
1918
|
-
def extract_geom_type(x):
|
|
1919
|
-
try:
|
|
1920
|
-
if isinstance(x, dict):
|
|
1921
|
-
return x.get("type")
|
|
1922
|
-
elif isinstance(x, str):
|
|
1923
|
-
# Handle both JSON strings and Python dict string representations
|
|
1924
|
-
try:
|
|
1925
|
-
parsed = json.loads(x)
|
|
1926
|
-
except:
|
|
1927
|
-
# Try ast.literal_eval for Python dict representations
|
|
1928
|
-
import ast
|
|
1929
|
-
|
|
1930
|
-
parsed = ast.literal_eval(x)
|
|
1931
|
-
return (
|
|
1932
|
-
parsed.get("type") if isinstance(parsed, dict) else None
|
|
1933
|
-
)
|
|
1934
|
-
except:
|
|
1935
|
-
pass
|
|
1936
|
-
return None
|
|
1937
|
-
|
|
1938
|
-
df_validated["geometry_type"] = df_validated["geo"].apply(
|
|
1939
|
-
extract_geom_type
|
|
1940
|
-
)
|
|
1941
|
-
else:
|
|
1942
|
-
# If geo doesn't exist, just use the original type
|
|
1943
|
-
df_validated["geometry_type"] = df_validated["geometry_type_original"]
|
|
1944
|
-
|
|
1945
|
-
# Flag if geometry changed
|
|
1946
|
-
df_validated["geometry_type_changed"] = (
|
|
1947
|
-
df_validated["geometry_type_original"] != df_validated["geometry_type"]
|
|
1948
|
-
)
|
|
1949
|
-
|
|
1950
|
-
# Classify the geometry type transition
|
|
1951
|
-
def classify_transition(orig, proc):
|
|
1952
|
-
if orig == proc:
|
|
1953
|
-
return "no_change"
|
|
1954
|
-
elif proc == "LineString":
|
|
1955
|
-
return f"{orig}_simplified_to_linestring"
|
|
1956
|
-
elif proc == "Point":
|
|
1957
|
-
return f"{orig}_simplified_to_point"
|
|
1958
|
-
else:
|
|
1959
|
-
return f"{orig}_to_{proc}"
|
|
1960
|
-
|
|
1961
|
-
df_validated["geometry_type_transition"] = df_validated.apply(
|
|
1962
|
-
lambda row: classify_transition(
|
|
1963
|
-
row["geometry_type_original"], row["geometry_type"]
|
|
1964
|
-
),
|
|
1965
|
-
axis=1,
|
|
1966
|
-
)
|
|
1967
|
-
|
|
1968
1997
|
# Store processing metadata
|
|
1969
1998
|
df_validated.attrs["processing_metadata"] = {
|
|
1970
|
-
"whisp_version": "
|
|
1999
|
+
"whisp_version": "3.0.0a1",
|
|
1971
2000
|
"processing_date": datetime.now().isoformat(),
|
|
1972
2001
|
"processing_mode": "concurrent",
|
|
1973
2002
|
"ee_endpoint": "high_volume",
|
|
1974
2003
|
"validate_geometries": validate_geometries,
|
|
1975
2004
|
"datasets_used": national_codes or [],
|
|
1976
|
-
"
|
|
2005
|
+
"geometry_audit_trail": True,
|
|
1977
2006
|
}
|
|
1978
2007
|
|
|
1979
|
-
logger.info(
|
|
1980
|
-
f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
|
|
1981
|
-
)
|
|
2008
|
+
logger.info(f"Audit trail added: geo_original column")
|
|
1982
2009
|
|
|
1983
2010
|
except Exception as e:
|
|
1984
2011
|
logger.warning(f"Error adding audit trail: {e}")
|
|
@@ -2016,7 +2043,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2016
2043
|
convert_water_flag: bool = True,
|
|
2017
2044
|
water_flag_threshold: float = 0.5,
|
|
2018
2045
|
sort_column: str = "plotId",
|
|
2019
|
-
|
|
2046
|
+
geometry_audit_trail: bool = False,
|
|
2020
2047
|
) -> pd.DataFrame:
|
|
2021
2048
|
"""
|
|
2022
2049
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -2059,14 +2086,10 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2059
2086
|
Water flag ratio threshold (default 0.5)
|
|
2060
2087
|
sort_column : str
|
|
2061
2088
|
Column to sort by (default "plotId", None to skip)
|
|
2062
|
-
|
|
2063
|
-
If True, includes
|
|
2064
|
-
- geo_original: Original input geometry (before EE processing)
|
|
2065
|
-
|
|
2066
|
-
- geometry_type: Processed geometry type (from EE)
|
|
2067
|
-
- geometry_type_changed: Boolean flag if geometry changed
|
|
2068
|
-
- geometry_type_transition: Description of how it changed
|
|
2069
|
-
These columns enable full transparency and auditability for EUDR compliance.
|
|
2089
|
+
geometry_audit_trail : bool, default True
|
|
2090
|
+
If True, includes original input geometry column:
|
|
2091
|
+
- geo_original: Original input geometry (before EE processing), stored as GeoJSON
|
|
2092
|
+
Enables geometry traceability for compliance and audit purposes.
|
|
2070
2093
|
|
|
2071
2094
|
Returns
|
|
2072
2095
|
-------
|
|
@@ -2086,6 +2109,12 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2086
2109
|
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
2087
2110
|
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
2088
2111
|
|
|
2112
|
+
# Load original geometries once here if needed for audit trail (avoid reloading later)
|
|
2113
|
+
gdf_original_geoms = None
|
|
2114
|
+
if geometry_audit_trail:
|
|
2115
|
+
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2116
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
2117
|
+
|
|
2089
2118
|
# Step 1: Get raw stats
|
|
2090
2119
|
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
2091
2120
|
df_raw = whisp_stats_geojson_to_df_sequential(
|
|
@@ -2143,94 +2172,38 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2143
2172
|
)
|
|
2144
2173
|
|
|
2145
2174
|
# Step 2c: Add audit trail columns (AFTER validation to preserve columns)
|
|
2146
|
-
if
|
|
2175
|
+
if geometry_audit_trail:
|
|
2147
2176
|
logger.debug("Adding audit trail columns...")
|
|
2148
2177
|
try:
|
|
2149
|
-
#
|
|
2150
|
-
|
|
2151
|
-
|
|
2178
|
+
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2179
|
+
if gdf_original_geoms is None:
|
|
2180
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2181
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
2152
2182
|
|
|
2153
2183
|
# Use plotId from df_validated to maintain mapping
|
|
2154
2184
|
df_original_geom = pd.DataFrame(
|
|
2155
2185
|
{
|
|
2156
|
-
"plotId": df_validated["plotId"].values[: len(
|
|
2157
|
-
"geo_original":
|
|
2186
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2187
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2158
2188
|
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2159
2189
|
),
|
|
2160
|
-
"geometry_type_original": gdf_original["geometry"].geom_type.values,
|
|
2161
2190
|
}
|
|
2162
2191
|
)
|
|
2163
2192
|
|
|
2164
2193
|
# Merge original geometries back
|
|
2165
2194
|
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2166
2195
|
|
|
2167
|
-
# Extract geometry type from processed 'geo' column if it exists
|
|
2168
|
-
# Note: 'geo' column may not exist after validation removes extra columns
|
|
2169
|
-
if "geo" in df_validated.columns:
|
|
2170
|
-
# Use geo column from validated dataframe
|
|
2171
|
-
def extract_geom_type(x):
|
|
2172
|
-
try:
|
|
2173
|
-
if isinstance(x, dict):
|
|
2174
|
-
return x.get("type")
|
|
2175
|
-
elif isinstance(x, str):
|
|
2176
|
-
# Handle both JSON strings and Python dict string representations
|
|
2177
|
-
try:
|
|
2178
|
-
parsed = json.loads(x)
|
|
2179
|
-
except:
|
|
2180
|
-
# Try ast.literal_eval for Python dict representations
|
|
2181
|
-
import ast
|
|
2182
|
-
|
|
2183
|
-
parsed = ast.literal_eval(x)
|
|
2184
|
-
return (
|
|
2185
|
-
parsed.get("type") if isinstance(parsed, dict) else None
|
|
2186
|
-
)
|
|
2187
|
-
except:
|
|
2188
|
-
pass
|
|
2189
|
-
return None
|
|
2190
|
-
|
|
2191
|
-
df_validated["geometry_type"] = df_validated["geo"].apply(
|
|
2192
|
-
extract_geom_type
|
|
2193
|
-
)
|
|
2194
|
-
else:
|
|
2195
|
-
# If geo doesn't exist, just use the original type
|
|
2196
|
-
df_validated["geometry_type"] = df_validated["geometry_type_original"]
|
|
2197
|
-
|
|
2198
|
-
# Flag if geometry changed
|
|
2199
|
-
df_validated["geometry_type_changed"] = (
|
|
2200
|
-
df_validated["geometry_type_original"] != df_validated["geometry_type"]
|
|
2201
|
-
)
|
|
2202
|
-
|
|
2203
|
-
# Classify the geometry type transition
|
|
2204
|
-
def classify_transition(orig, proc):
|
|
2205
|
-
if orig == proc:
|
|
2206
|
-
return "no_change"
|
|
2207
|
-
elif proc == "LineString":
|
|
2208
|
-
return f"{orig}_simplified_to_linestring"
|
|
2209
|
-
elif proc == "Point":
|
|
2210
|
-
return f"{orig}_simplified_to_point"
|
|
2211
|
-
else:
|
|
2212
|
-
return f"{orig}_to_{proc}"
|
|
2213
|
-
|
|
2214
|
-
df_validated["geometry_type_transition"] = df_validated.apply(
|
|
2215
|
-
lambda row: classify_transition(
|
|
2216
|
-
row["geometry_type_original"], row["geometry_type"]
|
|
2217
|
-
),
|
|
2218
|
-
axis=1,
|
|
2219
|
-
)
|
|
2220
|
-
|
|
2221
2196
|
# Store processing metadata
|
|
2222
2197
|
df_validated.attrs["processing_metadata"] = {
|
|
2223
|
-
"whisp_version": "
|
|
2198
|
+
"whisp_version": "3.0.0a1",
|
|
2224
2199
|
"processing_date": datetime.now().isoformat(),
|
|
2225
2200
|
"processing_mode": "sequential",
|
|
2226
2201
|
"ee_endpoint": "standard",
|
|
2227
2202
|
"datasets_used": national_codes or [],
|
|
2228
|
-
"
|
|
2203
|
+
"geometry_audit_trail": True,
|
|
2229
2204
|
}
|
|
2230
2205
|
|
|
2231
|
-
logger.info(
|
|
2232
|
-
f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
|
|
2233
|
-
)
|
|
2206
|
+
logger.info(f"Audit trail added: geo_original column")
|
|
2234
2207
|
|
|
2235
2208
|
except Exception as e:
|
|
2236
2209
|
logger.warning(f"Error adding audit trail: {e}")
|
|
@@ -2265,7 +2238,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2265
2238
|
unit_type: str = "ha",
|
|
2266
2239
|
whisp_image: ee.Image = None,
|
|
2267
2240
|
custom_bands: Dict[str, Any] = None,
|
|
2268
|
-
mode: str = "
|
|
2241
|
+
mode: str = "sequential",
|
|
2269
2242
|
# Concurrent-specific parameters
|
|
2270
2243
|
batch_size: int = 10,
|
|
2271
2244
|
max_concurrent: int = 20,
|
|
@@ -2278,15 +2251,15 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2278
2251
|
convert_water_flag: bool = True,
|
|
2279
2252
|
water_flag_threshold: float = 0.5,
|
|
2280
2253
|
sort_column: str = "plotId",
|
|
2281
|
-
|
|
2254
|
+
geometry_audit_trail: bool = False,
|
|
2282
2255
|
) -> pd.DataFrame:
|
|
2283
2256
|
"""
|
|
2284
2257
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
2285
2258
|
|
|
2286
|
-
|
|
2287
|
-
|
|
2259
|
+
Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
|
|
2260
|
+
based on explicit mode selection.
|
|
2288
2261
|
|
|
2289
|
-
This is the recommended entry point for most users
|
|
2262
|
+
This is the recommended entry point for most users.
|
|
2290
2263
|
|
|
2291
2264
|
Parameters
|
|
2292
2265
|
----------
|
|
@@ -2306,12 +2279,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2306
2279
|
Custom band information
|
|
2307
2280
|
mode : str
|
|
2308
2281
|
Processing mode:
|
|
2309
|
-
- "
|
|
2310
|
-
|
|
2311
|
-
* 1-5MB: sequential
|
|
2312
|
-
* >5MB: concurrent
|
|
2313
|
-
- "concurrent": Force high-volume endpoint (batch processing)
|
|
2314
|
-
- "sequential": Force standard endpoint (single-threaded)
|
|
2282
|
+
- "concurrent": Uses high-volume endpoint with batch processing
|
|
2283
|
+
- "sequential": Uses standard endpoint for sequential processing
|
|
2315
2284
|
batch_size : int
|
|
2316
2285
|
Features per batch (only for concurrent mode)
|
|
2317
2286
|
max_concurrent : int
|
|
@@ -2332,6 +2301,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2332
2301
|
Water flag ratio threshold
|
|
2333
2302
|
sort_column : str
|
|
2334
2303
|
Column to sort by
|
|
2304
|
+
geometry_audit_trail : bool
|
|
2305
|
+
Include geometry modification audit trail columns
|
|
2335
2306
|
|
|
2336
2307
|
Returns
|
|
2337
2308
|
-------
|
|
@@ -2340,16 +2311,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2340
2311
|
|
|
2341
2312
|
Examples
|
|
2342
2313
|
--------
|
|
2343
|
-
>>> #
|
|
2344
|
-
>>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
|
|
2345
|
-
|
|
2346
|
-
>>> # Force concurrent processing for large datasets
|
|
2314
|
+
>>> # Use concurrent processing (recommended for most datasets)
|
|
2347
2315
|
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
2348
|
-
... "
|
|
2316
|
+
... "data.geojson",
|
|
2349
2317
|
... mode="concurrent"
|
|
2350
2318
|
... )
|
|
2351
2319
|
|
|
2352
|
-
>>> # Use sequential for
|
|
2320
|
+
>>> # Use sequential processing for more stable results
|
|
2353
2321
|
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
2354
2322
|
... "data.geojson",
|
|
2355
2323
|
... mode="sequential"
|
|
@@ -2357,35 +2325,16 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2357
2325
|
"""
|
|
2358
2326
|
logger = logging.getLogger("whisp")
|
|
2359
2327
|
|
|
2360
|
-
#
|
|
2361
|
-
if mode
|
|
2362
|
-
try:
|
|
2363
|
-
file_size = Path(input_geojson_filepath).stat().st_size
|
|
2364
|
-
if file_size > 5_000_000: # >5MB
|
|
2365
|
-
chosen_mode = "concurrent"
|
|
2366
|
-
logger.info(
|
|
2367
|
-
f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
|
|
2368
|
-
)
|
|
2369
|
-
else: # <=5MB
|
|
2370
|
-
chosen_mode = "sequential"
|
|
2371
|
-
logger.info(
|
|
2372
|
-
f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
|
|
2373
|
-
)
|
|
2374
|
-
except Exception as e:
|
|
2375
|
-
logger.warning(
|
|
2376
|
-
f"Could not determine file size: {e}. Defaulting to sequential."
|
|
2377
|
-
)
|
|
2378
|
-
chosen_mode = "sequential"
|
|
2379
|
-
elif mode in ("concurrent", "sequential"):
|
|
2380
|
-
chosen_mode = mode
|
|
2381
|
-
logger.info(f"Mode explicitly set to: {mode}")
|
|
2382
|
-
else:
|
|
2328
|
+
# Validate mode parameter
|
|
2329
|
+
if mode not in ("concurrent", "sequential"):
|
|
2383
2330
|
raise ValueError(
|
|
2384
|
-
f"Invalid mode '{mode}'. Must be '
|
|
2331
|
+
f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
|
|
2385
2332
|
)
|
|
2386
2333
|
|
|
2334
|
+
logger.info(f"Mode: {mode}")
|
|
2335
|
+
|
|
2387
2336
|
# Route to appropriate function
|
|
2388
|
-
if
|
|
2337
|
+
if mode == "concurrent":
|
|
2389
2338
|
logger.debug("Routing to concurrent processing...")
|
|
2390
2339
|
return whisp_formatted_stats_geojson_to_df_concurrent(
|
|
2391
2340
|
input_geojson_filepath=input_geojson_filepath,
|
|
@@ -2406,7 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2406
2355
|
convert_water_flag=convert_water_flag,
|
|
2407
2356
|
water_flag_threshold=water_flag_threshold,
|
|
2408
2357
|
sort_column=sort_column,
|
|
2409
|
-
|
|
2358
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
2410
2359
|
)
|
|
2411
2360
|
else: # sequential
|
|
2412
2361
|
logger.debug("Routing to sequential processing...")
|
|
@@ -2424,5 +2373,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2424
2373
|
convert_water_flag=convert_water_flag,
|
|
2425
2374
|
water_flag_threshold=water_flag_threshold,
|
|
2426
2375
|
sort_column=sort_column,
|
|
2427
|
-
|
|
2376
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
2428
2377
|
)
|