openforis-whisp 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +7 -7
- openforis_whisp/advanced_stats.py +400 -93
- openforis_whisp/data_checks.py +178 -15
- openforis_whisp/data_conversion.py +154 -59
- openforis_whisp/reformat.py +2 -29
- openforis_whisp/stats.py +15 -45
- openforis_whisp/utils.py +449 -80
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/METADATA +1 -1
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/RECORD +11 -11
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/WHEEL +0 -0
|
@@ -32,7 +32,7 @@ import os
|
|
|
32
32
|
import subprocess
|
|
33
33
|
from contextlib import redirect_stdout, contextmanager
|
|
34
34
|
from pathlib import Path
|
|
35
|
-
from typing import Optional, List, Dict, Any, Tuple
|
|
35
|
+
from typing import Optional, List, Dict, Any, Tuple, Union
|
|
36
36
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
37
|
import tempfile
|
|
38
38
|
|
|
@@ -203,6 +203,57 @@ def _extract_decimal_places(format_string: str) -> int:
|
|
|
203
203
|
return 2 # Default to 2 decimal places
|
|
204
204
|
|
|
205
205
|
|
|
206
|
+
def _normalize_keep_external_columns(
|
|
207
|
+
keep_external_columns: Union[bool, List[str]],
|
|
208
|
+
all_columns: List[str],
|
|
209
|
+
plot_id_column: str = "plotId",
|
|
210
|
+
) -> List[str]:
|
|
211
|
+
"""
|
|
212
|
+
Normalize keep_external_columns parameter to a list of column names.
|
|
213
|
+
|
|
214
|
+
Converts flexible user input (bool or list) to a concrete list of columns to keep.
|
|
215
|
+
|
|
216
|
+
Parameters
|
|
217
|
+
----------
|
|
218
|
+
keep_external_columns : bool or List[str]
|
|
219
|
+
- False: keep nothing (return empty list)
|
|
220
|
+
- True: keep all columns except geometry and plot_id
|
|
221
|
+
- List[str]: keep specific columns (return as-is)
|
|
222
|
+
all_columns : List[str]
|
|
223
|
+
All available columns to choose from
|
|
224
|
+
plot_id_column : str
|
|
225
|
+
Name of plot ID column to exclude
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
List[str]
|
|
230
|
+
Columns to keep from external (GeoJSON) data
|
|
231
|
+
|
|
232
|
+
Examples
|
|
233
|
+
--------
|
|
234
|
+
>>> cols = _normalize_keep_external_columns(False, ["id", "Country", "geom"], "id")
|
|
235
|
+
>>> cols
|
|
236
|
+
[]
|
|
237
|
+
|
|
238
|
+
>>> cols = _normalize_keep_external_columns(True, ["id", "Country", "geom"], "id")
|
|
239
|
+
>>> cols
|
|
240
|
+
['Country']
|
|
241
|
+
|
|
242
|
+
>>> cols = _normalize_keep_external_columns(["Country"], ["id", "Country", "geom"], "id")
|
|
243
|
+
>>> cols
|
|
244
|
+
['Country']
|
|
245
|
+
"""
|
|
246
|
+
if keep_external_columns is True:
|
|
247
|
+
# Keep all columns except geometry and plot_id
|
|
248
|
+
return [c for c in all_columns if c not in [plot_id_column, "geometry"]]
|
|
249
|
+
elif keep_external_columns is False:
|
|
250
|
+
# Keep nothing
|
|
251
|
+
return []
|
|
252
|
+
else:
|
|
253
|
+
# Use provided list (handle None case)
|
|
254
|
+
return keep_external_columns or []
|
|
255
|
+
|
|
256
|
+
|
|
206
257
|
def _add_admin_context(
|
|
207
258
|
df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
|
|
208
259
|
) -> pd.DataFrame:
|
|
@@ -226,7 +277,7 @@ def _add_admin_context(
|
|
|
226
277
|
pd.DataFrame
|
|
227
278
|
DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
|
|
228
279
|
"""
|
|
229
|
-
logger = logging.getLogger("whisp
|
|
280
|
+
logger = logging.getLogger("whisp")
|
|
230
281
|
|
|
231
282
|
# Return early if admin code column doesn't exist
|
|
232
283
|
if admin_code_col not in df.columns:
|
|
@@ -347,7 +398,7 @@ def join_admin_codes(
|
|
|
347
398
|
pd.DataFrame
|
|
348
399
|
DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
|
|
349
400
|
"""
|
|
350
|
-
logger = logging.getLogger("whisp
|
|
401
|
+
logger = logging.getLogger("whisp")
|
|
351
402
|
|
|
352
403
|
# Return early if admin code column doesn't exist
|
|
353
404
|
if id_col not in df.columns:
|
|
@@ -408,8 +459,9 @@ class ProgressTracker:
|
|
|
408
459
|
"""
|
|
409
460
|
Track batch processing progress with time estimation.
|
|
410
461
|
|
|
411
|
-
Shows progress at
|
|
412
|
-
time remaining based on
|
|
462
|
+
Shows progress at adaptive milestones (more frequent for small datasets,
|
|
463
|
+
less frequent for large datasets) with estimated time remaining based on
|
|
464
|
+
processing speed.
|
|
413
465
|
"""
|
|
414
466
|
|
|
415
467
|
def __init__(self, total: int, logger: logging.Logger = None):
|
|
@@ -426,8 +478,19 @@ class ProgressTracker:
|
|
|
426
478
|
self.total = total
|
|
427
479
|
self.completed = 0
|
|
428
480
|
self.lock = threading.Lock()
|
|
429
|
-
self.logger = logger or logging.getLogger("whisp
|
|
430
|
-
|
|
481
|
+
self.logger = logger or logging.getLogger("whisp")
|
|
482
|
+
|
|
483
|
+
# Adaptive milestones based on dataset size
|
|
484
|
+
# Small datasets (< 50): show every 25% (not too spammy)
|
|
485
|
+
# Medium (50-500): show every 20%
|
|
486
|
+
# Large (500+): show every 10% (more frequent feedback on long runs)
|
|
487
|
+
if total < 50:
|
|
488
|
+
self.milestones = {25, 50, 75, 100}
|
|
489
|
+
elif total < 500:
|
|
490
|
+
self.milestones = {20, 40, 60, 80, 100}
|
|
491
|
+
else:
|
|
492
|
+
self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
|
|
493
|
+
|
|
431
494
|
self.shown_milestones = set()
|
|
432
495
|
self.start_time = time.time()
|
|
433
496
|
self.last_update_time = self.start_time
|
|
@@ -537,16 +600,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
|
|
|
537
600
|
If incorrect endpoint and raise_error=True
|
|
538
601
|
"""
|
|
539
602
|
if not check_ee_endpoint(endpoint_type):
|
|
540
|
-
msg = (
|
|
541
|
-
f"Not using {endpoint_type.upper()} endpoint.\n"
|
|
542
|
-
f"Current URL: {ee.data._cloud_api_base_url}\n"
|
|
543
|
-
f"\nTo use {endpoint_type} endpoint, run:\n"
|
|
544
|
-
)
|
|
545
|
-
msg += "ee.Reset()\n"
|
|
546
603
|
if endpoint_type == "high-volume":
|
|
547
|
-
msg
|
|
548
|
-
|
|
549
|
-
|
|
604
|
+
msg = (
|
|
605
|
+
"Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
606
|
+
"ee.Reset()\n"
|
|
607
|
+
"ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
608
|
+
"Or with project specified (e.g. when in Colab):\n"
|
|
609
|
+
"ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
610
|
+
)
|
|
611
|
+
else: # standard endpoint
|
|
612
|
+
msg = (
|
|
613
|
+
"Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
|
|
614
|
+
"ee.Reset()\n"
|
|
615
|
+
"ee.Initialize()\n"
|
|
616
|
+
"Or with project specified (e.g. when in Colab):\n"
|
|
617
|
+
"ee.Initialize(project='your_cloud_project_name')"
|
|
618
|
+
)
|
|
550
619
|
|
|
551
620
|
if raise_error:
|
|
552
621
|
raise RuntimeError(msg)
|
|
@@ -713,8 +782,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
713
782
|
"""
|
|
714
783
|
Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
|
|
715
784
|
|
|
716
|
-
OPTIMIZATION:
|
|
717
|
-
This
|
|
785
|
+
OPTIMIZATION: Passes GeoDataFrame directly to convert_geojson_to_ee to preserve CRS.
|
|
786
|
+
This ensures proper coordinate system handling and reprojection to WGS84 if needed.
|
|
718
787
|
|
|
719
788
|
Preserves the __row_id__ column if present so it can be retrieved after processing.
|
|
720
789
|
|
|
@@ -728,10 +797,13 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
728
797
|
ee.FeatureCollection
|
|
729
798
|
EE FeatureCollection with __row_id__ as a feature property
|
|
730
799
|
"""
|
|
731
|
-
#
|
|
732
|
-
#
|
|
733
|
-
|
|
734
|
-
|
|
800
|
+
# Pass GeoDataFrame directly to preserve CRS metadata
|
|
801
|
+
# convert_geojson_to_ee will handle:
|
|
802
|
+
# - CRS detection and conversion to WGS84 if needed
|
|
803
|
+
# - Data type sanitization (datetime, object columns)
|
|
804
|
+
# - Geometry validation and Z-coordinate stripping
|
|
805
|
+
|
|
806
|
+
fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
|
|
735
807
|
|
|
736
808
|
# If __row_id__ is in the original GeoDataFrame, it will be preserved
|
|
737
809
|
# as a feature property in the GeoJSON and thus in the EE FeatureCollection
|
|
@@ -740,8 +812,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
740
812
|
|
|
741
813
|
def clean_geodataframe(
|
|
742
814
|
gdf: gpd.GeoDataFrame,
|
|
743
|
-
remove_nulls: bool =
|
|
744
|
-
|
|
815
|
+
remove_nulls: bool = False,
|
|
816
|
+
repair_geometries: bool = False,
|
|
745
817
|
logger: logging.Logger = None,
|
|
746
818
|
) -> gpd.GeoDataFrame:
|
|
747
819
|
"""
|
|
@@ -752,9 +824,11 @@ def clean_geodataframe(
|
|
|
752
824
|
gdf : gpd.GeoDataFrame
|
|
753
825
|
Input GeoDataFrame
|
|
754
826
|
remove_nulls : bool
|
|
755
|
-
Remove null geometries
|
|
756
|
-
|
|
757
|
-
|
|
827
|
+
Remove null geometries. Defaults to False to preserve data integrity.
|
|
828
|
+
Set to True only if you explicitly want to drop rows with null geometries.
|
|
829
|
+
repair_geometries : bool
|
|
830
|
+
Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
|
|
831
|
+
original geometries. Set to True only if you want to automatically repair invalid geometries.
|
|
758
832
|
logger : logging.Logger, optional
|
|
759
833
|
Logger for output
|
|
760
834
|
|
|
@@ -763,7 +837,7 @@ def clean_geodataframe(
|
|
|
763
837
|
gpd.GeoDataFrame
|
|
764
838
|
Cleaned GeoDataFrame
|
|
765
839
|
"""
|
|
766
|
-
logger = logger or logging.getLogger("whisp
|
|
840
|
+
logger = logger or logging.getLogger("whisp")
|
|
767
841
|
|
|
768
842
|
if remove_nulls:
|
|
769
843
|
null_count = gdf.geometry.isna().sum()
|
|
@@ -771,11 +845,11 @@ def clean_geodataframe(
|
|
|
771
845
|
logger.warning(f"Removing {null_count} null geometries")
|
|
772
846
|
gdf = gdf[~gdf.geometry.isna()].copy()
|
|
773
847
|
|
|
774
|
-
if
|
|
848
|
+
if repair_geometries:
|
|
775
849
|
valid_count = gdf.geometry.is_valid.sum()
|
|
776
850
|
invalid_count = len(gdf) - valid_count
|
|
777
851
|
if invalid_count > 0:
|
|
778
|
-
logger.warning(f"
|
|
852
|
+
logger.warning(f"Repairing {invalid_count} invalid geometries")
|
|
779
853
|
from shapely.validation import make_valid
|
|
780
854
|
|
|
781
855
|
gdf = gdf.copy()
|
|
@@ -787,6 +861,19 @@ def clean_geodataframe(
|
|
|
787
861
|
return gdf
|
|
788
862
|
|
|
789
863
|
|
|
864
|
+
# ============================================================================
|
|
865
|
+
# BATCH RETRY HELPER
|
|
866
|
+
# ============================================================================
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
# ============================================================================
|
|
870
|
+
# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
|
|
871
|
+
# ============================================================================
|
|
872
|
+
# Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
|
|
873
|
+
# approach: when a batch fails, reduce batch_size parameter and retry manually.
|
|
874
|
+
# This avoids semaphore deadlocks and provides clearer error messages.
|
|
875
|
+
|
|
876
|
+
|
|
790
877
|
# ============================================================================
|
|
791
878
|
# EE PROCESSING WITH RETRY LOGIC
|
|
792
879
|
# ============================================================================
|
|
@@ -828,7 +915,7 @@ def process_ee_batch(
|
|
|
828
915
|
RuntimeError
|
|
829
916
|
If processing fails after all retries
|
|
830
917
|
"""
|
|
831
|
-
logger = logger or logging.getLogger("whisp
|
|
918
|
+
logger = logger or logging.getLogger("whisp")
|
|
832
919
|
|
|
833
920
|
for attempt in range(max_retries):
|
|
834
921
|
try:
|
|
@@ -955,7 +1042,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
955
1042
|
"""
|
|
956
1043
|
from openforis_whisp.reformat import format_stats_dataframe
|
|
957
1044
|
|
|
958
|
-
logger = logger or logging.getLogger("whisp
|
|
1045
|
+
logger = logger or logging.getLogger("whisp")
|
|
959
1046
|
|
|
960
1047
|
# Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
|
|
961
1048
|
_suppress_verbose_output(max_concurrent=max_concurrent)
|
|
@@ -973,11 +1060,23 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
973
1060
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
974
1061
|
|
|
975
1062
|
if validate_geometries:
|
|
976
|
-
gdf = clean_geodataframe(
|
|
1063
|
+
gdf = clean_geodataframe(
|
|
1064
|
+
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
1065
|
+
)
|
|
977
1066
|
|
|
978
1067
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
979
1068
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
980
1069
|
|
|
1070
|
+
# Strip unnecessary properties before sending to EE
|
|
1071
|
+
# Keep only: geometry, plot_id_column, and external_id_column
|
|
1072
|
+
# This prevents duplication of GeoJSON properties in EE results
|
|
1073
|
+
keep_cols = ["geometry", plot_id_column]
|
|
1074
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
1075
|
+
keep_cols.append(external_id_column)
|
|
1076
|
+
|
|
1077
|
+
gdf_for_ee = gdf[keep_cols].copy()
|
|
1078
|
+
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1079
|
+
|
|
981
1080
|
# Create image if not provided
|
|
982
1081
|
if whisp_image is None:
|
|
983
1082
|
logger.debug("Creating Whisp image...")
|
|
@@ -1001,8 +1100,8 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1001
1100
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
1002
1101
|
|
|
1003
1102
|
# Batch the data
|
|
1004
|
-
batches = batch_geodataframe(
|
|
1005
|
-
logger.info(f"Processing {len(
|
|
1103
|
+
batches = batch_geodataframe(gdf_for_ee, batch_size)
|
|
1104
|
+
logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
|
|
1006
1105
|
|
|
1007
1106
|
# Setup semaphore for EE concurrency control
|
|
1008
1107
|
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
@@ -1056,7 +1155,12 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1056
1155
|
for i, batch in enumerate(batches)
|
|
1057
1156
|
}
|
|
1058
1157
|
|
|
1158
|
+
# Track which batches failed for retry
|
|
1159
|
+
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1160
|
+
batch_futures = {future: i for future, i in futures.items()}
|
|
1161
|
+
|
|
1059
1162
|
for future in as_completed(futures):
|
|
1163
|
+
batch_idx = batch_futures[future]
|
|
1060
1164
|
try:
|
|
1061
1165
|
batch_idx, df_server, df_client = future.result()
|
|
1062
1166
|
|
|
@@ -1064,8 +1168,35 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1064
1168
|
if plot_id_column not in df_server.columns:
|
|
1065
1169
|
df_server[plot_id_column] = range(len(df_server))
|
|
1066
1170
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1171
|
+
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1172
|
+
# These are the actual EE processing results
|
|
1173
|
+
df_server_clean = df_server.copy()
|
|
1174
|
+
|
|
1175
|
+
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1176
|
+
# (formatted wrapper handles keep_external_columns parameter)
|
|
1177
|
+
keep_external_columns = [plot_id_column]
|
|
1178
|
+
if (
|
|
1179
|
+
external_id_column
|
|
1180
|
+
and external_id_column in df_client.columns
|
|
1181
|
+
):
|
|
1182
|
+
keep_external_columns.append(external_id_column)
|
|
1183
|
+
if "geometry" in df_client.columns:
|
|
1184
|
+
keep_external_columns.append("geometry")
|
|
1185
|
+
# Keep geometry type column (Geometry_type)
|
|
1186
|
+
if geometry_type_column in df_client.columns:
|
|
1187
|
+
keep_external_columns.append(geometry_type_column)
|
|
1188
|
+
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1189
|
+
centroid_cols = [
|
|
1190
|
+
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1191
|
+
]
|
|
1192
|
+
keep_external_columns.extend(centroid_cols)
|
|
1193
|
+
|
|
1194
|
+
df_client_clean = df_client[
|
|
1195
|
+
[c for c in keep_external_columns if c in df_client.columns]
|
|
1196
|
+
].drop_duplicates()
|
|
1197
|
+
|
|
1198
|
+
merged = df_server_clean.merge(
|
|
1199
|
+
df_client_clean,
|
|
1069
1200
|
on=plot_id_column,
|
|
1070
1201
|
how="left",
|
|
1071
1202
|
suffixes=("_ee", "_client"),
|
|
@@ -1074,12 +1205,16 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1074
1205
|
progress.update()
|
|
1075
1206
|
|
|
1076
1207
|
except Exception as e:
|
|
1208
|
+
# Batch failed - fail fast with clear guidance
|
|
1077
1209
|
error_msg = str(e)
|
|
1078
|
-
logger.error(f"Batch
|
|
1079
|
-
|
|
1210
|
+
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1211
|
+
logger.debug(f"Full error: {error_msg}")
|
|
1212
|
+
|
|
1213
|
+
# Get original batch for error reporting
|
|
1214
|
+
original_batch = batch_map[batch_idx]
|
|
1080
1215
|
|
|
1081
|
-
|
|
1082
|
-
batch_errors.append(error_msg)
|
|
1216
|
+
# Add to batch errors for final reporting
|
|
1217
|
+
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1083
1218
|
finally:
|
|
1084
1219
|
# Restore logger levels
|
|
1085
1220
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -1087,8 +1222,60 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1087
1222
|
|
|
1088
1223
|
progress.finish()
|
|
1089
1224
|
|
|
1090
|
-
#
|
|
1091
|
-
if batch_errors
|
|
1225
|
+
# If we have batch errors after retry attempts, fail the entire process
|
|
1226
|
+
if batch_errors:
|
|
1227
|
+
total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
|
|
1228
|
+
failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
|
|
1229
|
+
|
|
1230
|
+
# Format detailed error information for debugging
|
|
1231
|
+
error_details_list = []
|
|
1232
|
+
for idx, batch, msg in batch_errors:
|
|
1233
|
+
error_details_list.append(f" Batch {idx} ({len(batch)} features): {msg}")
|
|
1234
|
+
error_details = "\n".join(error_details_list)
|
|
1235
|
+
|
|
1236
|
+
# Analyze error patterns for debugging hints
|
|
1237
|
+
error_patterns = {
|
|
1238
|
+
"memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
|
|
1239
|
+
"request_size": any(
|
|
1240
|
+
keyword in msg.lower()
|
|
1241
|
+
for _, _, msg in batch_errors
|
|
1242
|
+
for keyword in ["too large", "10mb", "payload", "size limit"]
|
|
1243
|
+
),
|
|
1244
|
+
"quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
|
|
1245
|
+
"timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
# Build helpful suggestions based on error patterns
|
|
1249
|
+
suggestions = []
|
|
1250
|
+
if error_patterns["memory"]:
|
|
1251
|
+
suggestions.append(
|
|
1252
|
+
f" • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
|
|
1253
|
+
)
|
|
1254
|
+
if error_patterns["request_size"]:
|
|
1255
|
+
suggestions.append(
|
|
1256
|
+
" • Request payload too large: reduce batch_size or simplify input geometries"
|
|
1257
|
+
)
|
|
1258
|
+
if error_patterns["quota"]:
|
|
1259
|
+
suggestions.append(" • Earth Engine quota exceeded: wait and retry later")
|
|
1260
|
+
if error_patterns["timeout"]:
|
|
1261
|
+
suggestions.append(
|
|
1262
|
+
" • Processing timeout: reduce batch_size or simplify input geometries"
|
|
1263
|
+
)
|
|
1264
|
+
|
|
1265
|
+
suggestions_text = (
|
|
1266
|
+
"\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
raise RuntimeError(
|
|
1270
|
+
f"Failed to process {len(batch_errors)} batch(es):\n"
|
|
1271
|
+
f"\n{error_details}\n"
|
|
1272
|
+
f"\nTotal rows affected: {total_failed_rows}\n"
|
|
1273
|
+
f"{suggestions_text}\n"
|
|
1274
|
+
f"Please reduce batch_size and try again."
|
|
1275
|
+
)
|
|
1276
|
+
|
|
1277
|
+
# Check if we should retry with validation due to band errors (legacy band error handling)
|
|
1278
|
+
if not results:
|
|
1092
1279
|
# All batches failed - likely a bad band issue
|
|
1093
1280
|
is_band_error = any(
|
|
1094
1281
|
keyword in str(batch_errors)
|
|
@@ -1442,7 +1629,7 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1442
1629
|
"""
|
|
1443
1630
|
from openforis_whisp.reformat import format_stats_dataframe
|
|
1444
1631
|
|
|
1445
|
-
logger = logger or logging.getLogger("whisp
|
|
1632
|
+
logger = logger or logging.getLogger("whisp")
|
|
1446
1633
|
|
|
1447
1634
|
# Suppress verbose output from dependencies (sequential has lower concurrency, use default)
|
|
1448
1635
|
_suppress_verbose_output(max_concurrent=1)
|
|
@@ -1459,8 +1646,10 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1459
1646
|
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1460
1647
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1461
1648
|
|
|
1462
|
-
# Clean geometries
|
|
1463
|
-
gdf = clean_geodataframe(
|
|
1649
|
+
# Clean geometries (preserve both null and invalid geometries by default)
|
|
1650
|
+
gdf = clean_geodataframe(
|
|
1651
|
+
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
1652
|
+
)
|
|
1464
1653
|
|
|
1465
1654
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1466
1655
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
@@ -1469,6 +1658,16 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1469
1658
|
row_id_col = "__row_id__"
|
|
1470
1659
|
gdf[row_id_col] = range(len(gdf))
|
|
1471
1660
|
|
|
1661
|
+
# Strip unnecessary properties before sending to EE
|
|
1662
|
+
# Keep only: geometry, plot_id_column, and external_id_column
|
|
1663
|
+
# This prevents duplication of GeoJSON properties in EE results
|
|
1664
|
+
keep_cols = ["geometry", plot_id_column, row_id_col]
|
|
1665
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
1666
|
+
keep_cols.append(external_id_column)
|
|
1667
|
+
|
|
1668
|
+
gdf_for_ee = gdf[keep_cols].copy()
|
|
1669
|
+
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1670
|
+
|
|
1472
1671
|
# Create image if not provided
|
|
1473
1672
|
if whisp_image is None:
|
|
1474
1673
|
logger.debug("Creating Whisp image...")
|
|
@@ -1491,7 +1690,7 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1491
1690
|
# Convert to EE (suppress print statements from convert_geojson_to_ee)
|
|
1492
1691
|
logger.debug("Converting to EE FeatureCollection...")
|
|
1493
1692
|
with redirect_stdout(io.StringIO()):
|
|
1494
|
-
fc = convert_geojson_to_ee(
|
|
1693
|
+
fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
|
|
1495
1694
|
|
|
1496
1695
|
# Create reducer
|
|
1497
1696
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
@@ -1633,6 +1832,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1633
1832
|
convert_water_flag: bool = True,
|
|
1634
1833
|
water_flag_threshold: float = 0.5,
|
|
1635
1834
|
sort_column: str = "plotId",
|
|
1835
|
+
geometry_audit_trail: bool = False,
|
|
1636
1836
|
) -> pd.DataFrame:
|
|
1637
1837
|
"""
|
|
1638
1838
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -1683,15 +1883,22 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1683
1883
|
Water flag ratio threshold (default 0.5)
|
|
1684
1884
|
sort_column : str
|
|
1685
1885
|
Column to sort by (default "plotId", None to skip)
|
|
1886
|
+
geometry_audit_trail : bool, default False
|
|
1887
|
+
If True, includes original input geometry column:
|
|
1888
|
+
- geo_original: Original input geometry (before EE processing), stored as GeoJSON
|
|
1889
|
+
Enables geometry traceability for compliance and audit purposes.
|
|
1686
1890
|
|
|
1687
1891
|
Returns
|
|
1688
1892
|
-------
|
|
1689
1893
|
pd.DataFrame
|
|
1690
|
-
Validated, formatted results DataFrame
|
|
1894
|
+
Validated, formatted results DataFrame with optional audit trail
|
|
1691
1895
|
"""
|
|
1692
1896
|
from openforis_whisp.reformat import format_stats_dataframe
|
|
1897
|
+
from datetime import datetime, timezone
|
|
1898
|
+
import json
|
|
1899
|
+
from shapely.geometry import mapping
|
|
1693
1900
|
|
|
1694
|
-
logger = logger or logging.getLogger("whisp
|
|
1901
|
+
logger = logger or logging.getLogger("whisp")
|
|
1695
1902
|
|
|
1696
1903
|
# Auto-detect decimal places from config if not provided
|
|
1697
1904
|
if decimal_places is None:
|
|
@@ -1699,6 +1906,12 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1699
1906
|
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1700
1907
|
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1701
1908
|
|
|
1909
|
+
# Load original geometries once here if needed for audit trail (avoid reloading later)
|
|
1910
|
+
gdf_original_geoms = None
|
|
1911
|
+
if geometry_audit_trail:
|
|
1912
|
+
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
1913
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
1914
|
+
|
|
1702
1915
|
# Step 1: Get raw stats
|
|
1703
1916
|
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
1704
1917
|
df_raw = whisp_stats_geojson_to_df_concurrent(
|
|
@@ -1759,6 +1972,57 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1759
1972
|
custom_bands=custom_bands,
|
|
1760
1973
|
)
|
|
1761
1974
|
|
|
1975
|
+
# Step 2c: Add audit trail columns (AFTER validation to preserve columns)
|
|
1976
|
+
if geometry_audit_trail:
|
|
1977
|
+
logger.debug("Adding audit trail columns...")
|
|
1978
|
+
try:
|
|
1979
|
+
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
1980
|
+
if gdf_original_geoms is None:
|
|
1981
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
1982
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
1983
|
+
|
|
1984
|
+
# Use plotId from df_validated to maintain mapping
|
|
1985
|
+
df_original_geom = pd.DataFrame(
|
|
1986
|
+
{
|
|
1987
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
1988
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
1989
|
+
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
1990
|
+
),
|
|
1991
|
+
}
|
|
1992
|
+
)
|
|
1993
|
+
|
|
1994
|
+
# Merge original geometries back
|
|
1995
|
+
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
1996
|
+
|
|
1997
|
+
# Store processing metadata
|
|
1998
|
+
df_validated.attrs["processing_metadata"] = {
|
|
1999
|
+
"whisp_version": "3.0.0a1",
|
|
2000
|
+
"processing_date": datetime.now().isoformat(),
|
|
2001
|
+
"processing_mode": "concurrent",
|
|
2002
|
+
"ee_endpoint": "high_volume",
|
|
2003
|
+
"validate_geometries": validate_geometries,
|
|
2004
|
+
"datasets_used": national_codes or [],
|
|
2005
|
+
"geometry_audit_trail": True,
|
|
2006
|
+
}
|
|
2007
|
+
|
|
2008
|
+
logger.info(f"Audit trail added: geo_original column")
|
|
2009
|
+
|
|
2010
|
+
except Exception as e:
|
|
2011
|
+
logger.warning(f"Error adding audit trail: {e}")
|
|
2012
|
+
# Continue without audit trail if something fails
|
|
2013
|
+
|
|
2014
|
+
# Add processing metadata column using pd.concat to avoid fragmentation warning
|
|
2015
|
+
metadata_dict = {
|
|
2016
|
+
"whisp_version": "3.0.0a1",
|
|
2017
|
+
"processing_timestamp_utc": datetime.now(timezone.utc).strftime(
|
|
2018
|
+
"%Y-%m-%d %H:%M:%S UTC"
|
|
2019
|
+
),
|
|
2020
|
+
}
|
|
2021
|
+
metadata_series = pd.Series(
|
|
2022
|
+
[metadata_dict] * len(df_validated), name="whisp_processing_metadata"
|
|
2023
|
+
)
|
|
2024
|
+
df_validated = pd.concat([df_validated, metadata_series], axis=1)
|
|
2025
|
+
|
|
1762
2026
|
logger.info("Concurrent processing + formatting + validation complete")
|
|
1763
2027
|
return df_validated
|
|
1764
2028
|
|
|
@@ -1779,6 +2043,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
1779
2043
|
convert_water_flag: bool = True,
|
|
1780
2044
|
water_flag_threshold: float = 0.5,
|
|
1781
2045
|
sort_column: str = "plotId",
|
|
2046
|
+
geometry_audit_trail: bool = False,
|
|
1782
2047
|
) -> pd.DataFrame:
|
|
1783
2048
|
"""
|
|
1784
2049
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -1821,15 +2086,22 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
1821
2086
|
Water flag ratio threshold (default 0.5)
|
|
1822
2087
|
sort_column : str
|
|
1823
2088
|
Column to sort by (default "plotId", None to skip)
|
|
2089
|
+
geometry_audit_trail : bool, default True
|
|
2090
|
+
If True, includes original input geometry column:
|
|
2091
|
+
- geo_original: Original input geometry (before EE processing), stored as GeoJSON
|
|
2092
|
+
Enables geometry traceability for compliance and audit purposes.
|
|
1824
2093
|
|
|
1825
2094
|
Returns
|
|
1826
2095
|
-------
|
|
1827
2096
|
pd.DataFrame
|
|
1828
|
-
Validated, formatted results DataFrame
|
|
2097
|
+
Validated, formatted results DataFrame with optional audit trail
|
|
1829
2098
|
"""
|
|
1830
2099
|
from openforis_whisp.reformat import format_stats_dataframe
|
|
2100
|
+
from datetime import datetime, timezone
|
|
2101
|
+
import json
|
|
2102
|
+
from shapely.geometry import mapping
|
|
1831
2103
|
|
|
1832
|
-
logger = logger or logging.getLogger("whisp
|
|
2104
|
+
logger = logger or logging.getLogger("whisp")
|
|
1833
2105
|
|
|
1834
2106
|
# Auto-detect decimal places from config if not provided
|
|
1835
2107
|
if decimal_places is None:
|
|
@@ -1837,6 +2109,12 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
1837
2109
|
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1838
2110
|
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1839
2111
|
|
|
2112
|
+
# Load original geometries once here if needed for audit trail (avoid reloading later)
|
|
2113
|
+
gdf_original_geoms = None
|
|
2114
|
+
if geometry_audit_trail:
|
|
2115
|
+
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2116
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
2117
|
+
|
|
1840
2118
|
# Step 1: Get raw stats
|
|
1841
2119
|
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
1842
2120
|
df_raw = whisp_stats_geojson_to_df_sequential(
|
|
@@ -1893,6 +2171,56 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
1893
2171
|
custom_bands=custom_bands,
|
|
1894
2172
|
)
|
|
1895
2173
|
|
|
2174
|
+
# Step 2c: Add audit trail columns (AFTER validation to preserve columns)
|
|
2175
|
+
if geometry_audit_trail:
|
|
2176
|
+
logger.debug("Adding audit trail columns...")
|
|
2177
|
+
try:
|
|
2178
|
+
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2179
|
+
if gdf_original_geoms is None:
|
|
2180
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2181
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
2182
|
+
|
|
2183
|
+
# Use plotId from df_validated to maintain mapping
|
|
2184
|
+
df_original_geom = pd.DataFrame(
|
|
2185
|
+
{
|
|
2186
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2187
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2188
|
+
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2189
|
+
),
|
|
2190
|
+
}
|
|
2191
|
+
)
|
|
2192
|
+
|
|
2193
|
+
# Merge original geometries back
|
|
2194
|
+
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2195
|
+
|
|
2196
|
+
# Store processing metadata
|
|
2197
|
+
df_validated.attrs["processing_metadata"] = {
|
|
2198
|
+
"whisp_version": "3.0.0a1",
|
|
2199
|
+
"processing_date": datetime.now().isoformat(),
|
|
2200
|
+
"processing_mode": "sequential",
|
|
2201
|
+
"ee_endpoint": "standard",
|
|
2202
|
+
"datasets_used": national_codes or [],
|
|
2203
|
+
"geometry_audit_trail": True,
|
|
2204
|
+
}
|
|
2205
|
+
|
|
2206
|
+
logger.info(f"Audit trail added: geo_original column")
|
|
2207
|
+
|
|
2208
|
+
except Exception as e:
|
|
2209
|
+
logger.warning(f"Error adding audit trail: {e}")
|
|
2210
|
+
# Continue without audit trail if something fails
|
|
2211
|
+
|
|
2212
|
+
# Add processing metadata column using pd.concat to avoid fragmentation warning
|
|
2213
|
+
metadata_dict = {
|
|
2214
|
+
"whisp_version": "3.0.0a1",
|
|
2215
|
+
"processing_timestamp_utc": datetime.now(timezone.utc).strftime(
|
|
2216
|
+
"%Y-%m-%d %H:%M:%S UTC"
|
|
2217
|
+
),
|
|
2218
|
+
}
|
|
2219
|
+
metadata_series = pd.Series(
|
|
2220
|
+
[metadata_dict] * len(df_validated), name="whisp_processing_metadata"
|
|
2221
|
+
)
|
|
2222
|
+
df_validated = pd.concat([df_validated, metadata_series], axis=1)
|
|
2223
|
+
|
|
1896
2224
|
logger.info("Sequential processing + formatting + validation complete")
|
|
1897
2225
|
return df_validated
|
|
1898
2226
|
|
|
@@ -1910,7 +2238,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
1910
2238
|
unit_type: str = "ha",
|
|
1911
2239
|
whisp_image: ee.Image = None,
|
|
1912
2240
|
custom_bands: Dict[str, Any] = None,
|
|
1913
|
-
mode: str = "
|
|
2241
|
+
mode: str = "sequential",
|
|
1914
2242
|
# Concurrent-specific parameters
|
|
1915
2243
|
batch_size: int = 10,
|
|
1916
2244
|
max_concurrent: int = 20,
|
|
@@ -1923,14 +2251,15 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
1923
2251
|
convert_water_flag: bool = True,
|
|
1924
2252
|
water_flag_threshold: float = 0.5,
|
|
1925
2253
|
sort_column: str = "plotId",
|
|
2254
|
+
geometry_audit_trail: bool = False,
|
|
1926
2255
|
) -> pd.DataFrame:
|
|
1927
2256
|
"""
|
|
1928
2257
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
1929
2258
|
|
|
1930
|
-
|
|
1931
|
-
|
|
2259
|
+
Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
|
|
2260
|
+
based on explicit mode selection.
|
|
1932
2261
|
|
|
1933
|
-
This is the recommended entry point for most users
|
|
2262
|
+
This is the recommended entry point for most users.
|
|
1934
2263
|
|
|
1935
2264
|
Parameters
|
|
1936
2265
|
----------
|
|
@@ -1950,12 +2279,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
1950
2279
|
Custom band information
|
|
1951
2280
|
mode : str
|
|
1952
2281
|
Processing mode:
|
|
1953
|
-
- "
|
|
1954
|
-
|
|
1955
|
-
* 1-5MB: sequential
|
|
1956
|
-
* >5MB: concurrent
|
|
1957
|
-
- "concurrent": Force high-volume endpoint (batch processing)
|
|
1958
|
-
- "sequential": Force standard endpoint (single-threaded)
|
|
2282
|
+
- "concurrent": Uses high-volume endpoint with batch processing
|
|
2283
|
+
- "sequential": Uses standard endpoint for sequential processing
|
|
1959
2284
|
batch_size : int
|
|
1960
2285
|
Features per batch (only for concurrent mode)
|
|
1961
2286
|
max_concurrent : int
|
|
@@ -1976,6 +2301,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
1976
2301
|
Water flag ratio threshold
|
|
1977
2302
|
sort_column : str
|
|
1978
2303
|
Column to sort by
|
|
2304
|
+
geometry_audit_trail : bool
|
|
2305
|
+
Include geometry modification audit trail columns
|
|
1979
2306
|
|
|
1980
2307
|
Returns
|
|
1981
2308
|
-------
|
|
@@ -1984,52 +2311,30 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
1984
2311
|
|
|
1985
2312
|
Examples
|
|
1986
2313
|
--------
|
|
1987
|
-
>>> #
|
|
1988
|
-
>>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
|
|
1989
|
-
|
|
1990
|
-
>>> # Force concurrent processing for large datasets
|
|
2314
|
+
>>> # Use concurrent processing (recommended for most datasets)
|
|
1991
2315
|
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
1992
|
-
... "
|
|
2316
|
+
... "data.geojson",
|
|
1993
2317
|
... mode="concurrent"
|
|
1994
2318
|
... )
|
|
1995
2319
|
|
|
1996
|
-
>>> # Use sequential for
|
|
2320
|
+
>>> # Use sequential processing for more stable results
|
|
1997
2321
|
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
1998
2322
|
... "data.geojson",
|
|
1999
2323
|
... mode="sequential"
|
|
2000
2324
|
... )
|
|
2001
2325
|
"""
|
|
2002
|
-
logger = logging.getLogger("whisp
|
|
2326
|
+
logger = logging.getLogger("whisp")
|
|
2003
2327
|
|
|
2004
|
-
#
|
|
2005
|
-
if mode
|
|
2006
|
-
try:
|
|
2007
|
-
file_size = Path(input_geojson_filepath).stat().st_size
|
|
2008
|
-
if file_size > 5_000_000: # >5MB
|
|
2009
|
-
chosen_mode = "concurrent"
|
|
2010
|
-
logger.info(
|
|
2011
|
-
f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
|
|
2012
|
-
)
|
|
2013
|
-
else: # <=5MB
|
|
2014
|
-
chosen_mode = "sequential"
|
|
2015
|
-
logger.info(
|
|
2016
|
-
f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
|
|
2017
|
-
)
|
|
2018
|
-
except Exception as e:
|
|
2019
|
-
logger.warning(
|
|
2020
|
-
f"Could not determine file size: {e}. Defaulting to sequential."
|
|
2021
|
-
)
|
|
2022
|
-
chosen_mode = "sequential"
|
|
2023
|
-
elif mode in ("concurrent", "sequential"):
|
|
2024
|
-
chosen_mode = mode
|
|
2025
|
-
logger.info(f"Mode explicitly set to: {mode}")
|
|
2026
|
-
else:
|
|
2328
|
+
# Validate mode parameter
|
|
2329
|
+
if mode not in ("concurrent", "sequential"):
|
|
2027
2330
|
raise ValueError(
|
|
2028
|
-
f"Invalid mode '{mode}'. Must be '
|
|
2331
|
+
f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
|
|
2029
2332
|
)
|
|
2030
2333
|
|
|
2334
|
+
logger.info(f"Mode: {mode}")
|
|
2335
|
+
|
|
2031
2336
|
# Route to appropriate function
|
|
2032
|
-
if
|
|
2337
|
+
if mode == "concurrent":
|
|
2033
2338
|
logger.debug("Routing to concurrent processing...")
|
|
2034
2339
|
return whisp_formatted_stats_geojson_to_df_concurrent(
|
|
2035
2340
|
input_geojson_filepath=input_geojson_filepath,
|
|
@@ -2050,6 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2050
2355
|
convert_water_flag=convert_water_flag,
|
|
2051
2356
|
water_flag_threshold=water_flag_threshold,
|
|
2052
2357
|
sort_column=sort_column,
|
|
2358
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
2053
2359
|
)
|
|
2054
2360
|
else: # sequential
|
|
2055
2361
|
logger.debug("Routing to sequential processing...")
|
|
@@ -2067,4 +2373,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2067
2373
|
convert_water_flag=convert_water_flag,
|
|
2068
2374
|
water_flag_threshold=water_flag_threshold,
|
|
2069
2375
|
sort_column=sort_column,
|
|
2376
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
2070
2377
|
)
|