openforis-whisp 2.0.0b2__py3-none-any.whl → 3.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +35 -4
- openforis_whisp/advanced_stats.py +2070 -0
- openforis_whisp/data_checks.py +642 -0
- openforis_whisp/data_conversion.py +86 -44
- openforis_whisp/datasets.py +298 -225
- openforis_whisp/logger.py +26 -0
- openforis_whisp/parameters/__init__.py +0 -0
- openforis_whisp/parameters/lookup_gaul1_admin.py +18663 -0
- openforis_whisp/reformat.py +198 -2
- openforis_whisp/stats.py +488 -68
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a1.dist-info/RECORD +20 -0
- openforis_whisp-2.0.0b2.dist-info/RECORD +0 -16
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/LICENSE +0 -0
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/WHEEL +0 -0
openforis_whisp/stats.py
CHANGED
|
@@ -3,6 +3,7 @@ import pandas as pd
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from .datasets import combine_datasets
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
import country_converter as coco
|
|
7
8
|
from openforis_whisp.parameters.config_runtime import (
|
|
8
9
|
plot_id_column,
|
|
@@ -34,8 +35,57 @@ from .reformat import (
|
|
|
34
35
|
|
|
35
36
|
# NB functions that included "formatted" in the name apply a schema for validation and reformatting of the output dataframe. The schema is created from lookup tables.
|
|
36
37
|
|
|
38
|
+
# ============================================================================
|
|
39
|
+
# PERFORMANCE OPTIMIZATION: Cache expensive Earth Engine datasets
|
|
40
|
+
# ============================================================================
|
|
41
|
+
# These images/collections are loaded once and reused across all features
|
|
42
|
+
# to avoid repeated expensive operations. This saves 7-15 seconds per analysis.
|
|
37
43
|
|
|
38
|
-
|
|
44
|
+
_WATER_FLAG_IMAGE = None
|
|
45
|
+
_admin_boundaries_FC = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_water_flag_image():
|
|
49
|
+
"""
|
|
50
|
+
Get cached water flag image.
|
|
51
|
+
|
|
52
|
+
OPTIMIZATION: Water flag image is created once and reused for all features.
|
|
53
|
+
This avoids recreating ocean/water datasets for every feature (previously
|
|
54
|
+
called in get_type_and_location for each feature).
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
ee.Image
|
|
59
|
+
Cached water flag image
|
|
60
|
+
"""
|
|
61
|
+
global _WATER_FLAG_IMAGE
|
|
62
|
+
if _WATER_FLAG_IMAGE is None:
|
|
63
|
+
_WATER_FLAG_IMAGE = water_flag_all_prep()
|
|
64
|
+
return _WATER_FLAG_IMAGE
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_admin_boundaries_fc():
|
|
68
|
+
"""
|
|
69
|
+
Get cached GAUL 2024 L1 administrative boundary feature collection.
|
|
70
|
+
|
|
71
|
+
OPTIMIZATION: GAUL 2024 L1 collection is loaded once and reused for all features.
|
|
72
|
+
This avoids loading the large FeatureCollection for every feature (previously
|
|
73
|
+
called in get_admin_boundaries_info for each feature).
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
ee.FeatureCollection
|
|
78
|
+
Cached GAUL 2024 L1 administrative boundary feature collection
|
|
79
|
+
"""
|
|
80
|
+
global _admin_boundaries_FC
|
|
81
|
+
if _admin_boundaries_FC is None:
|
|
82
|
+
_admin_boundaries_FC = ee.FeatureCollection(
|
|
83
|
+
"projects/sat-io/open-datasets/FAO/GAUL/GAUL_2024_L1"
|
|
84
|
+
)
|
|
85
|
+
return _admin_boundaries_FC
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def whisp_formatted_stats_geojson_to_df_legacy(
|
|
39
89
|
input_geojson_filepath: Path | str,
|
|
40
90
|
external_id_column=None,
|
|
41
91
|
remove_geom=False,
|
|
@@ -43,9 +93,15 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
43
93
|
unit_type="ha",
|
|
44
94
|
whisp_image=None,
|
|
45
95
|
custom_bands=None, # New parameter
|
|
96
|
+
validate_geometries: bool = False,
|
|
46
97
|
) -> pd.DataFrame:
|
|
47
98
|
"""
|
|
48
|
-
|
|
99
|
+
Legacy function for basic Whisp stats extraction.
|
|
100
|
+
|
|
101
|
+
DEPRECATED: This is the original implementation maintained for backward compatibility.
|
|
102
|
+
Use whisp_formatted_stats_geojson_to_df() for new code, which provides automatic
|
|
103
|
+
optimization, formatting, and schema validation.
|
|
104
|
+
|
|
49
105
|
Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
|
|
50
106
|
Output df is validated against a panderas schema (created on the fly from the two lookup CSVs).
|
|
51
107
|
|
|
@@ -79,13 +135,48 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
79
135
|
- List of band names: ['Aa_test', 'elevation']
|
|
80
136
|
- Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
|
|
81
137
|
- None: preserves all extra columns automatically
|
|
138
|
+
validate_geometries : bool, optional
|
|
139
|
+
Whether to validate and fix invalid geometries, by default False.
|
|
140
|
+
Set to True to automatically fix invalid/self-intersecting polygons.
|
|
82
141
|
|
|
83
142
|
Returns
|
|
84
143
|
-------
|
|
85
144
|
df_stats : pd.DataFrame
|
|
86
145
|
The DataFrame containing the Whisp stats for the input ROI.
|
|
87
146
|
"""
|
|
88
|
-
|
|
147
|
+
# Load GeoJSON and validate geometries if requested
|
|
148
|
+
if validate_geometries:
|
|
149
|
+
import json
|
|
150
|
+
import geopandas as gpd
|
|
151
|
+
from shapely.validation import make_valid
|
|
152
|
+
import logging as py_logging
|
|
153
|
+
|
|
154
|
+
logger = py_logging.getLogger("whisp-legacy")
|
|
155
|
+
|
|
156
|
+
# Load GeoJSON file
|
|
157
|
+
with open(input_geojson_filepath, "r") as f:
|
|
158
|
+
geojson_data = json.load(f)
|
|
159
|
+
|
|
160
|
+
# Convert to GeoDataFrame
|
|
161
|
+
gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
|
|
162
|
+
|
|
163
|
+
# Validate and fix invalid geometries
|
|
164
|
+
valid_count = gdf.geometry.is_valid.sum()
|
|
165
|
+
invalid_count = len(gdf) - valid_count
|
|
166
|
+
if invalid_count > 0:
|
|
167
|
+
logger.warning(f"Fixing {invalid_count} invalid geometries")
|
|
168
|
+
gdf["geometry"] = gdf["geometry"].apply(
|
|
169
|
+
lambda g: make_valid(g) if g and not g.is_valid else g
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Convert back to GeoJSON dict (stays in memory - no temp files!)
|
|
173
|
+
geojson_cleaned = json.loads(gdf.to_json())
|
|
174
|
+
|
|
175
|
+
# OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
|
|
176
|
+
feature_collection = convert_geojson_to_ee(geojson_cleaned)
|
|
177
|
+
else:
|
|
178
|
+
# Original path - no validation
|
|
179
|
+
feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
|
|
89
180
|
|
|
90
181
|
return whisp_formatted_stats_ee_to_df(
|
|
91
182
|
feature_collection,
|
|
@@ -98,6 +189,169 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
98
189
|
)
|
|
99
190
|
|
|
100
191
|
|
|
192
|
+
def whisp_formatted_stats_geojson_to_df(
|
|
193
|
+
input_geojson_filepath: Path | str,
|
|
194
|
+
external_id_column=None,
|
|
195
|
+
remove_geom=False,
|
|
196
|
+
national_codes=None,
|
|
197
|
+
unit_type="ha",
|
|
198
|
+
whisp_image=None,
|
|
199
|
+
custom_bands=None,
|
|
200
|
+
mode: str = "sequential",
|
|
201
|
+
batch_size: int = 10,
|
|
202
|
+
max_concurrent: int = 20,
|
|
203
|
+
validate_geometries: bool = False,
|
|
204
|
+
) -> pd.DataFrame:
|
|
205
|
+
"""
|
|
206
|
+
Main entry point for converting GeoJSON to Whisp statistics.
|
|
207
|
+
|
|
208
|
+
Routes to the appropriate processing mode with automatic formatting and validation.
|
|
209
|
+
|
|
210
|
+
Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
|
|
211
|
+
Output DataFrame is validated against a Panderas schema (created from lookup CSVs).
|
|
212
|
+
Results are automatically formatted and unit-converted (ha or percent).
|
|
213
|
+
|
|
214
|
+
If `external_id_column` is provided, it will be used to link external identifiers
|
|
215
|
+
from the input GeoJSON to the output DataFrame.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
input_geojson_filepath : Path | str
|
|
220
|
+
The filepath to the GeoJSON of the ROI to analyze.
|
|
221
|
+
external_id_column : str, optional
|
|
222
|
+
The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
|
|
223
|
+
This column must exist as a property in ALL features of the GeoJSON file.
|
|
224
|
+
Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
|
|
225
|
+
remove_geom : bool, default=False
|
|
226
|
+
If True, the geometry of the GeoJSON is removed from the output DataFrame.
|
|
227
|
+
national_codes : list, optional
|
|
228
|
+
List of ISO2 country codes to include national datasets.
|
|
229
|
+
unit_type: str, optional
|
|
230
|
+
Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
|
|
231
|
+
whisp_image : ee.Image, optional
|
|
232
|
+
Pre-combined multiband Earth Engine Image containing all Whisp datasets.
|
|
233
|
+
If provided, this image will be used instead of combining datasets based on national_codes.
|
|
234
|
+
If None, datasets will be combined automatically using national_codes parameter.
|
|
235
|
+
custom_bands : list or dict, optional
|
|
236
|
+
Custom band information for extra columns. Can be:
|
|
237
|
+
- List of band names: ['Aa_test', 'elevation']
|
|
238
|
+
- Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
|
|
239
|
+
- None: preserves all extra columns automatically
|
|
240
|
+
mode : str, optional
|
|
241
|
+
Processing mode, by default "concurrent":
|
|
242
|
+
- "concurrent": Uses high-volume endpoint with concurrent batching (recommended for large files)
|
|
243
|
+
- "sequential": Uses standard endpoint for sequential processing (more stable)
|
|
244
|
+
- "legacy": Uses original implementation (basic stats extraction only, no formatting)
|
|
245
|
+
batch_size : int, optional
|
|
246
|
+
Features per batch for concurrent/sequential modes, by default 10.
|
|
247
|
+
Only applicable for "concurrent" and "sequential" modes.
|
|
248
|
+
max_concurrent : int, optional
|
|
249
|
+
Maximum concurrent EE calls for concurrent mode, by default 20.
|
|
250
|
+
Only applicable for "concurrent" mode.
|
|
251
|
+
validate_geometries : bool, optional
|
|
252
|
+
Whether to validate and fix invalid geometries, by default False.
|
|
253
|
+
Set to True to automatically fix invalid/self-intersecting polygons.
|
|
254
|
+
For production workflows, it's recommended to use geometry validation and
|
|
255
|
+
cleaning tools BEFORE processing with this function.
|
|
256
|
+
|
|
257
|
+
Returns
|
|
258
|
+
-------
|
|
259
|
+
df_stats : pd.DataFrame
|
|
260
|
+
The DataFrame containing the Whisp stats for the input ROI,
|
|
261
|
+
automatically formatted and validated.
|
|
262
|
+
|
|
263
|
+
Examples
|
|
264
|
+
--------
|
|
265
|
+
>>> # Use concurrent processing (default, recommended for large datasets)
|
|
266
|
+
>>> df = whisp_formatted_stats_geojson_to_df("data.geojson")
|
|
267
|
+
|
|
268
|
+
>>> # Use sequential processing for more stable/predictable results
|
|
269
|
+
>>> df = whisp_formatted_stats_geojson_to_df(
|
|
270
|
+
... "data.geojson",
|
|
271
|
+
... mode="sequential"
|
|
272
|
+
... )
|
|
273
|
+
|
|
274
|
+
>>> # Adjust concurrency parameters
|
|
275
|
+
>>> df = whisp_formatted_stats_geojson_to_df(
|
|
276
|
+
... "large_data.geojson",
|
|
277
|
+
... mode="concurrent",
|
|
278
|
+
... max_concurrent=30,
|
|
279
|
+
... batch_size=15
|
|
280
|
+
... )
|
|
281
|
+
|
|
282
|
+
>>> # Use legacy mode for backward compatibility (basic extraction only)
|
|
283
|
+
>>> df = whisp_formatted_stats_geojson_to_df(
|
|
284
|
+
... "data.geojson",
|
|
285
|
+
... mode="legacy"
|
|
286
|
+
... )
|
|
287
|
+
"""
|
|
288
|
+
# Import here to avoid circular imports
|
|
289
|
+
try:
|
|
290
|
+
from openforis_whisp.advanced_stats import (
|
|
291
|
+
whisp_formatted_stats_geojson_to_df_fast,
|
|
292
|
+
)
|
|
293
|
+
except ImportError:
|
|
294
|
+
# Fallback to legacy if advanced_stats not available
|
|
295
|
+
mode = "legacy"
|
|
296
|
+
|
|
297
|
+
logger = logging.getLogger("whisp")
|
|
298
|
+
|
|
299
|
+
if mode == "legacy":
|
|
300
|
+
# Log info if batch_size or max_concurrent were passed but won't be used
|
|
301
|
+
if batch_size != 10 or max_concurrent != 20:
|
|
302
|
+
unused = []
|
|
303
|
+
if batch_size != 10:
|
|
304
|
+
unused.append(f"batch_size={batch_size}")
|
|
305
|
+
if max_concurrent != 20:
|
|
306
|
+
unused.append(f"max_concurrent={max_concurrent}")
|
|
307
|
+
logger.info(
|
|
308
|
+
f"Mode is 'legacy': {', '.join(unused)}\n"
|
|
309
|
+
"parameter(s) are not used in legacy mode."
|
|
310
|
+
)
|
|
311
|
+
# Use original implementation (basic stats extraction only)
|
|
312
|
+
return whisp_formatted_stats_geojson_to_df_legacy(
|
|
313
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
314
|
+
external_id_column=external_id_column,
|
|
315
|
+
remove_geom=remove_geom,
|
|
316
|
+
national_codes=national_codes,
|
|
317
|
+
unit_type=unit_type,
|
|
318
|
+
whisp_image=whisp_image,
|
|
319
|
+
custom_bands=custom_bands,
|
|
320
|
+
validate_geometries=validate_geometries,
|
|
321
|
+
)
|
|
322
|
+
elif mode in ("concurrent", "sequential"):
|
|
323
|
+
# Log info if batch_size or max_concurrent are not used in sequential mode
|
|
324
|
+
if mode == "sequential":
|
|
325
|
+
unused = []
|
|
326
|
+
if batch_size != 10:
|
|
327
|
+
unused.append(f"batch_size={batch_size}")
|
|
328
|
+
if max_concurrent != 20:
|
|
329
|
+
unused.append(f"max_concurrent={max_concurrent}")
|
|
330
|
+
if unused:
|
|
331
|
+
logger.info(
|
|
332
|
+
f"Mode is 'sequential': {', '.join(unused)}\n"
|
|
333
|
+
"parameter(s) are not used in sequential (single-threaded) mode."
|
|
334
|
+
)
|
|
335
|
+
# Route to fast function with explicit mode (skip auto-detection)
|
|
336
|
+
return whisp_formatted_stats_geojson_to_df_fast(
|
|
337
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
338
|
+
external_id_column=external_id_column,
|
|
339
|
+
remove_geom=remove_geom,
|
|
340
|
+
national_codes=national_codes,
|
|
341
|
+
unit_type=unit_type,
|
|
342
|
+
whisp_image=whisp_image,
|
|
343
|
+
custom_bands=custom_bands,
|
|
344
|
+
mode=mode, # Pass mode directly (concurrent or sequential)
|
|
345
|
+
batch_size=batch_size,
|
|
346
|
+
max_concurrent=max_concurrent,
|
|
347
|
+
validate_geometries=validate_geometries,
|
|
348
|
+
)
|
|
349
|
+
else:
|
|
350
|
+
raise ValueError(
|
|
351
|
+
f"Invalid mode '{mode}'. Must be 'concurrent', 'sequential', or 'legacy'."
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
|
|
101
355
|
def whisp_formatted_stats_geojson_to_geojson(
|
|
102
356
|
input_geojson_filepath,
|
|
103
357
|
output_geojson_filepath,
|
|
@@ -141,7 +395,8 @@ def whisp_formatted_stats_geojson_to_geojson(
|
|
|
141
395
|
# Convert the df to GeoJSON
|
|
142
396
|
convert_df_to_geojson(df, output_geojson_filepath, geo_column)
|
|
143
397
|
|
|
144
|
-
|
|
398
|
+
# Suppress verbose output
|
|
399
|
+
# print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
|
|
145
400
|
|
|
146
401
|
|
|
147
402
|
def whisp_formatted_stats_ee_to_geojson(
|
|
@@ -425,7 +680,9 @@ def whisp_stats_ee_to_ee(
|
|
|
425
680
|
national_codes=None,
|
|
426
681
|
unit_type="ha",
|
|
427
682
|
keep_properties=None,
|
|
428
|
-
whisp_image=None,
|
|
683
|
+
whisp_image=None,
|
|
684
|
+
validate_external_id=True,
|
|
685
|
+
validate_bands=False, # New parameter
|
|
429
686
|
):
|
|
430
687
|
"""
|
|
431
688
|
Process a feature collection to get statistics for each feature.
|
|
@@ -442,19 +699,25 @@ def whisp_stats_ee_to_ee(
|
|
|
442
699
|
whisp_image (ee.Image, optional): Pre-combined multiband Earth Engine Image containing
|
|
443
700
|
all Whisp datasets. If provided, this image will be used instead of combining
|
|
444
701
|
datasets based on national_codes.
|
|
702
|
+
validate_external_id (bool, optional): If True, validates that external_id_column exists
|
|
703
|
+
in all features (default: True). Set to False to skip validation and save 2-4 seconds.
|
|
704
|
+
Only disable if you're confident the column exists in all features.
|
|
445
705
|
|
|
446
706
|
Returns:
|
|
447
707
|
ee.FeatureCollection: The output feature collection with statistics.
|
|
448
708
|
"""
|
|
449
709
|
if external_id_column is not None:
|
|
450
710
|
try:
|
|
451
|
-
#
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
711
|
+
# OPTIMIZATION: Make validation optional to save 2-4 seconds
|
|
712
|
+
# Validation includes multiple .getInfo() calls which are slow
|
|
713
|
+
if validate_external_id:
|
|
714
|
+
# Validate that the external_id_column exists in all features
|
|
715
|
+
validation_result = validate_external_id_column(
|
|
716
|
+
feature_collection, external_id_column
|
|
717
|
+
)
|
|
455
718
|
|
|
456
|
-
|
|
457
|
-
|
|
719
|
+
if not validation_result["is_valid"]:
|
|
720
|
+
raise ValueError(validation_result["error_message"])
|
|
458
721
|
|
|
459
722
|
# First handle property selection, but preserve the external_id_column
|
|
460
723
|
if keep_properties is not None:
|
|
@@ -506,19 +769,27 @@ def whisp_stats_ee_to_ee(
|
|
|
506
769
|
national_codes=national_codes,
|
|
507
770
|
unit_type=unit_type,
|
|
508
771
|
whisp_image=whisp_image, # Pass through
|
|
772
|
+
validate_bands=validate_bands,
|
|
509
773
|
)
|
|
510
774
|
|
|
511
775
|
return add_id_to_feature_collection(dataset=fc, id_name=plot_id_column)
|
|
512
776
|
|
|
513
777
|
|
|
514
778
|
def _keep_fc_properties(feature_collection, keep_properties):
|
|
779
|
+
"""
|
|
780
|
+
Filter feature collection properties based on keep_properties parameter.
|
|
781
|
+
|
|
782
|
+
OPTIMIZATION: When keep_properties is True, we no longer call .getInfo()
|
|
783
|
+
to get property names. Instead, we simply return the collection as-is,
|
|
784
|
+
since True means "keep all properties". This saves 1-2 seconds.
|
|
785
|
+
"""
|
|
515
786
|
# If keep_properties is specified, select only those properties
|
|
516
787
|
if keep_properties is None:
|
|
517
788
|
feature_collection = feature_collection.select([])
|
|
518
789
|
elif keep_properties == True:
|
|
519
|
-
# If keep_properties is true,
|
|
520
|
-
|
|
521
|
-
|
|
790
|
+
# If keep_properties is true, keep all properties
|
|
791
|
+
# No need to call .select() or .getInfo() - just return as-is
|
|
792
|
+
pass
|
|
522
793
|
elif isinstance(keep_properties, list):
|
|
523
794
|
feature_collection = feature_collection.select(keep_properties)
|
|
524
795
|
else:
|
|
@@ -534,7 +805,8 @@ def whisp_stats_ee_to_df(
|
|
|
534
805
|
remove_geom=False,
|
|
535
806
|
national_codes=None,
|
|
536
807
|
unit_type="ha",
|
|
537
|
-
whisp_image=None,
|
|
808
|
+
whisp_image=None,
|
|
809
|
+
validate_bands=False, # New parameter
|
|
538
810
|
) -> pd.DataFrame:
|
|
539
811
|
"""
|
|
540
812
|
Convert a Google Earth Engine FeatureCollection to a pandas DataFrame and convert ISO3 to ISO2 country codes.
|
|
@@ -561,27 +833,52 @@ def whisp_stats_ee_to_df(
|
|
|
561
833
|
"""
|
|
562
834
|
# First, do the whisp processing to get the EE feature collection with stats
|
|
563
835
|
try:
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
836
|
+
try:
|
|
837
|
+
stats_feature_collection = whisp_stats_ee_to_ee(
|
|
838
|
+
feature_collection,
|
|
839
|
+
external_id_column,
|
|
840
|
+
national_codes=national_codes,
|
|
841
|
+
unit_type=unit_type,
|
|
842
|
+
whisp_image=whisp_image, # Pass through
|
|
843
|
+
validate_bands=False, # try withoutb validation first
|
|
844
|
+
)
|
|
845
|
+
except Exception as e:
|
|
846
|
+
print(f"An error occurred during Whisp stats processing: {e}")
|
|
847
|
+
raise e
|
|
574
848
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
849
|
+
# Then, convert the EE feature collection to DataFrame
|
|
850
|
+
try:
|
|
851
|
+
df_stats = convert_ee_to_df(
|
|
852
|
+
ee_object=stats_feature_collection,
|
|
853
|
+
remove_geom=remove_geom,
|
|
854
|
+
)
|
|
855
|
+
except Exception as e:
|
|
856
|
+
print(f"An error occurred during the conversion from EE to DataFrame: {e}")
|
|
857
|
+
raise e
|
|
858
|
+
|
|
859
|
+
except: # retry with validation of whisp input datasets
|
|
860
|
+
try:
|
|
861
|
+
stats_feature_collection = whisp_stats_ee_to_ee(
|
|
862
|
+
feature_collection,
|
|
863
|
+
external_id_column,
|
|
864
|
+
national_codes=national_codes,
|
|
865
|
+
unit_type=unit_type,
|
|
866
|
+
whisp_image=whisp_image,
|
|
867
|
+
validate_bands=True, # If error, try with validation
|
|
868
|
+
)
|
|
869
|
+
except Exception as e:
|
|
870
|
+
print(f"An error occurred during Whisp stats processing: {e}")
|
|
871
|
+
raise e
|
|
584
872
|
|
|
873
|
+
# Then, convert the EE feature collection to DataFrame
|
|
874
|
+
try:
|
|
875
|
+
df_stats = convert_ee_to_df(
|
|
876
|
+
ee_object=stats_feature_collection,
|
|
877
|
+
remove_geom=remove_geom,
|
|
878
|
+
)
|
|
879
|
+
except Exception as e:
|
|
880
|
+
print(f"An error occurred during the conversion from EE to DataFrame: {e}")
|
|
881
|
+
raise e
|
|
585
882
|
try:
|
|
586
883
|
df_stats = convert_iso3_to_iso2(
|
|
587
884
|
df=df_stats,
|
|
@@ -599,6 +896,13 @@ def whisp_stats_ee_to_df(
|
|
|
599
896
|
print(f"An error occurred during point geometry area adjustment: {e}")
|
|
600
897
|
# Continue without the adjustment rather than failing completely
|
|
601
898
|
|
|
899
|
+
# Reformat geometry types (MultiPolygon -> Polygon)
|
|
900
|
+
try:
|
|
901
|
+
df_stats = reformat_geometry_type(df_stats)
|
|
902
|
+
except Exception as e:
|
|
903
|
+
print(f"An error occurred during geometry type reformatting: {e}")
|
|
904
|
+
# Continue without the adjustment rather than failing completely
|
|
905
|
+
|
|
602
906
|
return df_stats
|
|
603
907
|
|
|
604
908
|
|
|
@@ -623,12 +927,6 @@ def set_point_geometry_area_to_zero(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
623
927
|
)
|
|
624
928
|
return df
|
|
625
929
|
|
|
626
|
-
if geometry_area_column not in df.columns:
|
|
627
|
-
print(
|
|
628
|
-
f"Warning: {geometry_area_column} column not found. Skipping area adjustment for points."
|
|
629
|
-
)
|
|
630
|
-
return df
|
|
631
|
-
|
|
632
930
|
# Create a copy to avoid modifying the original
|
|
633
931
|
df_modified = df.copy()
|
|
634
932
|
|
|
@@ -644,6 +942,43 @@ def set_point_geometry_area_to_zero(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
644
942
|
return df_modified
|
|
645
943
|
|
|
646
944
|
|
|
945
|
+
def reformat_geometry_type(df: pd.DataFrame) -> pd.DataFrame:
|
|
946
|
+
"""
|
|
947
|
+
Reformat geometry type classification in the DataFrame output.
|
|
948
|
+
Standardizes MultiPolygon geometry type to Polygon for consistent output.
|
|
949
|
+
|
|
950
|
+
Parameters
|
|
951
|
+
----------
|
|
952
|
+
df : pd.DataFrame
|
|
953
|
+
DataFrame containing geometry type column
|
|
954
|
+
|
|
955
|
+
Returns
|
|
956
|
+
-------
|
|
957
|
+
pd.DataFrame
|
|
958
|
+
DataFrame with standardized geometry types
|
|
959
|
+
"""
|
|
960
|
+
# Check if required columns exist
|
|
961
|
+
if geometry_type_column not in df.columns:
|
|
962
|
+
print(
|
|
963
|
+
f"Warning: {geometry_type_column} column not found. Skipping geometry type reformatting."
|
|
964
|
+
)
|
|
965
|
+
return df
|
|
966
|
+
|
|
967
|
+
# Create a copy to avoid modifying the original
|
|
968
|
+
df_modified = df.copy()
|
|
969
|
+
|
|
970
|
+
# Reformat MultiPolygon to Polygon
|
|
971
|
+
multipolygon_mask = df_modified[geometry_type_column] == "MultiPolygon"
|
|
972
|
+
df_modified.loc[multipolygon_mask, geometry_type_column] = "Polygon"
|
|
973
|
+
|
|
974
|
+
# Log the changes
|
|
975
|
+
num_reformatted = multipolygon_mask.sum()
|
|
976
|
+
# if num_reformatted > 0:
|
|
977
|
+
# print(f"Reformatted {num_reformatted} MultiPolygon geometries to Polygon")
|
|
978
|
+
|
|
979
|
+
return df_modified
|
|
980
|
+
|
|
981
|
+
|
|
647
982
|
def whisp_stats_ee_to_drive(
|
|
648
983
|
feature_collection: ee.FeatureCollection,
|
|
649
984
|
external_id_column=None,
|
|
@@ -696,7 +1031,11 @@ def whisp_stats_ee_to_drive(
|
|
|
696
1031
|
|
|
697
1032
|
# Get stats for a feature or feature collection
|
|
698
1033
|
def get_stats(
|
|
699
|
-
feature_or_feature_col,
|
|
1034
|
+
feature_or_feature_col,
|
|
1035
|
+
national_codes=None,
|
|
1036
|
+
unit_type="ha",
|
|
1037
|
+
whisp_image=None,
|
|
1038
|
+
validate_bands=False,
|
|
700
1039
|
):
|
|
701
1040
|
"""
|
|
702
1041
|
Get stats for a feature or feature collection with optional pre-combined image.
|
|
@@ -725,16 +1064,27 @@ def get_stats(
|
|
|
725
1064
|
img_combined = whisp_image
|
|
726
1065
|
print("Using provided whisp_image")
|
|
727
1066
|
else:
|
|
728
|
-
img_combined = combine_datasets(
|
|
1067
|
+
img_combined = combine_datasets(
|
|
1068
|
+
national_codes=national_codes,
|
|
1069
|
+
validate_bands=validate_bands,
|
|
1070
|
+
include_context_bands=False,
|
|
1071
|
+
)
|
|
729
1072
|
print(f"Combining datasets with national_codes: {national_codes}")
|
|
730
1073
|
|
|
731
1074
|
# Check if the input is a Feature or a FeatureCollection
|
|
732
1075
|
if isinstance(feature_or_feature_col, ee.Feature):
|
|
733
1076
|
print("Processing single feature")
|
|
1077
|
+
# OPTIMIZATION: Create cached images for single feature processing
|
|
1078
|
+
water_all = get_water_flag_image()
|
|
1079
|
+
bounds_ADM1 = get_admin_boundaries_fc()
|
|
734
1080
|
output = ee.FeatureCollection(
|
|
735
1081
|
[
|
|
736
1082
|
get_stats_feature(
|
|
737
|
-
feature_or_feature_col,
|
|
1083
|
+
feature_or_feature_col,
|
|
1084
|
+
img_combined,
|
|
1085
|
+
unit_type=unit_type,
|
|
1086
|
+
water_all=water_all,
|
|
1087
|
+
bounds_ADM1=bounds_ADM1,
|
|
738
1088
|
)
|
|
739
1089
|
]
|
|
740
1090
|
)
|
|
@@ -756,6 +1106,10 @@ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=
|
|
|
756
1106
|
"""
|
|
757
1107
|
Calculate statistics for a feature collection using Whisp datasets.
|
|
758
1108
|
|
|
1109
|
+
OPTIMIZATION: Creates water flag and admin_boundaries images once and reuses
|
|
1110
|
+
them for all features instead of recreating them for each feature.
|
|
1111
|
+
This saves 7-15 seconds per analysis.
|
|
1112
|
+
|
|
759
1113
|
Parameters
|
|
760
1114
|
----------
|
|
761
1115
|
feature_col : ee.FeatureCollection
|
|
@@ -775,15 +1129,19 @@ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=
|
|
|
775
1129
|
ee.FeatureCollection
|
|
776
1130
|
Feature collection with calculated statistics
|
|
777
1131
|
"""
|
|
778
|
-
|
|
779
|
-
#
|
|
780
|
-
|
|
781
|
-
|
|
1132
|
+
# OPTIMIZATION: Create cached images once before processing features
|
|
1133
|
+
# These will be reused for all features instead of being recreated each time
|
|
1134
|
+
water_all = get_water_flag_image()
|
|
1135
|
+
bounds_ADM1 = get_admin_boundaries_fc()
|
|
782
1136
|
|
|
783
1137
|
out_feature_col = ee.FeatureCollection(
|
|
784
1138
|
feature_col.map(
|
|
785
1139
|
lambda feature: get_stats_feature(
|
|
786
|
-
feature,
|
|
1140
|
+
feature,
|
|
1141
|
+
img_combined,
|
|
1142
|
+
unit_type=unit_type,
|
|
1143
|
+
water_all=water_all,
|
|
1144
|
+
bounds_ADM1=bounds_ADM1,
|
|
787
1145
|
)
|
|
788
1146
|
)
|
|
789
1147
|
)
|
|
@@ -796,10 +1154,15 @@ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=
|
|
|
796
1154
|
# Note: This function doesn't need whisp_image parameter since it already accepts img_combined directly
|
|
797
1155
|
|
|
798
1156
|
|
|
799
|
-
def get_stats_feature(
|
|
1157
|
+
def get_stats_feature(
|
|
1158
|
+
feature, img_combined, unit_type="ha", water_all=None, bounds_ADM1=None
|
|
1159
|
+
):
|
|
800
1160
|
"""
|
|
801
1161
|
Get statistics for a single feature using a pre-combined image.
|
|
802
1162
|
|
|
1163
|
+
OPTIMIZATION: Accepts cached water/admin_boundaries images to avoid recreating
|
|
1164
|
+
them for every feature.
|
|
1165
|
+
|
|
803
1166
|
Parameters
|
|
804
1167
|
----------
|
|
805
1168
|
feature : ee.Feature
|
|
@@ -808,6 +1171,10 @@ def get_stats_feature(feature, img_combined, unit_type="ha"):
|
|
|
808
1171
|
Pre-combined image with all the datasets
|
|
809
1172
|
unit_type : str, optional
|
|
810
1173
|
Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
|
|
1174
|
+
water_all : ee.Image, optional
|
|
1175
|
+
Cached water flag image
|
|
1176
|
+
bounds_ADM1 : ee.FeatureCollection, optional
|
|
1177
|
+
Cached admin_boundaries feature collection
|
|
811
1178
|
|
|
812
1179
|
Returns
|
|
813
1180
|
-------
|
|
@@ -822,8 +1189,8 @@ def get_stats_feature(feature, img_combined, unit_type="ha"):
|
|
|
822
1189
|
tileScale=8,
|
|
823
1190
|
)
|
|
824
1191
|
|
|
825
|
-
# Get basic feature information
|
|
826
|
-
feature_info = get_type_and_location(feature)
|
|
1192
|
+
# Get basic feature information with cached images
|
|
1193
|
+
feature_info = get_type_and_location(feature, water_all, bounds_ADM1)
|
|
827
1194
|
|
|
828
1195
|
# add statistics unit type (e.g., percentage or hectares) to dictionary
|
|
829
1196
|
stats_unit_type = ee.Dictionary({stats_unit_type_column: unit_type})
|
|
@@ -872,22 +1239,51 @@ def get_stats_feature(feature, img_combined, unit_type="ha"):
|
|
|
872
1239
|
|
|
873
1240
|
|
|
874
1241
|
# Get basic feature information - uses admin and water datasets in gee.
|
|
875
|
-
def get_type_and_location(feature):
|
|
876
|
-
"""
|
|
1242
|
+
def get_type_and_location(feature, water_all=None, bounds_ADM1=None):
|
|
1243
|
+
"""
|
|
1244
|
+
Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags.
|
|
1245
|
+
|
|
1246
|
+
OPTIMIZATION: Accepts cached water flag image and admin_boundaries collection
|
|
1247
|
+
to avoid recreating them for every feature (saves 7-15 seconds per analysis).
|
|
1248
|
+
|
|
1249
|
+
Parameters
|
|
1250
|
+
----------
|
|
1251
|
+
feature : ee.Feature
|
|
1252
|
+
The feature to extract information from
|
|
1253
|
+
water_all : ee.Image, optional
|
|
1254
|
+
Cached water flag image. If None, creates it.
|
|
1255
|
+
bounds_ADM1 : ee.FeatureCollection, optional
|
|
1256
|
+
Cached admin_boundaries feature collection. If None, loads it.
|
|
877
1257
|
|
|
1258
|
+
Returns
|
|
1259
|
+
-------
|
|
1260
|
+
ee.Dictionary
|
|
1261
|
+
Dictionary with feature information
|
|
1262
|
+
"""
|
|
878
1263
|
# Get centroid of the feature's geometry
|
|
879
|
-
centroid = feature.geometry().centroid(1)
|
|
1264
|
+
centroid = feature.geometry().centroid(0.1)
|
|
1265
|
+
|
|
1266
|
+
# OPTIMIZATION: Use cached admin_boundaries
|
|
1267
|
+
if bounds_ADM1 is None:
|
|
1268
|
+
bounds_ADM1 = get_admin_boundaries_fc()
|
|
880
1269
|
|
|
881
|
-
# Fetch location info from
|
|
882
|
-
location = ee.Dictionary(
|
|
883
|
-
country = ee.Dictionary({iso3_country_column: location.get("
|
|
1270
|
+
# Fetch location info from GAUL 2024 L1 (country, admin)
|
|
1271
|
+
location = ee.Dictionary(get_admin_boundaries_info(centroid, bounds_ADM1))
|
|
1272
|
+
country = ee.Dictionary({iso3_country_column: location.get("iso3_code")})
|
|
884
1273
|
|
|
885
1274
|
admin_1 = ee.Dictionary(
|
|
886
|
-
{admin_1_column: location.get("
|
|
887
|
-
) # Administrative level 1 (
|
|
1275
|
+
{admin_1_column: location.get("gaul1_name")}
|
|
1276
|
+
) # Administrative level 1 (from GAUL 2024 L1)
|
|
1277
|
+
|
|
1278
|
+
# OPTIMIZATION: Use cached water flag image
|
|
1279
|
+
if water_all is None:
|
|
1280
|
+
water_all = get_water_flag_image()
|
|
1281
|
+
|
|
1282
|
+
# OPTIMIZATION: Use cached water flag image
|
|
1283
|
+
if water_all is None:
|
|
1284
|
+
water_all = get_water_flag_image()
|
|
888
1285
|
|
|
889
1286
|
# Prepare the water flag information
|
|
890
|
-
water_all = water_flag_all_prep()
|
|
891
1287
|
water_flag_dict = value_at_point_flag(
|
|
892
1288
|
point=centroid, image=water_all, band_name=water_flag, output_name=water_flag
|
|
893
1289
|
)
|
|
@@ -899,8 +1295,12 @@ def get_type_and_location(feature):
|
|
|
899
1295
|
coords_list = centroid.coordinates()
|
|
900
1296
|
coords_dict = ee.Dictionary(
|
|
901
1297
|
{
|
|
902
|
-
centroid_x_coord_column: coords_list.get(0)
|
|
903
|
-
|
|
1298
|
+
centroid_x_coord_column: ee.Number(coords_list.get(0)).format(
|
|
1299
|
+
"%.6f"
|
|
1300
|
+
), # Longitude (6 dp)
|
|
1301
|
+
centroid_y_coord_column: ee.Number(coords_list.get(1)).format(
|
|
1302
|
+
"%.6f"
|
|
1303
|
+
), # Latitude (6 dp)
|
|
904
1304
|
}
|
|
905
1305
|
)
|
|
906
1306
|
|
|
@@ -938,16 +1338,36 @@ def percent_and_format(val, area_ha):
|
|
|
938
1338
|
return ee.Number(formatted_value)
|
|
939
1339
|
|
|
940
1340
|
|
|
941
|
-
#
|
|
942
|
-
def
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
1341
|
+
# GAUL 2024 L1 - admin units from FAO, allows commercial use
|
|
1342
|
+
def get_admin_boundaries_info(geometry, bounds_ADM1=None):
|
|
1343
|
+
"""
|
|
1344
|
+
Get GAUL 2024 L1 info for a geometry (country ISO3 code and admin boundary name).
|
|
1345
|
+
|
|
1346
|
+
OPTIMIZATION: Accepts cached GAUL 2024 L1 FeatureCollection to avoid
|
|
1347
|
+
reloading it for every feature (saves 2-5 seconds per analysis).
|
|
1348
|
+
|
|
1349
|
+
Parameters
|
|
1350
|
+
----------
|
|
1351
|
+
geometry : ee.Geometry
|
|
1352
|
+
The geometry to query
|
|
1353
|
+
bounds_ADM1 : ee.FeatureCollection, optional
|
|
1354
|
+
Cached GAUL 2024 L1 feature collection. If None, loads it.
|
|
1355
|
+
|
|
1356
|
+
Returns
|
|
1357
|
+
-------
|
|
1358
|
+
ee.Dictionary
|
|
1359
|
+
Dictionary with iso3_code (country) and gaul1_name (admin boundary name)
|
|
1360
|
+
"""
|
|
1361
|
+
if bounds_ADM1 is None:
|
|
1362
|
+
bounds_ADM1 = get_admin_boundaries_fc()
|
|
1363
|
+
|
|
1364
|
+
polygonsIntersectPoint = bounds_ADM1.filterBounds(geometry)
|
|
1365
|
+
backup_dict = ee.Dictionary({"iso3_code": "Unknown", "gaul1_name": "Unknown"})
|
|
946
1366
|
return ee.Algorithms.If(
|
|
947
1367
|
polygonsIntersectPoint.size().gt(0),
|
|
948
1368
|
polygonsIntersectPoint.first()
|
|
949
1369
|
.toDictionary()
|
|
950
|
-
.select(["
|
|
1370
|
+
.select(["iso3_code", "gaul1_name"]),
|
|
951
1371
|
backup_dict,
|
|
952
1372
|
)
|
|
953
1373
|
|