openforis-whisp 2.0.0a6__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openforis_whisp/stats.py CHANGED
@@ -1,1134 +1,1228 @@
1
- import ee
2
- import pandas as pd
3
- from pathlib import Path
4
- from .datasets import combine_datasets
5
- import json
6
- import country_converter as coco
7
- from openforis_whisp.parameters.config_runtime import (
8
- plot_id_column,
9
- external_id_column,
10
- geometry_type_column,
11
- geometry_area_column,
12
- geometry_area_column_formatting,
13
- centroid_x_coord_column,
14
- centroid_y_coord_column,
15
- iso3_country_column,
16
- iso2_country_column,
17
- admin_1_column,
18
- stats_unit_type_column,
19
- stats_area_columns_formatting,
20
- stats_percent_columns_formatting,
21
- water_flag,
22
- )
23
- from .data_conversion import (
24
- convert_ee_to_df,
25
- convert_geojson_to_ee,
26
- convert_ee_to_geojson,
27
- # convert_csv_to_geojson,
28
- convert_df_to_geojson,
29
- ) # copied functions from whisp-api and geemap (accessed 2024) to avoid dependency
30
- from .reformat import validate_dataframe_using_lookups
31
-
32
- # NB functions that included "formatted" in the name apply a schema for validation and reformatting of the output dataframe. The schema is created from lookup tables.
33
-
34
-
35
- def whisp_formatted_stats_geojson_to_df(
36
- input_geojson_filepath: Path | str,
37
- external_id_column=None,
38
- remove_geom=False,
39
- national_codes=None,
40
- unit_type="ha",
41
- ) -> pd.DataFrame:
42
- """
43
- Main function for most users.
44
- Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
45
- Output df is validated against a panderas schema (created on the fly from the two lookup CSVs).
46
-
47
- This function first converts the provided GeoJSON file into an Earth Engine FeatureCollection.
48
- It then processes the FeatureCollection to extract relevant Whisp statistics,
49
- returning a structured DataFrame that aligns with the expected schema.
50
-
51
- If `external_id_column` is provided, it will be used to link external identifiers
52
- from the input GeoJSON to the output DataFrame.
53
-
54
- Parameters
55
- ----------
56
- input_geojson_filepath : Path | str
57
- The filepath to the GeoJSON of the ROI to analyze.
58
- external_id_column : str, optional
59
- The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
60
- This column must exist as a property in ALL features of the GeoJSON file.
61
- Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
62
- remove_geom : bool, default=False
63
- If True, the geometry of the GeoJSON is removed from the output DataFrame.
64
- national_codes : list, optional
65
- List of ISO2 country codes to include national datasets.
66
- unit_type: str, optional
67
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
68
-
69
- Returns
70
- -------
71
- df_stats : pd.DataFrame
72
- The DataFrame containing the Whisp stats for the input ROI.
73
- """
74
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
75
-
76
- return whisp_formatted_stats_ee_to_df(
77
- feature_collection,
78
- external_id_column,
79
- remove_geom,
80
- national_codes=national_codes,
81
- unit_type=unit_type, # Fixed: now it's a keyword argument
82
- )
83
-
84
-
85
- def whisp_formatted_stats_geojson_to_geojson(
86
- input_geojson_filepath,
87
- output_geojson_filepath,
88
- external_id_column=None,
89
- geo_column: str = "geo",
90
- national_codes=None,
91
- unit_type="ha",
92
- ):
93
- """
94
- Convert a formatted GeoJSON file with a geo column into a GeoJSON file containing Whisp stats.
95
-
96
- Parameters
97
- ----------
98
- input_geojson_filepath : str
99
- The filepath to the input GeoJSON file.
100
- output_geojson_filepath : str
101
- The filepath to save the output GeoJSON file.
102
- external_id_column : str, optional
103
- The name of the column containing external IDs, by default None.
104
- geo_column : str, optional
105
- The name of the column containing GeoJSON geometries, by default "geo".
106
- national_codes : list, optional
107
- List of ISO2 country codes to include national datasets.
108
- unit_type : str, optional
109
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
110
-
111
- Returns
112
- -------
113
- None
114
- """
115
- df = whisp_formatted_stats_geojson_to_df(
116
- input_geojson_filepath=input_geojson_filepath,
117
- external_id_column=external_id_column,
118
- national_codes=national_codes,
119
- unit_type=unit_type,
120
- )
121
- # Convert the df to GeoJSON
122
- convert_df_to_geojson(df, output_geojson_filepath, geo_column)
123
-
124
- print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
125
-
126
-
127
- def whisp_formatted_stats_ee_to_geojson(
128
- feature_collection: ee.FeatureCollection,
129
- output_geojson_filepath: str,
130
- external_id_column=None,
131
- geo_column: str = "geo",
132
- national_codes=None,
133
- unit_type="ha",
134
- ):
135
- """
136
- Convert an Earth Engine FeatureCollection to a GeoJSON file containing Whisp stats.
137
-
138
- Parameters
139
- ----------
140
- feature_collection : ee.FeatureCollection
141
- The feature collection of the ROI to analyze.
142
- output_geojson_filepath : str
143
- The filepath to save the output GeoJSON file.
144
- external_id_column : str, optional
145
- The name of the column containing external IDs, by default None.
146
- geo_column : str, optional
147
- The name of the column containing GeoJSON geometries, by default "geo".
148
- national_codes : list, optional
149
- List of ISO2 country codes to include national datasets.
150
- unit_type : str, optional
151
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
152
- Returns
153
- -------
154
- None
155
- """
156
- # Convert ee feature collection to a pandas dataframe
157
- df_stats = whisp_formatted_stats_ee_to_df(
158
- feature_collection,
159
- external_id_column,
160
- national_codes=national_codes,
161
- unit_type=unit_type,
162
- )
163
-
164
- # Convert the df to GeoJSON
165
- convert_df_to_geojson(df_stats, output_geojson_filepath, geo_column)
166
-
167
- print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
168
-
169
-
170
- def whisp_formatted_stats_ee_to_df(
171
- feature_collection: ee.FeatureCollection,
172
- external_id_column=None,
173
- remove_geom=False,
174
- national_codes=None,
175
- unit_type="ha",
176
- ) -> pd.DataFrame:
177
- """
178
- Convert a feature collection to a validated DataFrame with Whisp statistics.
179
-
180
- Parameters
181
- ----------
182
- feature_collection : ee.FeatureCollection
183
- The feature collection of the ROI to analyze.
184
- external_id_column : str, optional
185
- The name of the external ID column, by default None.
186
- remove_geom : bool, optional
187
- Whether to remove the geometry column, by default False.
188
- national_codes : list, optional
189
- List of ISO2 country codes to include national datasets.
190
- unit_type : str, optional
191
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
192
-
193
- Returns
194
- -------
195
- validated_df : pd.DataFrame
196
- The validated dataframe containing the Whisp stats for the input ROI.
197
- """
198
- # Convert ee feature collection to a pandas dataframe
199
- df_stats = whisp_stats_ee_to_df(
200
- feature_collection,
201
- external_id_column,
202
- remove_geom,
203
- national_codes=national_codes,
204
- unit_type=unit_type,
205
- )
206
-
207
- # Pass national_codes to validation function to filter schema
208
- validated_df = validate_dataframe_using_lookups(
209
- df_stats, national_codes=national_codes
210
- )
211
- return validated_df
212
-
213
-
214
- ### functions without additional formatting below (i.e., raw output from GEE processing without schema validation step)
215
-
216
-
217
- def whisp_stats_geojson_to_df(
218
- input_geojson_filepath: Path | str,
219
- external_id_column=None,
220
- remove_geom=False,
221
- national_codes=None,
222
- unit_type="ha",
223
- ) -> pd.DataFrame:
224
- """
225
- Convert a GeoJSON file to a pandas DataFrame with Whisp statistics.
226
-
227
- Parameters
228
- ----------
229
- input_geojson_filepath : Path | str
230
- The filepath to the GeoJSON of the ROI to analyze.
231
- external_id_column : str, optional
232
- The name of the external ID column, by default None.
233
- remove_geom : bool, optional
234
- Whether to remove the geometry column, by default False.
235
- national_codes : list, optional
236
- List of ISO2 country codes to include national datasets.
237
- unit_type : str, optional
238
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
239
-
240
- Returns
241
- -------
242
- df_stats : pd.DataFrame
243
- The dataframe containing the Whisp stats for the input ROI.
244
- """
245
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
246
-
247
- return whisp_stats_ee_to_df(
248
- feature_collection,
249
- external_id_column,
250
- remove_geom,
251
- national_codes=national_codes,
252
- unit_type=unit_type,
253
- )
254
-
255
-
256
- def whisp_stats_geojson_to_ee(
257
- input_geojson_filepath: Path | str,
258
- external_id_column=None,
259
- national_codes=None,
260
- ) -> ee.FeatureCollection:
261
- """
262
- Convert a GeoJSON file to an Earth Engine FeatureCollection with Whisp statistics.
263
-
264
- Parameters
265
- ----------
266
- input_geojson_filepath : Path | str
267
- The filepath to the GeoJSON of the ROI to analyze.
268
- external_id_column : str, optional
269
- The name of the external ID column, by default None.
270
- national_codes : list, optional
271
- List of ISO2 country codes to include national datasets.
272
-
273
- Returns
274
- -------
275
- ee.FeatureCollection
276
- The feature collection containing the Whisp stats for the input ROI.
277
- """
278
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
279
-
280
- return whisp_stats_ee_to_ee(
281
- feature_collection, external_id_column, national_codes=national_codes
282
- )
283
-
284
-
285
- def whisp_stats_geojson_to_geojson(
286
- input_geojson_filepath,
287
- output_geojson_filepath,
288
- external_id_column=None,
289
- national_codes=None,
290
- unit_type="ha",
291
- ):
292
- """
293
- Convert a GeoJSON file to a GeoJSON object containing Whisp stats for the input ROI.
294
-
295
- Parameters
296
- ----------
297
- input_geojson_filepath : str
298
- The filepath to the input GeoJSON file.
299
- output_geojson_filepath : str
300
- The filepath to save the output GeoJSON file.
301
- external_id_column : str, optional
302
- The name of the column containing external IDs, by default None.
303
- national_codes : list, optional
304
- List of ISO2 country codes to include national datasets.
305
- unit_type : str, optional
306
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
307
-
308
- Returns
309
- -------
310
- None
311
- """
312
- # Convert GeoJSON to Earth Engine FeatureCollection
313
- feature_collection = convert_geojson_to_ee(input_geojson_filepath)
314
-
315
- # Get stats as a FeatureCollection
316
- stats_feature_collection = whisp_stats_ee_to_ee(
317
- feature_collection,
318
- external_id_column,
319
- national_codes=national_codes,
320
- unit_type=unit_type,
321
- )
322
-
323
- # Convert the stats FeatureCollection to GeoJSON
324
- stats_geojson = convert_ee_to_geojson(stats_feature_collection)
325
-
326
- # Save the GeoJSON to a file
327
- with open(output_geojson_filepath, "w") as f:
328
- json.dump(stats_geojson, f, indent=2)
329
-
330
-
331
- def whisp_stats_geojson_to_drive(
332
- input_geojson_filepath: Path | str,
333
- external_id_column=None,
334
- national_codes=None,
335
- unit_type="ha",
336
- ):
337
- """
338
- Export Whisp statistics for a GeoJSON file to Google Drive.
339
-
340
- Parameters
341
- ----------
342
- input_geojson_filepath : Path | str
343
- The filepath to the GeoJSON of the ROI to analyze.
344
- external_id_column : str, optional
345
- The name of the external ID column, by default None.
346
- national_codes : list, optional
347
- List of ISO2 country codes to include national datasets.
348
- unit_type : str, optional
349
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
350
-
351
- Returns
352
- -------
353
- Message showing location of file in Google Drive
354
- """
355
- try:
356
- input_geojson_filepath = Path(input_geojson_filepath)
357
- if not input_geojson_filepath.exists():
358
- raise FileNotFoundError(f"File {input_geojson_filepath} does not exist.")
359
-
360
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
361
-
362
- return whisp_stats_ee_to_drive(
363
- feature_collection,
364
- external_id_column,
365
- national_codes=national_codes,
366
- unit_type=unit_type,
367
- )
368
-
369
- except Exception as e:
370
- print(f"An error occurred: {e}")
371
-
372
-
373
- def whisp_stats_ee_to_ee(
374
- feature_collection,
375
- external_id_column,
376
- national_codes=None,
377
- unit_type="ha",
378
- keep_properties=None,
379
- ):
380
- """
381
- Process a feature collection to get statistics for each feature.
382
-
383
- Parameters:
384
- feature_collection (ee.FeatureCollection): The input feature collection.
385
- external_id_column (str): The name of the external ID column to check.
386
- national_codes (list, optional): List of ISO2 country codes to include national datasets.
387
- unit_type (str): Whether to use hectares ("ha") or percentage ("percent"), default "ha".
388
- keep_properties (None, bool, or list, optional): Properties to keep from the input features.
389
- - None: Remove all properties (default behavior)
390
- - True: Keep all properties
391
- - list: Keep only the specified properties
392
-
393
- Returns:
394
- ee.FeatureCollection: The output feature collection with statistics.
395
- """
396
- if external_id_column is not None:
397
- try:
398
- # Validate that the external_id_column exists in all features
399
- validation_result = validate_external_id_column(
400
- feature_collection, external_id_column
401
- )
402
-
403
- if not validation_result["is_valid"]:
404
- raise ValueError(validation_result["error_message"])
405
-
406
- # First handle property selection, but preserve the external_id_column
407
- if keep_properties is not None:
408
- if keep_properties == True:
409
- # Keep all properties including external_id_column
410
- pass # No need to modify feature_collection
411
- elif isinstance(keep_properties, list):
412
- # Ensure external_id_column is included in the list
413
- if external_id_column not in keep_properties:
414
- keep_properties = keep_properties + [external_id_column]
415
- feature_collection = feature_collection.select(keep_properties)
416
- else:
417
- raise ValueError(
418
- "keep_properties must be None, True, or a list of property names."
419
- )
420
-
421
- # Set the external_id with robust null handling
422
- def set_external_id_safely_and_clean(feature):
423
- external_id_value = feature.get(external_id_column)
424
- # Use server-side null checking and string conversion
425
- external_id_value = ee.Algorithms.If(
426
- ee.Algorithms.IsEqual(external_id_value, None),
427
- "unknown",
428
- ee.String(external_id_value),
429
- )
430
- # Create a new feature with the standardized external_id column
431
- # Note: we use "external_id" as the standardized column name, not the original external_id_column name
432
- return ee.Feature(feature.set("external_id", external_id_value))
433
-
434
- feature_collection = feature_collection.map(
435
- set_external_id_safely_and_clean
436
- )
437
-
438
- # Finally, clean up to keep only geometry and external_id if keep_properties is None
439
- if keep_properties is None:
440
- feature_collection = feature_collection.select(["external_id"])
441
-
442
- except Exception as e:
443
- # Handle the exception and provide a helpful error message
444
- print(
445
- f"An error occurred when trying to set the external_id_column: {external_id_column}. Error: {e}"
446
- )
447
- raise e # Re-raise the exception to stop execution
448
- else:
449
- feature_collection = _keep_fc_properties(feature_collection, keep_properties)
450
-
451
- fc = get_stats(
452
- feature_collection, national_codes=national_codes, unit_type=unit_type
453
- )
454
-
455
- return add_id_to_feature_collection(dataset=fc, id_name=plot_id_column)
456
-
457
-
458
- def _keep_fc_properties(feature_collection, keep_properties):
459
- # If keep_properties is specified, select only those properties
460
- if keep_properties is None:
461
- feature_collection = feature_collection.select([])
462
- elif keep_properties == True:
463
- # If keep_properties is true, select all properties
464
- first_feature_props = feature_collection.first().propertyNames().getInfo()
465
- feature_collection = feature_collection.select(first_feature_props)
466
- elif isinstance(keep_properties, list):
467
- feature_collection = feature_collection.select(keep_properties)
468
- else:
469
- raise ValueError(
470
- "keep_properties must be None, True, or a list of property names."
471
- )
472
- return feature_collection
473
-
474
-
475
- def whisp_stats_ee_to_df(
476
- feature_collection: ee.FeatureCollection,
477
- external_id_column=None,
478
- remove_geom=False,
479
- national_codes=None,
480
- unit_type="ha",
481
- ) -> pd.DataFrame:
482
- """
483
- Convert a Google Earth Engine FeatureCollection to a pandas DataFrame and convert ISO3 to ISO2 country codes.
484
-
485
- Parameters
486
- ----------
487
- feature_collection : ee.FeatureCollection
488
- The input FeatureCollection to analyze.
489
- external_id_column : str, optional
490
- The name of the external ID column, by default None.
491
- remove_geom : bool, optional
492
- Whether to remove the geometry column, by default True.
493
- national_codes : list, optional
494
- List of ISO2 country codes to include national datasets.
495
- unit_type : str, optional
496
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
497
-
498
- Returns
499
- -------
500
- df_stats : pd.DataFrame
501
- The dataframe containing the Whisp stats for the input ROI.
502
- """
503
- try:
504
- df_stats = convert_ee_to_df(
505
- ee_object=whisp_stats_ee_to_ee(
506
- feature_collection,
507
- external_id_column,
508
- national_codes=national_codes,
509
- unit_type=unit_type,
510
- ),
511
- remove_geom=remove_geom,
512
- )
513
- except Exception as e:
514
- print(f"An error occurred during the conversion from EE to DataFrame: {e}")
515
- return pd.DataFrame() # Return an empty DataFrame in case of error
516
-
517
- try:
518
- df_stats = convert_iso3_to_iso2(
519
- df=df_stats,
520
- iso3_column=iso3_country_column,
521
- iso2_column=iso2_country_column,
522
- )
523
- except Exception as e:
524
- print(f"An error occurred during the ISO3 to ISO2 conversion: {e}")
525
- return pd.DataFrame() # Return an empty DataFrame in case of error
526
-
527
- return df_stats
528
-
529
-
530
- def whisp_stats_ee_to_drive(
531
- feature_collection: ee.FeatureCollection,
532
- external_id_column=None,
533
- national_codes=None,
534
- unit_type="ha",
535
- ):
536
- """
537
- Export Whisp statistics for a feature collection to Google Drive.
538
-
539
- Parameters
540
- ----------
541
- feature_collection : ee.FeatureCollection
542
- The feature collection to analyze.
543
- external_id_column : str, optional
544
- The name of the external ID column, by default None.
545
- national_codes : list, optional
546
- List of ISO2 country codes to include national datasets.
547
- unit_type : str, optional
548
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
549
- Returns
550
- -------
551
- None
552
- """
553
- try:
554
- task = ee.batch.Export.table.toDrive(
555
- collection=whisp_stats_ee_to_ee(
556
- feature_collection,
557
- external_id_column,
558
- national_codes=national_codes,
559
- unit_type=unit_type,
560
- ),
561
- description="whisp_output_table",
562
- # folder="whisp_results",
563
- fileFormat="CSV",
564
- )
565
- task.start()
566
- print(
567
- "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
568
- )
569
- except Exception as e:
570
- print(f"An error occurred during the export: {e}")
571
-
572
-
573
- #### main stats functions
574
-
575
-
576
- # Get stats for a feature or feature collection
577
- def get_stats(feature_or_feature_col, national_codes=None, unit_type="ha"):
578
- """
579
- Get stats for a feature or feature collection with optional filtering by national codes.
580
-
581
- Parameters
582
- ----------
583
- feature_or_feature_col : ee.Feature or ee.FeatureCollection
584
- The input feature or feature collection to analyze
585
- national_codes : list, optional
586
- List of ISO2 country codes to include national datasets
587
- unit_type : str, optional
588
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
589
- Returns
590
- -------
591
- ee.FeatureCollection
592
- Feature collection with calculated statistics
593
- """
594
- # Check if the input is a Feature or a FeatureCollection
595
- if isinstance(feature_or_feature_col, ee.Feature):
596
- # If the input is a Feature, call the server-side function for processing
597
- print("feature")
598
- # For a single feature, we need to combine datasets with the national_codes filter
599
- img_combined = combine_datasets(national_codes=national_codes)
600
- output = ee.FeatureCollection(
601
- [
602
- get_stats_feature(
603
- feature_or_feature_col, img_combined, unit_type=unit_type
604
- )
605
- ]
606
- )
607
- elif isinstance(feature_or_feature_col, ee.FeatureCollection):
608
- # If the input is a FeatureCollection, call the server-side function for processing
609
- output = get_stats_fc(
610
- feature_or_feature_col, national_codes=national_codes, unit_type=unit_type
611
- )
612
- else:
613
- output = "Check inputs: not an ee.Feature or ee.FeatureCollection"
614
- return output
615
-
616
-
617
- # Get statistics for a feature collection
618
- def get_stats_fc(feature_col, national_codes=None, unit_type="ha"):
619
- """
620
- Calculate statistics for a feature collection using Whisp datasets.
621
-
622
- Parameters
623
- ----------
624
- feature_col : ee.FeatureCollection
625
- The input feature collection to analyze
626
- national_codes : list, optional
627
- List of ISO2 country codes (e.g., ["BR", "US"]) to include national datasets.
628
- If provided, only national datasets for these countries and global datasets will be used.
629
- If None (default), only global datasets will be used.
630
- unit_type : str, optional
631
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
632
- Returns
633
- -------
634
- ee.FeatureCollection
635
- Feature collection with calculated statistics
636
- """
637
- img_combined = combine_datasets(
638
- national_codes=national_codes
639
- ) # Pass national_codes to combine_datasets
640
-
641
- out_feature_col = ee.FeatureCollection(
642
- feature_col.map(
643
- lambda feature: get_stats_feature(
644
- feature, img_combined, unit_type=unit_type
645
- )
646
- )
647
- )
648
- # print(out_feature_col.first().getInfo()) # for testing
649
-
650
- return out_feature_col
651
-
652
-
653
- # Get statistics for a single feature
654
-
655
-
656
- def get_stats_feature(feature, img_combined, unit_type="ha"):
657
- """
658
- Get statistics for a single feature using a pre-combined image.
659
-
660
- Parameters
661
- ----------
662
- feature : ee.Feature
663
- The feature to analyze
664
- img_combined : ee.Image
665
- Pre-combined image with all the datasets
666
- unit_type : str, optional
667
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
668
-
669
- Returns
670
- -------
671
- ee.Feature
672
- Feature with calculated statistics
673
- """
674
- reduce = img_combined.reduceRegion(
675
- reducer=ee.Reducer.sum(),
676
- geometry=feature.geometry(),
677
- scale=10,
678
- maxPixels=1e10,
679
- tileScale=8,
680
- )
681
-
682
- # Get basic feature information
683
- feature_info = get_type_and_location(feature)
684
-
685
- # add statistics unit type (e.g., percentage or hectares) to dictionary
686
- stats_unit_type = ee.Dictionary({stats_unit_type_column: unit_type})
687
-
688
- # Now, modified_dict contains all keys with the prefix added
689
- reduce_ha = reduce.map(
690
- lambda key, val: divide_and_format(ee.Number(val), ee.Number(10000))
691
- )
692
-
693
- # Get value for hectares
694
- area_ha = ee.Number(ee.Dictionary(reduce_ha).get(geometry_area_column))
695
-
696
- # Apply the function to each value in the dictionary using map()
697
- reduce_percent = reduce_ha.map(
698
- lambda key, val: percent_and_format(ee.Number(val), area_ha)
699
- )
700
-
701
- # Reformat the hectare statistics
702
- reducer_stats_ha = reduce_ha.set(
703
- geometry_area_column, area_ha.format(geometry_area_column_formatting)
704
- ) # area ha (formatted)
705
-
706
- # Reformat the percentage statistics
707
- reducer_stats_percent = reduce_percent.set(
708
- geometry_area_column, area_ha.format(geometry_area_column_formatting)
709
- ) # area ha (formatted)
710
-
711
- # Add country info onto hectare analysis results
712
- properties_ha = feature_info.combine(ee.Dictionary(reducer_stats_ha)).combine(
713
- stats_unit_type
714
- )
715
-
716
- # Add country info onto percentage analysis results
717
- properties_percent = feature_info.combine(
718
- ee.Dictionary(reducer_stats_percent)
719
- ).combine(stats_unit_type)
720
-
721
- # Choose whether to use hectares or percentage based on the parameter instead of global variable
722
- out_feature = ee.Algorithms.If(
723
- unit_type == "ha",
724
- feature.set(properties_ha), # .setGeometry(None),
725
- feature.set(properties_percent), # .setGeometry(None),
726
- )
727
-
728
- return out_feature
729
-
730
-
731
- # Get basic feature information - uses admin and water datasets in gee.
732
- def get_type_and_location(feature):
733
- """Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags."""
734
-
735
- # Get centroid of the feature's geometry
736
- centroid = feature.geometry().centroid(1)
737
-
738
- # Fetch location info from geoboundaries (country, admin)
739
- location = ee.Dictionary(get_geoboundaries_info(centroid))
740
- country = ee.Dictionary({iso3_country_column: location.get("shapeGroup")})
741
-
742
- admin_1 = ee.Dictionary(
743
- {admin_1_column: location.get("shapeName")}
744
- ) # Administrative level 1 (if available)
745
-
746
- # Prepare the water flag information
747
- water_all = water_flag_all_prep()
748
- water_flag_dict = value_at_point_flag(
749
- point=centroid, image=water_all, band_name=water_flag, output_name=water_flag
750
- )
751
-
752
- # Get the geometry type of the feature
753
- geom_type = ee.Dictionary({geometry_type_column: feature.geometry().type()})
754
-
755
- # Get the coordinates (latitude, longitude) of the centroid
756
- coords_list = centroid.coordinates()
757
- coords_dict = ee.Dictionary(
758
- {
759
- centroid_x_coord_column: coords_list.get(0), # Longitude
760
- centroid_y_coord_column: coords_list.get(1), # Latitude
761
- }
762
- )
763
-
764
- # Combine all the extracted info into a single dictionary
765
- feature_info = (
766
- country.combine(admin_1)
767
- .combine(geom_type)
768
- .combine(coords_dict)
769
- .combine(water_flag_dict)
770
- )
771
-
772
- return feature_info
773
-
774
-
775
- # Define a function to divide each value by 10,000 and format it with one decimal place
776
- def divide_and_format(val, unit):
777
- # Convert the image to an ee.Number, divide by 10,000, and format with one decimal place
778
- formatted_value = ee.Number.parse(
779
- ee.Number(ee.Number(val).divide(ee.Number(unit))).format(
780
- stats_area_columns_formatting
781
- )
782
- )
783
- # Return the formatted value
784
- return ee.Number(formatted_value)
785
-
786
-
787
- # Define a function to divide by total area of geometry and multiply by 100
788
- def percent_and_format(val, area_ha):
789
- formatted_value = ee.Number.parse(
790
- ee.Number(ee.Number(val).divide(area_ha).multiply(ee.Number(100))).format(
791
- stats_percent_columns_formatting
792
- )
793
- )
794
- # Return the formatted value
795
- return ee.Number(formatted_value)
796
-
797
-
798
- # geoboundaries - admin units from a freqently updated database, allows commercial use (CC BY 4.0 DEED) (disputed territories may need checking)
799
- def get_geoboundaries_info(geometry):
800
- gbounds_ADM0 = ee.FeatureCollection("WM/geoLab/geoBoundaries/600/ADM1")
801
- polygonsIntersectPoint = gbounds_ADM0.filterBounds(geometry)
802
- backup_dict = ee.Dictionary({"shapeGroup": "Unknown", "shapeName": "Unknown"})
803
- return ee.Algorithms.If(
804
- polygonsIntersectPoint.size().gt(0),
805
- polygonsIntersectPoint.first()
806
- .toDictionary()
807
- .select(["shapeGroup", "shapeName"]),
808
- backup_dict,
809
- )
810
-
811
-
812
- #####
813
- # water flag - to flag plots that may be erroneous (i.e., where errors may have occured in their creation / translation and so fall in either the ocean or inland water -
814
- def usgs_gsv_ocean_prep(): # TO DO: for speed export image as an asset at samne res as JRC
815
- # Initialize the Earth Engine API
816
- # ee.Initialize()
817
-
818
- # Load the datasets
819
- mainlands = ee.FeatureCollection(
820
- "projects/sat-io/open-datasets/shoreline/mainlands"
821
- )
822
- big_islands = ee.FeatureCollection(
823
- "projects/sat-io/open-datasets/shoreline/big_islands"
824
- )
825
- small_islands = ee.FeatureCollection(
826
- "projects/sat-io/open-datasets/shoreline/small_islands"
827
- )
828
-
829
- # Combine the datasets into one FeatureCollection
830
- gsv = ee.FeatureCollection([mainlands, big_islands, small_islands]).flatten()
831
-
832
- # Rasterize the combined FeatureCollection and make areas outside coast (i.e. ocean) as value 1
833
- # and then rename the band
834
- return ee.Image(1).paint(gsv).selfMask().rename("ocean")
835
-
836
-
837
- def jrc_water_surface_prep():
838
- jrc_surface_water = ee.Image("JRC/GSW1_4/GlobalSurfaceWater")
839
-
840
- # use transition band
841
- jrc_transition = jrc_surface_water.select("transition")
842
-
843
- # select permanent water bodies:
844
- # remap the following classes to have a value of 1:
845
- # "Permanent", "New Permanent", and "Seasonal to Permanent" (i.e., classes 1,2 and 7).
846
- # All other classes as value 0.
847
- permanent_inland_water = jrc_transition.remap([1, 2, 7], [1, 1, 1], 0).unmask()
848
-
849
- # optional - clip to within coast line (not needed currently and extra processing)
850
- # permanent_inland_water = permanent_inland_water.where(usgs_gsv_ocean_prep(),0)
851
-
852
- return permanent_inland_water.rename("water_inland")
853
-
854
-
855
- def water_flag_all_prep():
856
- # combine both where water surface is 1, then 1, else use non_land_gsv
857
- return (
858
- usgs_gsv_ocean_prep()
859
- .unmask()
860
- .where(jrc_water_surface_prep(), 1)
861
- .rename(water_flag)
862
- )
863
-
864
-
865
- def value_at_point_flag(point, image, band_name, output_name):
866
- """Sample an image at the given point and make a dictionary output where the name is defined by output_name parameter"""
867
- sample = image.sample(region=point, scale=30, numPixels=1).first()
868
-
869
- # Get the value from the sampled point
870
- value = sample.get(band_name) # assuming the band name is 'b1', change if necessary
871
-
872
- # Use a conditional statement to check if the value is 1
873
- result = value # ee.Algorithms.If(ee.Number(value).eq(1), "True", "False")
874
-
875
- # Return the output dictionary
876
- return ee.Dictionary({output_name: result}) # .getInfo()
877
-
878
-
879
- def add_id_to_feature_collection(dataset, id_name):
880
- """
881
- Adds an incremental (1,2,3 etc) 'id' property to each feature in the given FeatureCollection.
882
-
883
- Args:
884
- - dataset: ee.FeatureCollection, the FeatureCollection to operate on.
885
-
886
- Returns:
887
- - dataset_with_id: ee.FeatureCollection, the FeatureCollection with 'id' property added to each feature.
888
- """
889
- # Get the list of system:index values
890
- indexes = dataset.aggregate_array("system:index")
891
-
892
- # Create a sequence of numbers starting from 1 to the size of indexes
893
- ids = ee.List.sequence(1, indexes.size())
894
-
895
- # Create a dictionary mapping system:index to id
896
- id_by_index = ee.Dictionary.fromLists(indexes, ids)
897
-
898
- # Function to add 'id' property to each feature
899
- def add_id(feature):
900
- # Get the system:index of the feature
901
- system_index = feature.get("system:index")
902
-
903
- # Get the id corresponding to the system:index
904
- feature_id = id_by_index.get(system_index)
905
-
906
- # Set the 'id' property of the feature
907
- return feature.set(id_name, feature_id)
908
-
909
- # Map the add_id function over the dataset
910
- dataset_with_id = dataset.map(add_id)
911
-
912
- return dataset_with_id
913
-
914
-
915
- # Function to add ID to features
916
- def add_id_to_feature(feature, id_name):
917
- index = feature.get("system:index")
918
- return feature.set(id_name, index)
919
-
920
-
921
- # Function to flag positive values
922
- def flag_positive_values(feature, flag_positive):
923
- for prop_name in flag_positive:
924
- flag_value = ee.Algorithms.If(
925
- ee.Number(feature.get(prop_name)).gt(0), "True", "-"
926
- )
927
- feature = feature.set(prop_name, flag_value)
928
- return feature
929
-
930
-
931
- # Function to exclude properties
932
- def copy_properties_and_exclude(feature, exclude_properties_from_output):
933
- return ee.Feature(feature.geometry()).copyProperties(
934
- source=feature, exclude=exclude_properties_from_output
935
- )
936
-
937
-
938
- def ee_image_checker(image):
939
- """
940
- Tests if the input is a valid ee.Image.
941
-
942
- Args:
943
- image: An ee.Image object.
944
-
945
- Returns:
946
- bool: True if the input is a valid ee.Image, False otherwise.
947
- """
948
- try:
949
- if ee.Algorithms.ObjectType(image).getInfo() == "Image":
950
- # Trigger some action on the image to ensure it's a valid image
951
- image.getInfo() # This will raise an exception if the image is invalid
952
- return True
953
- except ee.EEException as e:
954
- print(f"Image validation failed with EEException: {e}")
955
- except Exception as e:
956
- print(f"Image validation failed with exception: {e}")
957
- return False
958
-
959
-
960
- def keep_valid_images(image_list):
961
- """
962
- Filters a list to return only valid ee.Images.
963
-
964
- Args:
965
- image_list: List of ee.Image objects.
966
-
967
- Returns:
968
- list: List of valid ee.Image objects.
969
- """
970
- valid_imgs = []
971
- for image in image_list:
972
- if ee_image_checker(image):
973
- valid_imgs.append(image)
974
- return valid_imgs
975
-
976
-
977
- def convert_iso3_to_iso2(df, iso3_column, iso2_column):
978
- """
979
- Converts ISO3 country codes to ISO2 codes and adds a new column to the DataFrame.
980
-
981
- Args:
982
- df (pd.DataFrame): Input DataFrame containing ISO3 country codes.
983
- iso3_column (str): The column name in the DataFrame with ISO3 country codes.
984
- iso2_column (str): The new column name to store ISO2 country codes.
985
-
986
- Returns:
987
- pd.DataFrame: Updated DataFrame with the new ISO2 column.
988
- """
989
- import country_converter as coco
990
-
991
- # Apply conversion from ISO3 to ISO2
992
- df[iso2_column] = df[iso3_column].apply(
993
- lambda x: (
994
- coco.convert(names=x, to="ISO2") if x else "not found (disputed territory)"
995
- )
996
- )
997
-
998
- return df
999
-
1000
-
1001
- def validate_external_id_column(feature_collection, external_id_column):
1002
- """
1003
- Validates that the external_id_column exists in all features of the collection.
1004
-
1005
- Parameters
1006
- ----------
1007
- feature_collection : ee.FeatureCollection
1008
- The feature collection to validate
1009
- external_id_column : str
1010
- The name of the external ID column to check
1011
-
1012
- Returns
1013
- -------
1014
- dict
1015
- Dictionary with validation results including:
1016
- - 'is_valid': bool indicating if column exists in all features
1017
- - 'total_features': int total number of features
1018
- - 'features_with_column': int number of features that have the column
1019
- - 'available_properties': list of properties available in first feature
1020
- - 'error_message': str error message if validation fails
1021
- """
1022
- try:
1023
- # Get total number of features
1024
- total_features = feature_collection.size().getInfo()
1025
-
1026
- if total_features == 0:
1027
- return {
1028
- "is_valid": False,
1029
- "total_features": 0,
1030
- "features_with_column": 0,
1031
- "available_properties": [],
1032
- "error_message": "Feature collection is empty",
1033
- }
1034
-
1035
- # Get available properties from first feature
1036
- first_feature_props = feature_collection.first().propertyNames().getInfo()
1037
-
1038
- # Check if external_id_column exists in all features
1039
- def check_column_exists(feature):
1040
- has_column = feature.propertyNames().contains(external_id_column)
1041
- return feature.set("_has_external_id", has_column)
1042
-
1043
- features_with_check = feature_collection.map(check_column_exists)
1044
- features_with_column = (
1045
- features_with_check.filter(ee.Filter.eq("_has_external_id", True))
1046
- .size()
1047
- .getInfo()
1048
- )
1049
-
1050
- is_valid = features_with_column == total_features
1051
-
1052
- error_message = None
1053
- if not is_valid:
1054
- missing_count = total_features - features_with_column
1055
- error_message = (
1056
- f"The column '{external_id_column}' is missing from {missing_count} "
1057
- f"out of {total_features} features in the collection. "
1058
- f"Available properties in first feature: {first_feature_props}"
1059
- )
1060
-
1061
- return {
1062
- "is_valid": is_valid,
1063
- "total_features": total_features,
1064
- "features_with_column": features_with_column,
1065
- "available_properties": first_feature_props,
1066
- "error_message": error_message,
1067
- }
1068
-
1069
- except Exception as e:
1070
- return {
1071
- "is_valid": False,
1072
- "total_features": 0,
1073
- "features_with_column": 0,
1074
- "available_properties": [],
1075
- "error_message": f"Error during validation: {str(e)}",
1076
- }
1077
-
1078
-
1079
- def debug_feature_collection_properties(feature_collection, max_features=5):
1080
- """
1081
- Debug helper function to inspect the properties of features in a collection.
1082
-
1083
- Parameters
1084
- ----------
1085
- feature_collection : ee.FeatureCollection
1086
- The feature collection to inspect
1087
- max_features : int, optional
1088
- Maximum number of features to inspect, by default 5
1089
-
1090
- Returns
1091
- -------
1092
- dict
1093
- Dictionary with debugging information about the feature collection
1094
- """
1095
- try:
1096
- total_features = feature_collection.size().getInfo()
1097
-
1098
- if total_features == 0:
1099
- return {"total_features": 0, "error": "Feature collection is empty"}
1100
-
1101
- # Limit the number of features to inspect
1102
- features_to_check = min(max_features, total_features)
1103
- limited_fc = feature_collection.limit(features_to_check)
1104
-
1105
- # Get properties for each feature
1106
- def get_feature_properties(feature):
1107
- return ee.Dictionary(
1108
- {
1109
- "properties": feature.propertyNames(),
1110
- "geometry_type": feature.geometry().type(),
1111
- }
1112
- )
1113
-
1114
- feature_info = limited_fc.map(get_feature_properties).getInfo()
1115
-
1116
- return {
1117
- "total_features": total_features,
1118
- "inspected_features": features_to_check,
1119
- "feature_details": [
1120
- {
1121
- "feature_index": i,
1122
- "properties": feature_info["features"][i]["properties"][
1123
- "properties"
1124
- ],
1125
- "geometry_type": feature_info["features"][i]["properties"][
1126
- "geometry_type"
1127
- ],
1128
- }
1129
- for i in range(len(feature_info["features"]))
1130
- ],
1131
- }
1132
-
1133
- except Exception as e:
1134
- return {"error": f"Error during debugging: {str(e)}"}
1
+ import ee
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from .datasets import combine_datasets
5
+ import json
6
+ import country_converter as coco
7
+ from openforis_whisp.parameters.config_runtime import (
8
+ plot_id_column,
9
+ external_id_column,
10
+ geometry_type_column,
11
+ geometry_area_column,
12
+ geometry_area_column_formatting,
13
+ centroid_x_coord_column,
14
+ centroid_y_coord_column,
15
+ iso3_country_column,
16
+ iso2_country_column,
17
+ admin_1_column,
18
+ stats_unit_type_column,
19
+ stats_area_columns_formatting,
20
+ stats_percent_columns_formatting,
21
+ water_flag,
22
+ )
23
+ from .data_conversion import (
24
+ convert_ee_to_df,
25
+ convert_geojson_to_ee,
26
+ convert_ee_to_geojson,
27
+ # convert_csv_to_geojson,
28
+ convert_df_to_geojson,
29
+ ) # copied functions from whisp-api and geemap (accessed 2024) to avoid dependency
30
+ from .reformat import (
31
+ validate_dataframe_using_lookups,
32
+ validate_dataframe_using_lookups_flexible,
33
+ )
34
+
35
+ # NB functions that included "formatted" in the name apply a schema for validation and reformatting of the output dataframe. The schema is created from lookup tables.
36
+
37
+
38
+ def whisp_formatted_stats_geojson_to_df(
39
+ input_geojson_filepath: Path | str,
40
+ external_id_column=None,
41
+ remove_geom=False,
42
+ national_codes=None,
43
+ unit_type="ha",
44
+ whisp_image=None,
45
+ custom_bands=None, # New parameter
46
+ ) -> pd.DataFrame:
47
+ """
48
+ Main function for most users.
49
+ Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
50
+ Output df is validated against a panderas schema (created on the fly from the two lookup CSVs).
51
+
52
+ This function first converts the provided GeoJSON file into an Earth Engine FeatureCollection.
53
+ It then processes the FeatureCollection to extract relevant Whisp statistics,
54
+ returning a structured DataFrame that aligns with the expected schema.
55
+
56
+ If `external_id_column` is provided, it will be used to link external identifiers
57
+ from the input GeoJSON to the output DataFrame.
58
+
59
+ Parameters
60
+ ----------
61
+ input_geojson_filepath : Path | str
62
+ The filepath to the GeoJSON of the ROI to analyze.
63
+ external_id_column : str, optional
64
+ The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
65
+ This column must exist as a property in ALL features of the GeoJSON file.
66
+ Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
67
+ remove_geom : bool, default=False
68
+ If True, the geometry of the GeoJSON is removed from the output DataFrame.
69
+ national_codes : list, optional
70
+ List of ISO2 country codes to include national datasets.
71
+ unit_type: str, optional
72
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
73
+ whisp_image : ee.Image, optional
74
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
75
+ If provided, this image will be used instead of combining datasets based on national_codes.
76
+ If None, datasets will be combined automatically using national_codes parameter.
77
+ custom_bands : list or dict, optional
78
+ Custom band information for extra columns. Can be:
79
+ - List of band names: ['Aa_test', 'elevation']
80
+ - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
81
+ - None: preserves all extra columns automatically
82
+
83
+ Returns
84
+ -------
85
+ df_stats : pd.DataFrame
86
+ The DataFrame containing the Whisp stats for the input ROI.
87
+ """
88
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
89
+
90
+ return whisp_formatted_stats_ee_to_df(
91
+ feature_collection,
92
+ external_id_column,
93
+ remove_geom,
94
+ national_codes=national_codes,
95
+ unit_type=unit_type,
96
+ whisp_image=whisp_image,
97
+ custom_bands=custom_bands, # Pass through
98
+ )
99
+
100
+
101
+ def whisp_formatted_stats_geojson_to_geojson(
102
+ input_geojson_filepath,
103
+ output_geojson_filepath,
104
+ external_id_column=None,
105
+ geo_column: str = "geo",
106
+ national_codes=None,
107
+ unit_type="ha",
108
+ whisp_image=None, # New parameter
109
+ ):
110
+ """
111
+ Convert a formatted GeoJSON file with a geo column into a GeoJSON file containing Whisp stats.
112
+
113
+ Parameters
114
+ ----------
115
+ input_geojson_filepath : str
116
+ The filepath to the input GeoJSON file.
117
+ output_geojson_filepath : str
118
+ The filepath to save the output GeoJSON file.
119
+ external_id_column : str, optional
120
+ The name of the column containing external IDs, by default None.
121
+ geo_column : str, optional
122
+ The name of the column containing GeoJSON geometries, by default "geo".
123
+ national_codes : list, optional
124
+ List of ISO2 country codes to include national datasets.
125
+ unit_type : str, optional
126
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
127
+ whisp_image : ee.Image, optional
128
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
129
+
130
+ Returns
131
+ -------
132
+ None
133
+ """
134
+ df = whisp_formatted_stats_geojson_to_df(
135
+ input_geojson_filepath=input_geojson_filepath,
136
+ external_id_column=external_id_column,
137
+ national_codes=national_codes,
138
+ unit_type=unit_type,
139
+ whisp_image=whisp_image, # Pass through
140
+ )
141
+ # Convert the df to GeoJSON
142
+ convert_df_to_geojson(df, output_geojson_filepath, geo_column)
143
+
144
+ print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
145
+
146
+
147
+ def whisp_formatted_stats_ee_to_geojson(
148
+ feature_collection: ee.FeatureCollection,
149
+ output_geojson_filepath: str,
150
+ external_id_column=None,
151
+ geo_column: str = "geo",
152
+ national_codes=None,
153
+ unit_type="ha",
154
+ whisp_image=None, # New parameter
155
+ ):
156
+ """
157
+ Convert an Earth Engine FeatureCollection to a GeoJSON file containing Whisp stats.
158
+
159
+ Parameters
160
+ ----------
161
+ feature_collection : ee.FeatureCollection
162
+ The feature collection of the ROI to analyze.
163
+ output_geojson_filepath : str
164
+ The filepath to save the output GeoJSON file.
165
+ external_id_column : str, optional
166
+ The name of the column containing external IDs, by default None.
167
+ geo_column : str, optional
168
+ The name of the column containing GeoJSON geometries, by default "geo".
169
+ national_codes : list, optional
170
+ List of ISO2 country codes to include national datasets.
171
+ unit_type : str, optional
172
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
173
+ whisp_image : ee.Image, optional
174
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
175
+ Returns
176
+ -------
177
+ None
178
+ """
179
+ # Convert ee feature collection to a pandas dataframe
180
+ df_stats = whisp_formatted_stats_ee_to_df(
181
+ feature_collection,
182
+ external_id_column,
183
+ national_codes=national_codes,
184
+ unit_type=unit_type,
185
+ whisp_image=whisp_image, # Pass through
186
+ )
187
+
188
+ # Convert the df to GeoJSON
189
+ convert_df_to_geojson(df_stats, output_geojson_filepath, geo_column)
190
+
191
+ print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
192
+
193
+
194
+ def whisp_formatted_stats_ee_to_df(
195
+ feature_collection: ee.FeatureCollection,
196
+ external_id_column=None,
197
+ remove_geom=False,
198
+ national_codes=None,
199
+ unit_type="ha",
200
+ whisp_image=None,
201
+ custom_bands=None, # New parameter
202
+ ) -> pd.DataFrame:
203
+ """
204
+ Convert a feature collection to a validated DataFrame with Whisp statistics.
205
+
206
+ Parameters
207
+ ----------
208
+ feature_collection : ee.FeatureCollection
209
+ The feature collection of the ROI to analyze.
210
+ external_id_column : str, optional
211
+ The name of the external ID column, by default None.
212
+ remove_geom : bool, optional
213
+ Whether to remove the geometry column, by default False.
214
+ national_codes : list, optional
215
+ List of ISO2 country codes to include national datasets.
216
+ unit_type : str, optional
217
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
218
+ whisp_image : ee.Image, optional
219
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
220
+ custom_bands : list or dict, optional
221
+ Custom band information for extra columns.
222
+
223
+ Returns
224
+ -------
225
+ validated_df : pd.DataFrame
226
+ The validated dataframe containing the Whisp stats for the input ROI.
227
+ """
228
+ # Convert ee feature collection to a pandas dataframe
229
+ df_stats = whisp_stats_ee_to_df(
230
+ feature_collection,
231
+ external_id_column,
232
+ remove_geom,
233
+ national_codes=national_codes,
234
+ unit_type=unit_type,
235
+ whisp_image=whisp_image,
236
+ )
237
+
238
+ # Use flexible validation that handles custom bands
239
+ validated_df = validate_dataframe_using_lookups_flexible(
240
+ df_stats, national_codes=national_codes, custom_bands=custom_bands
241
+ )
242
+ return validated_df
243
+
244
+
245
+ ### functions without additional formatting below (i.e., raw output from GEE processing without schema validation step)
246
+
247
+
248
+ def whisp_stats_geojson_to_df(
249
+ input_geojson_filepath: Path | str,
250
+ external_id_column=None,
251
+ remove_geom=False,
252
+ national_codes=None,
253
+ unit_type="ha",
254
+ whisp_image=None, # New parameter
255
+ ) -> pd.DataFrame:
256
+ """
257
+ Convert a GeoJSON file to a pandas DataFrame with Whisp statistics.
258
+
259
+ Parameters
260
+ ----------
261
+ input_geojson_filepath : Path | str
262
+ The filepath to the GeoJSON of the ROI to analyze.
263
+ external_id_column : str, optional
264
+ The name of the external ID column, by default None.
265
+ remove_geom : bool, optional
266
+ Whether to remove the geometry column, by default False.
267
+ national_codes : list, optional
268
+ List of ISO2 country codes to include national datasets.
269
+ unit_type : str, optional
270
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
271
+ whisp_image : ee.Image, optional
272
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
273
+
274
+ Returns
275
+ -------
276
+ df_stats : pd.DataFrame
277
+ The dataframe containing the Whisp stats for the input ROI.
278
+ """
279
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
280
+
281
+ return whisp_stats_ee_to_df(
282
+ feature_collection,
283
+ external_id_column,
284
+ remove_geom,
285
+ national_codes=national_codes,
286
+ unit_type=unit_type,
287
+ whisp_image=whisp_image, # Pass through
288
+ )
289
+
290
+
291
+ def whisp_stats_geojson_to_ee(
292
+ input_geojson_filepath: Path | str,
293
+ external_id_column=None,
294
+ national_codes=None,
295
+ whisp_image=None, # New parameter
296
+ ) -> ee.FeatureCollection:
297
+ """
298
+ Convert a GeoJSON file to an Earth Engine FeatureCollection with Whisp statistics.
299
+
300
+ Parameters
301
+ ----------
302
+ input_geojson_filepath : Path | str
303
+ The filepath to the GeoJSON of the ROI to analyze.
304
+ external_id_column : str, optional
305
+ The name of the external ID column, by default None.
306
+ national_codes : list, optional
307
+ List of ISO2 country codes to include national datasets.
308
+ whisp_image : ee.Image, optional
309
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
310
+
311
+ Returns
312
+ -------
313
+ ee.FeatureCollection
314
+ The feature collection containing the Whisp stats for the input ROI.
315
+ """
316
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
317
+
318
+ return whisp_stats_ee_to_ee(
319
+ feature_collection,
320
+ external_id_column,
321
+ national_codes=national_codes,
322
+ whisp_image=whisp_image, # Pass through
323
+ )
324
+
325
+
326
+ def whisp_stats_geojson_to_geojson(
327
+ input_geojson_filepath,
328
+ output_geojson_filepath,
329
+ external_id_column=None,
330
+ national_codes=None,
331
+ unit_type="ha",
332
+ whisp_image=None, # New parameter
333
+ ):
334
+ """
335
+ Convert a GeoJSON file to a GeoJSON object containing Whisp stats for the input ROI.
336
+
337
+ Parameters
338
+ ----------
339
+ input_geojson_filepath : str
340
+ The filepath to the input GeoJSON file.
341
+ output_geojson_filepath : str
342
+ The filepath to save the output GeoJSON file.
343
+ external_id_column : str, optional
344
+ The name of the column containing external IDs, by default None.
345
+ national_codes : list, optional
346
+ List of ISO2 country codes to include national datasets.
347
+ unit_type : str, optional
348
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
349
+ whisp_image : ee.Image, optional
350
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
351
+
352
+ Returns
353
+ -------
354
+ None
355
+ """
356
+ # Convert GeoJSON to Earth Engine FeatureCollection
357
+ feature_collection = convert_geojson_to_ee(input_geojson_filepath)
358
+
359
+ # Get stats as a FeatureCollection
360
+ stats_feature_collection = whisp_stats_ee_to_ee(
361
+ feature_collection,
362
+ external_id_column,
363
+ national_codes=national_codes,
364
+ unit_type=unit_type,
365
+ whisp_image=whisp_image, # Pass through
366
+ )
367
+
368
+ # Convert the stats FeatureCollection to GeoJSON
369
+ stats_geojson = convert_ee_to_geojson(stats_feature_collection)
370
+
371
+ # Save the GeoJSON to a file
372
+ with open(output_geojson_filepath, "w") as f:
373
+ json.dump(stats_geojson, f, indent=2)
374
+
375
+
376
+ def whisp_stats_geojson_to_drive(
377
+ input_geojson_filepath: Path | str,
378
+ external_id_column=None,
379
+ national_codes=None,
380
+ unit_type="ha",
381
+ whisp_image=None, # New parameter
382
+ ):
383
+ """
384
+ Export Whisp statistics for a GeoJSON file to Google Drive.
385
+
386
+ Parameters
387
+ ----------
388
+ input_geojson_filepath : Path | str
389
+ The filepath to the GeoJSON of the ROI to analyze.
390
+ external_id_column : str, optional
391
+ The name of the external ID column, by default None.
392
+ national_codes : list, optional
393
+ List of ISO2 country codes to include national datasets.
394
+ unit_type : str, optional
395
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
396
+ whisp_image : ee.Image, optional
397
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
398
+
399
+ Returns
400
+ -------
401
+ Message showing location of file in Google Drive
402
+ """
403
+ try:
404
+ input_geojson_filepath = Path(input_geojson_filepath)
405
+ if not input_geojson_filepath.exists():
406
+ raise FileNotFoundError(f"File {input_geojson_filepath} does not exist.")
407
+
408
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
409
+
410
+ return whisp_stats_ee_to_drive(
411
+ feature_collection,
412
+ external_id_column,
413
+ national_codes=national_codes,
414
+ unit_type=unit_type,
415
+ whisp_image=whisp_image, # Pass through
416
+ )
417
+
418
+ except Exception as e:
419
+ print(f"An error occurred: {e}")
420
+
421
+
422
+ def whisp_stats_ee_to_ee(
423
+ feature_collection,
424
+ external_id_column,
425
+ national_codes=None,
426
+ unit_type="ha",
427
+ keep_properties=None,
428
+ whisp_image=None, # New parameter
429
+ ):
430
+ """
431
+ Process a feature collection to get statistics for each feature.
432
+
433
+ Parameters:
434
+ feature_collection (ee.FeatureCollection): The input feature collection.
435
+ external_id_column (str): The name of the external ID column to check.
436
+ national_codes (list, optional): List of ISO2 country codes to include national datasets.
437
+ unit_type (str): Whether to use hectares ("ha") or percentage ("percent"), default "ha".
438
+ keep_properties (None, bool, or list, optional): Properties to keep from the input features.
439
+ - None: Remove all properties (default behavior)
440
+ - True: Keep all properties
441
+ - list: Keep only the specified properties
442
+ whisp_image (ee.Image, optional): Pre-combined multiband Earth Engine Image containing
443
+ all Whisp datasets. If provided, this image will be used instead of combining
444
+ datasets based on national_codes.
445
+
446
+ Returns:
447
+ ee.FeatureCollection: The output feature collection with statistics.
448
+ """
449
+ if external_id_column is not None:
450
+ try:
451
+ # Validate that the external_id_column exists in all features
452
+ validation_result = validate_external_id_column(
453
+ feature_collection, external_id_column
454
+ )
455
+
456
+ if not validation_result["is_valid"]:
457
+ raise ValueError(validation_result["error_message"])
458
+
459
+ # First handle property selection, but preserve the external_id_column
460
+ if keep_properties is not None:
461
+ if keep_properties == True:
462
+ # Keep all properties including external_id_column
463
+ pass # No need to modify feature_collection
464
+ elif isinstance(keep_properties, list):
465
+ # Ensure external_id_column is included in the list
466
+ if external_id_column not in keep_properties:
467
+ keep_properties = keep_properties + [external_id_column]
468
+ feature_collection = feature_collection.select(keep_properties)
469
+ else:
470
+ raise ValueError(
471
+ "keep_properties must be None, True, or a list of property names."
472
+ )
473
+
474
+ # Set the external_id with robust null handling
475
+ def set_external_id_safely_and_clean(feature):
476
+ external_id_value = feature.get(external_id_column)
477
+ # Use server-side null checking and string conversion
478
+ external_id_value = ee.Algorithms.If(
479
+ ee.Algorithms.IsEqual(external_id_value, None),
480
+ "unknown",
481
+ ee.String(external_id_value),
482
+ )
483
+ # Create a new feature with the standardized external_id column
484
+ # Note: we use "external_id" as the standardized column name, not the original external_id_column name
485
+ return ee.Feature(feature.set("external_id", external_id_value))
486
+
487
+ feature_collection = feature_collection.map(
488
+ set_external_id_safely_and_clean
489
+ )
490
+
491
+ # Finally, clean up to keep only geometry and external_id if keep_properties is None
492
+ if keep_properties is None:
493
+ feature_collection = feature_collection.select(["external_id"])
494
+
495
+ except Exception as e:
496
+ # Handle the exception and provide a helpful error message
497
+ print(
498
+ f"An error occurred when trying to set the external_id_column: {external_id_column}. Error: {e}"
499
+ )
500
+ raise e # Re-raise the exception to stop execution
501
+ else:
502
+ feature_collection = _keep_fc_properties(feature_collection, keep_properties)
503
+
504
+ fc = get_stats(
505
+ feature_collection,
506
+ national_codes=national_codes,
507
+ unit_type=unit_type,
508
+ whisp_image=whisp_image, # Pass through
509
+ )
510
+
511
+ return add_id_to_feature_collection(dataset=fc, id_name=plot_id_column)
512
+
513
+
514
+ def _keep_fc_properties(feature_collection, keep_properties):
515
+ # If keep_properties is specified, select only those properties
516
+ if keep_properties is None:
517
+ feature_collection = feature_collection.select([])
518
+ elif keep_properties == True:
519
+ # If keep_properties is true, select all properties
520
+ first_feature_props = feature_collection.first().propertyNames().getInfo()
521
+ feature_collection = feature_collection.select(first_feature_props)
522
+ elif isinstance(keep_properties, list):
523
+ feature_collection = feature_collection.select(keep_properties)
524
+ else:
525
+ raise ValueError(
526
+ "keep_properties must be None, True, or a list of property names."
527
+ )
528
+ return feature_collection
529
+
530
+
531
+ def whisp_stats_ee_to_df(
532
+ feature_collection: ee.FeatureCollection,
533
+ external_id_column=None,
534
+ remove_geom=False,
535
+ national_codes=None,
536
+ unit_type="ha",
537
+ whisp_image=None, # New parameter
538
+ ) -> pd.DataFrame:
539
+ """
540
+ Convert a Google Earth Engine FeatureCollection to a pandas DataFrame and convert ISO3 to ISO2 country codes.
541
+
542
+ Parameters
543
+ ----------
544
+ feature_collection : ee.FeatureCollection
545
+ The input FeatureCollection to analyze.
546
+ external_id_column : str, optional
547
+ The name of the external ID column, by default None.
548
+ remove_geom : bool, optional
549
+ Whether to remove the geometry column, by default True.
550
+ national_codes : list, optional
551
+ List of ISO2 country codes to include national datasets.
552
+ unit_type : str, optional
553
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
554
+ whisp_image : ee.Image, optional
555
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
556
+
557
+ Returns
558
+ -------
559
+ df_stats : pd.DataFrame
560
+ The dataframe containing the Whisp stats for the input ROI.
561
+ """
562
+ # First, do the whisp processing to get the EE feature collection with stats
563
+ try:
564
+ stats_feature_collection = whisp_stats_ee_to_ee(
565
+ feature_collection,
566
+ external_id_column,
567
+ national_codes=national_codes,
568
+ unit_type=unit_type,
569
+ whisp_image=whisp_image, # Pass through
570
+ )
571
+ except Exception as e:
572
+ print(f"An error occurred during Whisp stats processing: {e}")
573
+ raise e
574
+
575
+ # Then, convert the EE feature collection to DataFrame
576
+ try:
577
+ df_stats = convert_ee_to_df(
578
+ ee_object=stats_feature_collection,
579
+ remove_geom=remove_geom,
580
+ )
581
+ except Exception as e:
582
+ print(f"An error occurred during the conversion from EE to DataFrame: {e}")
583
+ raise e
584
+
585
+ try:
586
+ df_stats = convert_iso3_to_iso2(
587
+ df=df_stats,
588
+ iso3_column=iso3_country_column,
589
+ iso2_column=iso2_country_column,
590
+ )
591
+ except Exception as e:
592
+ print(f"An error occurred during the ISO3 to ISO2 conversion: {e}")
593
+ return pd.DataFrame() # Return an empty DataFrame in case of error
594
+
595
+ return df_stats
596
+
597
+
598
+ def whisp_stats_ee_to_drive(
599
+ feature_collection: ee.FeatureCollection,
600
+ external_id_column=None,
601
+ national_codes=None,
602
+ unit_type="ha",
603
+ whisp_image=None, # New parameter
604
+ ):
605
+ """
606
+ Export Whisp statistics for a feature collection to Google Drive.
607
+
608
+ Parameters
609
+ ----------
610
+ feature_collection : ee.FeatureCollection
611
+ The feature collection to analyze.
612
+ external_id_column : str, optional
613
+ The name of the external ID column, by default None.
614
+ national_codes : list, optional
615
+ List of ISO2 country codes to include national datasets.
616
+ unit_type : str, optional
617
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
618
+ whisp_image : ee.Image, optional
619
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
620
+ Returns
621
+ -------
622
+ None
623
+ """
624
+ try:
625
+ task = ee.batch.Export.table.toDrive(
626
+ collection=whisp_stats_ee_to_ee(
627
+ feature_collection,
628
+ external_id_column,
629
+ national_codes=national_codes,
630
+ unit_type=unit_type,
631
+ whisp_image=whisp_image, # Pass through
632
+ ),
633
+ description="whisp_output_table",
634
+ # folder="whisp_results",
635
+ fileFormat="CSV",
636
+ )
637
+ task.start()
638
+ print(
639
+ "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
640
+ )
641
+ except Exception as e:
642
+ print(f"An error occurred during the export: {e}")
643
+
644
+
645
+ #### main stats functions
646
+
647
+
648
+ # Get stats for a feature or feature collection
649
+ def get_stats(
650
+ feature_or_feature_col, national_codes=None, unit_type="ha", whisp_image=None
651
+ ):
652
+ """
653
+ Get stats for a feature or feature collection with optional pre-combined image.
654
+
655
+ Parameters
656
+ ----------
657
+ feature_or_feature_col : ee.Feature or ee.FeatureCollection
658
+ The input feature or feature collection to analyze
659
+ national_codes : list, optional
660
+ List of ISO2 country codes to include national datasets.
661
+ Only used if whisp_image is None.
662
+ unit_type : str, optional
663
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
664
+ whisp_image : ee.Image, optional
665
+ Pre-combined multiband Earth Engine Image containing all Whisp datasets.
666
+ If provided, this will be used instead of combining datasets based on national_codes.
667
+ If None, datasets will be combined automatically using national_codes parameter.
668
+ Returns
669
+ -------
670
+ ee.FeatureCollection
671
+ Feature collection with calculated statistics
672
+ """
673
+
674
+ # Use provided image or combine datasets
675
+ if whisp_image is not None:
676
+ img_combined = whisp_image
677
+ print("Using provided whisp_image")
678
+ else:
679
+ img_combined = combine_datasets(national_codes=national_codes)
680
+ print(f"Combining datasets with national_codes: {national_codes}")
681
+
682
+ # Check if the input is a Feature or a FeatureCollection
683
+ if isinstance(feature_or_feature_col, ee.Feature):
684
+ print("Processing single feature")
685
+ output = ee.FeatureCollection(
686
+ [
687
+ get_stats_feature(
688
+ feature_or_feature_col, img_combined, unit_type=unit_type
689
+ )
690
+ ]
691
+ )
692
+ elif isinstance(feature_or_feature_col, ee.FeatureCollection):
693
+ print("Processing feature collection")
694
+ output = get_stats_fc(
695
+ feature_or_feature_col,
696
+ national_codes=national_codes,
697
+ unit_type=unit_type,
698
+ img_combined=img_combined, # Pass the image directly
699
+ )
700
+ else:
701
+ output = "Check inputs: not an ee.Feature or ee.FeatureCollection"
702
+ return output
703
+
704
+
705
+ # Get statistics for a feature collection
706
+ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=None):
707
+ """
708
+ Calculate statistics for a feature collection using Whisp datasets.
709
+
710
+ Parameters
711
+ ----------
712
+ feature_col : ee.FeatureCollection
713
+ The input feature collection to analyze
714
+ national_codes : list, optional
715
+ List of ISO2 country codes (e.g., ["BR", "US"]) to include national datasets.
716
+ If provided, only national datasets for these countries and global datasets will be used.
717
+ If None (default), only global datasets will be used.
718
+ Only used if img_combined is None.
719
+ unit_type : str, optional
720
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
721
+ img_combined : ee.Image, optional
722
+ Pre-combined multiband image containing all Whisp datasets.
723
+ If provided, this will be used instead of combining datasets based on national_codes.
724
+ Returns
725
+ -------
726
+ ee.FeatureCollection
727
+ Feature collection with calculated statistics
728
+ """
729
+
730
+ # # Use provided image or combine datasets
731
+ # if img_combined is None:
732
+ # img_combined = combine_datasets(national_codes=national_codes)
733
+
734
+ out_feature_col = ee.FeatureCollection(
735
+ feature_col.map(
736
+ lambda feature: get_stats_feature(
737
+ feature, img_combined, unit_type=unit_type
738
+ )
739
+ )
740
+ )
741
+ # print(out_feature_col.first().getInfo()) # for testing
742
+
743
+ return out_feature_col
744
+
745
+
746
+ # Get statistics for a single feature
747
+ # Note: This function doesn't need whisp_image parameter since it already accepts img_combined directly
748
+
749
+
750
+ def get_stats_feature(feature, img_combined, unit_type="ha"):
751
+ """
752
+ Get statistics for a single feature using a pre-combined image.
753
+
754
+ Parameters
755
+ ----------
756
+ feature : ee.Feature
757
+ The feature to analyze
758
+ img_combined : ee.Image
759
+ Pre-combined image with all the datasets
760
+ unit_type : str, optional
761
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
762
+
763
+ Returns
764
+ -------
765
+ ee.Feature
766
+ Feature with calculated statistics
767
+ """
768
+ reduce = img_combined.reduceRegion(
769
+ reducer=ee.Reducer.sum(),
770
+ geometry=feature.geometry(),
771
+ scale=10,
772
+ maxPixels=1e10,
773
+ tileScale=8,
774
+ )
775
+
776
+ # Get basic feature information
777
+ feature_info = get_type_and_location(feature)
778
+
779
+ # add statistics unit type (e.g., percentage or hectares) to dictionary
780
+ stats_unit_type = ee.Dictionary({stats_unit_type_column: unit_type})
781
+
782
+ # Now, modified_dict contains all keys with the prefix added
783
+ reduce_ha = reduce.map(
784
+ lambda key, val: divide_and_format(ee.Number(val), ee.Number(10000))
785
+ )
786
+
787
+ # Get value for hectares
788
+ area_ha = ee.Number(ee.Dictionary(reduce_ha).get(geometry_area_column))
789
+
790
+ # Apply the function to each value in the dictionary using map()
791
+ reduce_percent = reduce_ha.map(
792
+ lambda key, val: percent_and_format(ee.Number(val), area_ha)
793
+ )
794
+
795
+ # Reformat the hectare statistics
796
+ reducer_stats_ha = reduce_ha.set(
797
+ geometry_area_column, area_ha.format(geometry_area_column_formatting)
798
+ ) # area ha (formatted)
799
+
800
+ # Reformat the percentage statistics
801
+ reducer_stats_percent = reduce_percent.set(
802
+ geometry_area_column, area_ha.format(geometry_area_column_formatting)
803
+ ) # area ha (formatted)
804
+
805
+ # Add country info onto hectare analysis results
806
+ properties_ha = feature_info.combine(ee.Dictionary(reducer_stats_ha)).combine(
807
+ stats_unit_type
808
+ )
809
+
810
+ # Add country info onto percentage analysis results
811
+ properties_percent = feature_info.combine(
812
+ ee.Dictionary(reducer_stats_percent)
813
+ ).combine(stats_unit_type)
814
+
815
+ # Choose whether to use hectares or percentage based on the parameter instead of global variable
816
+ out_feature = ee.Algorithms.If(
817
+ unit_type == "ha",
818
+ feature.set(properties_ha), # .setGeometry(None),
819
+ feature.set(properties_percent), # .setGeometry(None),
820
+ )
821
+
822
+ return out_feature
823
+
824
+
825
+ # Get basic feature information - uses admin and water datasets in gee.
826
+ def get_type_and_location(feature):
827
+ """Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags."""
828
+
829
+ # Get centroid of the feature's geometry
830
+ centroid = feature.geometry().centroid(1)
831
+
832
+ # Fetch location info from geoboundaries (country, admin)
833
+ location = ee.Dictionary(get_geoboundaries_info(centroid))
834
+ country = ee.Dictionary({iso3_country_column: location.get("shapeGroup")})
835
+
836
+ admin_1 = ee.Dictionary(
837
+ {admin_1_column: location.get("shapeName")}
838
+ ) # Administrative level 1 (if available)
839
+
840
+ # Prepare the water flag information
841
+ water_all = water_flag_all_prep()
842
+ water_flag_dict = value_at_point_flag(
843
+ point=centroid, image=water_all, band_name=water_flag, output_name=water_flag
844
+ )
845
+
846
+ # Get the geometry type of the feature
847
+ geom_type = ee.Dictionary({geometry_type_column: feature.geometry().type()})
848
+
849
+ # Get the coordinates (latitude, longitude) of the centroid
850
+ coords_list = centroid.coordinates()
851
+ coords_dict = ee.Dictionary(
852
+ {
853
+ centroid_x_coord_column: coords_list.get(0), # Longitude
854
+ centroid_y_coord_column: coords_list.get(1), # Latitude
855
+ }
856
+ )
857
+
858
+ # Combine all the extracted info into a single dictionary
859
+ feature_info = (
860
+ country.combine(admin_1)
861
+ .combine(geom_type)
862
+ .combine(coords_dict)
863
+ .combine(water_flag_dict)
864
+ )
865
+
866
+ return feature_info
867
+
868
+
869
+ # Define a function to divide each value by 10,000 and format it with one decimal place
870
+ def divide_and_format(val, unit):
871
+ # Convert the image to an ee.Number, divide by 10,000, and format with one decimal place
872
+ formatted_value = ee.Number.parse(
873
+ ee.Number(ee.Number(val).divide(ee.Number(unit))).format(
874
+ stats_area_columns_formatting
875
+ )
876
+ )
877
+ # Return the formatted value
878
+ return ee.Number(formatted_value)
879
+
880
+
881
+ # Define a function to divide by total area of geometry and multiply by 100
882
+ def percent_and_format(val, area_ha):
883
+ formatted_value = ee.Number.parse(
884
+ ee.Number(ee.Number(val).divide(area_ha).multiply(ee.Number(100))).format(
885
+ stats_percent_columns_formatting
886
+ )
887
+ )
888
+ # Return the formatted value
889
+ return ee.Number(formatted_value)
890
+
891
+
892
+ # geoboundaries - admin units from a freqently updated database, allows commercial use (CC BY 4.0 DEED) (disputed territories may need checking)
893
+ def get_geoboundaries_info(geometry):
894
+ gbounds_ADM0 = ee.FeatureCollection("WM/geoLab/geoBoundaries/600/ADM1")
895
+ polygonsIntersectPoint = gbounds_ADM0.filterBounds(geometry)
896
+ backup_dict = ee.Dictionary({"shapeGroup": "Unknown", "shapeName": "Unknown"})
897
+ return ee.Algorithms.If(
898
+ polygonsIntersectPoint.size().gt(0),
899
+ polygonsIntersectPoint.first()
900
+ .toDictionary()
901
+ .select(["shapeGroup", "shapeName"]),
902
+ backup_dict,
903
+ )
904
+
905
+
906
+ #####
907
+ # water flag - to flag plots that may be erroneous (i.e., where errors may have occured in their creation / translation and so fall in either the ocean or inland water -
908
+ def usgs_gsv_ocean_prep(): # TO DO: for speed export image as an asset at samne res as JRC
909
+ # Initialize the Earth Engine API
910
+ # ee.Initialize()
911
+
912
+ # Load the datasets
913
+ mainlands = ee.FeatureCollection(
914
+ "projects/sat-io/open-datasets/shoreline/mainlands"
915
+ )
916
+ big_islands = ee.FeatureCollection(
917
+ "projects/sat-io/open-datasets/shoreline/big_islands"
918
+ )
919
+ small_islands = ee.FeatureCollection(
920
+ "projects/sat-io/open-datasets/shoreline/small_islands"
921
+ )
922
+
923
+ # Combine the datasets into one FeatureCollection
924
+ gsv = ee.FeatureCollection([mainlands, big_islands, small_islands]).flatten()
925
+
926
+ # Rasterize the combined FeatureCollection and make areas outside coast (i.e. ocean) as value 1
927
+ # and then rename the band
928
+ return ee.Image(1).paint(gsv).selfMask().rename("ocean")
929
+
930
+
931
+ def jrc_water_surface_prep():
932
+ jrc_surface_water = ee.Image("JRC/GSW1_4/GlobalSurfaceWater")
933
+
934
+ # use transition band
935
+ jrc_transition = jrc_surface_water.select("transition")
936
+
937
+ # select permanent water bodies:
938
+ # remap the following classes to have a value of 1:
939
+ # "Permanent", "New Permanent", and "Seasonal to Permanent" (i.e., classes 1,2 and 7).
940
+ # All other classes as value 0.
941
+ permanent_inland_water = jrc_transition.remap([1, 2, 7], [1, 1, 1], 0).unmask()
942
+
943
+ # optional - clip to within coast line (not needed currently and extra processing)
944
+ # permanent_inland_water = permanent_inland_water.where(usgs_gsv_ocean_prep(),0)
945
+
946
+ return permanent_inland_water.rename("water_inland")
947
+
948
+
949
+ def water_flag_all_prep():
950
+ # combine both where water surface is 1, then 1, else use non_land_gsv
951
+ return (
952
+ usgs_gsv_ocean_prep()
953
+ .unmask()
954
+ .where(jrc_water_surface_prep(), 1)
955
+ .rename(water_flag)
956
+ )
957
+
958
+
959
+ def value_at_point_flag(point, image, band_name, output_name):
960
+ """Sample an image at the given point and make a dictionary output where the name is defined by output_name parameter"""
961
+ sample = image.sample(region=point, scale=30, numPixels=1).first()
962
+
963
+ # Get the value from the sampled point
964
+ value = sample.get(band_name) # assuming the band name is 'b1', change if necessary
965
+
966
+ # Use a conditional statement to check if the value is 1
967
+ result = value # ee.Algorithms.If(ee.Number(value).eq(1), "True", "False")
968
+
969
+ # Return the output dictionary
970
+ return ee.Dictionary({output_name: result}) # .getInfo()
971
+
972
+
973
+ def add_id_to_feature_collection(dataset, id_name):
974
+ """
975
+ Adds an incremental (1,2,3 etc) 'id' property to each feature in the given FeatureCollection.
976
+
977
+ Args:
978
+ - dataset: ee.FeatureCollection, the FeatureCollection to operate on.
979
+
980
+ Returns:
981
+ - dataset_with_id: ee.FeatureCollection, the FeatureCollection with 'id' property added to each feature.
982
+ """
983
+ # Get the list of system:index values
984
+ indexes = dataset.aggregate_array("system:index")
985
+
986
+ # Create a sequence of numbers starting from 1 to the size of indexes
987
+ ids = ee.List.sequence(1, indexes.size())
988
+
989
+ # Create a dictionary mapping system:index to id
990
+ id_by_index = ee.Dictionary.fromLists(indexes, ids)
991
+
992
+ # Function to add 'id' property to each feature
993
+ def add_id(feature):
994
+ # Get the system:index of the feature
995
+ system_index = feature.get("system:index")
996
+
997
+ # Get the id corresponding to the system:index
998
+ feature_id = id_by_index.get(system_index)
999
+
1000
+ # Set the 'id' property of the feature
1001
+ return feature.set(id_name, feature_id)
1002
+
1003
+ # Map the add_id function over the dataset
1004
+ dataset_with_id = dataset.map(add_id)
1005
+
1006
+ return dataset_with_id
1007
+
1008
+
1009
+ # Function to add ID to features
1010
+ def add_id_to_feature(feature, id_name):
1011
+ index = feature.get("system:index")
1012
+ return feature.set(id_name, index)
1013
+
1014
+
1015
+ # Function to flag positive values
1016
+ def flag_positive_values(feature, flag_positive):
1017
+ for prop_name in flag_positive:
1018
+ flag_value = ee.Algorithms.If(
1019
+ ee.Number(feature.get(prop_name)).gt(0), "True", "-"
1020
+ )
1021
+ feature = feature.set(prop_name, flag_value)
1022
+ return feature
1023
+
1024
+
1025
+ # Function to exclude properties
1026
+ def copy_properties_and_exclude(feature, exclude_properties_from_output):
1027
+ return ee.Feature(feature.geometry()).copyProperties(
1028
+ source=feature, exclude=exclude_properties_from_output
1029
+ )
1030
+
1031
+
1032
+ def ee_image_checker(image):
1033
+ """
1034
+ Tests if the input is a valid ee.Image.
1035
+
1036
+ Args:
1037
+ image: An ee.Image object.
1038
+
1039
+ Returns:
1040
+ bool: True if the input is a valid ee.Image, False otherwise.
1041
+ """
1042
+ try:
1043
+ if ee.Algorithms.ObjectType(image).getInfo() == "Image":
1044
+ # Trigger some action on the image to ensure it's a valid image
1045
+ image.getInfo() # This will raise an exception if the image is invalid
1046
+ return True
1047
+ except ee.EEException as e:
1048
+ print(f"Image validation failed with EEException: {e}")
1049
+ except Exception as e:
1050
+ print(f"Image validation failed with exception: {e}")
1051
+ return False
1052
+
1053
+
1054
+ def keep_valid_images(image_list):
1055
+ """
1056
+ Filters a list to return only valid ee.Images.
1057
+
1058
+ Args:
1059
+ image_list: List of ee.Image objects.
1060
+
1061
+ Returns:
1062
+ list: List of valid ee.Image objects.
1063
+ """
1064
+ valid_imgs = []
1065
+ for image in image_list:
1066
+ if ee_image_checker(image):
1067
+ valid_imgs.append(image)
1068
+ return valid_imgs
1069
+
1070
+
1071
+ def convert_iso3_to_iso2(df, iso3_column, iso2_column):
1072
+ """
1073
+ Converts ISO3 country codes to ISO2 codes and adds a new column to the DataFrame.
1074
+
1075
+ Args:
1076
+ df (pd.DataFrame): Input DataFrame containing ISO3 country codes.
1077
+ iso3_column (str): The column name in the DataFrame with ISO3 country codes.
1078
+ iso2_column (str): The new column name to store ISO2 country codes.
1079
+
1080
+ Returns:
1081
+ pd.DataFrame: Updated DataFrame with the new ISO2 column.
1082
+ """
1083
+ import country_converter as coco
1084
+
1085
+ # Apply conversion from ISO3 to ISO2
1086
+ df[iso2_column] = df[iso3_column].apply(
1087
+ lambda x: (
1088
+ coco.convert(names=x, to="ISO2") if x else "not found (disputed territory)"
1089
+ )
1090
+ )
1091
+
1092
+ return df
1093
+
1094
+
1095
+ def validate_external_id_column(feature_collection, external_id_column):
1096
+ """
1097
+ Validates that the external_id_column exists in all features of the collection.
1098
+
1099
+ Parameters
1100
+ ----------
1101
+ feature_collection : ee.FeatureCollection
1102
+ The feature collection to validate
1103
+ external_id_column : str
1104
+ The name of the external ID column to check
1105
+
1106
+ Returns
1107
+ -------
1108
+ dict
1109
+ Dictionary with validation results including:
1110
+ - 'is_valid': bool indicating if column exists in all features
1111
+ - 'total_features': int total number of features
1112
+ - 'features_with_column': int number of features that have the column
1113
+ - 'available_properties': list of properties available in first feature
1114
+ - 'error_message': str error message if validation fails
1115
+ """
1116
+ try:
1117
+ # Get total number of features
1118
+ total_features = feature_collection.size().getInfo()
1119
+
1120
+ if total_features == 0:
1121
+ return {
1122
+ "is_valid": False,
1123
+ "total_features": 0,
1124
+ "features_with_column": 0,
1125
+ "available_properties": [],
1126
+ "error_message": "Feature collection is empty",
1127
+ }
1128
+
1129
+ # Get available properties from first feature
1130
+ first_feature_props = feature_collection.first().propertyNames().getInfo()
1131
+
1132
+ # Check if external_id_column exists in all features
1133
+ def check_column_exists(feature):
1134
+ has_column = feature.propertyNames().contains(external_id_column)
1135
+ return feature.set("_has_external_id", has_column)
1136
+
1137
+ features_with_check = feature_collection.map(check_column_exists)
1138
+ features_with_column = (
1139
+ features_with_check.filter(ee.Filter.eq("_has_external_id", True))
1140
+ .size()
1141
+ .getInfo()
1142
+ )
1143
+
1144
+ is_valid = features_with_column == total_features
1145
+
1146
+ error_message = None
1147
+ if not is_valid:
1148
+ missing_count = total_features - features_with_column
1149
+ error_message = (
1150
+ f"The column '{external_id_column}' is missing from {missing_count} "
1151
+ f"out of {total_features} features in the collection. "
1152
+ f"Available properties in first feature: {first_feature_props}"
1153
+ )
1154
+
1155
+ return {
1156
+ "is_valid": is_valid,
1157
+ "total_features": total_features,
1158
+ "features_with_column": features_with_column,
1159
+ "available_properties": first_feature_props,
1160
+ "error_message": error_message,
1161
+ }
1162
+
1163
+ except Exception as e:
1164
+ return {
1165
+ "is_valid": False,
1166
+ "total_features": 0,
1167
+ "features_with_column": 0,
1168
+ "available_properties": [],
1169
+ "error_message": f"Error during validation: {str(e)}",
1170
+ }
1171
+
1172
+
1173
+ def debug_feature_collection_properties(feature_collection, max_features=5):
1174
+ """
1175
+ Debug helper function to inspect the properties of features in a collection.
1176
+
1177
+ Parameters
1178
+ ----------
1179
+ feature_collection : ee.FeatureCollection
1180
+ The feature collection to inspect
1181
+ max_features : int, optional
1182
+ Maximum number of features to inspect, by default 5
1183
+
1184
+ Returns
1185
+ -------
1186
+ dict
1187
+ Dictionary with debugging information about the feature collection
1188
+ """
1189
+ try:
1190
+ total_features = feature_collection.size().getInfo()
1191
+
1192
+ if total_features == 0:
1193
+ return {"total_features": 0, "error": "Feature collection is empty"}
1194
+
1195
+ # Limit the number of features to inspect
1196
+ features_to_check = min(max_features, total_features)
1197
+ limited_fc = feature_collection.limit(features_to_check)
1198
+
1199
+ # Get properties for each feature
1200
+ def get_feature_properties(feature):
1201
+ return ee.Dictionary(
1202
+ {
1203
+ "properties": feature.propertyNames(),
1204
+ "geometry_type": feature.geometry().type(),
1205
+ }
1206
+ )
1207
+
1208
+ feature_info = limited_fc.map(get_feature_properties).getInfo()
1209
+
1210
+ return {
1211
+ "total_features": total_features,
1212
+ "inspected_features": features_to_check,
1213
+ "feature_details": [
1214
+ {
1215
+ "feature_index": i,
1216
+ "properties": feature_info["features"][i]["properties"][
1217
+ "properties"
1218
+ ],
1219
+ "geometry_type": feature_info["features"][i]["properties"][
1220
+ "geometry_type"
1221
+ ],
1222
+ }
1223
+ for i in range(len(feature_info["features"]))
1224
+ ],
1225
+ }
1226
+
1227
+ except Exception as e:
1228
+ return {"error": f"Error during debugging: {str(e)}"}