openforis-whisp 2.0.0a4__py3-none-any.whl → 2.0.0a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openforis_whisp/stats.py CHANGED
@@ -1,953 +1,1134 @@
1
- import ee
2
- import pandas as pd
3
- from pathlib import Path
4
- from .datasets import combine_datasets
5
- import json
6
- import country_converter as coco
7
- from openforis_whisp.parameters.config_runtime import (
8
- plot_id_column,
9
- geo_id_column,
10
- geometry_type_column,
11
- geometry_area_column,
12
- geometry_area_column_formatting,
13
- centroid_x_coord_column,
14
- centroid_y_coord_column,
15
- iso3_country_column,
16
- iso2_country_column,
17
- admin_1_column,
18
- stats_unit_type_column,
19
- stats_area_columns_formatting,
20
- stats_percent_columns_formatting,
21
- water_flag,
22
- )
23
- from .data_conversion import (
24
- convert_ee_to_df,
25
- convert_geojson_to_ee,
26
- convert_ee_to_geojson,
27
- # convert_csv_to_geojson,
28
- convert_df_to_geojson,
29
- ) # copied functions from whisp-api and geemap (accessed 2024) to avoid dependency
30
- from .reformat import validate_dataframe_using_lookups
31
-
32
- # NB functions that included "formatted" in the name apply a schema for validation and reformatting of the output dataframe. The schema is created from lookup tables.
33
-
34
-
35
- def whisp_formatted_stats_geojson_to_df(
36
- input_geojson_filepath: Path | str,
37
- external_id_column=None,
38
- remove_geom=False,
39
- national_codes=None,
40
- unit_type="ha",
41
- ) -> pd.DataFrame:
42
- """
43
- Main function for most users.
44
- Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
45
- Output df is validated against a panderas schema (created on the fly from the two lookup CSVs).
46
-
47
- This function first converts the provided GeoJSON file into an Earth Engine FeatureCollection.
48
- It then processes the FeatureCollection to extract relevant Whisp statistics,
49
- returning a structured DataFrame that aligns with the expected schema.
50
-
51
- If `external_id_column` is provided, it will be used to link external identifiers
52
- from the input GeoJSON to the output DataFrame.
53
-
54
- Parameters
55
- ----------
56
- input_geojson_filepath : Path | str
57
- The filepath to the GeoJSON of the ROI to analyze.
58
- external_id_column : str, optional
59
- The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
60
- remove_geom : bool, default=False
61
- If True, the geometry of the GeoJSON is removed from the output DataFrame.
62
- national_codes : list, optional
63
- List of ISO2 country codes to include national datasets.
64
- unit_type: str, optional
65
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
66
-
67
- Returns
68
- -------
69
- df_stats : pd.DataFrame
70
- The DataFrame containing the Whisp stats for the input ROI.
71
- """
72
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
73
-
74
- return whisp_formatted_stats_ee_to_df(
75
- feature_collection,
76
- external_id_column,
77
- remove_geom,
78
- national_codes=national_codes,
79
- unit_type=unit_type, # Fixed: now it's a keyword argument
80
- )
81
-
82
-
83
- def whisp_formatted_stats_geojson_to_geojson(
84
- input_geojson_filepath,
85
- output_geojson_filepath,
86
- external_id_column=None,
87
- geo_column: str = "geo",
88
- national_codes=None,
89
- unit_type="ha",
90
- ):
91
- """
92
- Convert a formatted GeoJSON file with a geo column into a GeoJSON file containing Whisp stats.
93
-
94
- Parameters
95
- ----------
96
- input_geojson_filepath : str
97
- The filepath to the input GeoJSON file.
98
- output_geojson_filepath : str
99
- The filepath to save the output GeoJSON file.
100
- external_id_column : str, optional
101
- The name of the column containing external IDs, by default None.
102
- geo_column : str, optional
103
- The name of the column containing GeoJSON geometries, by default "geo".
104
- national_codes : list, optional
105
- List of ISO2 country codes to include national datasets.
106
- unit_type : str, optional
107
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
108
-
109
- Returns
110
- -------
111
- None
112
- """
113
- df = whisp_formatted_stats_geojson_to_df(
114
- input_geojson_filepath=input_geojson_filepath,
115
- external_id_column=external_id_column,
116
- national_codes=national_codes,
117
- unit_type=unit_type,
118
- )
119
- # Convert the df to GeoJSON
120
- convert_df_to_geojson(df, output_geojson_filepath, geo_column)
121
-
122
- print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
123
-
124
-
125
- def whisp_formatted_stats_ee_to_geojson(
126
- feature_collection: ee.FeatureCollection,
127
- output_geojson_filepath: str,
128
- external_id_column=None,
129
- geo_column: str = "geo",
130
- national_codes=None,
131
- unit_type="ha",
132
- ):
133
- """
134
- Convert an Earth Engine FeatureCollection to a GeoJSON file containing Whisp stats.
135
-
136
- Parameters
137
- ----------
138
- feature_collection : ee.FeatureCollection
139
- The feature collection of the ROI to analyze.
140
- output_geojson_filepath : str
141
- The filepath to save the output GeoJSON file.
142
- external_id_column : str, optional
143
- The name of the column containing external IDs, by default None.
144
- geo_column : str, optional
145
- The name of the column containing GeoJSON geometries, by default "geo".
146
- national_codes : list, optional
147
- List of ISO2 country codes to include national datasets.
148
- unit_type : str, optional
149
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
150
- Returns
151
- -------
152
- None
153
- """
154
- # Convert ee feature collection to a pandas dataframe
155
- df_stats = whisp_formatted_stats_ee_to_df(
156
- feature_collection,
157
- external_id_column,
158
- national_codes=national_codes,
159
- unit_type=unit_type,
160
- )
161
-
162
- # Convert the df to GeoJSON
163
- convert_df_to_geojson(df_stats, output_geojson_filepath, geo_column)
164
-
165
- print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
166
-
167
-
168
- def whisp_formatted_stats_ee_to_df(
169
- feature_collection: ee.FeatureCollection,
170
- external_id_column=None,
171
- remove_geom=False,
172
- national_codes=None,
173
- unit_type="ha",
174
- ) -> pd.DataFrame:
175
- """
176
- Convert a feature collection to a validated DataFrame with Whisp statistics.
177
-
178
- Parameters
179
- ----------
180
- feature_collection : ee.FeatureCollection
181
- The feature collection of the ROI to analyze.
182
- external_id_column : str, optional
183
- The name of the external ID column, by default None.
184
- remove_geom : bool, optional
185
- Whether to remove the geometry column, by default False.
186
- national_codes : list, optional
187
- List of ISO2 country codes to include national datasets.
188
- unit_type : str, optional
189
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
190
-
191
- Returns
192
- -------
193
- validated_df : pd.DataFrame
194
- The validated dataframe containing the Whisp stats for the input ROI.
195
- """
196
- # Convert ee feature collection to a pandas dataframe
197
- df_stats = whisp_stats_ee_to_df(
198
- feature_collection,
199
- external_id_column,
200
- remove_geom,
201
- national_codes=national_codes,
202
- unit_type=unit_type,
203
- )
204
-
205
- # Pass national_codes to validation function to filter schema
206
- validated_df = validate_dataframe_using_lookups(
207
- df_stats, national_codes=national_codes
208
- )
209
- return validated_df
210
-
211
-
212
- ### functions without additional formatting below (i.e., raw output from GEE processing without schema validation step)
213
-
214
-
215
- def whisp_stats_geojson_to_df(
216
- input_geojson_filepath: Path | str,
217
- external_id_column=None,
218
- remove_geom=False,
219
- national_codes=None,
220
- unit_type="ha",
221
- ) -> pd.DataFrame:
222
- """
223
- Convert a GeoJSON file to a pandas DataFrame with Whisp statistics.
224
-
225
- Parameters
226
- ----------
227
- input_geojson_filepath : Path | str
228
- The filepath to the GeoJSON of the ROI to analyze.
229
- external_id_column : str, optional
230
- The name of the external ID column, by default None.
231
- remove_geom : bool, optional
232
- Whether to remove the geometry column, by default False.
233
- national_codes : list, optional
234
- List of ISO2 country codes to include national datasets.
235
- unit_type : str, optional
236
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
237
-
238
- Returns
239
- -------
240
- df_stats : pd.DataFrame
241
- The dataframe containing the Whisp stats for the input ROI.
242
- """
243
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
244
-
245
- return whisp_stats_ee_to_df(
246
- feature_collection,
247
- external_id_column,
248
- remove_geom,
249
- national_codes=national_codes,
250
- unit_type=unit_type,
251
- )
252
-
253
-
254
- def whisp_stats_geojson_to_ee(
255
- input_geojson_filepath: Path | str,
256
- external_id_column=None,
257
- national_codes=None,
258
- ) -> ee.FeatureCollection:
259
- """
260
- Convert a GeoJSON file to an Earth Engine FeatureCollection with Whisp statistics.
261
-
262
- Parameters
263
- ----------
264
- input_geojson_filepath : Path | str
265
- The filepath to the GeoJSON of the ROI to analyze.
266
- external_id_column : str, optional
267
- The name of the external ID column, by default None.
268
- national_codes : list, optional
269
- List of ISO2 country codes to include national datasets.
270
-
271
- Returns
272
- -------
273
- ee.FeatureCollection
274
- The feature collection containing the Whisp stats for the input ROI.
275
- """
276
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
277
-
278
- return whisp_stats_ee_to_ee(
279
- feature_collection, external_id_column, national_codes=national_codes
280
- )
281
-
282
-
283
- def whisp_stats_geojson_to_geojson(
284
- input_geojson_filepath,
285
- output_geojson_filepath,
286
- external_id_column=None,
287
- national_codes=None,
288
- unit_type="ha",
289
- ):
290
- """
291
- Convert a GeoJSON file to a GeoJSON object containing Whisp stats for the input ROI.
292
-
293
- Parameters
294
- ----------
295
- input_geojson_filepath : str
296
- The filepath to the input GeoJSON file.
297
- output_geojson_filepath : str
298
- The filepath to save the output GeoJSON file.
299
- external_id_column : str, optional
300
- The name of the column containing external IDs, by default None.
301
- national_codes : list, optional
302
- List of ISO2 country codes to include national datasets.
303
- unit_type : str, optional
304
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
305
-
306
- Returns
307
- -------
308
- None
309
- """
310
- # Convert GeoJSON to Earth Engine FeatureCollection
311
- feature_collection = convert_geojson_to_ee(input_geojson_filepath)
312
-
313
- # Get stats as a FeatureCollection
314
- stats_feature_collection = whisp_stats_ee_to_ee(
315
- feature_collection,
316
- external_id_column,
317
- national_codes=national_codes,
318
- unit_type=unit_type,
319
- )
320
-
321
- # Convert the stats FeatureCollection to GeoJSON
322
- stats_geojson = convert_ee_to_geojson(stats_feature_collection)
323
-
324
- # Save the GeoJSON to a file
325
- with open(output_geojson_filepath, "w") as f:
326
- json.dump(stats_geojson, f, indent=2)
327
-
328
-
329
- def whisp_stats_geojson_to_drive(
330
- input_geojson_filepath: Path | str,
331
- external_id_column=None,
332
- national_codes=None,
333
- unit_type="ha",
334
- ):
335
- """
336
- Export Whisp statistics for a GeoJSON file to Google Drive.
337
-
338
- Parameters
339
- ----------
340
- input_geojson_filepath : Path | str
341
- The filepath to the GeoJSON of the ROI to analyze.
342
- external_id_column : str, optional
343
- The name of the external ID column, by default None.
344
- national_codes : list, optional
345
- List of ISO2 country codes to include national datasets.
346
- unit_type : str, optional
347
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
348
-
349
- Returns
350
- -------
351
- Message showing location of file in Google Drive
352
- """
353
- try:
354
- input_geojson_filepath = Path(input_geojson_filepath)
355
- if not input_geojson_filepath.exists():
356
- raise FileNotFoundError(f"File {input_geojson_filepath} does not exist.")
357
-
358
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
359
-
360
- return whisp_stats_ee_to_drive(
361
- feature_collection,
362
- external_id_column,
363
- national_codes=national_codes,
364
- unit_type=unit_type,
365
- )
366
-
367
- except Exception as e:
368
- print(f"An error occurred: {e}")
369
-
370
-
371
- def whisp_stats_ee_to_ee(
372
- feature_collection, external_id_column, national_codes=None, unit_type="ha"
373
- ):
374
- """
375
- Process a feature collection to get statistics for each feature.
376
-
377
- Parameters:
378
- feature_collection (ee.FeatureCollection): The input feature collection.
379
- external_id_column (str): The name of the external ID column to check.
380
- national_codes (list, optional): List of ISO2 country codes to include national datasets.
381
- unit_type (str): Whether to use hectares ("ha") or percentage ("percent"), default "ha".
382
-
383
- Returns:
384
- ee.FeatureCollection: The output feature collection with statistics.
385
- """
386
- if external_id_column is not None:
387
- try:
388
- # Check if external_id_column is a property in feature_collection (server-side)
389
- def check_column_exists(feature):
390
- return ee.Algorithms.If(
391
- feature.propertyNames().contains(external_id_column),
392
- feature,
393
- ee.Feature(
394
- None
395
- ), # Return an empty feature if the column does not exist
396
- )
397
-
398
- feature_collection_with_check = feature_collection.map(check_column_exists)
399
- size_fc = feature_collection.size()
400
- valid_feature_count = feature_collection_with_check.filter(
401
- ee.Filter.notNull([external_id_column])
402
- ).size()
403
-
404
- # Raise an error if the column does not exist in any feature
405
- if valid_feature_count.neq(size_fc).getInfo():
406
- raise ValueError(
407
- f"The column '{external_id_column}' is not a property throughout the feature collection."
408
- )
409
-
410
- # Set the geo_id_column
411
- feature_collection = feature_collection.map(
412
- lambda feature: feature.set(
413
- geo_id_column, ee.String(feature.get(external_id_column))
414
- )
415
- )
416
-
417
- except Exception as e:
418
- # Handle the exception and provide a helpful error message
419
- print(
420
- f"An error occurred when trying to set the external_id_column: {external_id_column}. Error: {e}"
421
- )
422
-
423
- fc = get_stats(
424
- feature_collection, national_codes=national_codes, unit_type=unit_type
425
- )
426
-
427
- return add_id_to_feature_collection(dataset=fc, id_name=plot_id_column)
428
-
429
-
430
- def whisp_stats_ee_to_df(
431
- feature_collection: ee.FeatureCollection,
432
- external_id_column=None,
433
- remove_geom=False,
434
- national_codes=None,
435
- unit_type="ha",
436
- ) -> pd.DataFrame:
437
- """
438
- Convert a Google Earth Engine FeatureCollection to a pandas DataFrame and convert ISO3 to ISO2 country codes.
439
-
440
- Parameters
441
- ----------
442
- feature_collection : ee.FeatureCollection
443
- The input FeatureCollection to analyze.
444
- external_id_column : str, optional
445
- The name of the external ID column, by default None.
446
- remove_geom : bool, optional
447
- Whether to remove the geometry column, by default True.
448
- national_codes : list, optional
449
- List of ISO2 country codes to include national datasets.
450
- unit_type : str, optional
451
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
452
-
453
- Returns
454
- -------
455
- df_stats : pd.DataFrame
456
- The dataframe containing the Whisp stats for the input ROI.
457
- """
458
- try:
459
- df_stats = convert_ee_to_df(
460
- ee_object=whisp_stats_ee_to_ee(
461
- feature_collection,
462
- external_id_column,
463
- national_codes=national_codes,
464
- unit_type=unit_type,
465
- ),
466
- remove_geom=remove_geom,
467
- )
468
- except Exception as e:
469
- print(f"An error occurred during the conversion from EE to DataFrame: {e}")
470
- return pd.DataFrame() # Return an empty DataFrame in case of error
471
-
472
- try:
473
- df_stats = convert_iso3_to_iso2(
474
- df=df_stats,
475
- iso3_column=iso3_country_column,
476
- iso2_column=iso2_country_column,
477
- )
478
- except Exception as e:
479
- print(f"An error occurred during the ISO3 to ISO2 conversion: {e}")
480
- return pd.DataFrame() # Return an empty DataFrame in case of error
481
-
482
- return df_stats
483
-
484
-
485
- def whisp_stats_ee_to_drive(
486
- feature_collection: ee.FeatureCollection,
487
- external_id_column=None,
488
- national_codes=None,
489
- unit_type="ha",
490
- ):
491
- """
492
- Export Whisp statistics for a feature collection to Google Drive.
493
-
494
- Parameters
495
- ----------
496
- feature_collection : ee.FeatureCollection
497
- The feature collection to analyze.
498
- external_id_column : str, optional
499
- The name of the external ID column, by default None.
500
- national_codes : list, optional
501
- List of ISO2 country codes to include national datasets.
502
- unit_type : str, optional
503
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
504
- Returns
505
- -------
506
- None
507
- """
508
- try:
509
- task = ee.batch.Export.table.toDrive(
510
- collection=whisp_stats_ee_to_ee(
511
- feature_collection,
512
- external_id_column,
513
- national_codes=national_codes,
514
- unit_type=unit_type,
515
- ),
516
- description="whisp_output_table",
517
- # folder="whisp_results",
518
- fileFormat="CSV",
519
- )
520
- task.start()
521
- print(
522
- "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
523
- )
524
- except Exception as e:
525
- print(f"An error occurred during the export: {e}")
526
-
527
-
528
- #### main stats functions
529
-
530
-
531
- # Get stats for a feature or feature collection
532
- def get_stats(feature_or_feature_col, national_codes=None, unit_type="ha"):
533
- """
534
- Get stats for a feature or feature collection with optional filtering by national codes.
535
-
536
- Parameters
537
- ----------
538
- feature_or_feature_col : ee.Feature or ee.FeatureCollection
539
- The input feature or feature collection to analyze
540
- national_codes : list, optional
541
- List of ISO2 country codes to include national datasets
542
- unit_type : str, optional
543
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
544
- Returns
545
- -------
546
- ee.FeatureCollection
547
- Feature collection with calculated statistics
548
- """
549
- # Check if the input is a Feature or a FeatureCollection
550
- if isinstance(feature_or_feature_col, ee.Feature):
551
- # If the input is a Feature, call the server-side function for processing
552
- print("feature")
553
- # For a single feature, we need to combine datasets with the national_codes filter
554
- img_combined = combine_datasets(national_codes=national_codes)
555
- output = ee.FeatureCollection(
556
- [
557
- get_stats_feature(
558
- feature_or_feature_col, img_combined, unit_type=unit_type
559
- )
560
- ]
561
- )
562
- elif isinstance(feature_or_feature_col, ee.FeatureCollection):
563
- # If the input is a FeatureCollection, call the server-side function for processing
564
- output = get_stats_fc(
565
- feature_or_feature_col, national_codes=national_codes, unit_type=unit_type
566
- )
567
- else:
568
- output = "Check inputs: not an ee.Feature or ee.FeatureCollection"
569
- return output
570
-
571
-
572
- # Get statistics for a feature collection
573
- def get_stats_fc(feature_col, national_codes=None, unit_type="ha"):
574
- """
575
- Calculate statistics for a feature collection using Whisp datasets.
576
-
577
- Parameters
578
- ----------
579
- feature_col : ee.FeatureCollection
580
- The input feature collection to analyze
581
- national_codes : list, optional
582
- List of ISO2 country codes (e.g., ["BR", "US"]) to include national datasets.
583
- If provided, only national datasets for these countries and global datasets will be used.
584
- If None (default), only global datasets will be used.
585
- unit_type : str, optional
586
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
587
- Returns
588
- -------
589
- ee.FeatureCollection
590
- Feature collection with calculated statistics
591
- """
592
- img_combined = combine_datasets(
593
- national_codes=national_codes
594
- ) # Pass national_codes to combine_datasets
595
-
596
- out_feature_col = ee.FeatureCollection(
597
- feature_col.map(
598
- lambda feature: get_stats_feature(
599
- feature, img_combined, unit_type=unit_type
600
- )
601
- )
602
- )
603
- # print(out_feature_col.first().getInfo()) # for testing
604
-
605
- return out_feature_col
606
-
607
-
608
- # Get statistics for a single feature
609
-
610
-
611
- def get_stats_feature(feature, img_combined, unit_type="ha"):
612
- """
613
- Get statistics for a single feature using a pre-combined image.
614
-
615
- Parameters
616
- ----------
617
- feature : ee.Feature
618
- The feature to analyze
619
- img_combined : ee.Image
620
- Pre-combined image with all the datasets
621
- unit_type : str, optional
622
- Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
623
-
624
- Returns
625
- -------
626
- ee.Feature
627
- Feature with calculated statistics
628
- """
629
- reduce = img_combined.reduceRegion(
630
- reducer=ee.Reducer.sum(),
631
- geometry=feature.geometry(),
632
- scale=10,
633
- maxPixels=1e10,
634
- tileScale=8,
635
- )
636
-
637
- # Get basic feature information
638
- feature_info = get_type_and_location(feature)
639
-
640
- # add statistics unit type (e.g., percentage or hectares) to dictionary
641
- stats_unit_type = ee.Dictionary({stats_unit_type_column: unit_type})
642
-
643
- # Now, modified_dict contains all keys with the prefix added
644
- reduce_ha = reduce.map(
645
- lambda key, val: divide_and_format(ee.Number(val), ee.Number(10000))
646
- )
647
-
648
- # Get value for hectares
649
- area_ha = ee.Number(ee.Dictionary(reduce_ha).get(geometry_area_column))
650
-
651
- # Apply the function to each value in the dictionary using map()
652
- reduce_percent = reduce_ha.map(
653
- lambda key, val: percent_and_format(ee.Number(val), area_ha)
654
- )
655
-
656
- # Reformat the hectare statistics
657
- reducer_stats_ha = reduce_ha.set(
658
- geometry_area_column, area_ha.format(geometry_area_column_formatting)
659
- ) # area ha (formatted)
660
-
661
- # Reformat the percentage statistics
662
- reducer_stats_percent = reduce_percent.set(
663
- geometry_area_column, area_ha.format(geometry_area_column_formatting)
664
- ) # area ha (formatted)
665
-
666
- # Add country info onto hectare analysis results
667
- properties_ha = feature_info.combine(ee.Dictionary(reducer_stats_ha)).combine(
668
- stats_unit_type
669
- )
670
-
671
- # Add country info onto percentage analysis results
672
- properties_percent = feature_info.combine(
673
- ee.Dictionary(reducer_stats_percent)
674
- ).combine(stats_unit_type)
675
-
676
- # Choose whether to use hectares or percentage based on the parameter instead of global variable
677
- out_feature = ee.Algorithms.If(
678
- unit_type == "ha",
679
- feature.set(properties_ha), # .setGeometry(None),
680
- feature.set(properties_percent), # .setGeometry(None),
681
- )
682
-
683
- return out_feature
684
-
685
-
686
- # Get basic feature information - uses admin and water datasets in gee.
687
- def get_type_and_location(feature):
688
- """Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags."""
689
-
690
- # Get centroid of the feature's geometry
691
- centroid = feature.geometry().centroid(1)
692
-
693
- # Fetch location info from geoboundaries (country, admin)
694
- location = ee.Dictionary(get_geoboundaries_info(centroid))
695
- country = ee.Dictionary({iso3_country_column: location.get("shapeGroup")})
696
-
697
- admin_1 = ee.Dictionary(
698
- {admin_1_column: location.get("shapeName")}
699
- ) # Administrative level 1 (if available)
700
-
701
- # Prepare the water flag information
702
- water_all = water_flag_all_prep()
703
- water_flag_dict = value_at_point_flag(
704
- point=centroid, image=water_all, band_name=water_flag, output_name=water_flag
705
- )
706
-
707
- # Get the geometry type of the feature
708
- geom_type = ee.Dictionary({geometry_type_column: feature.geometry().type()})
709
-
710
- # Get the coordinates (latitude, longitude) of the centroid
711
- coords_list = centroid.coordinates()
712
- coords_dict = ee.Dictionary(
713
- {
714
- centroid_x_coord_column: coords_list.get(0), # Longitude
715
- centroid_y_coord_column: coords_list.get(1), # Latitude
716
- }
717
- )
718
-
719
- # Combine all the extracted info into a single dictionary
720
- feature_info = (
721
- country.combine(admin_1)
722
- .combine(geom_type)
723
- .combine(coords_dict)
724
- .combine(water_flag_dict)
725
- )
726
-
727
- return feature_info
728
-
729
-
730
- # Define a function to divide each value by 10,000 and format it with one decimal place
731
- def divide_and_format(val, unit):
732
- # Convert the image to an ee.Number, divide by 10,000, and format with one decimal place
733
- formatted_value = ee.Number.parse(
734
- ee.Number(ee.Number(val).divide(ee.Number(unit))).format(
735
- stats_area_columns_formatting
736
- )
737
- )
738
- # Return the formatted value
739
- return ee.Number(formatted_value)
740
-
741
-
742
- # Define a function to divide by total area of geometry and multiply by 100
743
- def percent_and_format(val, area_ha):
744
- formatted_value = ee.Number.parse(
745
- ee.Number(ee.Number(val).divide(area_ha).multiply(ee.Number(100))).format(
746
- stats_percent_columns_formatting
747
- )
748
- )
749
- # Return the formatted value
750
- return ee.Number(formatted_value)
751
-
752
-
753
- # geoboundaries - admin units from a freqently updated database, allows commercial use (CC BY 4.0 DEED) (disputed territories may need checking)
754
- def get_geoboundaries_info(geometry):
755
- gbounds_ADM0 = ee.FeatureCollection("WM/geoLab/geoBoundaries/600/ADM1")
756
- polygonsIntersectPoint = gbounds_ADM0.filterBounds(geometry)
757
- backup_dict = ee.Dictionary({"shapeGroup": "Unknown", "shapeName": "Unknown"})
758
- return ee.Algorithms.If(
759
- polygonsIntersectPoint.size().gt(0),
760
- polygonsIntersectPoint.first()
761
- .toDictionary()
762
- .select(["shapeGroup", "shapeName"]),
763
- backup_dict,
764
- )
765
-
766
-
767
- #####
768
- # water flag - to flag plots that may be erroneous (i.e., where errors may have occured in their creation / translation and so fall in either the ocean or inland water -
769
- def usgs_gsv_ocean_prep(): # TO DO: for speed export image as an asset at samne res as JRC
770
- # Initialize the Earth Engine API
771
- # ee.Initialize()
772
-
773
- # Load the datasets
774
- mainlands = ee.FeatureCollection(
775
- "projects/sat-io/open-datasets/shoreline/mainlands"
776
- )
777
- big_islands = ee.FeatureCollection(
778
- "projects/sat-io/open-datasets/shoreline/big_islands"
779
- )
780
- small_islands = ee.FeatureCollection(
781
- "projects/sat-io/open-datasets/shoreline/small_islands"
782
- )
783
-
784
- # Combine the datasets into one FeatureCollection
785
- gsv = ee.FeatureCollection([mainlands, big_islands, small_islands]).flatten()
786
-
787
- # Rasterize the combined FeatureCollection and make areas outside coast (i.e. ocean) as value 1
788
- # and then rename the band
789
- return ee.Image(1).paint(gsv).selfMask().rename("ocean")
790
-
791
-
792
- def jrc_water_surface_prep():
793
- jrc_surface_water = ee.Image("JRC/GSW1_4/GlobalSurfaceWater")
794
-
795
- # use transition band
796
- jrc_transition = jrc_surface_water.select("transition")
797
-
798
- # select permanent water bodies:
799
- # remap the following classes to have a value of 1:
800
- # "Permanent", "New Permanent", and "Seasonal to Permanent" (i.e., classes 1,2 and 7).
801
- # All other classes as value 0.
802
- permanent_inland_water = jrc_transition.remap([1, 2, 7], [1, 1, 1], 0).unmask()
803
-
804
- # optional - clip to within coast line (not needed currently and extra processing)
805
- # permanent_inland_water = permanent_inland_water.where(usgs_gsv_ocean_prep(),0)
806
-
807
- return permanent_inland_water.rename("water_inland")
808
-
809
-
810
- def water_flag_all_prep():
811
- # combine both where water surface is 1, then 1, else use non_land_gsv
812
- return (
813
- usgs_gsv_ocean_prep()
814
- .unmask()
815
- .where(jrc_water_surface_prep(), 1)
816
- .rename(water_flag)
817
- )
818
-
819
-
820
- def value_at_point_flag(point, image, band_name, output_name):
821
- """Sample an image at the given point and make a dictionary output where the name is defined by output_name parameter"""
822
- sample = image.sample(region=point, scale=30, numPixels=1).first()
823
-
824
- # Get the value from the sampled point
825
- value = sample.get(band_name) # assuming the band name is 'b1', change if necessary
826
-
827
- # Use a conditional statement to check if the value is 1
828
- result = value # ee.Algorithms.If(ee.Number(value).eq(1), "True", "False")
829
-
830
- # Return the output dictionary
831
- return ee.Dictionary({output_name: result}) # .getInfo()
832
-
833
-
834
- def add_id_to_feature_collection(dataset, id_name):
835
- """
836
- Adds an incremental (1,2,3 etc) 'id' property to each feature in the given FeatureCollection.
837
-
838
- Args:
839
- - dataset: ee.FeatureCollection, the FeatureCollection to operate on.
840
-
841
- Returns:
842
- - dataset_with_id: ee.FeatureCollection, the FeatureCollection with 'id' property added to each feature.
843
- """
844
- # Get the list of system:index values
845
- indexes = dataset.aggregate_array("system:index")
846
-
847
- # Create a sequence of numbers starting from 1 to the size of indexes
848
- ids = ee.List.sequence(1, indexes.size())
849
-
850
- # Create a dictionary mapping system:index to id
851
- id_by_index = ee.Dictionary.fromLists(indexes, ids)
852
-
853
- # Function to add 'id' property to each feature
854
- def add_id(feature):
855
- # Get the system:index of the feature
856
- system_index = feature.get("system:index")
857
-
858
- # Get the id corresponding to the system:index
859
- feature_id = id_by_index.get(system_index)
860
-
861
- # Set the 'id' property of the feature
862
- return feature.set(id_name, feature_id)
863
-
864
- # Map the add_id function over the dataset
865
- dataset_with_id = dataset.map(add_id)
866
-
867
- return dataset_with_id
868
-
869
-
870
- # Function to add ID to features
871
- def add_id_to_feature(feature, id_name):
872
- index = feature.get("system:index")
873
- return feature.set(id_name, index)
874
-
875
-
876
- # Function to flag positive values
877
- def flag_positive_values(feature, flag_positive):
878
- for prop_name in flag_positive:
879
- flag_value = ee.Algorithms.If(
880
- ee.Number(feature.get(prop_name)).gt(0), "True", "-"
881
- )
882
- feature = feature.set(prop_name, flag_value)
883
- return feature
884
-
885
-
886
- # Function to exclude properties
887
- def copy_properties_and_exclude(feature, exclude_properties_from_output):
888
- return ee.Feature(feature.geometry()).copyProperties(
889
- source=feature, exclude=exclude_properties_from_output
890
- )
891
-
892
-
893
- def ee_image_checker(image):
894
- """
895
- Tests if the input is a valid ee.Image.
896
-
897
- Args:
898
- image: An ee.Image object.
899
-
900
- Returns:
901
- bool: True if the input is a valid ee.Image, False otherwise.
902
- """
903
- try:
904
- if ee.Algorithms.ObjectType(image).getInfo() == "Image":
905
- # Trigger some action on the image to ensure it's a valid image
906
- image.getInfo() # This will raise an exception if the image is invalid
907
- return True
908
- except ee.EEException as e:
909
- print(f"Image validation failed with EEException: {e}")
910
- except Exception as e:
911
- print(f"Image validation failed with exception: {e}")
912
- return False
913
-
914
-
915
- def keep_valid_images(image_list):
916
- """
917
- Filters a list to return only valid ee.Images.
918
-
919
- Args:
920
- image_list: List of ee.Image objects.
921
-
922
- Returns:
923
- list: List of valid ee.Image objects.
924
- """
925
- valid_imgs = []
926
- for image in image_list:
927
- if ee_image_checker(image):
928
- valid_imgs.append(image)
929
- return valid_imgs
930
-
931
-
932
- def convert_iso3_to_iso2(df, iso3_column, iso2_column):
933
- """
934
- Converts ISO3 country codes to ISO2 codes and adds a new column to the DataFrame.
935
-
936
- Args:
937
- df (pd.DataFrame): Input DataFrame containing ISO3 country codes.
938
- iso3_column (str): The column name in the DataFrame with ISO3 country codes.
939
- iso2_column (str): The new column name to store ISO2 country codes.
940
-
941
- Returns:
942
- pd.DataFrame: Updated DataFrame with the new ISO2 column.
943
- """
944
- import country_converter as coco
945
-
946
- # Apply conversion from ISO3 to ISO2
947
- df[iso2_column] = df[iso3_column].apply(
948
- lambda x: (
949
- coco.convert(names=x, to="ISO2") if x else "not found (disputed territory)"
950
- )
951
- )
952
-
953
- return df
1
+ import ee
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from .datasets import combine_datasets
5
+ import json
6
+ import country_converter as coco
7
+ from openforis_whisp.parameters.config_runtime import (
8
+ plot_id_column,
9
+ external_id_column,
10
+ geometry_type_column,
11
+ geometry_area_column,
12
+ geometry_area_column_formatting,
13
+ centroid_x_coord_column,
14
+ centroid_y_coord_column,
15
+ iso3_country_column,
16
+ iso2_country_column,
17
+ admin_1_column,
18
+ stats_unit_type_column,
19
+ stats_area_columns_formatting,
20
+ stats_percent_columns_formatting,
21
+ water_flag,
22
+ )
23
+ from .data_conversion import (
24
+ convert_ee_to_df,
25
+ convert_geojson_to_ee,
26
+ convert_ee_to_geojson,
27
+ # convert_csv_to_geojson,
28
+ convert_df_to_geojson,
29
+ ) # copied functions from whisp-api and geemap (accessed 2024) to avoid dependency
30
+ from .reformat import validate_dataframe_using_lookups
31
+
32
+ # NB functions that included "formatted" in the name apply a schema for validation and reformatting of the output dataframe. The schema is created from lookup tables.
33
+
34
+
35
+ def whisp_formatted_stats_geojson_to_df(
36
+ input_geojson_filepath: Path | str,
37
+ external_id_column=None,
38
+ remove_geom=False,
39
+ national_codes=None,
40
+ unit_type="ha",
41
+ ) -> pd.DataFrame:
42
+ """
43
+ Main function for most users.
44
+ Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
45
+ Output df is validated against a panderas schema (created on the fly from the two lookup CSVs).
46
+
47
+ This function first converts the provided GeoJSON file into an Earth Engine FeatureCollection.
48
+ It then processes the FeatureCollection to extract relevant Whisp statistics,
49
+ returning a structured DataFrame that aligns with the expected schema.
50
+
51
+ If `external_id_column` is provided, it will be used to link external identifiers
52
+ from the input GeoJSON to the output DataFrame.
53
+
54
+ Parameters
55
+ ----------
56
+ input_geojson_filepath : Path | str
57
+ The filepath to the GeoJSON of the ROI to analyze.
58
+ external_id_column : str, optional
59
+ The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
60
+ This column must exist as a property in ALL features of the GeoJSON file.
61
+ Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
62
+ remove_geom : bool, default=False
63
+ If True, the geometry of the GeoJSON is removed from the output DataFrame.
64
+ national_codes : list, optional
65
+ List of ISO2 country codes to include national datasets.
66
+ unit_type: str, optional
67
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
68
+
69
+ Returns
70
+ -------
71
+ df_stats : pd.DataFrame
72
+ The DataFrame containing the Whisp stats for the input ROI.
73
+ """
74
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
75
+
76
+ return whisp_formatted_stats_ee_to_df(
77
+ feature_collection,
78
+ external_id_column,
79
+ remove_geom,
80
+ national_codes=national_codes,
81
+ unit_type=unit_type, # Fixed: now it's a keyword argument
82
+ )
83
+
84
+
85
+ def whisp_formatted_stats_geojson_to_geojson(
86
+ input_geojson_filepath,
87
+ output_geojson_filepath,
88
+ external_id_column=None,
89
+ geo_column: str = "geo",
90
+ national_codes=None,
91
+ unit_type="ha",
92
+ ):
93
+ """
94
+ Convert a formatted GeoJSON file with a geo column into a GeoJSON file containing Whisp stats.
95
+
96
+ Parameters
97
+ ----------
98
+ input_geojson_filepath : str
99
+ The filepath to the input GeoJSON file.
100
+ output_geojson_filepath : str
101
+ The filepath to save the output GeoJSON file.
102
+ external_id_column : str, optional
103
+ The name of the column containing external IDs, by default None.
104
+ geo_column : str, optional
105
+ The name of the column containing GeoJSON geometries, by default "geo".
106
+ national_codes : list, optional
107
+ List of ISO2 country codes to include national datasets.
108
+ unit_type : str, optional
109
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
110
+
111
+ Returns
112
+ -------
113
+ None
114
+ """
115
+ df = whisp_formatted_stats_geojson_to_df(
116
+ input_geojson_filepath=input_geojson_filepath,
117
+ external_id_column=external_id_column,
118
+ national_codes=national_codes,
119
+ unit_type=unit_type,
120
+ )
121
+ # Convert the df to GeoJSON
122
+ convert_df_to_geojson(df, output_geojson_filepath, geo_column)
123
+
124
+ print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
125
+
126
+
127
+ def whisp_formatted_stats_ee_to_geojson(
128
+ feature_collection: ee.FeatureCollection,
129
+ output_geojson_filepath: str,
130
+ external_id_column=None,
131
+ geo_column: str = "geo",
132
+ national_codes=None,
133
+ unit_type="ha",
134
+ ):
135
+ """
136
+ Convert an Earth Engine FeatureCollection to a GeoJSON file containing Whisp stats.
137
+
138
+ Parameters
139
+ ----------
140
+ feature_collection : ee.FeatureCollection
141
+ The feature collection of the ROI to analyze.
142
+ output_geojson_filepath : str
143
+ The filepath to save the output GeoJSON file.
144
+ external_id_column : str, optional
145
+ The name of the column containing external IDs, by default None.
146
+ geo_column : str, optional
147
+ The name of the column containing GeoJSON geometries, by default "geo".
148
+ national_codes : list, optional
149
+ List of ISO2 country codes to include national datasets.
150
+ unit_type : str, optional
151
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
152
+ Returns
153
+ -------
154
+ None
155
+ """
156
+ # Convert ee feature collection to a pandas dataframe
157
+ df_stats = whisp_formatted_stats_ee_to_df(
158
+ feature_collection,
159
+ external_id_column,
160
+ national_codes=national_codes,
161
+ unit_type=unit_type,
162
+ )
163
+
164
+ # Convert the df to GeoJSON
165
+ convert_df_to_geojson(df_stats, output_geojson_filepath, geo_column)
166
+
167
+ print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
168
+
169
+
170
+ def whisp_formatted_stats_ee_to_df(
171
+ feature_collection: ee.FeatureCollection,
172
+ external_id_column=None,
173
+ remove_geom=False,
174
+ national_codes=None,
175
+ unit_type="ha",
176
+ ) -> pd.DataFrame:
177
+ """
178
+ Convert a feature collection to a validated DataFrame with Whisp statistics.
179
+
180
+ Parameters
181
+ ----------
182
+ feature_collection : ee.FeatureCollection
183
+ The feature collection of the ROI to analyze.
184
+ external_id_column : str, optional
185
+ The name of the external ID column, by default None.
186
+ remove_geom : bool, optional
187
+ Whether to remove the geometry column, by default False.
188
+ national_codes : list, optional
189
+ List of ISO2 country codes to include national datasets.
190
+ unit_type : str, optional
191
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
192
+
193
+ Returns
194
+ -------
195
+ validated_df : pd.DataFrame
196
+ The validated dataframe containing the Whisp stats for the input ROI.
197
+ """
198
+ # Convert ee feature collection to a pandas dataframe
199
+ df_stats = whisp_stats_ee_to_df(
200
+ feature_collection,
201
+ external_id_column,
202
+ remove_geom,
203
+ national_codes=national_codes,
204
+ unit_type=unit_type,
205
+ )
206
+
207
+ # Pass national_codes to validation function to filter schema
208
+ validated_df = validate_dataframe_using_lookups(
209
+ df_stats, national_codes=national_codes
210
+ )
211
+ return validated_df
212
+
213
+
214
+ ### functions without additional formatting below (i.e., raw output from GEE processing without schema validation step)
215
+
216
+
217
+ def whisp_stats_geojson_to_df(
218
+ input_geojson_filepath: Path | str,
219
+ external_id_column=None,
220
+ remove_geom=False,
221
+ national_codes=None,
222
+ unit_type="ha",
223
+ ) -> pd.DataFrame:
224
+ """
225
+ Convert a GeoJSON file to a pandas DataFrame with Whisp statistics.
226
+
227
+ Parameters
228
+ ----------
229
+ input_geojson_filepath : Path | str
230
+ The filepath to the GeoJSON of the ROI to analyze.
231
+ external_id_column : str, optional
232
+ The name of the external ID column, by default None.
233
+ remove_geom : bool, optional
234
+ Whether to remove the geometry column, by default False.
235
+ national_codes : list, optional
236
+ List of ISO2 country codes to include national datasets.
237
+ unit_type : str, optional
238
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
239
+
240
+ Returns
241
+ -------
242
+ df_stats : pd.DataFrame
243
+ The dataframe containing the Whisp stats for the input ROI.
244
+ """
245
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
246
+
247
+ return whisp_stats_ee_to_df(
248
+ feature_collection,
249
+ external_id_column,
250
+ remove_geom,
251
+ national_codes=national_codes,
252
+ unit_type=unit_type,
253
+ )
254
+
255
+
256
+ def whisp_stats_geojson_to_ee(
257
+ input_geojson_filepath: Path | str,
258
+ external_id_column=None,
259
+ national_codes=None,
260
+ ) -> ee.FeatureCollection:
261
+ """
262
+ Convert a GeoJSON file to an Earth Engine FeatureCollection with Whisp statistics.
263
+
264
+ Parameters
265
+ ----------
266
+ input_geojson_filepath : Path | str
267
+ The filepath to the GeoJSON of the ROI to analyze.
268
+ external_id_column : str, optional
269
+ The name of the external ID column, by default None.
270
+ national_codes : list, optional
271
+ List of ISO2 country codes to include national datasets.
272
+
273
+ Returns
274
+ -------
275
+ ee.FeatureCollection
276
+ The feature collection containing the Whisp stats for the input ROI.
277
+ """
278
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
279
+
280
+ return whisp_stats_ee_to_ee(
281
+ feature_collection, external_id_column, national_codes=national_codes
282
+ )
283
+
284
+
285
+ def whisp_stats_geojson_to_geojson(
286
+ input_geojson_filepath,
287
+ output_geojson_filepath,
288
+ external_id_column=None,
289
+ national_codes=None,
290
+ unit_type="ha",
291
+ ):
292
+ """
293
+ Convert a GeoJSON file to a GeoJSON object containing Whisp stats for the input ROI.
294
+
295
+ Parameters
296
+ ----------
297
+ input_geojson_filepath : str
298
+ The filepath to the input GeoJSON file.
299
+ output_geojson_filepath : str
300
+ The filepath to save the output GeoJSON file.
301
+ external_id_column : str, optional
302
+ The name of the column containing external IDs, by default None.
303
+ national_codes : list, optional
304
+ List of ISO2 country codes to include national datasets.
305
+ unit_type : str, optional
306
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
307
+
308
+ Returns
309
+ -------
310
+ None
311
+ """
312
+ # Convert GeoJSON to Earth Engine FeatureCollection
313
+ feature_collection = convert_geojson_to_ee(input_geojson_filepath)
314
+
315
+ # Get stats as a FeatureCollection
316
+ stats_feature_collection = whisp_stats_ee_to_ee(
317
+ feature_collection,
318
+ external_id_column,
319
+ national_codes=national_codes,
320
+ unit_type=unit_type,
321
+ )
322
+
323
+ # Convert the stats FeatureCollection to GeoJSON
324
+ stats_geojson = convert_ee_to_geojson(stats_feature_collection)
325
+
326
+ # Save the GeoJSON to a file
327
+ with open(output_geojson_filepath, "w") as f:
328
+ json.dump(stats_geojson, f, indent=2)
329
+
330
+
331
+ def whisp_stats_geojson_to_drive(
332
+ input_geojson_filepath: Path | str,
333
+ external_id_column=None,
334
+ national_codes=None,
335
+ unit_type="ha",
336
+ ):
337
+ """
338
+ Export Whisp statistics for a GeoJSON file to Google Drive.
339
+
340
+ Parameters
341
+ ----------
342
+ input_geojson_filepath : Path | str
343
+ The filepath to the GeoJSON of the ROI to analyze.
344
+ external_id_column : str, optional
345
+ The name of the external ID column, by default None.
346
+ national_codes : list, optional
347
+ List of ISO2 country codes to include national datasets.
348
+ unit_type : str, optional
349
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
350
+
351
+ Returns
352
+ -------
353
+ Message showing location of file in Google Drive
354
+ """
355
+ try:
356
+ input_geojson_filepath = Path(input_geojson_filepath)
357
+ if not input_geojson_filepath.exists():
358
+ raise FileNotFoundError(f"File {input_geojson_filepath} does not exist.")
359
+
360
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
361
+
362
+ return whisp_stats_ee_to_drive(
363
+ feature_collection,
364
+ external_id_column,
365
+ national_codes=national_codes,
366
+ unit_type=unit_type,
367
+ )
368
+
369
+ except Exception as e:
370
+ print(f"An error occurred: {e}")
371
+
372
+
373
+ def whisp_stats_ee_to_ee(
374
+ feature_collection,
375
+ external_id_column,
376
+ national_codes=None,
377
+ unit_type="ha",
378
+ keep_properties=None,
379
+ ):
380
+ """
381
+ Process a feature collection to get statistics for each feature.
382
+
383
+ Parameters:
384
+ feature_collection (ee.FeatureCollection): The input feature collection.
385
+ external_id_column (str): The name of the external ID column to check.
386
+ national_codes (list, optional): List of ISO2 country codes to include national datasets.
387
+ unit_type (str): Whether to use hectares ("ha") or percentage ("percent"), default "ha".
388
+ keep_properties (None, bool, or list, optional): Properties to keep from the input features.
389
+ - None: Remove all properties (default behavior)
390
+ - True: Keep all properties
391
+ - list: Keep only the specified properties
392
+
393
+ Returns:
394
+ ee.FeatureCollection: The output feature collection with statistics.
395
+ """
396
+ if external_id_column is not None:
397
+ try:
398
+ # Validate that the external_id_column exists in all features
399
+ validation_result = validate_external_id_column(
400
+ feature_collection, external_id_column
401
+ )
402
+
403
+ if not validation_result["is_valid"]:
404
+ raise ValueError(validation_result["error_message"])
405
+
406
+ # First handle property selection, but preserve the external_id_column
407
+ if keep_properties is not None:
408
+ if keep_properties == True:
409
+ # Keep all properties including external_id_column
410
+ pass # No need to modify feature_collection
411
+ elif isinstance(keep_properties, list):
412
+ # Ensure external_id_column is included in the list
413
+ if external_id_column not in keep_properties:
414
+ keep_properties = keep_properties + [external_id_column]
415
+ feature_collection = feature_collection.select(keep_properties)
416
+ else:
417
+ raise ValueError(
418
+ "keep_properties must be None, True, or a list of property names."
419
+ )
420
+
421
+ # Set the external_id with robust null handling
422
+ def set_external_id_safely_and_clean(feature):
423
+ external_id_value = feature.get(external_id_column)
424
+ # Use server-side null checking and string conversion
425
+ external_id_value = ee.Algorithms.If(
426
+ ee.Algorithms.IsEqual(external_id_value, None),
427
+ "unknown",
428
+ ee.String(external_id_value),
429
+ )
430
+ # Create a new feature with the standardized external_id column
431
+ # Note: we use "external_id" as the standardized column name, not the original external_id_column name
432
+ return ee.Feature(feature.set("external_id", external_id_value))
433
+
434
+ feature_collection = feature_collection.map(
435
+ set_external_id_safely_and_clean
436
+ )
437
+
438
+ # Finally, clean up to keep only geometry and external_id if keep_properties is None
439
+ if keep_properties is None:
440
+ feature_collection = feature_collection.select(["external_id"])
441
+
442
+ except Exception as e:
443
+ # Handle the exception and provide a helpful error message
444
+ print(
445
+ f"An error occurred when trying to set the external_id_column: {external_id_column}. Error: {e}"
446
+ )
447
+ raise e # Re-raise the exception to stop execution
448
+ else:
449
+ feature_collection = _keep_fc_properties(feature_collection, keep_properties)
450
+
451
+ fc = get_stats(
452
+ feature_collection, national_codes=national_codes, unit_type=unit_type
453
+ )
454
+
455
+ return add_id_to_feature_collection(dataset=fc, id_name=plot_id_column)
456
+
457
+
458
+ def _keep_fc_properties(feature_collection, keep_properties):
459
+ # If keep_properties is specified, select only those properties
460
+ if keep_properties is None:
461
+ feature_collection = feature_collection.select([])
462
+ elif keep_properties == True:
463
+ # If keep_properties is true, select all properties
464
+ first_feature_props = feature_collection.first().propertyNames().getInfo()
465
+ feature_collection = feature_collection.select(first_feature_props)
466
+ elif isinstance(keep_properties, list):
467
+ feature_collection = feature_collection.select(keep_properties)
468
+ else:
469
+ raise ValueError(
470
+ "keep_properties must be None, True, or a list of property names."
471
+ )
472
+ return feature_collection
473
+
474
+
475
+ def whisp_stats_ee_to_df(
476
+ feature_collection: ee.FeatureCollection,
477
+ external_id_column=None,
478
+ remove_geom=False,
479
+ national_codes=None,
480
+ unit_type="ha",
481
+ ) -> pd.DataFrame:
482
+ """
483
+ Convert a Google Earth Engine FeatureCollection to a pandas DataFrame and convert ISO3 to ISO2 country codes.
484
+
485
+ Parameters
486
+ ----------
487
+ feature_collection : ee.FeatureCollection
488
+ The input FeatureCollection to analyze.
489
+ external_id_column : str, optional
490
+ The name of the external ID column, by default None.
491
+ remove_geom : bool, optional
492
+ Whether to remove the geometry column, by default True.
493
+ national_codes : list, optional
494
+ List of ISO2 country codes to include national datasets.
495
+ unit_type : str, optional
496
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
497
+
498
+ Returns
499
+ -------
500
+ df_stats : pd.DataFrame
501
+ The dataframe containing the Whisp stats for the input ROI.
502
+ """
503
+ try:
504
+ df_stats = convert_ee_to_df(
505
+ ee_object=whisp_stats_ee_to_ee(
506
+ feature_collection,
507
+ external_id_column,
508
+ national_codes=national_codes,
509
+ unit_type=unit_type,
510
+ ),
511
+ remove_geom=remove_geom,
512
+ )
513
+ except Exception as e:
514
+ print(f"An error occurred during the conversion from EE to DataFrame: {e}")
515
+ return pd.DataFrame() # Return an empty DataFrame in case of error
516
+
517
+ try:
518
+ df_stats = convert_iso3_to_iso2(
519
+ df=df_stats,
520
+ iso3_column=iso3_country_column,
521
+ iso2_column=iso2_country_column,
522
+ )
523
+ except Exception as e:
524
+ print(f"An error occurred during the ISO3 to ISO2 conversion: {e}")
525
+ return pd.DataFrame() # Return an empty DataFrame in case of error
526
+
527
+ return df_stats
528
+
529
+
530
+ def whisp_stats_ee_to_drive(
531
+ feature_collection: ee.FeatureCollection,
532
+ external_id_column=None,
533
+ national_codes=None,
534
+ unit_type="ha",
535
+ ):
536
+ """
537
+ Export Whisp statistics for a feature collection to Google Drive.
538
+
539
+ Parameters
540
+ ----------
541
+ feature_collection : ee.FeatureCollection
542
+ The feature collection to analyze.
543
+ external_id_column : str, optional
544
+ The name of the external ID column, by default None.
545
+ national_codes : list, optional
546
+ List of ISO2 country codes to include national datasets.
547
+ unit_type : str, optional
548
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
549
+ Returns
550
+ -------
551
+ None
552
+ """
553
+ try:
554
+ task = ee.batch.Export.table.toDrive(
555
+ collection=whisp_stats_ee_to_ee(
556
+ feature_collection,
557
+ external_id_column,
558
+ national_codes=national_codes,
559
+ unit_type=unit_type,
560
+ ),
561
+ description="whisp_output_table",
562
+ # folder="whisp_results",
563
+ fileFormat="CSV",
564
+ )
565
+ task.start()
566
+ print(
567
+ "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
568
+ )
569
+ except Exception as e:
570
+ print(f"An error occurred during the export: {e}")
571
+
572
+
573
+ #### main stats functions
574
+
575
+
576
+ # Get stats for a feature or feature collection
577
+ def get_stats(feature_or_feature_col, national_codes=None, unit_type="ha"):
578
+ """
579
+ Get stats for a feature or feature collection with optional filtering by national codes.
580
+
581
+ Parameters
582
+ ----------
583
+ feature_or_feature_col : ee.Feature or ee.FeatureCollection
584
+ The input feature or feature collection to analyze
585
+ national_codes : list, optional
586
+ List of ISO2 country codes to include national datasets
587
+ unit_type : str, optional
588
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
589
+ Returns
590
+ -------
591
+ ee.FeatureCollection
592
+ Feature collection with calculated statistics
593
+ """
594
+ # Check if the input is a Feature or a FeatureCollection
595
+ if isinstance(feature_or_feature_col, ee.Feature):
596
+ # If the input is a Feature, call the server-side function for processing
597
+ print("feature")
598
+ # For a single feature, we need to combine datasets with the national_codes filter
599
+ img_combined = combine_datasets(national_codes=national_codes)
600
+ output = ee.FeatureCollection(
601
+ [
602
+ get_stats_feature(
603
+ feature_or_feature_col, img_combined, unit_type=unit_type
604
+ )
605
+ ]
606
+ )
607
+ elif isinstance(feature_or_feature_col, ee.FeatureCollection):
608
+ # If the input is a FeatureCollection, call the server-side function for processing
609
+ output = get_stats_fc(
610
+ feature_or_feature_col, national_codes=national_codes, unit_type=unit_type
611
+ )
612
+ else:
613
+ output = "Check inputs: not an ee.Feature or ee.FeatureCollection"
614
+ return output
615
+
616
+
617
+ # Get statistics for a feature collection
618
+ def get_stats_fc(feature_col, national_codes=None, unit_type="ha"):
619
+ """
620
+ Calculate statistics for a feature collection using Whisp datasets.
621
+
622
+ Parameters
623
+ ----------
624
+ feature_col : ee.FeatureCollection
625
+ The input feature collection to analyze
626
+ national_codes : list, optional
627
+ List of ISO2 country codes (e.g., ["BR", "US"]) to include national datasets.
628
+ If provided, only national datasets for these countries and global datasets will be used.
629
+ If None (default), only global datasets will be used.
630
+ unit_type : str, optional
631
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
632
+ Returns
633
+ -------
634
+ ee.FeatureCollection
635
+ Feature collection with calculated statistics
636
+ """
637
+ img_combined = combine_datasets(
638
+ national_codes=national_codes
639
+ ) # Pass national_codes to combine_datasets
640
+
641
+ out_feature_col = ee.FeatureCollection(
642
+ feature_col.map(
643
+ lambda feature: get_stats_feature(
644
+ feature, img_combined, unit_type=unit_type
645
+ )
646
+ )
647
+ )
648
+ # print(out_feature_col.first().getInfo()) # for testing
649
+
650
+ return out_feature_col
651
+
652
+
653
+ # Get statistics for a single feature
654
+
655
+
656
+ def get_stats_feature(feature, img_combined, unit_type="ha"):
657
+ """
658
+ Get statistics for a single feature using a pre-combined image.
659
+
660
+ Parameters
661
+ ----------
662
+ feature : ee.Feature
663
+ The feature to analyze
664
+ img_combined : ee.Image
665
+ Pre-combined image with all the datasets
666
+ unit_type : str, optional
667
+ Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
668
+
669
+ Returns
670
+ -------
671
+ ee.Feature
672
+ Feature with calculated statistics
673
+ """
674
+ reduce = img_combined.reduceRegion(
675
+ reducer=ee.Reducer.sum(),
676
+ geometry=feature.geometry(),
677
+ scale=10,
678
+ maxPixels=1e10,
679
+ tileScale=8,
680
+ )
681
+
682
+ # Get basic feature information
683
+ feature_info = get_type_and_location(feature)
684
+
685
+ # add statistics unit type (e.g., percentage or hectares) to dictionary
686
+ stats_unit_type = ee.Dictionary({stats_unit_type_column: unit_type})
687
+
688
+ # Now, modified_dict contains all keys with the prefix added
689
+ reduce_ha = reduce.map(
690
+ lambda key, val: divide_and_format(ee.Number(val), ee.Number(10000))
691
+ )
692
+
693
+ # Get value for hectares
694
+ area_ha = ee.Number(ee.Dictionary(reduce_ha).get(geometry_area_column))
695
+
696
+ # Apply the function to each value in the dictionary using map()
697
+ reduce_percent = reduce_ha.map(
698
+ lambda key, val: percent_and_format(ee.Number(val), area_ha)
699
+ )
700
+
701
+ # Reformat the hectare statistics
702
+ reducer_stats_ha = reduce_ha.set(
703
+ geometry_area_column, area_ha.format(geometry_area_column_formatting)
704
+ ) # area ha (formatted)
705
+
706
+ # Reformat the percentage statistics
707
+ reducer_stats_percent = reduce_percent.set(
708
+ geometry_area_column, area_ha.format(geometry_area_column_formatting)
709
+ ) # area ha (formatted)
710
+
711
+ # Add country info onto hectare analysis results
712
+ properties_ha = feature_info.combine(ee.Dictionary(reducer_stats_ha)).combine(
713
+ stats_unit_type
714
+ )
715
+
716
+ # Add country info onto percentage analysis results
717
+ properties_percent = feature_info.combine(
718
+ ee.Dictionary(reducer_stats_percent)
719
+ ).combine(stats_unit_type)
720
+
721
+ # Choose whether to use hectares or percentage based on the parameter instead of global variable
722
+ out_feature = ee.Algorithms.If(
723
+ unit_type == "ha",
724
+ feature.set(properties_ha), # .setGeometry(None),
725
+ feature.set(properties_percent), # .setGeometry(None),
726
+ )
727
+
728
+ return out_feature
729
+
730
+
731
+ # Get basic feature information - uses admin and water datasets in gee.
732
+ def get_type_and_location(feature):
733
+ """Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags."""
734
+
735
+ # Get centroid of the feature's geometry
736
+ centroid = feature.geometry().centroid(1)
737
+
738
+ # Fetch location info from geoboundaries (country, admin)
739
+ location = ee.Dictionary(get_geoboundaries_info(centroid))
740
+ country = ee.Dictionary({iso3_country_column: location.get("shapeGroup")})
741
+
742
+ admin_1 = ee.Dictionary(
743
+ {admin_1_column: location.get("shapeName")}
744
+ ) # Administrative level 1 (if available)
745
+
746
+ # Prepare the water flag information
747
+ water_all = water_flag_all_prep()
748
+ water_flag_dict = value_at_point_flag(
749
+ point=centroid, image=water_all, band_name=water_flag, output_name=water_flag
750
+ )
751
+
752
+ # Get the geometry type of the feature
753
+ geom_type = ee.Dictionary({geometry_type_column: feature.geometry().type()})
754
+
755
+ # Get the coordinates (latitude, longitude) of the centroid
756
+ coords_list = centroid.coordinates()
757
+ coords_dict = ee.Dictionary(
758
+ {
759
+ centroid_x_coord_column: coords_list.get(0), # Longitude
760
+ centroid_y_coord_column: coords_list.get(1), # Latitude
761
+ }
762
+ )
763
+
764
+ # Combine all the extracted info into a single dictionary
765
+ feature_info = (
766
+ country.combine(admin_1)
767
+ .combine(geom_type)
768
+ .combine(coords_dict)
769
+ .combine(water_flag_dict)
770
+ )
771
+
772
+ return feature_info
773
+
774
+
775
+ # Define a function to divide each value by 10,000 and format it with one decimal place
776
+ def divide_and_format(val, unit):
777
+ # Convert the image to an ee.Number, divide by 10,000, and format with one decimal place
778
+ formatted_value = ee.Number.parse(
779
+ ee.Number(ee.Number(val).divide(ee.Number(unit))).format(
780
+ stats_area_columns_formatting
781
+ )
782
+ )
783
+ # Return the formatted value
784
+ return ee.Number(formatted_value)
785
+
786
+
787
+ # Define a function to divide by total area of geometry and multiply by 100
788
+ def percent_and_format(val, area_ha):
789
+ formatted_value = ee.Number.parse(
790
+ ee.Number(ee.Number(val).divide(area_ha).multiply(ee.Number(100))).format(
791
+ stats_percent_columns_formatting
792
+ )
793
+ )
794
+ # Return the formatted value
795
+ return ee.Number(formatted_value)
796
+
797
+
798
+ # geoboundaries - admin units from a freqently updated database, allows commercial use (CC BY 4.0 DEED) (disputed territories may need checking)
799
+ def get_geoboundaries_info(geometry):
800
+ gbounds_ADM0 = ee.FeatureCollection("WM/geoLab/geoBoundaries/600/ADM1")
801
+ polygonsIntersectPoint = gbounds_ADM0.filterBounds(geometry)
802
+ backup_dict = ee.Dictionary({"shapeGroup": "Unknown", "shapeName": "Unknown"})
803
+ return ee.Algorithms.If(
804
+ polygonsIntersectPoint.size().gt(0),
805
+ polygonsIntersectPoint.first()
806
+ .toDictionary()
807
+ .select(["shapeGroup", "shapeName"]),
808
+ backup_dict,
809
+ )
810
+
811
+
812
+ #####
813
+ # water flag - to flag plots that may be erroneous (i.e., where errors may have occured in their creation / translation and so fall in either the ocean or inland water -
814
+ def usgs_gsv_ocean_prep(): # TO DO: for speed export image as an asset at samne res as JRC
815
+ # Initialize the Earth Engine API
816
+ # ee.Initialize()
817
+
818
+ # Load the datasets
819
+ mainlands = ee.FeatureCollection(
820
+ "projects/sat-io/open-datasets/shoreline/mainlands"
821
+ )
822
+ big_islands = ee.FeatureCollection(
823
+ "projects/sat-io/open-datasets/shoreline/big_islands"
824
+ )
825
+ small_islands = ee.FeatureCollection(
826
+ "projects/sat-io/open-datasets/shoreline/small_islands"
827
+ )
828
+
829
+ # Combine the datasets into one FeatureCollection
830
+ gsv = ee.FeatureCollection([mainlands, big_islands, small_islands]).flatten()
831
+
832
+ # Rasterize the combined FeatureCollection and make areas outside coast (i.e. ocean) as value 1
833
+ # and then rename the band
834
+ return ee.Image(1).paint(gsv).selfMask().rename("ocean")
835
+
836
+
837
+ def jrc_water_surface_prep():
838
+ jrc_surface_water = ee.Image("JRC/GSW1_4/GlobalSurfaceWater")
839
+
840
+ # use transition band
841
+ jrc_transition = jrc_surface_water.select("transition")
842
+
843
+ # select permanent water bodies:
844
+ # remap the following classes to have a value of 1:
845
+ # "Permanent", "New Permanent", and "Seasonal to Permanent" (i.e., classes 1,2 and 7).
846
+ # All other classes as value 0.
847
+ permanent_inland_water = jrc_transition.remap([1, 2, 7], [1, 1, 1], 0).unmask()
848
+
849
+ # optional - clip to within coast line (not needed currently and extra processing)
850
+ # permanent_inland_water = permanent_inland_water.where(usgs_gsv_ocean_prep(),0)
851
+
852
+ return permanent_inland_water.rename("water_inland")
853
+
854
+
855
+ def water_flag_all_prep():
856
+ # combine both where water surface is 1, then 1, else use non_land_gsv
857
+ return (
858
+ usgs_gsv_ocean_prep()
859
+ .unmask()
860
+ .where(jrc_water_surface_prep(), 1)
861
+ .rename(water_flag)
862
+ )
863
+
864
+
865
+ def value_at_point_flag(point, image, band_name, output_name):
866
+ """Sample an image at the given point and make a dictionary output where the name is defined by output_name parameter"""
867
+ sample = image.sample(region=point, scale=30, numPixels=1).first()
868
+
869
+ # Get the value from the sampled point
870
+ value = sample.get(band_name) # assuming the band name is 'b1', change if necessary
871
+
872
+ # Use a conditional statement to check if the value is 1
873
+ result = value # ee.Algorithms.If(ee.Number(value).eq(1), "True", "False")
874
+
875
+ # Return the output dictionary
876
+ return ee.Dictionary({output_name: result}) # .getInfo()
877
+
878
+
879
+ def add_id_to_feature_collection(dataset, id_name):
880
+ """
881
+ Adds an incremental (1,2,3 etc) 'id' property to each feature in the given FeatureCollection.
882
+
883
+ Args:
884
+ - dataset: ee.FeatureCollection, the FeatureCollection to operate on.
885
+
886
+ Returns:
887
+ - dataset_with_id: ee.FeatureCollection, the FeatureCollection with 'id' property added to each feature.
888
+ """
889
+ # Get the list of system:index values
890
+ indexes = dataset.aggregate_array("system:index")
891
+
892
+ # Create a sequence of numbers starting from 1 to the size of indexes
893
+ ids = ee.List.sequence(1, indexes.size())
894
+
895
+ # Create a dictionary mapping system:index to id
896
+ id_by_index = ee.Dictionary.fromLists(indexes, ids)
897
+
898
+ # Function to add 'id' property to each feature
899
+ def add_id(feature):
900
+ # Get the system:index of the feature
901
+ system_index = feature.get("system:index")
902
+
903
+ # Get the id corresponding to the system:index
904
+ feature_id = id_by_index.get(system_index)
905
+
906
+ # Set the 'id' property of the feature
907
+ return feature.set(id_name, feature_id)
908
+
909
+ # Map the add_id function over the dataset
910
+ dataset_with_id = dataset.map(add_id)
911
+
912
+ return dataset_with_id
913
+
914
+
915
+ # Function to add ID to features
916
+ def add_id_to_feature(feature, id_name):
917
+ index = feature.get("system:index")
918
+ return feature.set(id_name, index)
919
+
920
+
921
+ # Function to flag positive values
922
+ def flag_positive_values(feature, flag_positive):
923
+ for prop_name in flag_positive:
924
+ flag_value = ee.Algorithms.If(
925
+ ee.Number(feature.get(prop_name)).gt(0), "True", "-"
926
+ )
927
+ feature = feature.set(prop_name, flag_value)
928
+ return feature
929
+
930
+
931
+ # Function to exclude properties
932
+ def copy_properties_and_exclude(feature, exclude_properties_from_output):
933
+ return ee.Feature(feature.geometry()).copyProperties(
934
+ source=feature, exclude=exclude_properties_from_output
935
+ )
936
+
937
+
938
+ def ee_image_checker(image):
939
+ """
940
+ Tests if the input is a valid ee.Image.
941
+
942
+ Args:
943
+ image: An ee.Image object.
944
+
945
+ Returns:
946
+ bool: True if the input is a valid ee.Image, False otherwise.
947
+ """
948
+ try:
949
+ if ee.Algorithms.ObjectType(image).getInfo() == "Image":
950
+ # Trigger some action on the image to ensure it's a valid image
951
+ image.getInfo() # This will raise an exception if the image is invalid
952
+ return True
953
+ except ee.EEException as e:
954
+ print(f"Image validation failed with EEException: {e}")
955
+ except Exception as e:
956
+ print(f"Image validation failed with exception: {e}")
957
+ return False
958
+
959
+
960
+ def keep_valid_images(image_list):
961
+ """
962
+ Filters a list to return only valid ee.Images.
963
+
964
+ Args:
965
+ image_list: List of ee.Image objects.
966
+
967
+ Returns:
968
+ list: List of valid ee.Image objects.
969
+ """
970
+ valid_imgs = []
971
+ for image in image_list:
972
+ if ee_image_checker(image):
973
+ valid_imgs.append(image)
974
+ return valid_imgs
975
+
976
+
977
+ def convert_iso3_to_iso2(df, iso3_column, iso2_column):
978
+ """
979
+ Converts ISO3 country codes to ISO2 codes and adds a new column to the DataFrame.
980
+
981
+ Args:
982
+ df (pd.DataFrame): Input DataFrame containing ISO3 country codes.
983
+ iso3_column (str): The column name in the DataFrame with ISO3 country codes.
984
+ iso2_column (str): The new column name to store ISO2 country codes.
985
+
986
+ Returns:
987
+ pd.DataFrame: Updated DataFrame with the new ISO2 column.
988
+ """
989
+ import country_converter as coco
990
+
991
+ # Apply conversion from ISO3 to ISO2
992
+ df[iso2_column] = df[iso3_column].apply(
993
+ lambda x: (
994
+ coco.convert(names=x, to="ISO2") if x else "not found (disputed territory)"
995
+ )
996
+ )
997
+
998
+ return df
999
+
1000
+
1001
+ def validate_external_id_column(feature_collection, external_id_column):
1002
+ """
1003
+ Validates that the external_id_column exists in all features of the collection.
1004
+
1005
+ Parameters
1006
+ ----------
1007
+ feature_collection : ee.FeatureCollection
1008
+ The feature collection to validate
1009
+ external_id_column : str
1010
+ The name of the external ID column to check
1011
+
1012
+ Returns
1013
+ -------
1014
+ dict
1015
+ Dictionary with validation results including:
1016
+ - 'is_valid': bool indicating if column exists in all features
1017
+ - 'total_features': int total number of features
1018
+ - 'features_with_column': int number of features that have the column
1019
+ - 'available_properties': list of properties available in first feature
1020
+ - 'error_message': str error message if validation fails
1021
+ """
1022
+ try:
1023
+ # Get total number of features
1024
+ total_features = feature_collection.size().getInfo()
1025
+
1026
+ if total_features == 0:
1027
+ return {
1028
+ "is_valid": False,
1029
+ "total_features": 0,
1030
+ "features_with_column": 0,
1031
+ "available_properties": [],
1032
+ "error_message": "Feature collection is empty",
1033
+ }
1034
+
1035
+ # Get available properties from first feature
1036
+ first_feature_props = feature_collection.first().propertyNames().getInfo()
1037
+
1038
+ # Check if external_id_column exists in all features
1039
+ def check_column_exists(feature):
1040
+ has_column = feature.propertyNames().contains(external_id_column)
1041
+ return feature.set("_has_external_id", has_column)
1042
+
1043
+ features_with_check = feature_collection.map(check_column_exists)
1044
+ features_with_column = (
1045
+ features_with_check.filter(ee.Filter.eq("_has_external_id", True))
1046
+ .size()
1047
+ .getInfo()
1048
+ )
1049
+
1050
+ is_valid = features_with_column == total_features
1051
+
1052
+ error_message = None
1053
+ if not is_valid:
1054
+ missing_count = total_features - features_with_column
1055
+ error_message = (
1056
+ f"The column '{external_id_column}' is missing from {missing_count} "
1057
+ f"out of {total_features} features in the collection. "
1058
+ f"Available properties in first feature: {first_feature_props}"
1059
+ )
1060
+
1061
+ return {
1062
+ "is_valid": is_valid,
1063
+ "total_features": total_features,
1064
+ "features_with_column": features_with_column,
1065
+ "available_properties": first_feature_props,
1066
+ "error_message": error_message,
1067
+ }
1068
+
1069
+ except Exception as e:
1070
+ return {
1071
+ "is_valid": False,
1072
+ "total_features": 0,
1073
+ "features_with_column": 0,
1074
+ "available_properties": [],
1075
+ "error_message": f"Error during validation: {str(e)}",
1076
+ }
1077
+
1078
+
1079
+ def debug_feature_collection_properties(feature_collection, max_features=5):
1080
+ """
1081
+ Debug helper function to inspect the properties of features in a collection.
1082
+
1083
+ Parameters
1084
+ ----------
1085
+ feature_collection : ee.FeatureCollection
1086
+ The feature collection to inspect
1087
+ max_features : int, optional
1088
+ Maximum number of features to inspect, by default 5
1089
+
1090
+ Returns
1091
+ -------
1092
+ dict
1093
+ Dictionary with debugging information about the feature collection
1094
+ """
1095
+ try:
1096
+ total_features = feature_collection.size().getInfo()
1097
+
1098
+ if total_features == 0:
1099
+ return {"total_features": 0, "error": "Feature collection is empty"}
1100
+
1101
+ # Limit the number of features to inspect
1102
+ features_to_check = min(max_features, total_features)
1103
+ limited_fc = feature_collection.limit(features_to_check)
1104
+
1105
+ # Get properties for each feature
1106
+ def get_feature_properties(feature):
1107
+ return ee.Dictionary(
1108
+ {
1109
+ "properties": feature.propertyNames(),
1110
+ "geometry_type": feature.geometry().type(),
1111
+ }
1112
+ )
1113
+
1114
+ feature_info = limited_fc.map(get_feature_properties).getInfo()
1115
+
1116
+ return {
1117
+ "total_features": total_features,
1118
+ "inspected_features": features_to_check,
1119
+ "feature_details": [
1120
+ {
1121
+ "feature_index": i,
1122
+ "properties": feature_info["features"][i]["properties"][
1123
+ "properties"
1124
+ ],
1125
+ "geometry_type": feature_info["features"][i]["properties"][
1126
+ "geometry_type"
1127
+ ],
1128
+ }
1129
+ for i in range(len(feature_info["features"]))
1130
+ ],
1131
+ }
1132
+
1133
+ except Exception as e:
1134
+ return {"error": f"Error during debugging: {str(e)}"}