openforis-whisp 2.0.0a6__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openforis_whisp/risk.py CHANGED
@@ -1,771 +1,848 @@
1
- import pandas as pd
2
-
3
- from .pd_schemas import data_lookup_type
4
-
5
-
6
- from openforis_whisp.parameters.config_runtime import (
7
- geometry_area_column,
8
- DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
9
- stats_unit_type_column, # Add this import
10
- )
11
-
12
- from openforis_whisp.reformat import filter_lookup_by_country_codes
13
-
14
- # could embed this in each function below that uses lookup_gee_datasets_df.
15
- lookup_gee_datasets_df: data_lookup_type = pd.read_csv(
16
- DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH
17
- )
18
-
19
-
20
- # requires lookup_gee_datasets_df
21
-
22
-
23
- # Add function to detect unit type from dataframe
24
- def detect_unit_type(df, explicit_unit_type=None):
25
- """
26
- Determine the unit type from the dataframe or use the override value.
27
-
28
- Args:
29
- df (DataFrame): Input DataFrame.
30
- explicit_unit_type (str, optional): Override unit type ('ha' or 'percent').
31
-
32
- Returns:
33
- str: The unit type to use for calculations.
34
-
35
- Raises:
36
- ValueError: If the unit type can't be determined and no override is provided,
37
- or if there are mixed unit types in the dataframe.
38
- """
39
- # If override is provided, use it
40
- if explicit_unit_type is not None:
41
- if explicit_unit_type not in ["ha", "percent"]:
42
- raise ValueError(
43
- f"Invalid unit type: {explicit_unit_type}. Must be 'ha' or 'percent'."
44
- )
45
- return explicit_unit_type
46
-
47
- # Check if unit type column exists in the dataframe
48
- if stats_unit_type_column not in df.columns:
49
- raise ValueError(
50
- f"Column '{stats_unit_type_column}' not found in dataframe. "
51
- "Please provide 'explicit_unit_type' parameter to specify the unit type."
52
- )
53
-
54
- # Get unique values from the column
55
- unit_types = df[stats_unit_type_column].unique()
56
-
57
- # Check for mixed unit types
58
- if len(unit_types) > 1:
59
- raise ValueError(
60
- f"Mixed unit types in dataframe: {unit_types}. All rows must use the same unit type."
61
- )
62
-
63
- # Get the single unit type
64
- unit_type = unit_types[0]
65
-
66
- # Validate that the unit type is recognized
67
- if unit_type not in ["ha", "percent"]:
68
- raise ValueError(
69
- f"Unrecognized unit type: {unit_type}. Must be 'ha' or 'percent'."
70
- )
71
-
72
- return unit_type
73
-
74
-
75
- # Update whisp_risk to accept and pass the unit_type parameter
76
- def whisp_risk(
77
- df: data_lookup_type, # CHECK THIS
78
- ind_1_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
79
- ind_2_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
80
- ind_3_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
81
- ind_4_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
82
- ind_5_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
83
- ind_6_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
84
- ind_7_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
85
- ind_8_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
86
- ind_9_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
87
- ind_10_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
88
- ind_11_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
89
- ind_1_input_columns: pd.Series = None, # see lookup_gee_datasets for details
90
- ind_2_input_columns: pd.Series = None, # see lookup_gee_datasets for details
91
- ind_3_input_columns: pd.Series = None, # see lookup_gee_datasets for details
92
- ind_4_input_columns: pd.Series = None, # see lookup_gee_datasets for details
93
- ind_5_input_columns: pd.Series = None, # see lookup_gee_datasets for details
94
- ind_6_input_columns: pd.Series = None, # see lookup_gee_datasets for details
95
- ind_7_input_columns: pd.Series = None, # see lookup_gee_datasets for details
96
- ind_8_input_columns: pd.Series = None, # see lookup_gee_datasets for details
97
- ind_9_input_columns: pd.Series = None, # see lookup_gee_datasets for details
98
- ind_10_input_columns: pd.Series = None, # see lookup_gee_datasets for details
99
- ind_11_input_columns: pd.Series = None, # see lookup_gee_datasets for details
100
- ind_1_name: str = "Ind_01_treecover",
101
- ind_2_name: str = "Ind_02_commodities",
102
- ind_3_name: str = "Ind_03_disturbance_before_2020",
103
- ind_4_name: str = "Ind_04_disturbance_after_2020",
104
- ind_5_name: str = "Ind_05_primary_2020",
105
- ind_6_name: str = "Ind_06_nat_reg_forest_2020",
106
- ind_7_name: str = "Ind_07_planted_plantations_2020",
107
- ind_8_name: str = "Ind_08_planted_plantations_after_2020",
108
- ind_9_name: str = "Ind_09_treecover_after_2020",
109
- ind_10_name: str = "Ind_10_agri_after_2020",
110
- ind_11_name: str = "Ind_11_logging_concession_before_2020",
111
- low_name: str = "no",
112
- high_name: str = "yes",
113
- explicit_unit_type: str = None,
114
- national_codes: list[str] = None, # List of ISO2 country codes to filter by
115
- ) -> data_lookup_type:
116
- """
117
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
118
-
119
- Args:
120
- df (DataFrame): Input DataFrame.
121
- ind_1_pcent_threshold (int, optional): Percentage threshold for the first indicator. Defaults to 10.
122
- ind_2_pcent_threshold (int, optional): Percentage threshold for the second indicator. Defaults to 10.
123
- ind_3_pcent_threshold (int, optional): Percentage threshold for the third indicator. Defaults to 0.
124
- ind_4_pcent_threshold (int, optional): Percentage threshold for the fourth indicator. Defaults to 0.
125
- ind_1_input_columns (list, optional): List of input columns for the first indicator. Defaults to columns for the treecover theme.
126
- ind_2_input_columns (list, optional): List of input columns for the second indicator. Defaults to columns for the commodities theme.
127
- ind_3_input_columns (list, optional): List of input columns for the third indicator. Defaults to columns for disturbance before 2020.
128
- ind_4_input_columns (list, optional): List of input columns for the fourth indicator. Defaults to columns for disturbance after 2020.
129
- ind_1_name (str, optional): Name of the first indicator column. Defaults to "Indicator_1_treecover".
130
- ind_2_name (str, optional): Name of the second indicator column. Defaults to "Indicator_2_commodities".
131
- ind_3_name (str, optional): Name of the third indicator column. Defaults to "Indicator_3_disturbance_before_2020".
132
- ind_4_name (str, optional): Name of the fourth indicator column. Defaults to "Indicator_4_disturbance_after_2020".
133
- low_name (str, optional): Value shown in table if less than or equal to the threshold. Defaults to "no".
134
- high_name (str, optional): Value shown in table if more than the threshold. Defaults to "yes".
135
- explicit_unit_type (str, optional): Override the autodetected unit type ('ha' or 'percent').
136
- If not provided, will detect from dataframe 'unit' column.
137
-
138
- Returns:
139
- data_lookup_type: DataFrame with added 'EUDR_risk' column.
140
- """
141
- # Determine the unit type to use based on input data and overrid
142
- unit_type = detect_unit_type(df, explicit_unit_type)
143
-
144
- print(f"Using unit type: {unit_type}")
145
-
146
- lookup_df_copy = lookup_gee_datasets_df.copy()
147
-
148
- # filter by national codes (even if None - this removes all country columns unless specified)
149
- filtered_lookup_gee_datasets_df = filter_lookup_by_country_codes(
150
- lookup_df=lookup_df_copy,
151
- filter_col="ISO2_code",
152
- national_codes=national_codes,
153
- )
154
-
155
- # Rest of the function remains the same, but pass unit_type to add_indicators
156
- if ind_1_input_columns is None:
157
- ind_1_input_columns = get_cols_ind_01_treecover(filtered_lookup_gee_datasets_df)
158
- if ind_2_input_columns is None:
159
- ind_2_input_columns = get_cols_ind_02_commodities(
160
- filtered_lookup_gee_datasets_df
161
- )
162
- if ind_3_input_columns is None:
163
- ind_3_input_columns = get_cols_ind_03_dist_before_2020(
164
- filtered_lookup_gee_datasets_df
165
- )
166
- if ind_4_input_columns is None:
167
- ind_4_input_columns = get_cols_ind_04_dist_after_2020(
168
- filtered_lookup_gee_datasets_df
169
- )
170
- if ind_5_input_columns is None:
171
- ind_5_input_columns = get_cols_ind_05_primary_2020(
172
- filtered_lookup_gee_datasets_df
173
- )
174
- if ind_6_input_columns is None:
175
- ind_6_input_columns = get_cols_ind_06_nat_reg_2020(
176
- filtered_lookup_gee_datasets_df
177
- )
178
- if ind_7_input_columns is None:
179
- ind_7_input_columns = get_cols_ind_07_planted_2020(
180
- filtered_lookup_gee_datasets_df
181
- )
182
- if ind_8_input_columns is None:
183
- ind_8_input_columns = get_cols_ind_08_planted_after_2020(
184
- filtered_lookup_gee_datasets_df
185
- )
186
- if ind_9_input_columns is None:
187
- ind_9_input_columns = get_cols_ind_09_treecover_after_2020(
188
- filtered_lookup_gee_datasets_df
189
- )
190
- if ind_10_input_columns is None:
191
- ind_10_input_columns = get_cols_ind_10_agri_after_2020(
192
- filtered_lookup_gee_datasets_df
193
- )
194
- if ind_11_input_columns is None:
195
- ind_11_input_columns = get_cols_ind_11_logging_before_2020(
196
- filtered_lookup_gee_datasets_df
197
- )
198
-
199
- # Check range of values
200
- check_range(ind_1_pcent_threshold)
201
- check_range(ind_2_pcent_threshold)
202
- check_range(ind_3_pcent_threshold)
203
- check_range(ind_4_pcent_threshold)
204
- check_range(ind_5_pcent_threshold)
205
- check_range(ind_6_pcent_threshold)
206
- check_range(ind_7_pcent_threshold)
207
- check_range(ind_8_pcent_threshold)
208
- check_range(ind_9_pcent_threshold)
209
- check_range(ind_10_pcent_threshold)
210
- check_range(ind_11_pcent_threshold)
211
-
212
- input_cols = [
213
- ind_1_input_columns,
214
- ind_2_input_columns,
215
- ind_3_input_columns,
216
- ind_4_input_columns,
217
- ind_5_input_columns,
218
- ind_6_input_columns,
219
- ind_7_input_columns,
220
- ind_8_input_columns,
221
- ind_9_input_columns,
222
- ind_10_input_columns,
223
- ind_11_input_columns,
224
- ]
225
- thresholds = [
226
- ind_1_pcent_threshold,
227
- ind_2_pcent_threshold,
228
- ind_3_pcent_threshold,
229
- ind_4_pcent_threshold,
230
- ind_5_pcent_threshold,
231
- ind_6_pcent_threshold,
232
- ind_7_pcent_threshold,
233
- ind_8_pcent_threshold,
234
- ind_9_pcent_threshold,
235
- ind_10_pcent_threshold,
236
- ind_11_pcent_threshold,
237
- ]
238
- names = [
239
- ind_1_name,
240
- ind_2_name,
241
- ind_3_name,
242
- ind_4_name,
243
- ind_5_name,
244
- ind_6_name,
245
- ind_7_name,
246
- ind_8_name,
247
- ind_9_name,
248
- ind_10_name,
249
- ind_11_name,
250
- ]
251
- [check_range(threshold) for threshold in thresholds]
252
-
253
- df_w_indicators = add_indicators(
254
- df,
255
- input_cols,
256
- thresholds,
257
- names,
258
- low_name,
259
- high_name,
260
- unit_type, # Pass the unit type
261
- )
262
-
263
- df_w_indicators_and_risk_pcrop = add_eudr_risk_pcrop_col(
264
- df=df_w_indicators,
265
- ind_1_name=ind_1_name,
266
- ind_2_name=ind_2_name,
267
- ind_3_name=ind_3_name,
268
- ind_4_name=ind_4_name,
269
- )
270
-
271
- df_w_indicators_and_risk_acrop = add_eudr_risk_acrop_col(
272
- df=df_w_indicators,
273
- ind_1_name=ind_1_name,
274
- ind_2_name=ind_2_name,
275
- ind_4_name=ind_4_name,
276
- )
277
-
278
- df_w_indicators_and_risk_timber = add_eudr_risk_timber_col(
279
- df=df_w_indicators,
280
- ind_2_name=ind_2_name,
281
- ind_5_name=ind_5_name,
282
- ind_6_name=ind_6_name,
283
- ind_7_name=ind_7_name,
284
- ind_8_name=ind_8_name,
285
- ind_9_name=ind_9_name,
286
- ind_10_name=ind_10_name,
287
- ind_11_name=ind_11_name,
288
- )
289
-
290
- return df_w_indicators_and_risk_timber
291
-
292
-
293
- def add_eudr_risk_pcrop_col(
294
- df: data_lookup_type,
295
- ind_1_name: str,
296
- ind_2_name: str,
297
- ind_3_name: str,
298
- ind_4_name: str,
299
- ) -> data_lookup_type:
300
- """
301
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
302
-
303
- Args:
304
- df (DataFrame): Input DataFrame.
305
- ind_1_name (str, optional): Name of first indicator column. Defaults to "Ind_01_treecover".
306
- ind_2_name (str, optional): Name of second indicator column. Defaults to "Ind_02_commodities".
307
- ind_3_name (str, optional): Name of third indicator column. Defaults to "Ind_03_disturbance_before_2020".
308
- ind_4_name (str, optional): Name of fourth indicator column. Defaults to "Ind_04_disturbance_after_2020".
309
-
310
- Returns:
311
- DataFrame: DataFrame with added 'EUDR_risk' column.
312
- """
313
-
314
- for index, row in df.iterrows():
315
- # If any of the first three indicators suggest low risk, set EUDR_risk to "low"
316
- if (
317
- row[ind_1_name] == "no"
318
- or row[ind_2_name] == "yes"
319
- or row[ind_3_name] == "yes"
320
- ):
321
- df.at[index, "risk_pcrop"] = "low"
322
- # If none of the first three indicators suggest low risk and Indicator 4 suggests no risk, set EUDR_risk to "more_info_needed"
323
- elif row[ind_4_name] == "no":
324
- df.at[index, "risk_pcrop"] = "more_info_needed"
325
- # If none of the above conditions are met, set EUDR_risk to "high"
326
- else:
327
- df.at[index, "risk_pcrop"] = "high"
328
-
329
- return df
330
-
331
-
332
- def add_eudr_risk_acrop_col(
333
- df: data_lookup_type,
334
- ind_1_name: str,
335
- ind_2_name: str,
336
- ind_4_name: str,
337
- ) -> data_lookup_type:
338
- """
339
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
340
-
341
- Args:
342
- df (DataFrame): Input DataFrame.
343
- ind_1_name (str, optional): Name of first indicator column. Defaults to "Ind_01_treecover".
344
- ind_2_name (str, optional): Name of second indicator column. Defaults to "Ind_02_commodities".
345
- ind_4_name (str, optional): Name of fourth indicator column. Defaults to "Ind_04_disturbance_after_2020".
346
-
347
- Returns:
348
- DataFrame: DataFrame with added 'EUDR_risk' column.
349
- """
350
-
351
- # soy risk
352
- for index, row in df.iterrows():
353
- # If there is no tree cover in 2020, set EUDR_risk_soy to "low"
354
- if row[ind_1_name] == "no" or row[ind_2_name] == "yes":
355
- df.at[index, "risk_acrop"] = "low"
356
- # If there is tree cover in 2020 and distrubances post 2020, set EUDR_risk_soy to "high"
357
- elif row[ind_1_name] == "yes" and row[ind_4_name] == "yes":
358
- df.at[index, "risk_acrop"] = "high"
359
- # If tree cover and no disturbances post 2020, set EUDR_risk to "more_info_needed"
360
- else:
361
- df.at[index, "risk_acrop"] = "more_info_needed"
362
-
363
- return df
364
-
365
-
366
- def add_eudr_risk_timber_col(
367
- df: data_lookup_type,
368
- ind_2_name: str,
369
- ind_5_name: str,
370
- ind_6_name: str,
371
- ind_7_name: str,
372
- ind_8_name: str,
373
- ind_9_name: str,
374
- ind_10_name: str,
375
- ind_11_name: str,
376
- ) -> data_lookup_type:
377
- """
378
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
379
-
380
- Args:
381
- df (DataFrame): Input DataFrame.
382
- ind_2_name (str, optional): Name of second indicator column. Defaults to "Ind_02_commodities".
383
- ind_5_name (str, optional): Name of fifth indicator column. Defaults to "Ind_05_primary_2020".
384
- ind_6_name (str, optional): Name of sixth indicator column. Defaults to "Ind_06_nat_reg_forest_2020".
385
- ind_7_name (str, optional): Name of seventh indicator column. Defaults to "Ind_07_planted_plantations_2020".
386
- ind_8_name (str, optional): Name of eighth indicator column. Defaults to "Ind_08_planted_plantations_after_2020".
387
- ind_9_name (str, optional): Name of ninth indicator column. Defaults to "Ind_09_treecover_after_2020".
388
- ind_10_name (str, optional): Name of tenth indicator column. Defaults to "Ind_10_agri_after_2020".
389
- ind_11_name (str, optional): Name of eleventh indicator column. Defaults to "Ind_11_logging_concession_before_2020".
390
-
391
- Returns:
392
- DataFrame: DataFrame with added 'EUDR_risk' column.
393
- """
394
-
395
- for index, row in df.iterrows():
396
- # If there is a commodity in 2020 (ind_2_name)
397
- # OR if there is planted-plantation in 2020 (ind_7_name) AND no agriculture in 2023 (ind_10_name), set EUDR_risk_timber to "low"
398
- if row[ind_2_name] == "yes" or (
399
- row[ind_7_name] == "yes" and row[ind_10_name] == "no"
400
- ):
401
- df.at[index, "risk_timber"] = "low"
402
- # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) or planted forest (ind_7_name) in 2020 AND agricultural after 2020 (ind_10_name), set EUDR_timber to high
403
- elif (
404
- row[ind_5_name] == "yes"
405
- or row[ind_6_name] == "yes"
406
- or row[ind_7_name] == "yes"
407
- ) and row[ind_10_name] == "yes":
408
- df.at[index, "risk_timber"] = "high"
409
- # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) AND planted after 2020 (ind_8_name), set EUDR_risk to "high"
410
- elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes") and row[
411
- ind_8_name
412
- ] == "yes":
413
- df.at[index, "risk_timber"] = "high"
414
- # No data yet on OWL conversion
415
- # If primary or naturally regenerating or planted forest in 2020 and OWL in 2023, set EUDR_risk to high
416
- # elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes" or row[ind_7_name] == "yes") and row[ind_10_name] == "yes":
417
- # df.at[index, 'EUDR_risk_timber'] = "high"
418
-
419
- # If there is a natural primary forest (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) AND an information on management practice any time (ind_11_name) OR tree cover or regrowth post 2020 (ind_9_name), set EUDR_risk_timber to "low"
420
- elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes") and (
421
- row[ind_9_name] == "yes" or row[ind_11_name] == "yes"
422
- ):
423
- df.at[index, "risk_timber"] = "low"
424
- # If primary (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) and no other info, set EUDR_risk to "more_info_needed"
425
- elif row[ind_5_name] == "yes" or row[ind_6_name] == "yes":
426
- df.at[index, "risk_timber"] = "more_info_needed"
427
- # If none of the above conditions are met, set EUDR_risk to "low"
428
- else:
429
- df.at[index, "risk_timber"] = "low"
430
-
431
- return df
432
-
433
-
434
- def add_indicators(
435
- df: data_lookup_type,
436
- input_cols: list[str],
437
- thresholds: list[float],
438
- names: list[str],
439
- low_name: str = "no",
440
- high_name: str = "yes",
441
- unit_type: str = None,
442
- ) -> data_lookup_type:
443
- for input_col, threshold, name in zip(input_cols, thresholds, names):
444
- df = add_indicator_column(
445
- df=df,
446
- input_columns=input_col,
447
- threshold=threshold,
448
- new_column_name=name,
449
- low_name=low_name,
450
- high_name=high_name,
451
- sum_comparison=False,
452
- unit_type=unit_type, # Pass the unit type
453
- )
454
- return df
455
-
456
-
457
- # Update add_indicator_column to use the unit_type parameter
458
- def add_indicator_column(
459
- df: data_lookup_type,
460
- input_columns: list[str],
461
- threshold: float,
462
- new_column_name: str,
463
- low_name: str = "no",
464
- high_name: str = "yes",
465
- sum_comparison: bool = False,
466
- unit_type: str = None, # unit_type parameter
467
- ) -> data_lookup_type:
468
- """
469
- Add a new column to the DataFrame based on the specified columns, threshold, and comparison sign.
470
-
471
- Parameters:
472
- df (data_lookup_type): The pandas DataFrame to which the column will be added.
473
- input_columns (list): List of column names to check for threshold.
474
- threshold (float): The threshold value to compare against.
475
- new_column_name (str): The name of the new column to be added.
476
- The '>' sign is used for comparisons.
477
- When 'sum comparison' == True, then the threshold is compared to the sum of all those listed in 'input_columns', as opposed to when Flalse, when each column in the list is compared to the threshold individually
478
- low_name (str): The name for the value when below or equal to threshold (default is 'no').
479
- high_name (str): The name for the value when above threshold (default is 'yes').
480
- sum_comparison (bool): If True, sum all values in input_columns and compare to threshold (default is False).
481
- unit_type (str): Whether values are in "ha" or "percent".
482
-
483
- Returns:
484
- data_lookup_type: The DataFrame with the new column added.
485
- """
486
- # Create a new column and initialize with low_name
487
- new_column = pd.Series(low_name, index=df.index, name=new_column_name)
488
-
489
- # Default behavior: use '>' for single column comparison
490
- if sum_comparison:
491
- # Sum all values in specified columns and compare to threshold
492
- sum_values = df[input_columns].sum(axis=1)
493
- new_column[sum_values > threshold] = high_name
494
- else:
495
- # Check if any values in specified columns are above the threshold and update the new column accordingly
496
- for col in input_columns:
497
- # So that threshold is always in percent, if outputs are in ha, the code converts to percent (based on dividing by the geometry_area_column column.
498
- # Clamping is needed due to differences in decimal places (meaning input values may go just over 100)
499
- if unit_type == "ha":
500
- df[geometry_area_column] = pd.to_numeric(
501
- df[geometry_area_column], errors="coerce"
502
- )
503
- val_to_check = clamp(
504
- ((df[col] / df[geometry_area_column]) * 100), 0, 100
505
- )
506
- else:
507
- val_to_check = df[col]
508
- new_column[val_to_check > threshold] = high_name
509
-
510
- # Concatenate the new column to the DataFrame
511
- df = pd.concat([df, new_column], axis=1)
512
- return df
513
-
514
-
515
- def get_cols_ind_01_treecover(lookup_gee_datasets_df):
516
- """
517
- Generate a list of dataset names for the treecover theme, excluding those marked for exclusion.
518
-
519
- Args:
520
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
521
-
522
- Returns:
523
- list: List of dataset names set to be used in the risk calculations for the treecover theme, excluding those marked for exclusion.
524
- """
525
- lookup_gee_datasets_df = lookup_gee_datasets_df[
526
- lookup_gee_datasets_df["exclude_from_output"] != 1
527
- ]
528
- return list(
529
- lookup_gee_datasets_df["name"][
530
- (lookup_gee_datasets_df["use_for_risk"] == 1)
531
- & (lookup_gee_datasets_df["theme"] == "treecover")
532
- ]
533
- )
534
-
535
-
536
- def get_cols_ind_02_commodities(lookup_gee_datasets_df):
537
- """
538
- Generate a list of dataset names for the commodities theme, excluding those marked for exclusion.
539
-
540
- Args:
541
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
542
-
543
- Returns:
544
- list: List of dataset names set to be used in the risk calculations for the commodities theme, excluding those marked for exclusion.
545
- """
546
- lookup_gee_datasets_df = lookup_gee_datasets_df[
547
- lookup_gee_datasets_df["exclude_from_output"] != 1
548
- ]
549
- return list(
550
- lookup_gee_datasets_df["name"][
551
- (lookup_gee_datasets_df["use_for_risk"] == 1)
552
- & (lookup_gee_datasets_df["theme"] == "commodities")
553
- ]
554
- )
555
-
556
-
557
- def get_cols_ind_03_dist_before_2020(lookup_gee_datasets_df):
558
- """
559
- Generate a list of dataset names for the disturbance before 2020 theme, excluding those marked for exclusion.
560
-
561
- Args:
562
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
563
-
564
- Returns:
565
- list: List of dataset names set to be used in the risk calculations for the disturbance before 2020 theme, excluding those marked for exclusion.
566
- """
567
- lookup_gee_datasets_df = lookup_gee_datasets_df[
568
- lookup_gee_datasets_df["exclude_from_output"] != 1
569
- ]
570
- return list(
571
- lookup_gee_datasets_df["name"][
572
- (lookup_gee_datasets_df["use_for_risk"] == 1)
573
- & (lookup_gee_datasets_df["theme"] == "disturbance_before")
574
- ]
575
- )
576
-
577
-
578
- def get_cols_ind_04_dist_after_2020(lookup_gee_datasets_df):
579
- """
580
- Generate a list of dataset names for the disturbance after 2020 theme, excluding those marked for exclusion.
581
-
582
- Args:
583
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
584
-
585
- Returns:
586
- list: List of dataset names set to be used in the risk calculations for the disturbance after 2020 theme, excluding those marked for exclusion.
587
- """
588
- lookup_gee_datasets_df = lookup_gee_datasets_df[
589
- lookup_gee_datasets_df["exclude_from_output"] != 1
590
- ]
591
- return list(
592
- lookup_gee_datasets_df["name"][
593
- (lookup_gee_datasets_df["use_for_risk"] == 1)
594
- & (lookup_gee_datasets_df["theme"] == "disturbance_after")
595
- ]
596
- )
597
-
598
-
599
- def get_cols_ind_05_primary_2020(lookup_gee_datasets_df):
600
- """
601
- Generate a list of dataset names for primary forests in 2020
602
-
603
- Args:
604
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
605
-
606
- Returns:
607
- list: List of dataset names set to be used in the risk calculations for the degradation - primary forest in 2020, excluding those marked for exclusion.
608
- """
609
- lookup_gee_datasets_df = lookup_gee_datasets_df[
610
- lookup_gee_datasets_df["exclude_from_output"] != 1
611
- ]
612
- return list(
613
- lookup_gee_datasets_df["name"][
614
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
615
- & (lookup_gee_datasets_df["theme_timber"] == "primary")
616
- ]
617
- )
618
-
619
-
620
- def get_cols_ind_06_nat_reg_2020(lookup_gee_datasets_df):
621
- """
622
- Generate a list of dataset names for naturally_reg_2020 forests in 2020
623
-
624
- Args:
625
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
626
-
627
- Returns:
628
- list: List of dataset names set to be used in the risk calculations for the degradation - naturally_reg_2020 in 2020, excluding those marked for exclusion.
629
- """
630
- lookup_gee_datasets_df = lookup_gee_datasets_df[
631
- lookup_gee_datasets_df["exclude_from_output"] != 1
632
- ]
633
- return list(
634
- lookup_gee_datasets_df["name"][
635
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
636
- & (lookup_gee_datasets_df["theme_timber"] == "naturally_reg_2020")
637
- ]
638
- )
639
-
640
-
641
- def get_cols_ind_07_planted_2020(lookup_gee_datasets_df):
642
- """
643
- Generate a list of dataset names for planted and plantation forests in 2020
644
-
645
- Args:
646
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
647
-
648
- Returns:
649
- list: List of dataset names set to be used in the risk calculations for the degradation - planted and plantation forests in 2020, excluding those marked for exclusion.
650
- """
651
- lookup_gee_datasets_df = lookup_gee_datasets_df[
652
- lookup_gee_datasets_df["exclude_from_output"] != 1
653
- ]
654
- return list(
655
- lookup_gee_datasets_df["name"][
656
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
657
- & (lookup_gee_datasets_df["theme_timber"] == "planted_plantation_2020")
658
- ]
659
- )
660
-
661
-
662
- def get_cols_ind_08_planted_after_2020(lookup_gee_datasets_df):
663
- """
664
- Generate a list of dataset names for planted and plantation forests post 2020
665
-
666
- Args:
667
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
668
-
669
- Returns:
670
- list: List of dataset names set to be used in the risk calculations for the degradation - planted and plantation forests post 2020, excluding those marked for exclusion.
671
- """
672
- lookup_gee_datasets_df = lookup_gee_datasets_df[
673
- lookup_gee_datasets_df["exclude_from_output"] != 1
674
- ]
675
- return list(
676
- lookup_gee_datasets_df["name"][
677
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
678
- & (
679
- lookup_gee_datasets_df["theme_timber"]
680
- == "planted_plantation_after_2020"
681
- )
682
- ]
683
- )
684
-
685
-
686
- def get_cols_ind_09_treecover_after_2020(lookup_gee_datasets_df):
687
- """
688
- Generate a list of dataset names for treecover post 2020
689
-
690
- Args:
691
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
692
-
693
- Returns:
694
- list: List of dataset names set to be used in the risk calculations for the degradation - treecover post 2020, excluding those marked for exclusion.
695
- """
696
- lookup_gee_datasets_df = lookup_gee_datasets_df[
697
- lookup_gee_datasets_df["exclude_from_output"] != 1
698
- ]
699
- return list(
700
- lookup_gee_datasets_df["name"][
701
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
702
- & (lookup_gee_datasets_df["theme_timber"] == "treecover_after_2020")
703
- ]
704
- )
705
-
706
-
707
- def get_cols_ind_10_agri_after_2020(lookup_gee_datasets_df):
708
- """
709
- Generate a list of dataset names for croplands post 2020
710
-
711
- Args:
712
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
713
-
714
- Returns:
715
- list: List of dataset names set to be used in the risk calculations for the degradation - croplands post 2020, excluding those marked for exclusion.
716
- """
717
- lookup_gee_datasets_df = lookup_gee_datasets_df[
718
- lookup_gee_datasets_df["exclude_from_output"] != 1
719
- ]
720
- return list(
721
- lookup_gee_datasets_df["name"][
722
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
723
- & (lookup_gee_datasets_df["theme_timber"] == "agri_after_2020")
724
- ]
725
- )
726
-
727
-
728
- def get_cols_ind_11_logging_before_2020(lookup_gee_datasets_df):
729
- """
730
- Generate a list of dataset names for logging concessions (2020 if available)
731
-
732
- Args:
733
- lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
734
-
735
- Returns:
736
- list: List of dataset names set to be used in the risk calculations for the degradation - logging concessions, excluding those marked for exclusion.
737
- """
738
- lookup_gee_datasets_df = lookup_gee_datasets_df[
739
- lookup_gee_datasets_df["exclude_from_output"] != 1
740
- ]
741
- return list(
742
- lookup_gee_datasets_df["name"][
743
- (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
744
- & (lookup_gee_datasets_df["theme_timber"] == "logging_concession")
745
- ]
746
- )
747
-
748
-
749
- def clamp(
750
- value: float | pd.Series, min_val: float, max_val: float
751
- ) -> float | pd.Series:
752
- """
753
- Clamp a value or a Pandas Series within a specified range.
754
-
755
- Args:
756
- value (float | pd.Series): The value or series to be clamped.
757
- min_val (float): The minimum value of the range.
758
- max_val (float): The maximum value of the range.
759
-
760
- Returns:
761
- float | pd.Series: The clamped value or series within the range.
762
- """
763
- if isinstance(value, pd.Series):
764
- return value.clip(lower=min_val, upper=max_val)
765
- else:
766
- return max(min_val, min(value, max_val))
767
-
768
-
769
- def check_range(value: float) -> None:
770
- if not (0 <= value <= 100):
771
- raise ValueError("Value must be between 0 and 100.")
1
+ import pandas as pd
2
+
3
+ from .pd_schemas import data_lookup_type
4
+
5
+
6
+ from openforis_whisp.parameters.config_runtime import (
7
+ geometry_area_column,
8
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
9
+ stats_unit_type_column, # Add this import
10
+ )
11
+
12
+ from openforis_whisp.reformat import filter_lookup_by_country_codes
13
+
14
+ # could embed this in each function below that uses lookup_gee_datasets_df.
15
+ lookup_gee_datasets_df: data_lookup_type = pd.read_csv(
16
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH
17
+ )
18
+
19
+
20
+ # requires lookup_gee_datasets_df
21
+
22
+
23
+ # Add function to detect unit type from dataframe
24
+ def detect_unit_type(df, explicit_unit_type=None):
25
+ """
26
+ Determine the unit type from the dataframe or use the override value.
27
+
28
+ Args:
29
+ df (DataFrame): Input DataFrame.
30
+ explicit_unit_type (str, optional): Override unit type ('ha' or 'percent').
31
+
32
+ Returns:
33
+ str: The unit type to use for calculations.
34
+
35
+ Raises:
36
+ ValueError: If the unit type can't be determined and no override is provided,
37
+ or if there are mixed unit types in the dataframe.
38
+ """
39
+ # If override is provided, use it
40
+ if explicit_unit_type is not None:
41
+ if explicit_unit_type not in ["ha", "percent"]:
42
+ raise ValueError(
43
+ f"Invalid unit type: {explicit_unit_type}. Must be 'ha' or 'percent'."
44
+ )
45
+ return explicit_unit_type
46
+
47
+ # Check if unit type column exists in the dataframe
48
+ if stats_unit_type_column not in df.columns:
49
+ raise ValueError(
50
+ f"Column '{stats_unit_type_column}' not found in dataframe. "
51
+ "Please provide 'explicit_unit_type' parameter to specify the unit type."
52
+ )
53
+
54
+ # Get unique values from the column
55
+ unit_types = df[stats_unit_type_column].unique()
56
+
57
+ # Check for mixed unit types
58
+ if len(unit_types) > 1:
59
+ raise ValueError(
60
+ f"Mixed unit types in dataframe: {unit_types}. All rows must use the same unit type."
61
+ )
62
+
63
+ # Get the single unit type
64
+ unit_type = unit_types[0]
65
+
66
+ # Validate that the unit type is recognized
67
+ if unit_type not in ["ha", "percent"]:
68
+ raise ValueError(
69
+ f"Unrecognized unit type: {unit_type}. Must be 'ha' or 'percent'."
70
+ )
71
+
72
+ return unit_type
73
+
74
+
75
+ # Update whisp_risk to accept and pass the unit_type parameter
76
+ def whisp_risk(
77
+ df: data_lookup_type, # CHECK THIS
78
+ ind_1_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
79
+ ind_2_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
80
+ ind_3_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
81
+ ind_4_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
82
+ ind_5_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
83
+ ind_6_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
84
+ ind_7_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
85
+ ind_8_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
86
+ ind_9_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
87
+ ind_10_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
88
+ ind_11_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
89
+ ind_1_input_columns: pd.Series = None, # see lookup_gee_datasets for details
90
+ ind_2_input_columns: pd.Series = None, # see lookup_gee_datasets for details
91
+ ind_3_input_columns: pd.Series = None, # see lookup_gee_datasets for details
92
+ ind_4_input_columns: pd.Series = None, # see lookup_gee_datasets for details
93
+ ind_5_input_columns: pd.Series = None, # see lookup_gee_datasets for details
94
+ ind_6_input_columns: pd.Series = None, # see lookup_gee_datasets for details
95
+ ind_7_input_columns: pd.Series = None, # see lookup_gee_datasets for details
96
+ ind_8_input_columns: pd.Series = None, # see lookup_gee_datasets for details
97
+ ind_9_input_columns: pd.Series = None, # see lookup_gee_datasets for details
98
+ ind_10_input_columns: pd.Series = None, # see lookup_gee_datasets for details
99
+ ind_11_input_columns: pd.Series = None, # see lookup_gee_datasets for details
100
+ ind_1_name: str = "Ind_01_treecover",
101
+ ind_2_name: str = "Ind_02_commodities",
102
+ ind_3_name: str = "Ind_03_disturbance_before_2020",
103
+ ind_4_name: str = "Ind_04_disturbance_after_2020",
104
+ ind_5_name: str = "Ind_05_primary_2020",
105
+ ind_6_name: str = "Ind_06_nat_reg_forest_2020",
106
+ ind_7_name: str = "Ind_07_planted_plantations_2020",
107
+ ind_8_name: str = "Ind_08_planted_plantations_after_2020",
108
+ ind_9_name: str = "Ind_09_treecover_after_2020",
109
+ ind_10_name: str = "Ind_10_agri_after_2020",
110
+ ind_11_name: str = "Ind_11_logging_concession_before_2020",
111
+ low_name: str = "no",
112
+ high_name: str = "yes",
113
+ explicit_unit_type: str = None,
114
+ national_codes: list[str] = None, # List of ISO2 country codes to filter by
115
+ custom_bands_info: dict = None, # New parameter for custom band risk info
116
+ ) -> data_lookup_type:
117
+ """
118
+ Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
119
+
120
+ Args:
121
+ df (DataFrame): Input DataFrame.
122
+ ind_1_pcent_threshold (int, optional): Percentage threshold for the first indicator. Defaults to 10.
123
+ ind_2_pcent_threshold (int, optional): Percentage threshold for the second indicator. Defaults to 10.
124
+ ind_3_pcent_threshold (int, optional): Percentage threshold for the third indicator. Defaults to 0.
125
+ ind_4_pcent_threshold (int, optional): Percentage threshold for the fourth indicator. Defaults to 0.
126
+ ind_1_input_columns (list, optional): List of input columns for the first indicator. Defaults to columns for the treecover theme.
127
+ ind_2_input_columns (list, optional): List of input columns for the second indicator. Defaults to columns for the commodities theme.
128
+ ind_3_input_columns (list, optional): List of input columns for the third indicator. Defaults to columns for disturbance before 2020.
129
+ ind_4_input_columns (list, optional): List of input columns for the fourth indicator. Defaults to columns for disturbance after 2020.
130
+ ind_1_name (str, optional): Name of the first indicator column. Defaults to "Indicator_1_treecover".
131
+ ind_2_name (str, optional): Name of the second indicator column. Defaults to "Indicator_2_commodities".
132
+ ind_3_name (str, optional): Name of the third indicator column. Defaults to "Indicator_3_disturbance_before_2020".
133
+ ind_4_name (str, optional): Name of the fourth indicator column. Defaults to "Indicator_4_disturbance_after_2020".
134
+ low_name (str, optional): Value shown in table if less than or equal to the threshold. Defaults to "no".
135
+ high_name (str, optional): Value shown in table if more than the threshold. Defaults to "yes".
136
+ explicit_unit_type (str, optional): Override the autodetected unit type ('ha' or 'percent').
137
+ If not provided, will detect from dataframe 'unit' column.
138
+ custom_bands_info (dict, optional): Custom band risk information. Dict format:
139
+ {
140
+ 'band_name': {
141
+ 'theme': 'treecover', # or 'commodities', 'disturbance_before', 'disturbance_after'
142
+ 'theme_timber': 'primary', # or 'naturally_reg_2020', 'planted_plantation_2020', etc.
143
+ 'use_for_risk': 1, # 0 or 1
144
+ 'use_for_risk_timber': 1, # 0 or 1
145
+ }
146
+ }
147
+ If None, custom bands won't be included in risk calculations.
148
+
149
+ Returns:
150
+ data_lookup_type: DataFrame with added risk columns.
151
+ """
152
+ # Determine the unit type
153
+ unit_type = detect_unit_type(df, explicit_unit_type)
154
+ print(f"Using unit type: {unit_type}")
155
+
156
+ lookup_df_copy = lookup_gee_datasets_df.copy()
157
+
158
+ # Add custom bands to lookup if provided
159
+ if custom_bands_info:
160
+ lookup_df_copy = add_custom_bands_info_to_lookup(
161
+ lookup_df_copy, custom_bands_info, df.columns
162
+ )
163
+ print(f"Including custom bands: {list(custom_bands_info.keys())}")
164
+ # print(f"appended custom bands info to lookup table")
165
+ if national_codes:
166
+ print(f"Filtering by national codes: {national_codes}")
167
+ # Filter by national codes
168
+ filtered_lookup_gee_datasets_df = filter_lookup_by_country_codes(
169
+ lookup_df=lookup_df_copy,
170
+ filter_col="ISO2_code",
171
+ national_codes=national_codes,
172
+ )
173
+
174
+ # Get indicator columns (now includes custom bands)
175
+ if ind_1_input_columns is None:
176
+ ind_1_input_columns = get_cols_ind_01_treecover(filtered_lookup_gee_datasets_df)
177
+ if ind_2_input_columns is None:
178
+ ind_2_input_columns = get_cols_ind_02_commodities(
179
+ filtered_lookup_gee_datasets_df
180
+ )
181
+ if ind_3_input_columns is None:
182
+ ind_3_input_columns = get_cols_ind_03_dist_before_2020(
183
+ filtered_lookup_gee_datasets_df
184
+ )
185
+ if ind_4_input_columns is None:
186
+ ind_4_input_columns = get_cols_ind_04_dist_after_2020(
187
+ filtered_lookup_gee_datasets_df
188
+ )
189
+ if ind_5_input_columns is None:
190
+ ind_5_input_columns = get_cols_ind_05_primary_2020(
191
+ filtered_lookup_gee_datasets_df
192
+ )
193
+ if ind_6_input_columns is None:
194
+ ind_6_input_columns = get_cols_ind_06_nat_reg_2020(
195
+ filtered_lookup_gee_datasets_df
196
+ )
197
+ if ind_7_input_columns is None:
198
+ ind_7_input_columns = get_cols_ind_07_planted_2020(
199
+ filtered_lookup_gee_datasets_df
200
+ )
201
+ if ind_8_input_columns is None:
202
+ ind_8_input_columns = get_cols_ind_08_planted_after_2020(
203
+ filtered_lookup_gee_datasets_df
204
+ )
205
+ if ind_9_input_columns is None:
206
+ ind_9_input_columns = get_cols_ind_09_treecover_after_2020(
207
+ filtered_lookup_gee_datasets_df
208
+ )
209
+ if ind_10_input_columns is None:
210
+ ind_10_input_columns = get_cols_ind_10_agri_after_2020(
211
+ filtered_lookup_gee_datasets_df
212
+ )
213
+ if ind_11_input_columns is None:
214
+ ind_11_input_columns = get_cols_ind_11_logging_before_2020(
215
+ filtered_lookup_gee_datasets_df
216
+ )
217
+
218
+ # Check range of values
219
+ check_range(ind_1_pcent_threshold)
220
+ check_range(ind_2_pcent_threshold)
221
+ check_range(ind_3_pcent_threshold)
222
+ check_range(ind_4_pcent_threshold)
223
+ check_range(ind_5_pcent_threshold)
224
+ check_range(ind_6_pcent_threshold)
225
+ check_range(ind_7_pcent_threshold)
226
+ check_range(ind_8_pcent_threshold)
227
+ check_range(ind_9_pcent_threshold)
228
+ check_range(ind_10_pcent_threshold)
229
+ check_range(ind_11_pcent_threshold)
230
+
231
+ input_cols = [
232
+ ind_1_input_columns,
233
+ ind_2_input_columns,
234
+ ind_3_input_columns,
235
+ ind_4_input_columns,
236
+ ind_5_input_columns,
237
+ ind_6_input_columns,
238
+ ind_7_input_columns,
239
+ ind_8_input_columns,
240
+ ind_9_input_columns,
241
+ ind_10_input_columns,
242
+ ind_11_input_columns,
243
+ ]
244
+ thresholds = [
245
+ ind_1_pcent_threshold,
246
+ ind_2_pcent_threshold,
247
+ ind_3_pcent_threshold,
248
+ ind_4_pcent_threshold,
249
+ ind_5_pcent_threshold,
250
+ ind_6_pcent_threshold,
251
+ ind_7_pcent_threshold,
252
+ ind_8_pcent_threshold,
253
+ ind_9_pcent_threshold,
254
+ ind_10_pcent_threshold,
255
+ ind_11_pcent_threshold,
256
+ ]
257
+ names = [
258
+ ind_1_name,
259
+ ind_2_name,
260
+ ind_3_name,
261
+ ind_4_name,
262
+ ind_5_name,
263
+ ind_6_name,
264
+ ind_7_name,
265
+ ind_8_name,
266
+ ind_9_name,
267
+ ind_10_name,
268
+ ind_11_name,
269
+ ]
270
+ [check_range(threshold) for threshold in thresholds]
271
+
272
+ df_w_indicators = add_indicators(
273
+ df,
274
+ input_cols,
275
+ thresholds,
276
+ names,
277
+ low_name,
278
+ high_name,
279
+ unit_type, # Pass the unit type
280
+ )
281
+
282
+ df_w_indicators_and_risk_pcrop = add_eudr_risk_pcrop_col(
283
+ df=df_w_indicators,
284
+ ind_1_name=ind_1_name,
285
+ ind_2_name=ind_2_name,
286
+ ind_3_name=ind_3_name,
287
+ ind_4_name=ind_4_name,
288
+ )
289
+
290
+ df_w_indicators_and_risk_acrop = add_eudr_risk_acrop_col(
291
+ df=df_w_indicators,
292
+ ind_1_name=ind_1_name,
293
+ ind_2_name=ind_2_name,
294
+ ind_4_name=ind_4_name,
295
+ )
296
+
297
+ df_w_indicators_and_risk_timber = add_eudr_risk_timber_col(
298
+ df=df_w_indicators,
299
+ ind_2_name=ind_2_name,
300
+ ind_5_name=ind_5_name,
301
+ ind_6_name=ind_6_name,
302
+ ind_7_name=ind_7_name,
303
+ ind_8_name=ind_8_name,
304
+ ind_9_name=ind_9_name,
305
+ ind_10_name=ind_10_name,
306
+ ind_11_name=ind_11_name,
307
+ )
308
+
309
+ return df_w_indicators_and_risk_timber
310
+
311
+
312
+ def add_eudr_risk_pcrop_col(
313
+ df: data_lookup_type,
314
+ ind_1_name: str,
315
+ ind_2_name: str,
316
+ ind_3_name: str,
317
+ ind_4_name: str,
318
+ ) -> data_lookup_type:
319
+ """
320
+ Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
321
+
322
+ Args:
323
+ df (DataFrame): Input DataFrame.
324
+ ind_1_name (str, optional): Name of first indicator column. Defaults to "Ind_01_treecover".
325
+ ind_2_name (str, optional): Name of second indicator column. Defaults to "Ind_02_commodities".
326
+ ind_3_name (str, optional): Name of third indicator column. Defaults to "Ind_03_disturbance_before_2020".
327
+ ind_4_name (str, optional): Name of fourth indicator column. Defaults to "Ind_04_disturbance_after_2020".
328
+
329
+ Returns:
330
+ DataFrame: DataFrame with added 'EUDR_risk' column.
331
+ """
332
+
333
+ for index, row in df.iterrows():
334
+ # If any of the first three indicators suggest low risk, set EUDR_risk to "low"
335
+ if (
336
+ row[ind_1_name] == "no"
337
+ or row[ind_2_name] == "yes"
338
+ or row[ind_3_name] == "yes"
339
+ ):
340
+ df.at[index, "risk_pcrop"] = "low"
341
+ # If none of the first three indicators suggest low risk and Indicator 4 suggests no risk, set EUDR_risk to "more_info_needed"
342
+ elif row[ind_4_name] == "no":
343
+ df.at[index, "risk_pcrop"] = "more_info_needed"
344
+ # If none of the above conditions are met, set EUDR_risk to "high"
345
+ else:
346
+ df.at[index, "risk_pcrop"] = "high"
347
+
348
+ return df
349
+
350
+
351
+ def add_eudr_risk_acrop_col(
352
+ df: data_lookup_type,
353
+ ind_1_name: str,
354
+ ind_2_name: str,
355
+ ind_4_name: str,
356
+ ) -> data_lookup_type:
357
+ """
358
+ Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
359
+
360
+ Args:
361
+ df (DataFrame): Input DataFrame.
362
+ ind_1_name (str, optional): Name of first indicator column. Defaults to "Ind_01_treecover".
363
+ ind_2_name (str, optional): Name of second indicator column. Defaults to "Ind_02_commodities".
364
+ ind_4_name (str, optional): Name of fourth indicator column. Defaults to "Ind_04_disturbance_after_2020".
365
+
366
+ Returns:
367
+ DataFrame: DataFrame with added 'EUDR_risk' column.
368
+ """
369
+
370
+ # soy risk
371
+ for index, row in df.iterrows():
372
+ # If there is no tree cover in 2020, set EUDR_risk_soy to "low"
373
+ if row[ind_1_name] == "no" or row[ind_2_name] == "yes":
374
+ df.at[index, "risk_acrop"] = "low"
375
+ # If there is tree cover in 2020 and distrubances post 2020, set EUDR_risk_soy to "high"
376
+ elif row[ind_1_name] == "yes" and row[ind_4_name] == "yes":
377
+ df.at[index, "risk_acrop"] = "high"
378
+ # If tree cover and no disturbances post 2020, set EUDR_risk to "more_info_needed"
379
+ else:
380
+ df.at[index, "risk_acrop"] = "more_info_needed"
381
+
382
+ return df
383
+
384
+
385
+ def add_eudr_risk_timber_col(
386
+ df: data_lookup_type,
387
+ ind_2_name: str,
388
+ ind_5_name: str,
389
+ ind_6_name: str,
390
+ ind_7_name: str,
391
+ ind_8_name: str,
392
+ ind_9_name: str,
393
+ ind_10_name: str,
394
+ ind_11_name: str,
395
+ ) -> data_lookup_type:
396
+ """
397
+ Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
398
+
399
+ Args:
400
+ df (DataFrame): Input DataFrame.
401
+ ind_2_name (str, optional): Name of second indicator column. Defaults to "Ind_02_commodities".
402
+ ind_5_name (str, optional): Name of fifth indicator column. Defaults to "Ind_05_primary_2020".
403
+ ind_6_name (str, optional): Name of sixth indicator column. Defaults to "Ind_06_nat_reg_forest_2020".
404
+ ind_7_name (str, optional): Name of seventh indicator column. Defaults to "Ind_07_planted_plantations_2020".
405
+ ind_8_name (str, optional): Name of eighth indicator column. Defaults to "Ind_08_planted_plantations_after_2020".
406
+ ind_9_name (str, optional): Name of ninth indicator column. Defaults to "Ind_09_treecover_after_2020".
407
+ ind_10_name (str, optional): Name of tenth indicator column. Defaults to "Ind_10_agri_after_2020".
408
+ ind_11_name (str, optional): Name of eleventh indicator column. Defaults to "Ind_11_logging_concession_before_2020".
409
+
410
+ Returns:
411
+ DataFrame: DataFrame with added 'EUDR_risk' column.
412
+ """
413
+
414
+ for index, row in df.iterrows():
415
+ # If there is a commodity in 2020 (ind_2_name)
416
+ # OR if there is planted-plantation in 2020 (ind_7_name) AND no agriculture in 2023 (ind_10_name), set EUDR_risk_timber to "low"
417
+ if row[ind_2_name] == "yes" or (
418
+ row[ind_7_name] == "yes" and row[ind_10_name] == "no"
419
+ ):
420
+ df.at[index, "risk_timber"] = "low"
421
+ # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) or planted forest (ind_7_name) in 2020 AND agricultural after 2020 (ind_10_name), set EUDR_timber to high
422
+ elif (
423
+ row[ind_5_name] == "yes"
424
+ or row[ind_6_name] == "yes"
425
+ or row[ind_7_name] == "yes"
426
+ ) and row[ind_10_name] == "yes":
427
+ df.at[index, "risk_timber"] = "high"
428
+ # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) AND planted after 2020 (ind_8_name), set EUDR_risk to "high"
429
+ elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes") and row[
430
+ ind_8_name
431
+ ] == "yes":
432
+ df.at[index, "risk_timber"] = "high"
433
+ # No data yet on OWL conversion
434
+ # If primary or naturally regenerating or planted forest in 2020 and OWL in 2023, set EUDR_risk to high
435
+ # elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes" or row[ind_7_name] == "yes") and row[ind_10_name] == "yes":
436
+ # df.at[index, 'EUDR_risk_timber'] = "high"
437
+
438
+ # If there is a natural primary forest (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) AND an information on management practice any time (ind_11_name) OR tree cover or regrowth post 2020 (ind_9_name), set EUDR_risk_timber to "low"
439
+ elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes") and (
440
+ row[ind_9_name] == "yes" or row[ind_11_name] == "yes"
441
+ ):
442
+ df.at[index, "risk_timber"] = "low"
443
+ # If primary (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) and no other info, set EUDR_risk to "more_info_needed"
444
+ elif row[ind_5_name] == "yes" or row[ind_6_name] == "yes":
445
+ df.at[index, "risk_timber"] = "more_info_needed"
446
+ # If none of the above conditions are met, set EUDR_risk to "low"
447
+ else:
448
+ df.at[index, "risk_timber"] = "low"
449
+
450
+ return df
451
+
452
+
453
+ def add_indicators(
454
+ df: data_lookup_type,
455
+ input_cols: list[str],
456
+ thresholds: list[float],
457
+ names: list[str],
458
+ low_name: str = "no",
459
+ high_name: str = "yes",
460
+ unit_type: str = None,
461
+ ) -> data_lookup_type:
462
+ for input_col, threshold, name in zip(input_cols, thresholds, names):
463
+ df = add_indicator_column(
464
+ df=df,
465
+ input_columns=input_col,
466
+ threshold=threshold,
467
+ new_column_name=name,
468
+ low_name=low_name,
469
+ high_name=high_name,
470
+ sum_comparison=False,
471
+ unit_type=unit_type, # Pass the unit type
472
+ )
473
+ return df
474
+
475
+
476
+ # Update add_indicator_column to use the unit_type parameter
477
+ def add_indicator_column(
478
+ df: data_lookup_type,
479
+ input_columns: list[str],
480
+ threshold: float,
481
+ new_column_name: str,
482
+ low_name: str = "no",
483
+ high_name: str = "yes",
484
+ sum_comparison: bool = False,
485
+ unit_type: str = None, # unit_type parameter
486
+ ) -> data_lookup_type:
487
+ """
488
+ Add a new column to the DataFrame based on the specified columns, threshold, and comparison sign.
489
+
490
+ Parameters:
491
+ df (data_lookup_type): The pandas DataFrame to which the column will be added.
492
+ input_columns (list): List of column names to check for threshold.
493
+ threshold (float): The threshold value to compare against.
494
+ new_column_name (str): The name of the new column to be added.
495
+ The '>' sign is used for comparisons.
496
+ When 'sum comparison' == True, then the threshold is compared to the sum of all those listed in 'input_columns', as opposed to when Flalse, when each column in the list is compared to the threshold individually
497
+ low_name (str): The name for the value when below or equal to threshold (default is 'no').
498
+ high_name (str): The name for the value when above threshold (default is 'yes').
499
+ sum_comparison (bool): If True, sum all values in input_columns and compare to threshold (default is False).
500
+ unit_type (str): Whether values are in "ha" or "percent".
501
+
502
+ Returns:
503
+ data_lookup_type: The DataFrame with the new column added.
504
+ """
505
+ # Create a new column and initialize with low_name
506
+ new_column = pd.Series(low_name, index=df.index, name=new_column_name)
507
+
508
+ # Default behavior: use '>' for single column comparison
509
+ if sum_comparison:
510
+ # Sum all values in specified columns and compare to threshold
511
+ sum_values = df[input_columns].sum(axis=1)
512
+ new_column[sum_values > threshold] = high_name
513
+ else:
514
+ # Check if any values in specified columns are above the threshold and update the new column accordingly
515
+ for col in input_columns:
516
+ # So that threshold is always in percent, if outputs are in ha, the code converts to percent (based on dividing by the geometry_area_column column.
517
+ # Clamping is needed due to differences in decimal places (meaning input values may go just over 100)
518
+ if unit_type == "ha":
519
+ df[geometry_area_column] = pd.to_numeric(
520
+ df[geometry_area_column], errors="coerce"
521
+ )
522
+ val_to_check = clamp(
523
+ ((df[col] / df[geometry_area_column]) * 100), 0, 100
524
+ )
525
+ else:
526
+ val_to_check = df[col]
527
+ new_column[val_to_check > threshold] = high_name
528
+
529
+ # Concatenate the new column to the DataFrame
530
+ df = pd.concat([df, new_column], axis=1)
531
+ return df
532
+
533
+
534
+ def get_cols_ind_01_treecover(lookup_gee_datasets_df):
535
+ """
536
+ Generate a list of dataset names for the treecover theme, excluding those marked for exclusion.
537
+
538
+ Args:
539
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
540
+
541
+ Returns:
542
+ list: List of dataset names set to be used in the risk calculations for the treecover theme, excluding those marked for exclusion.
543
+ """
544
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
545
+ lookup_gee_datasets_df["exclude_from_output"] != 1
546
+ ]
547
+ return list(
548
+ lookup_gee_datasets_df["name"][
549
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
550
+ & (lookup_gee_datasets_df["theme"] == "treecover")
551
+ ]
552
+ )
553
+
554
+
555
+ def get_cols_ind_02_commodities(lookup_gee_datasets_df):
556
+ """
557
+ Generate a list of dataset names for the commodities theme, excluding those marked for exclusion.
558
+
559
+ Args:
560
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
561
+
562
+ Returns:
563
+ list: List of dataset names set to be used in the risk calculations for the commodities theme, excluding those marked for exclusion.
564
+ """
565
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
566
+ lookup_gee_datasets_df["exclude_from_output"] != 1
567
+ ]
568
+ return list(
569
+ lookup_gee_datasets_df["name"][
570
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
571
+ & (lookup_gee_datasets_df["theme"] == "commodities")
572
+ ]
573
+ )
574
+
575
+
576
+ def get_cols_ind_03_dist_before_2020(lookup_gee_datasets_df):
577
+ """
578
+ Generate a list of dataset names for the disturbance before 2020 theme, excluding those marked for exclusion.
579
+
580
+ Args:
581
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
582
+
583
+ Returns:
584
+ list: List of dataset names set to be used in the risk calculations for the disturbance before 2020 theme, excluding those marked for exclusion.
585
+ """
586
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
587
+ lookup_gee_datasets_df["exclude_from_output"] != 1
588
+ ]
589
+ return list(
590
+ lookup_gee_datasets_df["name"][
591
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
592
+ & (lookup_gee_datasets_df["theme"] == "disturbance_before")
593
+ ]
594
+ )
595
+
596
+
597
+ def get_cols_ind_04_dist_after_2020(lookup_gee_datasets_df):
598
+ """
599
+ Generate a list of dataset names for the disturbance after 2020 theme, excluding those marked for exclusion.
600
+
601
+ Args:
602
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
603
+
604
+ Returns:
605
+ list: List of dataset names set to be used in the risk calculations for the disturbance after 2020 theme, excluding those marked for exclusion.
606
+ """
607
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
608
+ lookup_gee_datasets_df["exclude_from_output"] != 1
609
+ ]
610
+ return list(
611
+ lookup_gee_datasets_df["name"][
612
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
613
+ & (lookup_gee_datasets_df["theme"] == "disturbance_after")
614
+ ]
615
+ )
616
+
617
+
618
+ def get_cols_ind_05_primary_2020(lookup_gee_datasets_df):
619
+ """
620
+ Generate a list of dataset names for primary forests in 2020
621
+
622
+ Args:
623
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
624
+
625
+ Returns:
626
+ list: List of dataset names set to be used in the risk calculations for the degradation - primary forest in 2020, excluding those marked for exclusion.
627
+ """
628
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
629
+ lookup_gee_datasets_df["exclude_from_output"] != 1
630
+ ]
631
+ return list(
632
+ lookup_gee_datasets_df["name"][
633
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
634
+ & (lookup_gee_datasets_df["theme_timber"] == "primary")
635
+ ]
636
+ )
637
+
638
+
639
+ def get_cols_ind_06_nat_reg_2020(lookup_gee_datasets_df):
640
+ """
641
+ Generate a list of dataset names for naturally_reg_2020 forests in 2020
642
+
643
+ Args:
644
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
645
+
646
+ Returns:
647
+ list: List of dataset names set to be used in the risk calculations for the degradation - naturally_reg_2020 in 2020, excluding those marked for exclusion.
648
+ """
649
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
650
+ lookup_gee_datasets_df["exclude_from_output"] != 1
651
+ ]
652
+ return list(
653
+ lookup_gee_datasets_df["name"][
654
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
655
+ & (lookup_gee_datasets_df["theme_timber"] == "naturally_reg_2020")
656
+ ]
657
+ )
658
+
659
+
660
+ def get_cols_ind_07_planted_2020(lookup_gee_datasets_df):
661
+ """
662
+ Generate a list of dataset names for planted and plantation forests in 2020
663
+
664
+ Args:
665
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
666
+
667
+ Returns:
668
+ list: List of dataset names set to be used in the risk calculations for the degradation - planted and plantation forests in 2020, excluding those marked for exclusion.
669
+ """
670
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
671
+ lookup_gee_datasets_df["exclude_from_output"] != 1
672
+ ]
673
+ return list(
674
+ lookup_gee_datasets_df["name"][
675
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
676
+ & (lookup_gee_datasets_df["theme_timber"] == "planted_plantation_2020")
677
+ ]
678
+ )
679
+
680
+
681
+ def get_cols_ind_08_planted_after_2020(lookup_gee_datasets_df):
682
+ """
683
+ Generate a list of dataset names for planted and plantation forests post 2020
684
+
685
+ Args:
686
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
687
+
688
+ Returns:
689
+ list: List of dataset names set to be used in the risk calculations for the degradation - planted and plantation forests post 2020, excluding those marked for exclusion.
690
+ """
691
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
692
+ lookup_gee_datasets_df["exclude_from_output"] != 1
693
+ ]
694
+ return list(
695
+ lookup_gee_datasets_df["name"][
696
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
697
+ & (
698
+ lookup_gee_datasets_df["theme_timber"]
699
+ == "planted_plantation_after_2020"
700
+ )
701
+ ]
702
+ )
703
+
704
+
705
+ def get_cols_ind_09_treecover_after_2020(lookup_gee_datasets_df):
706
+ """
707
+ Generate a list of dataset names for treecover post 2020
708
+
709
+ Args:
710
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
711
+
712
+ Returns:
713
+ list: List of dataset names set to be used in the risk calculations for the degradation - treecover post 2020, excluding those marked for exclusion.
714
+ """
715
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
716
+ lookup_gee_datasets_df["exclude_from_output"] != 1
717
+ ]
718
+ return list(
719
+ lookup_gee_datasets_df["name"][
720
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
721
+ & (lookup_gee_datasets_df["theme_timber"] == "treecover_after_2020")
722
+ ]
723
+ )
724
+
725
+
726
+ def get_cols_ind_10_agri_after_2020(lookup_gee_datasets_df):
727
+ """
728
+ Generate a list of dataset names for croplands post 2020
729
+
730
+ Args:
731
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
732
+
733
+ Returns:
734
+ list: List of dataset names set to be used in the risk calculations for the degradation - croplands post 2020, excluding those marked for exclusion.
735
+ """
736
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
737
+ lookup_gee_datasets_df["exclude_from_output"] != 1
738
+ ]
739
+ return list(
740
+ lookup_gee_datasets_df["name"][
741
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
742
+ & (lookup_gee_datasets_df["theme_timber"] == "agri_after_2020")
743
+ ]
744
+ )
745
+
746
+
747
+ def get_cols_ind_11_logging_before_2020(lookup_gee_datasets_df):
748
+ """
749
+ Generate a list of dataset names for logging concessions (2020 if available)
750
+
751
+ Args:
752
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
753
+
754
+ Returns:
755
+ list: List of dataset names set to be used in the risk calculations for the degradation - logging concessions, excluding those marked for exclusion.
756
+ """
757
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
758
+ lookup_gee_datasets_df["exclude_from_output"] != 1
759
+ ]
760
+ return list(
761
+ lookup_gee_datasets_df["name"][
762
+ (lookup_gee_datasets_df["use_for_risk_timber"] == 1)
763
+ & (lookup_gee_datasets_df["theme_timber"] == "logging_concession")
764
+ ]
765
+ )
766
+
767
+
768
+ def clamp(
769
+ value: float | pd.Series, min_val: float, max_val: float
770
+ ) -> float | pd.Series:
771
+ """
772
+ Clamp a value or a Pandas Series within a specified range.
773
+
774
+ Args:
775
+ value (float | pd.Series): The value or series to be clamped.
776
+ min_val (float): The minimum value of the range.
777
+ max_val (float): The maximum value of the range.
778
+
779
+ Returns:
780
+ float | pd.Series: The clamped value or series within the range.
781
+ """
782
+ if isinstance(value, pd.Series):
783
+ return value.clip(lower=min_val, upper=max_val)
784
+ else:
785
+ return max(min_val, min(value, max_val))
786
+
787
+
788
+ def check_range(value: float) -> None:
789
+ if not (0 <= value <= 100):
790
+ raise ValueError("Value must be between 0 and 100.")
791
+
792
+
793
+ def add_custom_bands_info_to_lookup(
794
+ lookup_df: pd.DataFrame, custom_bands_info: dict, df_columns: list
795
+ ) -> pd.DataFrame:
796
+ """
797
+ Add custom bands to the lookup DataFrame for risk calculations.
798
+
799
+ Parameters
800
+ ----------
801
+ lookup_df : pd.DataFrame
802
+ Original lookup DataFrame
803
+ custom_bands_info : dict
804
+ Custom band definitions with risk info
805
+ df_columns : list
806
+ List of columns in the actual data DataFrame
807
+
808
+ Returns
809
+ -------
810
+ pd.DataFrame
811
+ Lookup DataFrame with custom bands added
812
+ """
813
+ custom_rows = []
814
+
815
+ for band_name, band_info in custom_bands_info.items():
816
+ # Only add bands that actually exist in the DataFrame
817
+ if band_name in df_columns:
818
+ custom_row = {
819
+ "name": band_name, # Use the band name as provided
820
+ "theme": band_info.get(
821
+ "theme", pd.NA
822
+ ), # default to empty if not provided
823
+ "theme_timber": band_info.get(
824
+ "theme_timber", pd.NA
825
+ ), # default to empty if not provided
826
+ "use_for_risk": band_info.get(
827
+ "use_for_risk", 0
828
+ ), # default to 0 if not provided
829
+ "use_for_risk_timber": band_info.get(
830
+ "use_for_risk_timber", 0
831
+ ), # default to 0 if not provided
832
+ "exclude_from_output": 0, # 0 here is so we don't exclude custom bands
833
+ "ISO2_code": pd.NA, # Global, i.e., empty string, by default
834
+ # Add other required columns with defaults
835
+ "col_type": "float64", # default to float64 if not provided
836
+ "is_nullable": 1,
837
+ "is_required": 0,
838
+ "order": 9999, # Put at end unless specified otherwise
839
+ "corresponding_variable": pd.NA, # not necessary for custom bands
840
+ }
841
+ custom_rows.append(custom_row)
842
+
843
+ if custom_rows:
844
+ custom_df = pd.DataFrame(custom_rows)
845
+ # Combine with original lookup
846
+ lookup_df = pd.concat([lookup_df, custom_df], ignore_index=True)
847
+
848
+ return lookup_df