openforis-whisp 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,346 @@
1
+ # !pip install pandera[io] # special version used
2
+ import pandera as pa
3
+ import pandas as pd
4
+ import os
5
+ import logging
6
+
7
+
8
+ from openforis_whisp.logger import StdoutLogger, FileLogger
9
+
10
+ from openforis_whisp.pd_schemas import data_lookup_type
11
+
12
+
13
+ from openforis_whisp.parameters.config_runtime import (
14
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
15
+ DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
16
+ )
17
+
18
+ logger = StdoutLogger(__name__)
19
+
20
+
21
+ # Dictionary to cache schema and modification times for multiple files
22
+ cached_schema = None
23
+ cached_file_mtimes = {}
24
+
25
+
26
+ def validate_dataframe_using_lookups(
27
+ df_stats: pd.DataFrame, file_paths: list = None
28
+ ) -> pd.DataFrame:
29
+ """
30
+ Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
31
+
32
+ Args:
33
+ df_stats (pd.DataFrame): The DataFrame to validate.
34
+ file_paths (list): List of paths to schema files.
35
+
36
+ Returns:
37
+ pd.DataFrame: The validated DataFrame.
38
+ """
39
+ # Load the schema
40
+ schema = load_schema_if_any_file_changed(file_paths)
41
+
42
+ # Validate the DataFrame
43
+ validated_df = validate_dataframe(df_stats, schema)
44
+
45
+ return validated_df
46
+
47
+
48
+ # NB uses default inputs. If you want to use custom inputs, you can pass them as arguments
49
+ def load_schema_if_any_file_changed(file_paths):
50
+ """Load schema only if any file in the list has changed."""
51
+ global cached_schema, cached_file_mtimes
52
+
53
+ if file_paths is None:
54
+ file_paths = [
55
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
56
+ DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
57
+ ]
58
+
59
+ # Flag to indicate if any file has changed
60
+ schema_needs_update = False
61
+
62
+ # Check each file's modification time
63
+ for file_path in file_paths:
64
+ current_mtime = os.path.getmtime(file_path)
65
+
66
+ # If the file is new or has been modified, mark schema for update
67
+ if (
68
+ file_path not in cached_file_mtimes
69
+ or current_mtime != cached_file_mtimes[file_path]
70
+ ):
71
+ print(f"File {file_path} changed, updating schema...")
72
+ schema_needs_update = True
73
+ cached_file_mtimes[
74
+ file_path
75
+ ] = current_mtime # Update the modification time
76
+
77
+ # If any file has changed, update the schema
78
+ if schema_needs_update or cached_schema is None:
79
+ print("Creating or updating schema based on changed files...")
80
+ # You can combine the files as needed; here we assume one schema file
81
+ # If you want to handle multiple schema files differently, adjust this
82
+
83
+ # add checks on lookup inputs (i.e. a dataframe in type format: data_lookup_type)
84
+ combined_lookup_df: data_lookup_type = append_csvs_to_dataframe(
85
+ file_paths
86
+ ) # concatonates input lookup files
87
+
88
+ cached_schema = create_schema_from_dataframe(
89
+ combined_lookup_df
90
+ ) # create cached schema
91
+
92
+ else:
93
+ print("Using cached schema.")
94
+
95
+ return cached_schema
96
+
97
+
98
+ def validate_dataframe(
99
+ df_stats: pd.DataFrame, schema: pa.DataFrameSchema
100
+ ) -> pd.DataFrame:
101
+ """Validate the DataFrame against the given schema, reorder columns to match schema order, and list missing columns.
102
+
103
+ Args:
104
+ schema (pa.DataFrameSchema): The schema to validate against.
105
+ df_stats (pd.DataFrame): The DataFrame to validate.
106
+ required_false (bool): If True, sets all columns in the schema as optional (required=False).
107
+
108
+ Returns:
109
+ pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
110
+ """
111
+ log_missing_columns(df_stats, schema)
112
+
113
+ # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
114
+
115
+ # Try to automatically coerce the DataFrame to match the schema types
116
+ try:
117
+ validated_df = schema(df_stats)
118
+ except pa.errors.SchemaError as e:
119
+ print("Error during validation:", e)
120
+ # Return None or raise the error if validation fails
121
+ return None # or raise e
122
+
123
+ # Reorder the validated DataFrame to match the schema's column order
124
+ validated_df = validated_df.reindex(schema.columns.keys(), axis=1)
125
+
126
+ return validated_df
127
+
128
+
129
+ def load_schema_if_any_file_changed(file_paths):
130
+ """Load schema only if any file in the list has changed."""
131
+ global cached_schema, cached_file_mtimes
132
+
133
+ if file_paths is None:
134
+ file_paths = [
135
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
136
+ DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
137
+ ]
138
+
139
+ # Flag to indicate if any file has changed
140
+ schema_needs_update = False
141
+
142
+ # Check each file's modification time
143
+ for file_path in file_paths:
144
+ current_mtime = os.path.getmtime(file_path)
145
+
146
+ # If the file is new or has been modified, mark schema for update
147
+ if (
148
+ file_path not in cached_file_mtimes
149
+ or current_mtime != cached_file_mtimes[file_path]
150
+ ):
151
+ print(f"File {file_path} changed, updating schema...")
152
+ schema_needs_update = True
153
+ cached_file_mtimes[
154
+ file_path
155
+ ] = current_mtime # Update the modification time
156
+
157
+ # If any file has changed, update the schema
158
+ if schema_needs_update or cached_schema is None:
159
+ print("Creating or updating schema based on changed files...")
160
+ # You can combine the files as needed; here we assume one schema file
161
+ # If you want to handle multiple schema files differently, adjust this
162
+
163
+ # add checks on lookup inputs (i.e. a dataframe in type format: data_lookup_type)
164
+ combined_lookup_df: data_lookup_type = append_csvs_to_dataframe(
165
+ file_paths
166
+ ) # concatonates input lookup files
167
+
168
+ cached_schema = create_schema_from_dataframe(
169
+ combined_lookup_df
170
+ ) # create cached schema
171
+
172
+ else:
173
+ print("Using cached schema.")
174
+
175
+ return cached_schema
176
+
177
+
178
+ # example code to convert schema to JSON format if want to export (note pandera[io] required)
179
+ # cached_schema.to_yaml(output_file_path)
180
+
181
+ # loaded_schema = io.from_yaml(output_file_path)
182
+
183
+
184
+ def append_csvs_to_dataframe(csv_paths):
185
+ """
186
+ Appends multiple CSV files into a single Pandas DataFrame.
187
+
188
+ Args:
189
+ - csv_paths (list of str): List of paths to CSV files to append.
190
+
191
+ Returns:
192
+ - pd.DataFrame: Combined DataFrame containing data from all provided CSV files.
193
+
194
+ Raises:
195
+ - ValueError: If any CSV file cannot be read.
196
+ """
197
+
198
+ combined_df = pd.DataFrame() # Initialize an empty DataFrame
199
+
200
+ for path in csv_paths:
201
+ try:
202
+ # Read the CSV file into a DataFrame
203
+ df = pd.read_csv(path)
204
+ # Append to the combined DataFrame
205
+ combined_df = pd.concat([combined_df, df], ignore_index=True)
206
+ except Exception as e:
207
+ raise ValueError(f"Error reading {path}: {e}")
208
+
209
+ return combined_df
210
+
211
+
212
+ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
213
+ """Create a Pandera schema from a DataFrame containing schema information."""
214
+
215
+ if schema_df.empty:
216
+ raise ValueError("The input DataFrame is empty.")
217
+
218
+ required_columns = ["name", "col_type", "is_nullable", "is_required"]
219
+ missing_columns = [col for col in required_columns if col not in schema_df.columns]
220
+ if missing_columns:
221
+ raise ValueError(f"Missing columns in schema DataFrame: {missing_columns}")
222
+
223
+ # print("Schema DataFrame columns:", schema_df.columns)
224
+
225
+ # Sort DataFrame by 'order' if it exists
226
+ if "order" in schema_df.columns:
227
+ schema_df = schema_df.sort_values(by="order")
228
+
229
+ # Remove rows where 'exclude_from_output' equals 1, if that column exists
230
+ if "exclude_from_output" in schema_df.columns:
231
+ schema_df = schema_df[schema_df["exclude_from_output"] != 1]
232
+
233
+ # Create a dictionary to hold the column schema
234
+ schema_dict = {}
235
+ for _, row in schema_df.iterrows():
236
+ col_name = row["name"]
237
+ col_type = row["col_type"]
238
+ is_nullable = row["is_nullable"] in (1, "1", True, "True")
239
+ is_required = row["is_required"] in (1, "1", True, "True")
240
+
241
+ # print(
242
+ # f"Processing column: {col_name}, Type: {col_type}, Nullable: {is_nullable}, Required: {is_required}"
243
+ # )
244
+
245
+ # Map DataFrame types to Pandera types
246
+ if col_type == "int64":
247
+ schema_dict[col_name] = pa.Column(
248
+ pa.Int64, nullable=is_nullable, required=is_required
249
+ )
250
+ elif col_type == "int":
251
+ schema_dict[col_name] = pa.Column(
252
+ pa.Int, nullable=is_nullable, required=is_required
253
+ )
254
+ elif col_type == "string":
255
+ schema_dict[col_name] = pa.Column(
256
+ pa.String, nullable=is_nullable, required=is_required
257
+ )
258
+ elif col_type == "float32":
259
+ schema_dict[col_name] = pa.Column(
260
+ pa.Float32, nullable=is_nullable, required=is_required
261
+ )
262
+ elif col_type == "float64":
263
+ schema_dict[col_name] = pa.Column(
264
+ pa.Float64, nullable=is_nullable, required=is_required
265
+ )
266
+ elif col_type == "bool":
267
+ schema_dict[col_name] = pa.Column(
268
+ pa.Bool, nullable=is_nullable, required=is_required
269
+ )
270
+ else:
271
+ raise ValueError(f"Unsupported type: {col_type}")
272
+
273
+ # Create and return the DataFrame schema with coercion enabled
274
+ schema = pa.DataFrameSchema(
275
+ schema_dict,
276
+ strict=False,
277
+ unique_column_names=True,
278
+ add_missing_columns=True,
279
+ coerce=True,
280
+ )
281
+
282
+ return schema
283
+
284
+
285
+ def setup_logger(name):
286
+ # Create and configure logger
287
+ logging.basicConfig(level=logging.INFO)
288
+ logger = logging.getLogger(name)
289
+ return logger
290
+
291
+
292
+ def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
293
+ # Initialize the logger
294
+ logger = setup_logger(__name__)
295
+
296
+ # Extract the expected columns from the DataFrameSchema
297
+ template_columns = template_schema.columns.keys()
298
+ df_stats_columns = df_stats.columns
299
+
300
+ # Find missing columns
301
+ missing_in_template = [
302
+ col for col in df_stats_columns if col not in template_columns
303
+ ]
304
+ missing_in_stats = [col for col in template_columns if col not in df_stats_columns]
305
+
306
+ # Log results for missing columns in df_stats
307
+ if missing_in_template:
308
+ logger.warning(
309
+ f"The following columns from the results dataframe did not match any columns in the schema: \n{', '.join(missing_in_template)}"
310
+ )
311
+ else:
312
+ logger.info("All columns from dataframe found in the schema.")
313
+
314
+ # Log results for missing columns in template_df
315
+ if missing_in_stats:
316
+ logger.warning(
317
+ f"The following columns in the schema did not match any columns from the results dataframe: \n{', '.join(missing_in_stats)}"
318
+ )
319
+ else:
320
+ logger.info("All columns from the schema found in the results dataframe.")
321
+
322
+
323
+ def setup_logger(name):
324
+ """
325
+ Set up a logger with a specific name to avoid duplicate logs.
326
+ """
327
+ logger = logging.getLogger(name)
328
+ if not logger.hasHandlers():
329
+ # Create handlers only if there are none
330
+ stdout_handler = logging.StreamHandler()
331
+ file_handler = logging.FileHandler("missing_columns.log")
332
+
333
+ # Set levels
334
+ stdout_handler.setLevel(logging.WARNING)
335
+ file_handler.setLevel(logging.WARNING)
336
+
337
+ # Create formatter and add it to the handlers
338
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
339
+ stdout_handler.setFormatter(formatter)
340
+ file_handler.setFormatter(formatter)
341
+
342
+ # Add handlers to the logger
343
+ logger.addHandler(stdout_handler)
344
+ logger.addHandler(file_handler)
345
+
346
+ return logger
@@ -0,0 +1,329 @@
1
+ import pandas as pd
2
+
3
+ from .pd_schemas import data_lookup_type
4
+
5
+ from openforis_whisp.parameters.config_runtime import (
6
+ percent_or_ha,
7
+ geometry_area_column,
8
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
9
+ )
10
+
11
+ # could embed this in each function below that uses lookup_gee_datasets_df.
12
+ lookup_gee_datasets_df: data_lookup_type = pd.read_csv(
13
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH
14
+ )
15
+
16
+
17
+ # requires lookup_gee_datasets_df
18
+
19
+
20
+ def whisp_risk(
21
+ df: data_lookup_type, # CHECK THIS
22
+ ind_1_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
23
+ ind_2_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
24
+ ind_3_pcent_threshold: float = 0, # default values (draft decision tree and parameters)
25
+ ind_4_pcent_threshold: float = 0, # default values (draft decision tree and parameters)
26
+ ind_1_input_columns: pd.Series = None, # see lookup_gee_datasets for details
27
+ ind_2_input_columns: pd.Series = None, # see lookup_gee_datasets for details
28
+ ind_3_input_columns: pd.Series = None, # see lookup_gee_datasets for details
29
+ ind_4_input_columns: pd.Series = None, # see lookup_gee_datasets for details
30
+ ind_1_name: str = "Indicator_1_treecover",
31
+ ind_2_name: str = "Indicator_2_commodities",
32
+ ind_3_name: str = "Indicator_3_disturbance_before_2020",
33
+ ind_4_name: str = "Indicator_4_disturbance_after_2020",
34
+ low_name: str = "no",
35
+ high_name: str = "yes",
36
+ ) -> data_lookup_type:
37
+ """
38
+ Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
39
+
40
+ Args:
41
+ df (DataFrame): Input DataFrame.
42
+ ind_1_pcent_threshold (int, optional): Percentage threshold for the first indicator. Defaults to 10.
43
+ ind_2_pcent_threshold (int, optional): Percentage threshold for the second indicator. Defaults to 10.
44
+ ind_3_pcent_threshold (int, optional): Percentage threshold for the third indicator. Defaults to 0.
45
+ ind_4_pcent_threshold (int, optional): Percentage threshold for the fourth indicator. Defaults to 0.
46
+ ind_1_input_columns (list, optional): List of input columns for the first indicator. Defaults to columns for the treecover theme.
47
+ ind_2_input_columns (list, optional): List of input columns for the second indicator. Defaults to columns for the commodities theme.
48
+ ind_3_input_columns (list, optional): List of input columns for the third indicator. Defaults to columns for disturbance before 2020.
49
+ ind_4_input_columns (list, optional): List of input columns for the fourth indicator. Defaults to columns for disturbance after 2020.
50
+ ind_1_name (str, optional): Name of the first indicator column. Defaults to "Indicator_1_treecover".
51
+ ind_2_name (str, optional): Name of the second indicator column. Defaults to "Indicator_2_commodities".
52
+ ind_3_name (str, optional): Name of the third indicator column. Defaults to "Indicator_3_disturbance_before_2020".
53
+ ind_4_name (str, optional): Name of the fourth indicator column. Defaults to "Indicator_4_disturbance_after_2020".
54
+ low_name (str, optional): Value shown in table if less than or equal to the threshold. Defaults to "no".
55
+ high_name (str, optional): Value shown in table if more than the threshold. Defaults to "yes".
56
+
57
+ Returns:
58
+ data_lookup_type: DataFrame with added 'EUDR_risk' column.
59
+ """
60
+
61
+ if ind_1_input_columns is None:
62
+ ind_1_input_columns = get_cols_ind_1_treecover(lookup_gee_datasets_df)
63
+ if ind_2_input_columns is None:
64
+ ind_2_input_columns = get_cols_ind_2_commodities(lookup_gee_datasets_df)
65
+ if ind_3_input_columns is None:
66
+ ind_3_input_columns = get_cols_ind_3_dist_before_2020(lookup_gee_datasets_df)
67
+ if ind_4_input_columns is None:
68
+ ind_4_input_columns = get_cols_ind_4_dist_after_2020(lookup_gee_datasets_df)
69
+
70
+ # Check range of values
71
+ check_range(ind_1_pcent_threshold)
72
+ check_range(ind_2_pcent_threshold)
73
+ check_range(ind_3_pcent_threshold)
74
+ check_range(ind_4_pcent_threshold)
75
+
76
+ input_cols = [
77
+ ind_1_input_columns,
78
+ ind_2_input_columns,
79
+ ind_3_input_columns,
80
+ ind_4_input_columns,
81
+ ]
82
+ thresholds = [
83
+ ind_1_pcent_threshold,
84
+ ind_2_pcent_threshold,
85
+ ind_3_pcent_threshold,
86
+ ind_4_pcent_threshold,
87
+ ]
88
+ names = [ind_1_name, ind_2_name, ind_3_name, ind_4_name]
89
+ [check_range(threshold) for threshold in thresholds]
90
+
91
+ df_w_indicators = add_indicators(
92
+ df,
93
+ input_cols,
94
+ thresholds,
95
+ names,
96
+ low_name,
97
+ high_name,
98
+ )
99
+
100
+ df_w_indicators_and_risk = add_eudr_risk_col(
101
+ df=df_w_indicators,
102
+ ind_1_name=ind_1_name,
103
+ ind_2_name=ind_2_name,
104
+ ind_3_name=ind_3_name,
105
+ ind_4_name=ind_4_name,
106
+ )
107
+
108
+ return df_w_indicators_and_risk
109
+
110
+
111
+ def add_eudr_risk_col(
112
+ df: data_lookup_type,
113
+ ind_1_name: str,
114
+ ind_2_name: str,
115
+ ind_3_name: str,
116
+ ind_4_name: str,
117
+ ) -> data_lookup_type:
118
+ """
119
+ Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
120
+
121
+ Args:
122
+ df (DataFrame): Input DataFrame.
123
+ ind_1_name (str): Name of first indicator column.
124
+ ind_2_name (str): Name of second indicator column.
125
+ ind_3_name (str): Name of third indicator column.
126
+ ind_4_name (str): Name of fourth indicator column.
127
+
128
+ Returns:
129
+ DataFrame: DataFrame with added 'EUDR_risk' column.
130
+ """
131
+
132
+ for index, row in df.iterrows():
133
+ # If any of the first three indicators suggest low risk, set EUDR_risk to "low"
134
+ if (
135
+ row[ind_1_name] == "no"
136
+ or row[ind_2_name] == "yes"
137
+ or row[ind_3_name] == "yes"
138
+ ):
139
+ df.at[index, "EUDR_risk"] = "low"
140
+ # If none of the first three indicators suggest low risk and Indicator 4 suggests no risk, set EUDR_risk to "more_info_needed"
141
+ elif row[ind_4_name] == "no":
142
+ df.at[index, "EUDR_risk"] = "more_info_needed"
143
+ # If none of the above conditions are met, set EUDR_risk to "high"
144
+ else:
145
+ df.at[index, "EUDR_risk"] = "high"
146
+
147
+ return df
148
+
149
+
150
+ def add_indicators(
151
+ df: data_lookup_type,
152
+ input_cols: list[str],
153
+ thresholds: list[float],
154
+ names: list[str],
155
+ low_name: str = "no",
156
+ high_name: str = "yes",
157
+ ) -> data_lookup_type:
158
+ for input_col, threshold, name in zip(input_cols, thresholds, names):
159
+ df = add_indicator_column(
160
+ df=df,
161
+ input_columns=input_col,
162
+ threshold=threshold,
163
+ new_column_name=name,
164
+ low_name=low_name,
165
+ high_name=high_name,
166
+ )
167
+
168
+ return df
169
+
170
+
171
+ def add_indicator_column(
172
+ df: data_lookup_type,
173
+ input_columns: list[str],
174
+ threshold: float,
175
+ new_column_name: str,
176
+ low_name: str = "yes",
177
+ high_name: str = "no",
178
+ sum_comparison: bool = False,
179
+ ) -> data_lookup_type:
180
+ """
181
+ Add a new column to the DataFrame based on the specified columns, threshold, and comparison sign.
182
+
183
+ Parameters:
184
+ df (data_lookup_type): The pandas DataFrame to which the column will be added.
185
+ input_columns (list): List of column names to check for threshold.
186
+ threshold (float): The threshold value to compare against.
187
+ new_column_name (str): The name of the new column to be added.
188
+ The '>' sign is used for comparisons.
189
+ When 'sum comparison' == True, then the threshold is compared to the sum of all those listed in 'input_columns', as opposed to when Flalse, when each column in the list is compared to the threshold individually
190
+ low_name (str): The name for the value when below or equal to threshold (default is 'no').
191
+ high_name (str): The name for the value when above threshold (default is 'yes').
192
+ sum_comparison (bool): If True, sum all values in input_columns and compare to threshold (default is False).
193
+
194
+ Returns:
195
+ data_lookup_type: The DataFrame with the new column added.
196
+ """
197
+ # Create a new column and initialize with low_name
198
+ new_column = pd.Series(low_name, index=df.index, name=new_column_name)
199
+
200
+ # Default behavior: use '>' for single column comparison
201
+ if sum_comparison:
202
+ # Sum all values in specified columns and compare to threshold
203
+ sum_values = df[input_columns].sum(axis=1)
204
+ new_column[sum_values > threshold] = high_name
205
+ else:
206
+ # Check if any values in specified columns are above the threshold and update the new column accordingly
207
+ for col in input_columns:
208
+ # So that threshold is always in percent, if outputs are in ha, the code converts to percent (based on dividing by the geometry_area_column column.
209
+ # Clamping is needed due to differences in decimal places (meaning input values may go just over 100)
210
+ if percent_or_ha == "ha":
211
+ val_to_check = clamp(
212
+ ((df[col] / df[geometry_area_column]) * 100), 0, 100
213
+ )
214
+ else:
215
+ val_to_check = df[col]
216
+ new_column[val_to_check > threshold] = high_name
217
+
218
+ # Concatenate the new column to the DataFrame
219
+ df = pd.concat([df, new_column], axis=1)
220
+ return df
221
+
222
+
223
+ def get_cols_ind_1_treecover(lookup_gee_datasets_df):
224
+ """
225
+ Generate a list of dataset names for the treecover theme, excluding those marked for exclusion.
226
+
227
+ Args:
228
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
229
+
230
+ Returns:
231
+ list: List of dataset names set to be used in the risk calculations for the treecover theme, excluding those marked for exclusion.
232
+ """
233
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
234
+ lookup_gee_datasets_df["exclude_from_output"] != 1
235
+ ]
236
+ return list(
237
+ lookup_gee_datasets_df["name"][
238
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
239
+ & (lookup_gee_datasets_df["theme"] == "treecover")
240
+ ]
241
+ )
242
+
243
+
244
+ def get_cols_ind_2_commodities(lookup_gee_datasets_df):
245
+ """
246
+ Generate a list of dataset names for the commodities theme, excluding those marked for exclusion.
247
+
248
+ Args:
249
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
250
+
251
+ Returns:
252
+ list: List of dataset names set to be used in the risk calculations for the commodities theme, excluding those marked for exclusion.
253
+ """
254
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
255
+ lookup_gee_datasets_df["exclude_from_output"] != 1
256
+ ]
257
+ return list(
258
+ lookup_gee_datasets_df["name"][
259
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
260
+ & (lookup_gee_datasets_df["theme"] == "commodities")
261
+ ]
262
+ )
263
+
264
+
265
+ def get_cols_ind_3_dist_before_2020(lookup_gee_datasets_df):
266
+ """
267
+ Generate a list of dataset names for the disturbance before 2020 theme, excluding those marked for exclusion.
268
+
269
+ Args:
270
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
271
+
272
+ Returns:
273
+ list: List of dataset names set to be used in the risk calculations for the disturbance before 2020 theme, excluding those marked for exclusion.
274
+ """
275
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
276
+ lookup_gee_datasets_df["exclude_from_output"] != 1
277
+ ]
278
+ return list(
279
+ lookup_gee_datasets_df["name"][
280
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
281
+ & (lookup_gee_datasets_df["theme"] == "disturbance_before")
282
+ ]
283
+ )
284
+
285
+
286
+ def get_cols_ind_4_dist_after_2020(lookup_gee_datasets_df):
287
+ """
288
+ Generate a list of dataset names for the disturbance after 2020 theme, excluding those marked for exclusion.
289
+
290
+ Args:
291
+ lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
292
+
293
+ Returns:
294
+ list: List of dataset names set to be used in the risk calculations for the disturbance after 2020 theme, excluding those marked for exclusion.
295
+ """
296
+ lookup_gee_datasets_df = lookup_gee_datasets_df[
297
+ lookup_gee_datasets_df["exclude_from_output"] != 1
298
+ ]
299
+ return list(
300
+ lookup_gee_datasets_df["name"][
301
+ (lookup_gee_datasets_df["use_for_risk"] == 1)
302
+ & (lookup_gee_datasets_df["theme"] == "disturbance_after")
303
+ ]
304
+ )
305
+
306
+
307
+ def clamp(
308
+ value: float | pd.Series, min_val: float, max_val: float
309
+ ) -> float | pd.Series:
310
+ """
311
+ Clamp a value or a Pandas Series within a specified range.
312
+
313
+ Args:
314
+ value (float | pd.Series): The value or series to be clamped.
315
+ min_val (float): The minimum value of the range.
316
+ max_val (float): The maximum value of the range.
317
+
318
+ Returns:
319
+ float | pd.Series: The clamped value or series within the range.
320
+ """
321
+ if isinstance(value, pd.Series):
322
+ return value.clip(lower=min_val, upper=max_val)
323
+ else:
324
+ return max(min_val, min(value, max_val))
325
+
326
+
327
+ def check_range(value: float) -> None:
328
+ if not (0 <= value <= 100):
329
+ raise ValueError("Value must be between 0 and 100.")