openforis-whisp 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +68 -0
- openforis_whisp/data_conversion.py +348 -0
- openforis_whisp/datasets.py +695 -0
- openforis_whisp/logger.py +39 -0
- openforis_whisp/parameters/__init__.py +15 -0
- openforis_whisp/parameters/config_runtime.py +47 -0
- openforis_whisp/parameters/lookup_context_and_metadata.csv +13 -0
- openforis_whisp/parameters/lookup_gee_datasets.csv +155 -0
- openforis_whisp/pd_schemas.py +77 -0
- openforis_whisp/reformat.py +346 -0
- openforis_whisp/risk.py +329 -0
- openforis_whisp/stats.py +752 -0
- openforis_whisp/utils.py +154 -0
- openforis_whisp-0.0.1.dist-info/LICENSE +21 -0
- openforis_whisp-0.0.1.dist-info/METADATA +296 -0
- openforis_whisp-0.0.1.dist-info/RECORD +17 -0
- openforis_whisp-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# !pip install pandera[io] # special version used
|
|
2
|
+
import pandera as pa
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from openforis_whisp.logger import StdoutLogger, FileLogger
|
|
9
|
+
|
|
10
|
+
from openforis_whisp.pd_schemas import data_lookup_type
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from openforis_whisp.parameters.config_runtime import (
|
|
14
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
15
|
+
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = StdoutLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Dictionary to cache schema and modification times for multiple files
|
|
22
|
+
cached_schema = None
|
|
23
|
+
cached_file_mtimes = {}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def validate_dataframe_using_lookups(
|
|
27
|
+
df_stats: pd.DataFrame, file_paths: list = None
|
|
28
|
+
) -> pd.DataFrame:
|
|
29
|
+
"""
|
|
30
|
+
Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
df_stats (pd.DataFrame): The DataFrame to validate.
|
|
34
|
+
file_paths (list): List of paths to schema files.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
pd.DataFrame: The validated DataFrame.
|
|
38
|
+
"""
|
|
39
|
+
# Load the schema
|
|
40
|
+
schema = load_schema_if_any_file_changed(file_paths)
|
|
41
|
+
|
|
42
|
+
# Validate the DataFrame
|
|
43
|
+
validated_df = validate_dataframe(df_stats, schema)
|
|
44
|
+
|
|
45
|
+
return validated_df
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# NB uses default inputs. If you want to use custom inputs, you can pass them as arguments
|
|
49
|
+
def load_schema_if_any_file_changed(file_paths):
|
|
50
|
+
"""Load schema only if any file in the list has changed."""
|
|
51
|
+
global cached_schema, cached_file_mtimes
|
|
52
|
+
|
|
53
|
+
if file_paths is None:
|
|
54
|
+
file_paths = [
|
|
55
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
56
|
+
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Flag to indicate if any file has changed
|
|
60
|
+
schema_needs_update = False
|
|
61
|
+
|
|
62
|
+
# Check each file's modification time
|
|
63
|
+
for file_path in file_paths:
|
|
64
|
+
current_mtime = os.path.getmtime(file_path)
|
|
65
|
+
|
|
66
|
+
# If the file is new or has been modified, mark schema for update
|
|
67
|
+
if (
|
|
68
|
+
file_path not in cached_file_mtimes
|
|
69
|
+
or current_mtime != cached_file_mtimes[file_path]
|
|
70
|
+
):
|
|
71
|
+
print(f"File {file_path} changed, updating schema...")
|
|
72
|
+
schema_needs_update = True
|
|
73
|
+
cached_file_mtimes[
|
|
74
|
+
file_path
|
|
75
|
+
] = current_mtime # Update the modification time
|
|
76
|
+
|
|
77
|
+
# If any file has changed, update the schema
|
|
78
|
+
if schema_needs_update or cached_schema is None:
|
|
79
|
+
print("Creating or updating schema based on changed files...")
|
|
80
|
+
# You can combine the files as needed; here we assume one schema file
|
|
81
|
+
# If you want to handle multiple schema files differently, adjust this
|
|
82
|
+
|
|
83
|
+
# add checks on lookup inputs (i.e. a dataframe in type format: data_lookup_type)
|
|
84
|
+
combined_lookup_df: data_lookup_type = append_csvs_to_dataframe(
|
|
85
|
+
file_paths
|
|
86
|
+
) # concatonates input lookup files
|
|
87
|
+
|
|
88
|
+
cached_schema = create_schema_from_dataframe(
|
|
89
|
+
combined_lookup_df
|
|
90
|
+
) # create cached schema
|
|
91
|
+
|
|
92
|
+
else:
|
|
93
|
+
print("Using cached schema.")
|
|
94
|
+
|
|
95
|
+
return cached_schema
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def validate_dataframe(
|
|
99
|
+
df_stats: pd.DataFrame, schema: pa.DataFrameSchema
|
|
100
|
+
) -> pd.DataFrame:
|
|
101
|
+
"""Validate the DataFrame against the given schema, reorder columns to match schema order, and list missing columns.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
schema (pa.DataFrameSchema): The schema to validate against.
|
|
105
|
+
df_stats (pd.DataFrame): The DataFrame to validate.
|
|
106
|
+
required_false (bool): If True, sets all columns in the schema as optional (required=False).
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
|
|
110
|
+
"""
|
|
111
|
+
log_missing_columns(df_stats, schema)
|
|
112
|
+
|
|
113
|
+
# df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
|
|
114
|
+
|
|
115
|
+
# Try to automatically coerce the DataFrame to match the schema types
|
|
116
|
+
try:
|
|
117
|
+
validated_df = schema(df_stats)
|
|
118
|
+
except pa.errors.SchemaError as e:
|
|
119
|
+
print("Error during validation:", e)
|
|
120
|
+
# Return None or raise the error if validation fails
|
|
121
|
+
return None # or raise e
|
|
122
|
+
|
|
123
|
+
# Reorder the validated DataFrame to match the schema's column order
|
|
124
|
+
validated_df = validated_df.reindex(schema.columns.keys(), axis=1)
|
|
125
|
+
|
|
126
|
+
return validated_df
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def load_schema_if_any_file_changed(file_paths):
|
|
130
|
+
"""Load schema only if any file in the list has changed."""
|
|
131
|
+
global cached_schema, cached_file_mtimes
|
|
132
|
+
|
|
133
|
+
if file_paths is None:
|
|
134
|
+
file_paths = [
|
|
135
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
136
|
+
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
# Flag to indicate if any file has changed
|
|
140
|
+
schema_needs_update = False
|
|
141
|
+
|
|
142
|
+
# Check each file's modification time
|
|
143
|
+
for file_path in file_paths:
|
|
144
|
+
current_mtime = os.path.getmtime(file_path)
|
|
145
|
+
|
|
146
|
+
# If the file is new or has been modified, mark schema for update
|
|
147
|
+
if (
|
|
148
|
+
file_path not in cached_file_mtimes
|
|
149
|
+
or current_mtime != cached_file_mtimes[file_path]
|
|
150
|
+
):
|
|
151
|
+
print(f"File {file_path} changed, updating schema...")
|
|
152
|
+
schema_needs_update = True
|
|
153
|
+
cached_file_mtimes[
|
|
154
|
+
file_path
|
|
155
|
+
] = current_mtime # Update the modification time
|
|
156
|
+
|
|
157
|
+
# If any file has changed, update the schema
|
|
158
|
+
if schema_needs_update or cached_schema is None:
|
|
159
|
+
print("Creating or updating schema based on changed files...")
|
|
160
|
+
# You can combine the files as needed; here we assume one schema file
|
|
161
|
+
# If you want to handle multiple schema files differently, adjust this
|
|
162
|
+
|
|
163
|
+
# add checks on lookup inputs (i.e. a dataframe in type format: data_lookup_type)
|
|
164
|
+
combined_lookup_df: data_lookup_type = append_csvs_to_dataframe(
|
|
165
|
+
file_paths
|
|
166
|
+
) # concatonates input lookup files
|
|
167
|
+
|
|
168
|
+
cached_schema = create_schema_from_dataframe(
|
|
169
|
+
combined_lookup_df
|
|
170
|
+
) # create cached schema
|
|
171
|
+
|
|
172
|
+
else:
|
|
173
|
+
print("Using cached schema.")
|
|
174
|
+
|
|
175
|
+
return cached_schema
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# example code to convert schema to JSON format if want to export (note pandera[io] required)
|
|
179
|
+
# cached_schema.to_yaml(output_file_path)
|
|
180
|
+
|
|
181
|
+
# loaded_schema = io.from_yaml(output_file_path)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def append_csvs_to_dataframe(csv_paths):
|
|
185
|
+
"""
|
|
186
|
+
Appends multiple CSV files into a single Pandas DataFrame.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
- csv_paths (list of str): List of paths to CSV files to append.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
- pd.DataFrame: Combined DataFrame containing data from all provided CSV files.
|
|
193
|
+
|
|
194
|
+
Raises:
|
|
195
|
+
- ValueError: If any CSV file cannot be read.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
combined_df = pd.DataFrame() # Initialize an empty DataFrame
|
|
199
|
+
|
|
200
|
+
for path in csv_paths:
|
|
201
|
+
try:
|
|
202
|
+
# Read the CSV file into a DataFrame
|
|
203
|
+
df = pd.read_csv(path)
|
|
204
|
+
# Append to the combined DataFrame
|
|
205
|
+
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
raise ValueError(f"Error reading {path}: {e}")
|
|
208
|
+
|
|
209
|
+
return combined_df
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
|
|
213
|
+
"""Create a Pandera schema from a DataFrame containing schema information."""
|
|
214
|
+
|
|
215
|
+
if schema_df.empty:
|
|
216
|
+
raise ValueError("The input DataFrame is empty.")
|
|
217
|
+
|
|
218
|
+
required_columns = ["name", "col_type", "is_nullable", "is_required"]
|
|
219
|
+
missing_columns = [col for col in required_columns if col not in schema_df.columns]
|
|
220
|
+
if missing_columns:
|
|
221
|
+
raise ValueError(f"Missing columns in schema DataFrame: {missing_columns}")
|
|
222
|
+
|
|
223
|
+
# print("Schema DataFrame columns:", schema_df.columns)
|
|
224
|
+
|
|
225
|
+
# Sort DataFrame by 'order' if it exists
|
|
226
|
+
if "order" in schema_df.columns:
|
|
227
|
+
schema_df = schema_df.sort_values(by="order")
|
|
228
|
+
|
|
229
|
+
# Remove rows where 'exclude_from_output' equals 1, if that column exists
|
|
230
|
+
if "exclude_from_output" in schema_df.columns:
|
|
231
|
+
schema_df = schema_df[schema_df["exclude_from_output"] != 1]
|
|
232
|
+
|
|
233
|
+
# Create a dictionary to hold the column schema
|
|
234
|
+
schema_dict = {}
|
|
235
|
+
for _, row in schema_df.iterrows():
|
|
236
|
+
col_name = row["name"]
|
|
237
|
+
col_type = row["col_type"]
|
|
238
|
+
is_nullable = row["is_nullable"] in (1, "1", True, "True")
|
|
239
|
+
is_required = row["is_required"] in (1, "1", True, "True")
|
|
240
|
+
|
|
241
|
+
# print(
|
|
242
|
+
# f"Processing column: {col_name}, Type: {col_type}, Nullable: {is_nullable}, Required: {is_required}"
|
|
243
|
+
# )
|
|
244
|
+
|
|
245
|
+
# Map DataFrame types to Pandera types
|
|
246
|
+
if col_type == "int64":
|
|
247
|
+
schema_dict[col_name] = pa.Column(
|
|
248
|
+
pa.Int64, nullable=is_nullable, required=is_required
|
|
249
|
+
)
|
|
250
|
+
elif col_type == "int":
|
|
251
|
+
schema_dict[col_name] = pa.Column(
|
|
252
|
+
pa.Int, nullable=is_nullable, required=is_required
|
|
253
|
+
)
|
|
254
|
+
elif col_type == "string":
|
|
255
|
+
schema_dict[col_name] = pa.Column(
|
|
256
|
+
pa.String, nullable=is_nullable, required=is_required
|
|
257
|
+
)
|
|
258
|
+
elif col_type == "float32":
|
|
259
|
+
schema_dict[col_name] = pa.Column(
|
|
260
|
+
pa.Float32, nullable=is_nullable, required=is_required
|
|
261
|
+
)
|
|
262
|
+
elif col_type == "float64":
|
|
263
|
+
schema_dict[col_name] = pa.Column(
|
|
264
|
+
pa.Float64, nullable=is_nullable, required=is_required
|
|
265
|
+
)
|
|
266
|
+
elif col_type == "bool":
|
|
267
|
+
schema_dict[col_name] = pa.Column(
|
|
268
|
+
pa.Bool, nullable=is_nullable, required=is_required
|
|
269
|
+
)
|
|
270
|
+
else:
|
|
271
|
+
raise ValueError(f"Unsupported type: {col_type}")
|
|
272
|
+
|
|
273
|
+
# Create and return the DataFrame schema with coercion enabled
|
|
274
|
+
schema = pa.DataFrameSchema(
|
|
275
|
+
schema_dict,
|
|
276
|
+
strict=False,
|
|
277
|
+
unique_column_names=True,
|
|
278
|
+
add_missing_columns=True,
|
|
279
|
+
coerce=True,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return schema
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def setup_logger(name):
|
|
286
|
+
# Create and configure logger
|
|
287
|
+
logging.basicConfig(level=logging.INFO)
|
|
288
|
+
logger = logging.getLogger(name)
|
|
289
|
+
return logger
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
293
|
+
# Initialize the logger
|
|
294
|
+
logger = setup_logger(__name__)
|
|
295
|
+
|
|
296
|
+
# Extract the expected columns from the DataFrameSchema
|
|
297
|
+
template_columns = template_schema.columns.keys()
|
|
298
|
+
df_stats_columns = df_stats.columns
|
|
299
|
+
|
|
300
|
+
# Find missing columns
|
|
301
|
+
missing_in_template = [
|
|
302
|
+
col for col in df_stats_columns if col not in template_columns
|
|
303
|
+
]
|
|
304
|
+
missing_in_stats = [col for col in template_columns if col not in df_stats_columns]
|
|
305
|
+
|
|
306
|
+
# Log results for missing columns in df_stats
|
|
307
|
+
if missing_in_template:
|
|
308
|
+
logger.warning(
|
|
309
|
+
f"The following columns from the results dataframe did not match any columns in the schema: \n{', '.join(missing_in_template)}"
|
|
310
|
+
)
|
|
311
|
+
else:
|
|
312
|
+
logger.info("All columns from dataframe found in the schema.")
|
|
313
|
+
|
|
314
|
+
# Log results for missing columns in template_df
|
|
315
|
+
if missing_in_stats:
|
|
316
|
+
logger.warning(
|
|
317
|
+
f"The following columns in the schema did not match any columns from the results dataframe: \n{', '.join(missing_in_stats)}"
|
|
318
|
+
)
|
|
319
|
+
else:
|
|
320
|
+
logger.info("All columns from the schema found in the results dataframe.")
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def setup_logger(name):
|
|
324
|
+
"""
|
|
325
|
+
Set up a logger with a specific name to avoid duplicate logs.
|
|
326
|
+
"""
|
|
327
|
+
logger = logging.getLogger(name)
|
|
328
|
+
if not logger.hasHandlers():
|
|
329
|
+
# Create handlers only if there are none
|
|
330
|
+
stdout_handler = logging.StreamHandler()
|
|
331
|
+
file_handler = logging.FileHandler("missing_columns.log")
|
|
332
|
+
|
|
333
|
+
# Set levels
|
|
334
|
+
stdout_handler.setLevel(logging.WARNING)
|
|
335
|
+
file_handler.setLevel(logging.WARNING)
|
|
336
|
+
|
|
337
|
+
# Create formatter and add it to the handlers
|
|
338
|
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
339
|
+
stdout_handler.setFormatter(formatter)
|
|
340
|
+
file_handler.setFormatter(formatter)
|
|
341
|
+
|
|
342
|
+
# Add handlers to the logger
|
|
343
|
+
logger.addHandler(stdout_handler)
|
|
344
|
+
logger.addHandler(file_handler)
|
|
345
|
+
|
|
346
|
+
return logger
|
openforis_whisp/risk.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from .pd_schemas import data_lookup_type
|
|
4
|
+
|
|
5
|
+
from openforis_whisp.parameters.config_runtime import (
|
|
6
|
+
percent_or_ha,
|
|
7
|
+
geometry_area_column,
|
|
8
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
# could embed this in each function below that uses lookup_gee_datasets_df.
|
|
12
|
+
lookup_gee_datasets_df: data_lookup_type = pd.read_csv(
|
|
13
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# requires lookup_gee_datasets_df
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def whisp_risk(
|
|
21
|
+
df: data_lookup_type, # CHECK THIS
|
|
22
|
+
ind_1_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
|
|
23
|
+
ind_2_pcent_threshold: float = 10, # default values (draft decision tree and parameters)
|
|
24
|
+
ind_3_pcent_threshold: float = 0, # default values (draft decision tree and parameters)
|
|
25
|
+
ind_4_pcent_threshold: float = 0, # default values (draft decision tree and parameters)
|
|
26
|
+
ind_1_input_columns: pd.Series = None, # see lookup_gee_datasets for details
|
|
27
|
+
ind_2_input_columns: pd.Series = None, # see lookup_gee_datasets for details
|
|
28
|
+
ind_3_input_columns: pd.Series = None, # see lookup_gee_datasets for details
|
|
29
|
+
ind_4_input_columns: pd.Series = None, # see lookup_gee_datasets for details
|
|
30
|
+
ind_1_name: str = "Indicator_1_treecover",
|
|
31
|
+
ind_2_name: str = "Indicator_2_commodities",
|
|
32
|
+
ind_3_name: str = "Indicator_3_disturbance_before_2020",
|
|
33
|
+
ind_4_name: str = "Indicator_4_disturbance_after_2020",
|
|
34
|
+
low_name: str = "no",
|
|
35
|
+
high_name: str = "yes",
|
|
36
|
+
) -> data_lookup_type:
|
|
37
|
+
"""
|
|
38
|
+
Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
df (DataFrame): Input DataFrame.
|
|
42
|
+
ind_1_pcent_threshold (int, optional): Percentage threshold for the first indicator. Defaults to 10.
|
|
43
|
+
ind_2_pcent_threshold (int, optional): Percentage threshold for the second indicator. Defaults to 10.
|
|
44
|
+
ind_3_pcent_threshold (int, optional): Percentage threshold for the third indicator. Defaults to 0.
|
|
45
|
+
ind_4_pcent_threshold (int, optional): Percentage threshold for the fourth indicator. Defaults to 0.
|
|
46
|
+
ind_1_input_columns (list, optional): List of input columns for the first indicator. Defaults to columns for the treecover theme.
|
|
47
|
+
ind_2_input_columns (list, optional): List of input columns for the second indicator. Defaults to columns for the commodities theme.
|
|
48
|
+
ind_3_input_columns (list, optional): List of input columns for the third indicator. Defaults to columns for disturbance before 2020.
|
|
49
|
+
ind_4_input_columns (list, optional): List of input columns for the fourth indicator. Defaults to columns for disturbance after 2020.
|
|
50
|
+
ind_1_name (str, optional): Name of the first indicator column. Defaults to "Indicator_1_treecover".
|
|
51
|
+
ind_2_name (str, optional): Name of the second indicator column. Defaults to "Indicator_2_commodities".
|
|
52
|
+
ind_3_name (str, optional): Name of the third indicator column. Defaults to "Indicator_3_disturbance_before_2020".
|
|
53
|
+
ind_4_name (str, optional): Name of the fourth indicator column. Defaults to "Indicator_4_disturbance_after_2020".
|
|
54
|
+
low_name (str, optional): Value shown in table if less than or equal to the threshold. Defaults to "no".
|
|
55
|
+
high_name (str, optional): Value shown in table if more than the threshold. Defaults to "yes".
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
data_lookup_type: DataFrame with added 'EUDR_risk' column.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
if ind_1_input_columns is None:
|
|
62
|
+
ind_1_input_columns = get_cols_ind_1_treecover(lookup_gee_datasets_df)
|
|
63
|
+
if ind_2_input_columns is None:
|
|
64
|
+
ind_2_input_columns = get_cols_ind_2_commodities(lookup_gee_datasets_df)
|
|
65
|
+
if ind_3_input_columns is None:
|
|
66
|
+
ind_3_input_columns = get_cols_ind_3_dist_before_2020(lookup_gee_datasets_df)
|
|
67
|
+
if ind_4_input_columns is None:
|
|
68
|
+
ind_4_input_columns = get_cols_ind_4_dist_after_2020(lookup_gee_datasets_df)
|
|
69
|
+
|
|
70
|
+
# Check range of values
|
|
71
|
+
check_range(ind_1_pcent_threshold)
|
|
72
|
+
check_range(ind_2_pcent_threshold)
|
|
73
|
+
check_range(ind_3_pcent_threshold)
|
|
74
|
+
check_range(ind_4_pcent_threshold)
|
|
75
|
+
|
|
76
|
+
input_cols = [
|
|
77
|
+
ind_1_input_columns,
|
|
78
|
+
ind_2_input_columns,
|
|
79
|
+
ind_3_input_columns,
|
|
80
|
+
ind_4_input_columns,
|
|
81
|
+
]
|
|
82
|
+
thresholds = [
|
|
83
|
+
ind_1_pcent_threshold,
|
|
84
|
+
ind_2_pcent_threshold,
|
|
85
|
+
ind_3_pcent_threshold,
|
|
86
|
+
ind_4_pcent_threshold,
|
|
87
|
+
]
|
|
88
|
+
names = [ind_1_name, ind_2_name, ind_3_name, ind_4_name]
|
|
89
|
+
[check_range(threshold) for threshold in thresholds]
|
|
90
|
+
|
|
91
|
+
df_w_indicators = add_indicators(
|
|
92
|
+
df,
|
|
93
|
+
input_cols,
|
|
94
|
+
thresholds,
|
|
95
|
+
names,
|
|
96
|
+
low_name,
|
|
97
|
+
high_name,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
df_w_indicators_and_risk = add_eudr_risk_col(
|
|
101
|
+
df=df_w_indicators,
|
|
102
|
+
ind_1_name=ind_1_name,
|
|
103
|
+
ind_2_name=ind_2_name,
|
|
104
|
+
ind_3_name=ind_3_name,
|
|
105
|
+
ind_4_name=ind_4_name,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return df_w_indicators_and_risk
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def add_eudr_risk_col(
|
|
112
|
+
df: data_lookup_type,
|
|
113
|
+
ind_1_name: str,
|
|
114
|
+
ind_2_name: str,
|
|
115
|
+
ind_3_name: str,
|
|
116
|
+
ind_4_name: str,
|
|
117
|
+
) -> data_lookup_type:
|
|
118
|
+
"""
|
|
119
|
+
Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
df (DataFrame): Input DataFrame.
|
|
123
|
+
ind_1_name (str): Name of first indicator column.
|
|
124
|
+
ind_2_name (str): Name of second indicator column.
|
|
125
|
+
ind_3_name (str): Name of third indicator column.
|
|
126
|
+
ind_4_name (str): Name of fourth indicator column.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
DataFrame: DataFrame with added 'EUDR_risk' column.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
for index, row in df.iterrows():
|
|
133
|
+
# If any of the first three indicators suggest low risk, set EUDR_risk to "low"
|
|
134
|
+
if (
|
|
135
|
+
row[ind_1_name] == "no"
|
|
136
|
+
or row[ind_2_name] == "yes"
|
|
137
|
+
or row[ind_3_name] == "yes"
|
|
138
|
+
):
|
|
139
|
+
df.at[index, "EUDR_risk"] = "low"
|
|
140
|
+
# If none of the first three indicators suggest low risk and Indicator 4 suggests no risk, set EUDR_risk to "more_info_needed"
|
|
141
|
+
elif row[ind_4_name] == "no":
|
|
142
|
+
df.at[index, "EUDR_risk"] = "more_info_needed"
|
|
143
|
+
# If none of the above conditions are met, set EUDR_risk to "high"
|
|
144
|
+
else:
|
|
145
|
+
df.at[index, "EUDR_risk"] = "high"
|
|
146
|
+
|
|
147
|
+
return df
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def add_indicators(
|
|
151
|
+
df: data_lookup_type,
|
|
152
|
+
input_cols: list[str],
|
|
153
|
+
thresholds: list[float],
|
|
154
|
+
names: list[str],
|
|
155
|
+
low_name: str = "no",
|
|
156
|
+
high_name: str = "yes",
|
|
157
|
+
) -> data_lookup_type:
|
|
158
|
+
for input_col, threshold, name in zip(input_cols, thresholds, names):
|
|
159
|
+
df = add_indicator_column(
|
|
160
|
+
df=df,
|
|
161
|
+
input_columns=input_col,
|
|
162
|
+
threshold=threshold,
|
|
163
|
+
new_column_name=name,
|
|
164
|
+
low_name=low_name,
|
|
165
|
+
high_name=high_name,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return df
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def add_indicator_column(
|
|
172
|
+
df: data_lookup_type,
|
|
173
|
+
input_columns: list[str],
|
|
174
|
+
threshold: float,
|
|
175
|
+
new_column_name: str,
|
|
176
|
+
low_name: str = "yes",
|
|
177
|
+
high_name: str = "no",
|
|
178
|
+
sum_comparison: bool = False,
|
|
179
|
+
) -> data_lookup_type:
|
|
180
|
+
"""
|
|
181
|
+
Add a new column to the DataFrame based on the specified columns, threshold, and comparison sign.
|
|
182
|
+
|
|
183
|
+
Parameters:
|
|
184
|
+
df (data_lookup_type): The pandas DataFrame to which the column will be added.
|
|
185
|
+
input_columns (list): List of column names to check for threshold.
|
|
186
|
+
threshold (float): The threshold value to compare against.
|
|
187
|
+
new_column_name (str): The name of the new column to be added.
|
|
188
|
+
The '>' sign is used for comparisons.
|
|
189
|
+
When 'sum comparison' == True, then the threshold is compared to the sum of all those listed in 'input_columns', as opposed to when Flalse, when each column in the list is compared to the threshold individually
|
|
190
|
+
low_name (str): The name for the value when below or equal to threshold (default is 'no').
|
|
191
|
+
high_name (str): The name for the value when above threshold (default is 'yes').
|
|
192
|
+
sum_comparison (bool): If True, sum all values in input_columns and compare to threshold (default is False).
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
data_lookup_type: The DataFrame with the new column added.
|
|
196
|
+
"""
|
|
197
|
+
# Create a new column and initialize with low_name
|
|
198
|
+
new_column = pd.Series(low_name, index=df.index, name=new_column_name)
|
|
199
|
+
|
|
200
|
+
# Default behavior: use '>' for single column comparison
|
|
201
|
+
if sum_comparison:
|
|
202
|
+
# Sum all values in specified columns and compare to threshold
|
|
203
|
+
sum_values = df[input_columns].sum(axis=1)
|
|
204
|
+
new_column[sum_values > threshold] = high_name
|
|
205
|
+
else:
|
|
206
|
+
# Check if any values in specified columns are above the threshold and update the new column accordingly
|
|
207
|
+
for col in input_columns:
|
|
208
|
+
# So that threshold is always in percent, if outputs are in ha, the code converts to percent (based on dividing by the geometry_area_column column.
|
|
209
|
+
# Clamping is needed due to differences in decimal places (meaning input values may go just over 100)
|
|
210
|
+
if percent_or_ha == "ha":
|
|
211
|
+
val_to_check = clamp(
|
|
212
|
+
((df[col] / df[geometry_area_column]) * 100), 0, 100
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
val_to_check = df[col]
|
|
216
|
+
new_column[val_to_check > threshold] = high_name
|
|
217
|
+
|
|
218
|
+
# Concatenate the new column to the DataFrame
|
|
219
|
+
df = pd.concat([df, new_column], axis=1)
|
|
220
|
+
return df
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def get_cols_ind_1_treecover(lookup_gee_datasets_df):
|
|
224
|
+
"""
|
|
225
|
+
Generate a list of dataset names for the treecover theme, excluding those marked for exclusion.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
list: List of dataset names set to be used in the risk calculations for the treecover theme, excluding those marked for exclusion.
|
|
232
|
+
"""
|
|
233
|
+
lookup_gee_datasets_df = lookup_gee_datasets_df[
|
|
234
|
+
lookup_gee_datasets_df["exclude_from_output"] != 1
|
|
235
|
+
]
|
|
236
|
+
return list(
|
|
237
|
+
lookup_gee_datasets_df["name"][
|
|
238
|
+
(lookup_gee_datasets_df["use_for_risk"] == 1)
|
|
239
|
+
& (lookup_gee_datasets_df["theme"] == "treecover")
|
|
240
|
+
]
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_cols_ind_2_commodities(lookup_gee_datasets_df):
|
|
245
|
+
"""
|
|
246
|
+
Generate a list of dataset names for the commodities theme, excluding those marked for exclusion.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
list: List of dataset names set to be used in the risk calculations for the commodities theme, excluding those marked for exclusion.
|
|
253
|
+
"""
|
|
254
|
+
lookup_gee_datasets_df = lookup_gee_datasets_df[
|
|
255
|
+
lookup_gee_datasets_df["exclude_from_output"] != 1
|
|
256
|
+
]
|
|
257
|
+
return list(
|
|
258
|
+
lookup_gee_datasets_df["name"][
|
|
259
|
+
(lookup_gee_datasets_df["use_for_risk"] == 1)
|
|
260
|
+
& (lookup_gee_datasets_df["theme"] == "commodities")
|
|
261
|
+
]
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def get_cols_ind_3_dist_before_2020(lookup_gee_datasets_df):
|
|
266
|
+
"""
|
|
267
|
+
Generate a list of dataset names for the disturbance before 2020 theme, excluding those marked for exclusion.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
list: List of dataset names set to be used in the risk calculations for the disturbance before 2020 theme, excluding those marked for exclusion.
|
|
274
|
+
"""
|
|
275
|
+
lookup_gee_datasets_df = lookup_gee_datasets_df[
|
|
276
|
+
lookup_gee_datasets_df["exclude_from_output"] != 1
|
|
277
|
+
]
|
|
278
|
+
return list(
|
|
279
|
+
lookup_gee_datasets_df["name"][
|
|
280
|
+
(lookup_gee_datasets_df["use_for_risk"] == 1)
|
|
281
|
+
& (lookup_gee_datasets_df["theme"] == "disturbance_before")
|
|
282
|
+
]
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def get_cols_ind_4_dist_after_2020(lookup_gee_datasets_df):
|
|
287
|
+
"""
|
|
288
|
+
Generate a list of dataset names for the disturbance after 2020 theme, excluding those marked for exclusion.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
lookup_gee_datasets_df (pd.DataFrame): DataFrame containing dataset information.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
list: List of dataset names set to be used in the risk calculations for the disturbance after 2020 theme, excluding those marked for exclusion.
|
|
295
|
+
"""
|
|
296
|
+
lookup_gee_datasets_df = lookup_gee_datasets_df[
|
|
297
|
+
lookup_gee_datasets_df["exclude_from_output"] != 1
|
|
298
|
+
]
|
|
299
|
+
return list(
|
|
300
|
+
lookup_gee_datasets_df["name"][
|
|
301
|
+
(lookup_gee_datasets_df["use_for_risk"] == 1)
|
|
302
|
+
& (lookup_gee_datasets_df["theme"] == "disturbance_after")
|
|
303
|
+
]
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def clamp(
|
|
308
|
+
value: float | pd.Series, min_val: float, max_val: float
|
|
309
|
+
) -> float | pd.Series:
|
|
310
|
+
"""
|
|
311
|
+
Clamp a value or a Pandas Series within a specified range.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
value (float | pd.Series): The value or series to be clamped.
|
|
315
|
+
min_val (float): The minimum value of the range.
|
|
316
|
+
max_val (float): The maximum value of the range.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
float | pd.Series: The clamped value or series within the range.
|
|
320
|
+
"""
|
|
321
|
+
if isinstance(value, pd.Series):
|
|
322
|
+
return value.clip(lower=min_val, upper=max_val)
|
|
323
|
+
else:
|
|
324
|
+
return max(min_val, min(value, max_val))
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def check_range(value: float) -> None:
|
|
328
|
+
if not (0 <= value <= 100):
|
|
329
|
+
raise ValueError("Value must be between 0 and 100.")
|