openforis-whisp 2.0.0a5__py3-none-any.whl → 2.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +2 -3
- openforis_whisp/datasets.py +29 -36
- openforis_whisp/parameters/lookup_gee_datasets.csv +2 -1
- openforis_whisp/reformat.py +362 -161
- openforis_whisp/risk.py +85 -8
- openforis_whisp/stats.py +145 -51
- openforis_whisp/utils.py +40 -0
- {openforis_whisp-2.0.0a5.dist-info → openforis_whisp-2.0.0b1.dist-info}/METADATA +2 -2
- openforis_whisp-2.0.0b1.dist-info/RECORD +17 -0
- openforis_whisp-2.0.0a5.dist-info/RECORD +0 -17
- {openforis_whisp-2.0.0a5.dist-info → openforis_whisp-2.0.0b1.dist-info}/LICENSE +0 -0
- {openforis_whisp-2.0.0a5.dist-info → openforis_whisp-2.0.0b1.dist-info}/WHEEL +0 -0
openforis_whisp/__init__.py
CHANGED
|
@@ -29,9 +29,7 @@ try:
|
|
|
29
29
|
except Exception as e:
|
|
30
30
|
print("Error in default EE initialization:", e)
|
|
31
31
|
|
|
32
|
-
from openforis_whisp.datasets import
|
|
33
|
-
combine_datasets,
|
|
34
|
-
)
|
|
32
|
+
from openforis_whisp.datasets import combine_datasets, combine_custom_bands
|
|
35
33
|
|
|
36
34
|
from openforis_whisp.stats import (
|
|
37
35
|
whisp_stats_ee_to_ee,
|
|
@@ -56,6 +54,7 @@ from openforis_whisp.parameters.config_runtime import (
|
|
|
56
54
|
|
|
57
55
|
from openforis_whisp.reformat import (
|
|
58
56
|
validate_dataframe_using_lookups,
|
|
57
|
+
validate_dataframe_using_lookups_flexible,
|
|
59
58
|
validate_dataframe,
|
|
60
59
|
create_schema_from_dataframe,
|
|
61
60
|
load_schema_if_any_file_changed,
|
openforis_whisp/datasets.py
CHANGED
|
@@ -1215,40 +1215,6 @@ def nci_ocs2020_prep():
|
|
|
1215
1215
|
|
|
1216
1216
|
###Combining datasets
|
|
1217
1217
|
|
|
1218
|
-
###Combining datasets
|
|
1219
|
-
|
|
1220
|
-
# def combine_datasets():
|
|
1221
|
-
# """Combines datasets into a single multiband image, with fallback if assets are missing."""
|
|
1222
|
-
# img_combined = ee.Image(1).rename(geometry_area_column)
|
|
1223
|
-
|
|
1224
|
-
# # Combine images directly
|
|
1225
|
-
# for img in [func() for func in list_functions()]:
|
|
1226
|
-
# try:
|
|
1227
|
-
# img_combined = img_combined.addBands(img)
|
|
1228
|
-
# except ee.EEException as e:
|
|
1229
|
-
# # logger.error(f"Error adding image: {e}")
|
|
1230
|
-
# print(f"Error adding image: {e}")
|
|
1231
|
-
|
|
1232
|
-
# try:
|
|
1233
|
-
# # Attempt to print band names to check for errors
|
|
1234
|
-
# print(img_combined.bandNames().getInfo())
|
|
1235
|
-
# except ee.EEException as e:
|
|
1236
|
-
# # logger.error(f"Error printing band names: {e}")
|
|
1237
|
-
# # logger.info("Running code for filtering to only valid datasets due to error in input")
|
|
1238
|
-
# print("using valid datasets filter due to error in input")
|
|
1239
|
-
# # Validate images
|
|
1240
|
-
# images_to_test = [func() for func in list_functions()]
|
|
1241
|
-
# valid_imgs = keep_valid_images(images_to_test) # Validate images
|
|
1242
|
-
|
|
1243
|
-
# # Retry combining images after validation
|
|
1244
|
-
# img_combined = ee.Image(1).rename(geometry_area_column)
|
|
1245
|
-
# for img in valid_imgs:
|
|
1246
|
-
# img_combined = img_combined.addBands(img)
|
|
1247
|
-
|
|
1248
|
-
# img_combined = img_combined.multiply(ee.Image.pixelArea())
|
|
1249
|
-
|
|
1250
|
-
# return img_combined
|
|
1251
|
-
|
|
1252
1218
|
|
|
1253
1219
|
def combine_datasets(national_codes=None):
|
|
1254
1220
|
"""Combines datasets into a single multiband image, with fallback if assets are missing."""
|
|
@@ -1380,5 +1346,32 @@ def ee_image_checker(image):
|
|
|
1380
1346
|
return False
|
|
1381
1347
|
|
|
1382
1348
|
|
|
1383
|
-
#
|
|
1384
|
-
#
|
|
1349
|
+
# preparation steps for multiband image with area per pixel values
|
|
1350
|
+
# function for notebook environment
|
|
1351
|
+
# user provides custom_images dict and custom_bands_info dict
|
|
1352
|
+
def combine_custom_bands(custom_images, custom_bands_info):
|
|
1353
|
+
"""
|
|
1354
|
+
Combine custom Earth Engine images into a single multiband image with area conversion.
|
|
1355
|
+
|
|
1356
|
+
Returns
|
|
1357
|
+
-------
|
|
1358
|
+
ee.Image
|
|
1359
|
+
Combined bands converted to area values
|
|
1360
|
+
"""
|
|
1361
|
+
# ... existing validation code ...
|
|
1362
|
+
|
|
1363
|
+
# Step 3: Rename and combine images
|
|
1364
|
+
band_names = list(custom_bands_info.keys())
|
|
1365
|
+
|
|
1366
|
+
# Start with first image
|
|
1367
|
+
custom_ee_image = custom_images[band_names[0]].rename(band_names[0])
|
|
1368
|
+
|
|
1369
|
+
# Add remaining images if any
|
|
1370
|
+
for name in band_names[1:]:
|
|
1371
|
+
next_image = custom_images[name].rename(name)
|
|
1372
|
+
custom_ee_image = custom_ee_image.addBands(next_image)
|
|
1373
|
+
|
|
1374
|
+
# Convert to area values
|
|
1375
|
+
custom_ee_image = custom_ee_image.multiply(ee.Image.pixelArea())
|
|
1376
|
+
|
|
1377
|
+
return custom_ee_image # Only return the image
|
|
@@ -2,7 +2,7 @@ name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude
|
|
|
2
2
|
EUFO_2020,10,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_jrc_gfc_2020_prep
|
|
3
3
|
GLAD_Primary,20,,treecover,primary,1,1,0,float32,1,0,g_glad_pht_prep
|
|
4
4
|
TMF_undist,30,,treecover,primary,1,1,0,float32,1,0,g_jrc_tmf_undisturbed_prep
|
|
5
|
-
GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,
|
|
5
|
+
GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_gfc_tc_2020_prep
|
|
6
6
|
Forest_FDaP,60,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
|
|
7
7
|
ESA_TC_2020,70,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_esa_worldcover_trees_prep
|
|
8
8
|
TMF_plant,80,,commodities,NA,1,1,0,float32,1,0,g_jrc_tmf_plantation_prep
|
|
@@ -199,3 +199,4 @@ nBR_INPE_TCamz_pasture_2020,2422,BR,commodities,NA,1,1,0,float32,1,0,nbr_terracl
|
|
|
199
199
|
nBR_INPE_TCcer_pasture_2020,2423,BR,commodities,NA,1,1,0,float32,1,0,nbr_terraclass_cer20_ac_prep
|
|
200
200
|
nBR_MapBiomas_col9_pasture_2020,2424,BR,commodities,NA,1,1,0,float32,1,0,nbr_mapbiomasc9_pasture_prep
|
|
201
201
|
nCI_Cocoa_bnetd,3000,CI,commodities,NA,1,1,0,float32,1,0,nci_ocs2020_prep
|
|
202
|
+
|
openforis_whisp/reformat.py
CHANGED
|
@@ -38,6 +38,7 @@ def validate_dataframe_using_lookups(
|
|
|
38
38
|
Returns:
|
|
39
39
|
pd.DataFrame: The validated DataFrame.
|
|
40
40
|
"""
|
|
41
|
+
|
|
41
42
|
# Load the schema
|
|
42
43
|
schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
|
|
43
44
|
|
|
@@ -237,11 +238,11 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
|
|
|
237
238
|
return schema
|
|
238
239
|
|
|
239
240
|
|
|
240
|
-
def setup_logger(name):
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
241
|
+
# def setup_logger(name):
|
|
242
|
+
# # Create and configure logger
|
|
243
|
+
# logging.basicConfig(level=logging.INFO)
|
|
244
|
+
# logger = logging.getLogger(name)
|
|
245
|
+
# return logger
|
|
245
246
|
|
|
246
247
|
|
|
247
248
|
def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
@@ -301,164 +302,166 @@ def setup_logger(name):
|
|
|
301
302
|
return logger
|
|
302
303
|
|
|
303
304
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
#
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
#
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
#
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
#
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
#
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
#
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
#
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
#
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
#
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
# # return lookup_df
|
|
369
|
-
# def filter_lookup_by_country_codes(
|
|
370
|
-
# lookup_df: pd.DataFrame, national_codes: list = None
|
|
371
|
-
# ) -> pd.DataFrame:
|
|
372
|
-
# """
|
|
373
|
-
# Filter lookup DataFrame to include only:
|
|
374
|
-
# 1. Global columns (prefixed with 'g_')
|
|
375
|
-
# 2. General columns (not country-specific)
|
|
376
|
-
# 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
|
|
377
|
-
|
|
378
|
-
# If no national_codes are provided, ALL country-specific columns are filtered out.
|
|
379
|
-
|
|
380
|
-
# Args:
|
|
381
|
-
# lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
382
|
-
# national_codes (list, optional): List of ISO2 country codes to include.
|
|
383
|
-
# If None, all country-specific columns are removed.
|
|
384
|
-
|
|
385
|
-
# Returns:
|
|
386
|
-
# pd.DataFrame: Filtered lookup DataFrame
|
|
387
|
-
# """
|
|
388
|
-
|
|
389
|
-
# # Normalize national_codes to lowercase for case-insensitive comparison
|
|
390
|
-
# if national_codes:
|
|
391
|
-
# normalized_codes = [
|
|
392
|
-
# code.lower() for code in national_codes if isinstance(code, str)
|
|
393
|
-
# ]
|
|
394
|
-
# else:
|
|
395
|
-
# normalized_codes = []
|
|
396
|
-
|
|
397
|
-
# # Keep track of rows to remove
|
|
398
|
-
# rows_to_remove = []
|
|
399
|
-
|
|
400
|
-
# # Process each row in the lookup DataFrame
|
|
401
|
-
# for idx, row in lookup_df.iterrows():
|
|
402
|
-
# col_name = row["name"]
|
|
403
|
-
|
|
404
|
-
# # Skip if not a column name entry
|
|
405
|
-
# if pd.isna(col_name):
|
|
406
|
-
# continue
|
|
407
|
-
|
|
408
|
-
# # Always keep global columns (g_) and general columns
|
|
409
|
-
# if col_name.startswith("g_"):
|
|
410
|
-
# continue
|
|
411
|
-
|
|
412
|
-
# # Check if this is a country-specific column (nXX_)
|
|
413
|
-
# is_country_column = False
|
|
414
|
-
# matched_country = False
|
|
415
|
-
|
|
416
|
-
# # Look for pattern nXX_ which indicates a country-specific column
|
|
417
|
-
# for i in range(len(col_name) - 3):
|
|
418
|
-
# if (
|
|
419
|
-
# col_name[i : i + 1].lower() == "n"
|
|
420
|
-
# and len(col_name) > i + 3
|
|
421
|
-
# and col_name[i + 3 : i + 4] == "_"
|
|
422
|
-
# ):
|
|
423
|
-
# country_code = col_name[i + 1 : i + 3].lower()
|
|
424
|
-
# is_country_column = True
|
|
425
|
-
|
|
426
|
-
# # Only match if we have national_codes AND this country is in the list
|
|
427
|
-
# if national_codes and country_code in normalized_codes:
|
|
428
|
-
# matched_country = True
|
|
429
|
-
# break
|
|
430
|
-
|
|
431
|
-
# # Remove country-specific columns that don't match our criteria:
|
|
432
|
-
# # - If no national_codes provided: remove ALL country columns
|
|
433
|
-
# # - If national_codes provided: remove country columns NOT in the list
|
|
434
|
-
# if is_country_column and not matched_country:
|
|
435
|
-
# rows_to_remove.append(idx)
|
|
436
|
-
|
|
437
|
-
# # Filter out flagged rows
|
|
438
|
-
# if rows_to_remove:
|
|
439
|
-
# print(f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
440
|
-
# filtered_df = lookup_df.drop(rows_to_remove)
|
|
441
|
-
|
|
442
|
-
# # Filter out flagged rows
|
|
443
|
-
# if rows_to_remove:
|
|
444
|
-
# # Create detailed debug info
|
|
445
|
-
# removed_rows_info = []
|
|
446
|
-
# for idx in rows_to_remove:
|
|
447
|
-
# row_name = lookup_df.loc[idx, "name"]
|
|
448
|
-
# removed_rows_info.append({
|
|
449
|
-
# 'index': idx,
|
|
450
|
-
# 'name': row_name
|
|
451
|
-
# })
|
|
452
|
-
|
|
453
|
-
# # Extract just the column names for easy viewing
|
|
454
|
-
# removed_column_names = [info['name'] for info in removed_rows_info]
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
# print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
458
|
-
# print(f"Removed column names: {removed_column_names}")
|
|
459
|
-
# return filtered_df
|
|
305
|
+
def filter_lookup_by_country_codes(
|
|
306
|
+
lookup_df: pd.DataFrame, national_codes: list
|
|
307
|
+
) -> pd.DataFrame:
|
|
308
|
+
"""
|
|
309
|
+
Filter lookup DataFrame to include only:
|
|
310
|
+
1. Global columns (prefixed with 'g_')
|
|
311
|
+
2. General columns (not country-specific)
|
|
312
|
+
3. Country-specific columns matching the provided ISO2 codes
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
316
|
+
national_codes (list): List of ISO2 country codes to include
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
pd.DataFrame: Filtered lookup DataFrame
|
|
320
|
+
"""
|
|
321
|
+
if not national_codes:
|
|
322
|
+
return lookup_df
|
|
323
|
+
|
|
324
|
+
# Normalize national_codes to lowercase for case-insensitive comparison
|
|
325
|
+
normalized_codes = [
|
|
326
|
+
code.lower() for code in national_codes if isinstance(code, str)
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
# Keep track of rows to filter out
|
|
330
|
+
rows_to_remove = []
|
|
331
|
+
|
|
332
|
+
# Process each row in the lookup DataFrame
|
|
333
|
+
for idx, row in lookup_df.iterrows():
|
|
334
|
+
col_name = row["name"]
|
|
335
|
+
|
|
336
|
+
# Skip if not a column name entry
|
|
337
|
+
if pd.isna(col_name):
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
# Always keep global columns (g_) and columns that aren't country-specific
|
|
341
|
+
if col_name.startswith("g_"):
|
|
342
|
+
continue
|
|
343
|
+
|
|
344
|
+
# Check if this is a country-specific column (nXX_)
|
|
345
|
+
is_country_column = False
|
|
346
|
+
matched_country = False
|
|
347
|
+
|
|
348
|
+
# Look for pattern nXX_ which would indicate a country-specific column
|
|
349
|
+
for i in range(len(col_name) - 3):
|
|
350
|
+
if (
|
|
351
|
+
col_name[i : i + 1].lower() == "n"
|
|
352
|
+
and len(col_name) > i + 3
|
|
353
|
+
and col_name[i + 3 : i + 4] == "_"
|
|
354
|
+
):
|
|
355
|
+
country_code = col_name[i + 1 : i + 3].lower()
|
|
356
|
+
is_country_column = True
|
|
357
|
+
if country_code in normalized_codes:
|
|
358
|
+
matched_country = True
|
|
359
|
+
break
|
|
360
|
+
|
|
361
|
+
# If it's a country column but doesn't match our list, flag for removal
|
|
362
|
+
if is_country_column and not matched_country:
|
|
363
|
+
rows_to_remove.append(idx)
|
|
364
|
+
|
|
365
|
+
# Filter out rows for countries not in our list
|
|
366
|
+
if rows_to_remove:
|
|
367
|
+
return lookup_df.drop(rows_to_remove)
|
|
368
|
+
|
|
460
369
|
|
|
461
370
|
# return lookup_df
|
|
371
|
+
def filter_lookup_by_country_codes(
|
|
372
|
+
lookup_df: pd.DataFrame, national_codes: list = None
|
|
373
|
+
) -> pd.DataFrame:
|
|
374
|
+
"""
|
|
375
|
+
Filter lookup DataFrame to include only:
|
|
376
|
+
1. Global columns (prefixed with 'g_')
|
|
377
|
+
2. General columns (not country-specific)
|
|
378
|
+
3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
|
|
379
|
+
|
|
380
|
+
If no national_codes are provided, ALL country-specific columns are filtered out.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
384
|
+
national_codes (list, optional): List of ISO2 country codes to include.
|
|
385
|
+
If None, all country-specific columns are removed.
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
pd.DataFrame: Filtered lookup DataFrame
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
# Normalize national_codes to lowercase for case-insensitive comparison
|
|
392
|
+
if national_codes:
|
|
393
|
+
normalized_codes = [
|
|
394
|
+
code.lower() for code in national_codes if isinstance(code, str)
|
|
395
|
+
]
|
|
396
|
+
else:
|
|
397
|
+
normalized_codes = []
|
|
398
|
+
|
|
399
|
+
# Keep track of rows to remove
|
|
400
|
+
rows_to_remove = []
|
|
401
|
+
|
|
402
|
+
# Process each row in the lookup DataFrame
|
|
403
|
+
for idx, row in lookup_df.iterrows():
|
|
404
|
+
col_name = row["name"]
|
|
405
|
+
|
|
406
|
+
# Skip if not a column name entry
|
|
407
|
+
if pd.isna(col_name):
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
# Always keep global columns (g_) and general columns
|
|
411
|
+
if col_name.startswith("g_"):
|
|
412
|
+
continue
|
|
413
|
+
|
|
414
|
+
# Check if this is a country-specific column (nXX_)
|
|
415
|
+
is_country_column = False
|
|
416
|
+
matched_country = False
|
|
417
|
+
|
|
418
|
+
# Look for pattern nXX_ which indicates a country-specific column
|
|
419
|
+
for i in range(len(col_name) - 3):
|
|
420
|
+
if (
|
|
421
|
+
col_name[i : i + 1].lower() == "n"
|
|
422
|
+
and len(col_name) > i + 3
|
|
423
|
+
and col_name[i + 3 : i + 4] == "_"
|
|
424
|
+
):
|
|
425
|
+
country_code = col_name[i + 1 : i + 3].lower()
|
|
426
|
+
is_country_column = True
|
|
427
|
+
|
|
428
|
+
# Only match if we have national_codes AND this country is in the list
|
|
429
|
+
if national_codes and country_code in normalized_codes:
|
|
430
|
+
matched_country = True
|
|
431
|
+
break
|
|
432
|
+
|
|
433
|
+
# Remove country-specific columns that don't match our criteria:
|
|
434
|
+
# - If no national_codes provided: remove ALL country columns
|
|
435
|
+
# - If national_codes provided: remove country columns NOT in the list
|
|
436
|
+
if is_country_column and not matched_country:
|
|
437
|
+
rows_to_remove.append(idx)
|
|
438
|
+
|
|
439
|
+
# Filter out flagged rows
|
|
440
|
+
if rows_to_remove:
|
|
441
|
+
print(
|
|
442
|
+
f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria"
|
|
443
|
+
)
|
|
444
|
+
filtered_df = lookup_df.drop(rows_to_remove)
|
|
445
|
+
|
|
446
|
+
# Filter out flagged rows
|
|
447
|
+
# if rows_to_remove:
|
|
448
|
+
# # Create detailed debug info
|
|
449
|
+
# removed_rows_info = []
|
|
450
|
+
# for idx in rows_to_remove:
|
|
451
|
+
# row_name = lookup_df.loc[idx, "name"]
|
|
452
|
+
# removed_rows_info.append({
|
|
453
|
+
# 'index': idx,
|
|
454
|
+
# 'name': row_name
|
|
455
|
+
# })
|
|
456
|
+
|
|
457
|
+
# # Extract just the column names for easy viewing
|
|
458
|
+
# removed_column_names = [info['name'] for info in removed_rows_info]
|
|
459
|
+
|
|
460
|
+
# print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
461
|
+
# print(f"Removed column names: {removed_column_names}")
|
|
462
|
+
# return filtered_df
|
|
463
|
+
|
|
464
|
+
return lookup_df
|
|
462
465
|
|
|
463
466
|
|
|
464
467
|
def filter_lookup_by_country_codes(
|
|
@@ -493,3 +496,201 @@ def filter_lookup_by_country_codes(
|
|
|
493
496
|
)
|
|
494
497
|
|
|
495
498
|
return lookup_df[mask]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def validate_dataframe_using_lookups_flexible(
|
|
502
|
+
df_stats: pd.DataFrame,
|
|
503
|
+
file_paths: list = None,
|
|
504
|
+
national_codes: list = None,
|
|
505
|
+
custom_bands=None,
|
|
506
|
+
) -> pd.DataFrame:
|
|
507
|
+
"""
|
|
508
|
+
Load schema and validate DataFrame while handling custom bands properly.
|
|
509
|
+
|
|
510
|
+
Parameters
|
|
511
|
+
----------
|
|
512
|
+
df_stats : pd.DataFrame
|
|
513
|
+
DataFrame to validate
|
|
514
|
+
file_paths : list, optional
|
|
515
|
+
Schema file paths
|
|
516
|
+
national_codes : list, optional
|
|
517
|
+
Country codes for filtering
|
|
518
|
+
custom_bands : list or dict or None, optional
|
|
519
|
+
Custom band information:
|
|
520
|
+
- List: ['band1', 'band2'] - only preserves these specific bands
|
|
521
|
+
- Dict: {'band1': 'float64', 'band2': 'int64'} - validates these specific bands with types
|
|
522
|
+
- None: excludes ALL custom bands (strict mode)
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
pd.DataFrame
|
|
527
|
+
Validated DataFrame with custom bands handled according to specification
|
|
528
|
+
"""
|
|
529
|
+
# Load default schema
|
|
530
|
+
schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
|
|
531
|
+
schema_columns = list(schema.columns.keys())
|
|
532
|
+
|
|
533
|
+
# Identify extra columns
|
|
534
|
+
df_columns = df_stats.columns.tolist()
|
|
535
|
+
extra_columns = [col for col in df_columns if col not in schema_columns]
|
|
536
|
+
schema_only_columns = [col for col in df_columns if col in schema_columns]
|
|
537
|
+
|
|
538
|
+
if extra_columns:
|
|
539
|
+
logger.info(f"Found {len(extra_columns)} extra columns: {extra_columns}")
|
|
540
|
+
|
|
541
|
+
# Split DataFrame
|
|
542
|
+
df_schema_part = (
|
|
543
|
+
df_stats[schema_only_columns].copy()
|
|
544
|
+
if schema_only_columns
|
|
545
|
+
else pd.DataFrame()
|
|
546
|
+
)
|
|
547
|
+
df_extra_part = df_stats[extra_columns].copy()
|
|
548
|
+
|
|
549
|
+
# Validate schema columns if any exist
|
|
550
|
+
if not df_schema_part.empty:
|
|
551
|
+
try:
|
|
552
|
+
validated_schema_part = validate_dataframe(df_schema_part, schema)
|
|
553
|
+
except Exception as e:
|
|
554
|
+
logger.error(f"Schema validation failed: {e}")
|
|
555
|
+
validated_schema_part = (
|
|
556
|
+
df_schema_part # Keep original if validation fails
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
validated_schema_part = pd.DataFrame()
|
|
560
|
+
|
|
561
|
+
# ========== KEY FIX: Handle custom_bands=None properly ==========
|
|
562
|
+
if custom_bands is None:
|
|
563
|
+
# STRICT MODE: Exclude all custom bands when None
|
|
564
|
+
logger.info("custom_bands=None: Excluding all custom bands (strict mode)")
|
|
565
|
+
# Return only the schema columns, no extra columns
|
|
566
|
+
return (
|
|
567
|
+
validated_schema_part
|
|
568
|
+
if not validated_schema_part.empty
|
|
569
|
+
else pd.DataFrame()
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
elif custom_bands is not None:
|
|
573
|
+
# Process custom bands as specified
|
|
574
|
+
df_extra_part = _process_custom_bands(df_extra_part, custom_bands)
|
|
575
|
+
|
|
576
|
+
# Combine results
|
|
577
|
+
if not validated_schema_part.empty and not df_extra_part.empty:
|
|
578
|
+
result = pd.concat([validated_schema_part, df_extra_part], axis=1)
|
|
579
|
+
elif not validated_schema_part.empty:
|
|
580
|
+
result = validated_schema_part
|
|
581
|
+
else:
|
|
582
|
+
result = df_extra_part
|
|
583
|
+
|
|
584
|
+
# Reorder: schema columns first, then extra columns
|
|
585
|
+
if not validated_schema_part.empty:
|
|
586
|
+
ordered_columns = [
|
|
587
|
+
col for col in schema_columns if col in result.columns
|
|
588
|
+
] + [col for col in df_extra_part.columns]
|
|
589
|
+
result = result[ordered_columns]
|
|
590
|
+
|
|
591
|
+
return result
|
|
592
|
+
|
|
593
|
+
else:
|
|
594
|
+
# No extra columns - use normal validation
|
|
595
|
+
return validate_dataframe(df_stats, schema)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
|
|
599
|
+
"""
|
|
600
|
+
Process custom bands according to user specifications.
|
|
601
|
+
|
|
602
|
+
Parameters
|
|
603
|
+
----------
|
|
604
|
+
df_extra : pd.DataFrame
|
|
605
|
+
DataFrame with extra columns
|
|
606
|
+
custom_bands : list or dict
|
|
607
|
+
Custom band specifications
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
pd.DataFrame
|
|
612
|
+
Processed DataFrame with custom bands
|
|
613
|
+
"""
|
|
614
|
+
if isinstance(custom_bands, list):
|
|
615
|
+
# Just preserve specified columns as-is
|
|
616
|
+
custom_band_cols = [col for col in custom_bands if col in df_extra.columns]
|
|
617
|
+
if custom_band_cols:
|
|
618
|
+
logger.info(f"Preserving custom bands as-is: {custom_band_cols}")
|
|
619
|
+
return df_extra[custom_band_cols]
|
|
620
|
+
else:
|
|
621
|
+
logger.warning(
|
|
622
|
+
f"None of the specified custom bands {custom_bands} found in DataFrame"
|
|
623
|
+
)
|
|
624
|
+
return df_extra
|
|
625
|
+
|
|
626
|
+
elif isinstance(custom_bands, dict):
|
|
627
|
+
# Apply type conversions
|
|
628
|
+
result_df = df_extra.copy()
|
|
629
|
+
|
|
630
|
+
for band_name, target_type in custom_bands.items():
|
|
631
|
+
if band_name in result_df.columns:
|
|
632
|
+
try:
|
|
633
|
+
if target_type == "float64":
|
|
634
|
+
result_df[band_name] = pd.to_numeric(
|
|
635
|
+
result_df[band_name], errors="coerce"
|
|
636
|
+
).astype("float64")
|
|
637
|
+
elif target_type == "float32":
|
|
638
|
+
result_df[band_name] = pd.to_numeric(
|
|
639
|
+
result_df[band_name], errors="coerce"
|
|
640
|
+
).astype("float32")
|
|
641
|
+
elif target_type == "int64":
|
|
642
|
+
result_df[band_name] = pd.to_numeric(
|
|
643
|
+
result_df[band_name], errors="coerce"
|
|
644
|
+
).astype(
|
|
645
|
+
"Int64"
|
|
646
|
+
) # Nullable int
|
|
647
|
+
elif target_type == "string":
|
|
648
|
+
result_df[band_name] = result_df[band_name].astype("string")
|
|
649
|
+
elif target_type == "bool":
|
|
650
|
+
result_df[band_name] = result_df[band_name].astype("bool")
|
|
651
|
+
|
|
652
|
+
logger.info(f"Converted {band_name} to {target_type}")
|
|
653
|
+
|
|
654
|
+
except Exception as e:
|
|
655
|
+
logger.warning(
|
|
656
|
+
f"Failed to convert {band_name} to {target_type}: {e}"
|
|
657
|
+
)
|
|
658
|
+
else:
|
|
659
|
+
logger.warning(f"Custom band {band_name} not found in DataFrame")
|
|
660
|
+
|
|
661
|
+
return result_df
|
|
662
|
+
|
|
663
|
+
else:
|
|
664
|
+
# Unknown format, just return as-is
|
|
665
|
+
logger.warning(
|
|
666
|
+
f"Unknown custom_bands format: {type(custom_bands)}. Preserving all extra columns as-is."
|
|
667
|
+
)
|
|
668
|
+
return df_extra
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
# Fix the duplicate logging issue
|
|
672
|
+
def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
673
|
+
# Remove the duplicate logger creation line
|
|
674
|
+
# logger = setup_logger(__name__) # DELETE THIS LINE
|
|
675
|
+
|
|
676
|
+
# Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
|
|
677
|
+
|
|
678
|
+
# Extract the expected columns from the DataFrameSchema
|
|
679
|
+
template_columns = list(template_schema.columns.keys())
|
|
680
|
+
df_stats_columns = df_stats.columns.tolist()
|
|
681
|
+
|
|
682
|
+
# Find missing and extra columns
|
|
683
|
+
missing_in_df = [col for col in template_columns if col not in df_stats_columns]
|
|
684
|
+
extra_in_df = [col for col in df_stats_columns if col not in template_columns]
|
|
685
|
+
|
|
686
|
+
# Log missing schema columns
|
|
687
|
+
if missing_in_df:
|
|
688
|
+
logger.warning(f"Missing expected schema columns: {missing_in_df}")
|
|
689
|
+
else:
|
|
690
|
+
logger.info("All expected schema columns found in DataFrame.")
|
|
691
|
+
|
|
692
|
+
# Log extra columns (will be preserved)
|
|
693
|
+
if extra_in_df:
|
|
694
|
+
logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
|
|
695
|
+
else:
|
|
696
|
+
logger.info("No extra columns found in DataFrame.")
|