openforis-whisp 2.0.0a5__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,9 +29,7 @@ try:
29
29
  except Exception as e:
30
30
  print("Error in default EE initialization:", e)
31
31
 
32
- from openforis_whisp.datasets import (
33
- combine_datasets,
34
- )
32
+ from openforis_whisp.datasets import combine_datasets, combine_custom_bands
35
33
 
36
34
  from openforis_whisp.stats import (
37
35
  whisp_stats_ee_to_ee,
@@ -56,6 +54,7 @@ from openforis_whisp.parameters.config_runtime import (
56
54
 
57
55
  from openforis_whisp.reformat import (
58
56
  validate_dataframe_using_lookups,
57
+ validate_dataframe_using_lookups_flexible,
59
58
  validate_dataframe,
60
59
  create_schema_from_dataframe,
61
60
  load_schema_if_any_file_changed,
@@ -1215,40 +1215,6 @@ def nci_ocs2020_prep():
1215
1215
 
1216
1216
  ###Combining datasets
1217
1217
 
1218
- ###Combining datasets
1219
-
1220
- # def combine_datasets():
1221
- # """Combines datasets into a single multiband image, with fallback if assets are missing."""
1222
- # img_combined = ee.Image(1).rename(geometry_area_column)
1223
-
1224
- # # Combine images directly
1225
- # for img in [func() for func in list_functions()]:
1226
- # try:
1227
- # img_combined = img_combined.addBands(img)
1228
- # except ee.EEException as e:
1229
- # # logger.error(f"Error adding image: {e}")
1230
- # print(f"Error adding image: {e}")
1231
-
1232
- # try:
1233
- # # Attempt to print band names to check for errors
1234
- # print(img_combined.bandNames().getInfo())
1235
- # except ee.EEException as e:
1236
- # # logger.error(f"Error printing band names: {e}")
1237
- # # logger.info("Running code for filtering to only valid datasets due to error in input")
1238
- # print("using valid datasets filter due to error in input")
1239
- # # Validate images
1240
- # images_to_test = [func() for func in list_functions()]
1241
- # valid_imgs = keep_valid_images(images_to_test) # Validate images
1242
-
1243
- # # Retry combining images after validation
1244
- # img_combined = ee.Image(1).rename(geometry_area_column)
1245
- # for img in valid_imgs:
1246
- # img_combined = img_combined.addBands(img)
1247
-
1248
- # img_combined = img_combined.multiply(ee.Image.pixelArea())
1249
-
1250
- # return img_combined
1251
-
1252
1218
 
1253
1219
  def combine_datasets(national_codes=None):
1254
1220
  """Combines datasets into a single multiband image, with fallback if assets are missing."""
@@ -1380,5 +1346,32 @@ def ee_image_checker(image):
1380
1346
  return False
1381
1347
 
1382
1348
 
1383
- # print(combine_valid_datasets().bandNames().getInfo())
1384
- # print(combine_datasets().bandNames().getInfo())
1349
+ # preparation steps for multiband image with area per pixel values
1350
+ # function for notebook environment
1351
+ # user provides custom_images dict and custom_bands_info dict
1352
+ def combine_custom_bands(custom_images, custom_bands_info):
1353
+ """
1354
+ Combine custom Earth Engine images into a single multiband image with area conversion.
1355
+
1356
+ Returns
1357
+ -------
1358
+ ee.Image
1359
+ Combined bands converted to area values
1360
+ """
1361
+ # ... existing validation code ...
1362
+
1363
+ # Step 3: Rename and combine images
1364
+ band_names = list(custom_bands_info.keys())
1365
+
1366
+ # Start with first image
1367
+ custom_ee_image = custom_images[band_names[0]].rename(band_names[0])
1368
+
1369
+ # Add remaining images if any
1370
+ for name in band_names[1:]:
1371
+ next_image = custom_images[name].rename(name)
1372
+ custom_ee_image = custom_ee_image.addBands(next_image)
1373
+
1374
+ # Convert to area values
1375
+ custom_ee_image = custom_ee_image.multiply(ee.Image.pixelArea())
1376
+
1377
+ return custom_ee_image # Only return the image
@@ -2,7 +2,7 @@ name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude
2
2
  EUFO_2020,10,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_jrc_gfc_2020_prep
3
3
  GLAD_Primary,20,,treecover,primary,1,1,0,float32,1,0,g_glad_pht_prep
4
4
  TMF_undist,30,,treecover,primary,1,1,0,float32,1,0,g_jrc_tmf_undisturbed_prep
5
- GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
5
+ GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_gfc_tc_2020_prep
6
6
  Forest_FDaP,60,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
7
7
  ESA_TC_2020,70,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_esa_worldcover_trees_prep
8
8
  TMF_plant,80,,commodities,NA,1,1,0,float32,1,0,g_jrc_tmf_plantation_prep
@@ -199,3 +199,4 @@ nBR_INPE_TCamz_pasture_2020,2422,BR,commodities,NA,1,1,0,float32,1,0,nbr_terracl
199
199
  nBR_INPE_TCcer_pasture_2020,2423,BR,commodities,NA,1,1,0,float32,1,0,nbr_terraclass_cer20_ac_prep
200
200
  nBR_MapBiomas_col9_pasture_2020,2424,BR,commodities,NA,1,1,0,float32,1,0,nbr_mapbiomasc9_pasture_prep
201
201
  nCI_Cocoa_bnetd,3000,CI,commodities,NA,1,1,0,float32,1,0,nci_ocs2020_prep
202
+
@@ -38,6 +38,7 @@ def validate_dataframe_using_lookups(
38
38
  Returns:
39
39
  pd.DataFrame: The validated DataFrame.
40
40
  """
41
+
41
42
  # Load the schema
42
43
  schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
43
44
 
@@ -237,11 +238,11 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
237
238
  return schema
238
239
 
239
240
 
240
- def setup_logger(name):
241
- # Create and configure logger
242
- logging.basicConfig(level=logging.INFO)
243
- logger = logging.getLogger(name)
244
- return logger
241
+ # def setup_logger(name):
242
+ # # Create and configure logger
243
+ # logging.basicConfig(level=logging.INFO)
244
+ # logger = logging.getLogger(name)
245
+ # return logger
245
246
 
246
247
 
247
248
  def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
@@ -301,164 +302,166 @@ def setup_logger(name):
301
302
  return logger
302
303
 
303
304
 
304
- # def filter_lookup_by_country_codes(
305
- # lookup_df: pd.DataFrame, national_codes: list
306
- # ) -> pd.DataFrame:
307
- # """
308
- # Filter lookup DataFrame to include only:
309
- # 1. Global columns (prefixed with 'g_')
310
- # 2. General columns (not country-specific)
311
- # 3. Country-specific columns matching the provided ISO2 codes
312
-
313
- # Args:
314
- # lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
315
- # national_codes (list): List of ISO2 country codes to include
316
-
317
- # Returns:
318
- # pd.DataFrame: Filtered lookup DataFrame
319
- # """
320
- # if not national_codes:
321
- # return lookup_df
322
-
323
- # # Normalize national_codes to lowercase for case-insensitive comparison
324
- # normalized_codes = [
325
- # code.lower() for code in national_codes if isinstance(code, str)
326
- # ]
327
-
328
- # # Keep track of rows to filter out
329
- # rows_to_remove = []
330
-
331
- # # Process each row in the lookup DataFrame
332
- # for idx, row in lookup_df.iterrows():
333
- # col_name = row["name"]
334
-
335
- # # Skip if not a column name entry
336
- # if pd.isna(col_name):
337
- # continue
338
-
339
- # # Always keep global columns (g_) and columns that aren't country-specific
340
- # if col_name.startswith("g_"):
341
- # continue
342
-
343
- # # Check if this is a country-specific column (nXX_)
344
- # is_country_column = False
345
- # matched_country = False
346
-
347
- # # Look for pattern nXX_ which would indicate a country-specific column
348
- # for i in range(len(col_name) - 3):
349
- # if (
350
- # col_name[i : i + 1].lower() == "n"
351
- # and len(col_name) > i + 3
352
- # and col_name[i + 3 : i + 4] == "_"
353
- # ):
354
- # country_code = col_name[i + 1 : i + 3].lower()
355
- # is_country_column = True
356
- # if country_code in normalized_codes:
357
- # matched_country = True
358
- # break
359
-
360
- # # If it's a country column but doesn't match our list, flag for removal
361
- # if is_country_column and not matched_country:
362
- # rows_to_remove.append(idx)
363
-
364
- # # Filter out rows for countries not in our list
365
- # if rows_to_remove:
366
- # return lookup_df.drop(rows_to_remove)
367
-
368
- # # return lookup_df
369
- # def filter_lookup_by_country_codes(
370
- # lookup_df: pd.DataFrame, national_codes: list = None
371
- # ) -> pd.DataFrame:
372
- # """
373
- # Filter lookup DataFrame to include only:
374
- # 1. Global columns (prefixed with 'g_')
375
- # 2. General columns (not country-specific)
376
- # 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
377
-
378
- # If no national_codes are provided, ALL country-specific columns are filtered out.
379
-
380
- # Args:
381
- # lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
382
- # national_codes (list, optional): List of ISO2 country codes to include.
383
- # If None, all country-specific columns are removed.
384
-
385
- # Returns:
386
- # pd.DataFrame: Filtered lookup DataFrame
387
- # """
388
-
389
- # # Normalize national_codes to lowercase for case-insensitive comparison
390
- # if national_codes:
391
- # normalized_codes = [
392
- # code.lower() for code in national_codes if isinstance(code, str)
393
- # ]
394
- # else:
395
- # normalized_codes = []
396
-
397
- # # Keep track of rows to remove
398
- # rows_to_remove = []
399
-
400
- # # Process each row in the lookup DataFrame
401
- # for idx, row in lookup_df.iterrows():
402
- # col_name = row["name"]
403
-
404
- # # Skip if not a column name entry
405
- # if pd.isna(col_name):
406
- # continue
407
-
408
- # # Always keep global columns (g_) and general columns
409
- # if col_name.startswith("g_"):
410
- # continue
411
-
412
- # # Check if this is a country-specific column (nXX_)
413
- # is_country_column = False
414
- # matched_country = False
415
-
416
- # # Look for pattern nXX_ which indicates a country-specific column
417
- # for i in range(len(col_name) - 3):
418
- # if (
419
- # col_name[i : i + 1].lower() == "n"
420
- # and len(col_name) > i + 3
421
- # and col_name[i + 3 : i + 4] == "_"
422
- # ):
423
- # country_code = col_name[i + 1 : i + 3].lower()
424
- # is_country_column = True
425
-
426
- # # Only match if we have national_codes AND this country is in the list
427
- # if national_codes and country_code in normalized_codes:
428
- # matched_country = True
429
- # break
430
-
431
- # # Remove country-specific columns that don't match our criteria:
432
- # # - If no national_codes provided: remove ALL country columns
433
- # # - If national_codes provided: remove country columns NOT in the list
434
- # if is_country_column and not matched_country:
435
- # rows_to_remove.append(idx)
436
-
437
- # # Filter out flagged rows
438
- # if rows_to_remove:
439
- # print(f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria")
440
- # filtered_df = lookup_df.drop(rows_to_remove)
441
-
442
- # # Filter out flagged rows
443
- # if rows_to_remove:
444
- # # Create detailed debug info
445
- # removed_rows_info = []
446
- # for idx in rows_to_remove:
447
- # row_name = lookup_df.loc[idx, "name"]
448
- # removed_rows_info.append({
449
- # 'index': idx,
450
- # 'name': row_name
451
- # })
452
-
453
- # # Extract just the column names for easy viewing
454
- # removed_column_names = [info['name'] for info in removed_rows_info]
455
-
456
-
457
- # print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
458
- # print(f"Removed column names: {removed_column_names}")
459
- # return filtered_df
305
+ def filter_lookup_by_country_codes(
306
+ lookup_df: pd.DataFrame, national_codes: list
307
+ ) -> pd.DataFrame:
308
+ """
309
+ Filter lookup DataFrame to include only:
310
+ 1. Global columns (prefixed with 'g_')
311
+ 2. General columns (not country-specific)
312
+ 3. Country-specific columns matching the provided ISO2 codes
313
+
314
+ Args:
315
+ lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
316
+ national_codes (list): List of ISO2 country codes to include
317
+
318
+ Returns:
319
+ pd.DataFrame: Filtered lookup DataFrame
320
+ """
321
+ if not national_codes:
322
+ return lookup_df
323
+
324
+ # Normalize national_codes to lowercase for case-insensitive comparison
325
+ normalized_codes = [
326
+ code.lower() for code in national_codes if isinstance(code, str)
327
+ ]
328
+
329
+ # Keep track of rows to filter out
330
+ rows_to_remove = []
331
+
332
+ # Process each row in the lookup DataFrame
333
+ for idx, row in lookup_df.iterrows():
334
+ col_name = row["name"]
335
+
336
+ # Skip if not a column name entry
337
+ if pd.isna(col_name):
338
+ continue
339
+
340
+ # Always keep global columns (g_) and columns that aren't country-specific
341
+ if col_name.startswith("g_"):
342
+ continue
343
+
344
+ # Check if this is a country-specific column (nXX_)
345
+ is_country_column = False
346
+ matched_country = False
347
+
348
+ # Look for pattern nXX_ which would indicate a country-specific column
349
+ for i in range(len(col_name) - 3):
350
+ if (
351
+ col_name[i : i + 1].lower() == "n"
352
+ and len(col_name) > i + 3
353
+ and col_name[i + 3 : i + 4] == "_"
354
+ ):
355
+ country_code = col_name[i + 1 : i + 3].lower()
356
+ is_country_column = True
357
+ if country_code in normalized_codes:
358
+ matched_country = True
359
+ break
360
+
361
+ # If it's a country column but doesn't match our list, flag for removal
362
+ if is_country_column and not matched_country:
363
+ rows_to_remove.append(idx)
364
+
365
+ # Filter out rows for countries not in our list
366
+ if rows_to_remove:
367
+ return lookup_df.drop(rows_to_remove)
368
+
460
369
 
461
370
  # return lookup_df
371
+ def filter_lookup_by_country_codes(
372
+ lookup_df: pd.DataFrame, national_codes: list = None
373
+ ) -> pd.DataFrame:
374
+ """
375
+ Filter lookup DataFrame to include only:
376
+ 1. Global columns (prefixed with 'g_')
377
+ 2. General columns (not country-specific)
378
+ 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
379
+
380
+ If no national_codes are provided, ALL country-specific columns are filtered out.
381
+
382
+ Args:
383
+ lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
384
+ national_codes (list, optional): List of ISO2 country codes to include.
385
+ If None, all country-specific columns are removed.
386
+
387
+ Returns:
388
+ pd.DataFrame: Filtered lookup DataFrame
389
+ """
390
+
391
+ # Normalize national_codes to lowercase for case-insensitive comparison
392
+ if national_codes:
393
+ normalized_codes = [
394
+ code.lower() for code in national_codes if isinstance(code, str)
395
+ ]
396
+ else:
397
+ normalized_codes = []
398
+
399
+ # Keep track of rows to remove
400
+ rows_to_remove = []
401
+
402
+ # Process each row in the lookup DataFrame
403
+ for idx, row in lookup_df.iterrows():
404
+ col_name = row["name"]
405
+
406
+ # Skip if not a column name entry
407
+ if pd.isna(col_name):
408
+ continue
409
+
410
+ # Always keep global columns (g_) and general columns
411
+ if col_name.startswith("g_"):
412
+ continue
413
+
414
+ # Check if this is a country-specific column (nXX_)
415
+ is_country_column = False
416
+ matched_country = False
417
+
418
+ # Look for pattern nXX_ which indicates a country-specific column
419
+ for i in range(len(col_name) - 3):
420
+ if (
421
+ col_name[i : i + 1].lower() == "n"
422
+ and len(col_name) > i + 3
423
+ and col_name[i + 3 : i + 4] == "_"
424
+ ):
425
+ country_code = col_name[i + 1 : i + 3].lower()
426
+ is_country_column = True
427
+
428
+ # Only match if we have national_codes AND this country is in the list
429
+ if national_codes and country_code in normalized_codes:
430
+ matched_country = True
431
+ break
432
+
433
+ # Remove country-specific columns that don't match our criteria:
434
+ # - If no national_codes provided: remove ALL country columns
435
+ # - If national_codes provided: remove country columns NOT in the list
436
+ if is_country_column and not matched_country:
437
+ rows_to_remove.append(idx)
438
+
439
+ # Filter out flagged rows
440
+ if rows_to_remove:
441
+ print(
442
+ f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria"
443
+ )
444
+ filtered_df = lookup_df.drop(rows_to_remove)
445
+
446
+ # Filter out flagged rows
447
+ # if rows_to_remove:
448
+ # # Create detailed debug info
449
+ # removed_rows_info = []
450
+ # for idx in rows_to_remove:
451
+ # row_name = lookup_df.loc[idx, "name"]
452
+ # removed_rows_info.append({
453
+ # 'index': idx,
454
+ # 'name': row_name
455
+ # })
456
+
457
+ # # Extract just the column names for easy viewing
458
+ # removed_column_names = [info['name'] for info in removed_rows_info]
459
+
460
+ # print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
461
+ # print(f"Removed column names: {removed_column_names}")
462
+ # return filtered_df
463
+
464
+ return lookup_df
462
465
 
463
466
 
464
467
  def filter_lookup_by_country_codes(
@@ -493,3 +496,201 @@ def filter_lookup_by_country_codes(
493
496
  )
494
497
 
495
498
  return lookup_df[mask]
499
+
500
+
501
+ def validate_dataframe_using_lookups_flexible(
502
+ df_stats: pd.DataFrame,
503
+ file_paths: list = None,
504
+ national_codes: list = None,
505
+ custom_bands=None,
506
+ ) -> pd.DataFrame:
507
+ """
508
+ Load schema and validate DataFrame while handling custom bands properly.
509
+
510
+ Parameters
511
+ ----------
512
+ df_stats : pd.DataFrame
513
+ DataFrame to validate
514
+ file_paths : list, optional
515
+ Schema file paths
516
+ national_codes : list, optional
517
+ Country codes for filtering
518
+ custom_bands : list or dict or None, optional
519
+ Custom band information:
520
+ - List: ['band1', 'band2'] - only preserves these specific bands
521
+ - Dict: {'band1': 'float64', 'band2': 'int64'} - validates these specific bands with types
522
+ - None: excludes ALL custom bands (strict mode)
523
+
524
+ Returns
525
+ -------
526
+ pd.DataFrame
527
+ Validated DataFrame with custom bands handled according to specification
528
+ """
529
+ # Load default schema
530
+ schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
531
+ schema_columns = list(schema.columns.keys())
532
+
533
+ # Identify extra columns
534
+ df_columns = df_stats.columns.tolist()
535
+ extra_columns = [col for col in df_columns if col not in schema_columns]
536
+ schema_only_columns = [col for col in df_columns if col in schema_columns]
537
+
538
+ if extra_columns:
539
+ logger.info(f"Found {len(extra_columns)} extra columns: {extra_columns}")
540
+
541
+ # Split DataFrame
542
+ df_schema_part = (
543
+ df_stats[schema_only_columns].copy()
544
+ if schema_only_columns
545
+ else pd.DataFrame()
546
+ )
547
+ df_extra_part = df_stats[extra_columns].copy()
548
+
549
+ # Validate schema columns if any exist
550
+ if not df_schema_part.empty:
551
+ try:
552
+ validated_schema_part = validate_dataframe(df_schema_part, schema)
553
+ except Exception as e:
554
+ logger.error(f"Schema validation failed: {e}")
555
+ validated_schema_part = (
556
+ df_schema_part # Keep original if validation fails
557
+ )
558
+ else:
559
+ validated_schema_part = pd.DataFrame()
560
+
561
+ # ========== KEY FIX: Handle custom_bands=None properly ==========
562
+ if custom_bands is None:
563
+ # STRICT MODE: Exclude all custom bands when None
564
+ logger.info("custom_bands=None: Excluding all custom bands (strict mode)")
565
+ # Return only the schema columns, no extra columns
566
+ return (
567
+ validated_schema_part
568
+ if not validated_schema_part.empty
569
+ else pd.DataFrame()
570
+ )
571
+
572
+ elif custom_bands is not None:
573
+ # Process custom bands as specified
574
+ df_extra_part = _process_custom_bands(df_extra_part, custom_bands)
575
+
576
+ # Combine results
577
+ if not validated_schema_part.empty and not df_extra_part.empty:
578
+ result = pd.concat([validated_schema_part, df_extra_part], axis=1)
579
+ elif not validated_schema_part.empty:
580
+ result = validated_schema_part
581
+ else:
582
+ result = df_extra_part
583
+
584
+ # Reorder: schema columns first, then extra columns
585
+ if not validated_schema_part.empty:
586
+ ordered_columns = [
587
+ col for col in schema_columns if col in result.columns
588
+ ] + [col for col in df_extra_part.columns]
589
+ result = result[ordered_columns]
590
+
591
+ return result
592
+
593
+ else:
594
+ # No extra columns - use normal validation
595
+ return validate_dataframe(df_stats, schema)
596
+
597
+
598
+ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
599
+ """
600
+ Process custom bands according to user specifications.
601
+
602
+ Parameters
603
+ ----------
604
+ df_extra : pd.DataFrame
605
+ DataFrame with extra columns
606
+ custom_bands : list or dict
607
+ Custom band specifications
608
+
609
+ Returns
610
+ -------
611
+ pd.DataFrame
612
+ Processed DataFrame with custom bands
613
+ """
614
+ if isinstance(custom_bands, list):
615
+ # Just preserve specified columns as-is
616
+ custom_band_cols = [col for col in custom_bands if col in df_extra.columns]
617
+ if custom_band_cols:
618
+ logger.info(f"Preserving custom bands as-is: {custom_band_cols}")
619
+ return df_extra[custom_band_cols]
620
+ else:
621
+ logger.warning(
622
+ f"None of the specified custom bands {custom_bands} found in DataFrame"
623
+ )
624
+ return df_extra
625
+
626
+ elif isinstance(custom_bands, dict):
627
+ # Apply type conversions
628
+ result_df = df_extra.copy()
629
+
630
+ for band_name, target_type in custom_bands.items():
631
+ if band_name in result_df.columns:
632
+ try:
633
+ if target_type == "float64":
634
+ result_df[band_name] = pd.to_numeric(
635
+ result_df[band_name], errors="coerce"
636
+ ).astype("float64")
637
+ elif target_type == "float32":
638
+ result_df[band_name] = pd.to_numeric(
639
+ result_df[band_name], errors="coerce"
640
+ ).astype("float32")
641
+ elif target_type == "int64":
642
+ result_df[band_name] = pd.to_numeric(
643
+ result_df[band_name], errors="coerce"
644
+ ).astype(
645
+ "Int64"
646
+ ) # Nullable int
647
+ elif target_type == "string":
648
+ result_df[band_name] = result_df[band_name].astype("string")
649
+ elif target_type == "bool":
650
+ result_df[band_name] = result_df[band_name].astype("bool")
651
+
652
+ logger.info(f"Converted {band_name} to {target_type}")
653
+
654
+ except Exception as e:
655
+ logger.warning(
656
+ f"Failed to convert {band_name} to {target_type}: {e}"
657
+ )
658
+ else:
659
+ logger.warning(f"Custom band {band_name} not found in DataFrame")
660
+
661
+ return result_df
662
+
663
+ else:
664
+ # Unknown format, just return as-is
665
+ logger.warning(
666
+ f"Unknown custom_bands format: {type(custom_bands)}. Preserving all extra columns as-is."
667
+ )
668
+ return df_extra
669
+
670
+
671
+ # Fix the duplicate logging issue
672
+ def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
673
+ # Remove the duplicate logger creation line
674
+ # logger = setup_logger(__name__) # DELETE THIS LINE
675
+
676
+ # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
677
+
678
+ # Extract the expected columns from the DataFrameSchema
679
+ template_columns = list(template_schema.columns.keys())
680
+ df_stats_columns = df_stats.columns.tolist()
681
+
682
+ # Find missing and extra columns
683
+ missing_in_df = [col for col in template_columns if col not in df_stats_columns]
684
+ extra_in_df = [col for col in df_stats_columns if col not in template_columns]
685
+
686
+ # Log missing schema columns
687
+ if missing_in_df:
688
+ logger.warning(f"Missing expected schema columns: {missing_in_df}")
689
+ else:
690
+ logger.info("All expected schema columns found in DataFrame.")
691
+
692
+ # Log extra columns (will be preserved)
693
+ if extra_in_df:
694
+ logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
695
+ else:
696
+ logger.info("No extra columns found in DataFrame.")