openforis-whisp 2.0.0b2__py3-none-any.whl → 3.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,7 +81,10 @@ def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
81
81
  or load_schema_if_any_file_changed._last_cache_key != current_cache_key
82
82
  ):
83
83
 
84
- print(f"Creating schema for national_codes: {national_codes}")
84
+ # Suppress verbose output
85
+ if logger.level <= logging.INFO:
86
+ logger.debug(f"Creating schema for national_codes: {national_codes}")
87
+ # else: suppress entirely
85
88
 
86
89
  # Load and combine lookup files
87
90
  combined_lookup_df = append_csvs_to_dataframe(file_paths)
@@ -102,7 +105,10 @@ def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
102
105
 
103
106
  return schema
104
107
  else:
105
- print(f"Using cached schema for national_codes: {national_codes}")
108
+ # Suppress verbose output
109
+ if logger.level <= logging.INFO:
110
+ logger.debug(f"Using cached schema for national_codes: {national_codes}")
111
+ # else: suppress entirely
106
112
  return load_schema_if_any_file_changed._cached_schema
107
113
 
108
114
 
@@ -694,3 +700,193 @@ def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSch
694
700
  logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
695
701
  else:
696
702
  logger.info("No extra columns found in DataFrame.")
703
+
704
+
705
+ def format_stats_dataframe(
706
+ df,
707
+ area_col="Area_sum",
708
+ decimal_places=2,
709
+ unit_type="ha",
710
+ stats_unit_type_column="Unit",
711
+ strip_suffix="_sum",
712
+ remove_columns=True,
713
+ remove_columns_suffix="_median",
714
+ convert_water_flag=True,
715
+ water_flag_column="In_waterbody_sum",
716
+ water_flag_threshold=0.5,
717
+ sort_column="plotId",
718
+ ):
719
+ """Flexible stats formatting for DataFrame columns.
720
+
721
+ - Converts columns ending with `strip_suffix` (default '_sum') to hectares or percent.
722
+ - Removes columns ending with `remove_columns_suffix` (default '_median') if `remove_columns` is True.
723
+ - Optionally converts a water-flag stat into a boolean column based on the threshold compared to `area_col`.
724
+ - Strips the `strip_suffix` from produced stat column names (so 'Cocoa_sum' -> 'Cocoa').
725
+ - Fills `stats_unit_type_column` with `unit_type` for every row.
726
+
727
+ Returns a new DataFrame (copy) with conversions applied. Helper sub-functions are used for clarity
728
+ and to avoid fragmenting the original DataFrame (we build new columns and concat once).
729
+
730
+ Parameters
731
+ ----------
732
+ df : pd.DataFrame
733
+ Input DataFrame with stats columns
734
+ area_col : str
735
+ Name of area column (default 'Area_sum')
736
+ decimal_places : int
737
+ Decimal places for rounding (default 2)
738
+ unit_type : str
739
+ 'ha' or 'percent' (default 'ha')
740
+ stats_unit_type_column : str
741
+ Column name for unit type (default 'Unit')
742
+ strip_suffix : str
743
+ Suffix to strip from stat column names (default '_sum')
744
+ remove_columns : bool
745
+ Whether to remove columns with remove_columns_suffix (default True)
746
+ remove_columns_suffix : str
747
+ Suffix for columns to remove (default '_median')
748
+ convert_water_flag : bool
749
+ Whether to convert water flag to boolean (default True)
750
+ water_flag_column : str
751
+ Name of water flag column (default 'In_waterbody_sum')
752
+ water_flag_threshold : float
753
+ Threshold for water flag ratio (default 0.5)
754
+ sort_column : str
755
+ Column to sort by, or None to skip sorting (default "plotId")
756
+
757
+ Returns
758
+ -------
759
+ pd.DataFrame
760
+ Formatted DataFrame with converted values and updated column names
761
+ """
762
+ # Helper: find stat columns that end with the strip_suffix (and are not the area_col)
763
+ def _collect_stat_columns(columns, strip_suffix, area_col):
764
+ cols = [c for c in columns if c.endswith(strip_suffix) and c != area_col]
765
+ return cols
766
+
767
+ # Helper: drop columns with a given suffix
768
+ def _drop_suffix_columns(df, suffix):
769
+ if suffix is None or suffix == "":
770
+ logger.debug(f"Suffix is None or empty, returning df as-is")
771
+ return df
772
+ cols_to_drop = df.columns[df.columns.str.endswith(suffix)].tolist()
773
+ logger.debug(f"Columns ending with '{suffix}': {cols_to_drop}")
774
+ result = df.loc[:, ~df.columns.str.endswith(suffix)]
775
+ logger.debug(f"After dropping '{suffix}': {result.columns.tolist()}")
776
+ return result
777
+
778
+ # Helper: build converted stats (returns DataFrame of new columns indexed same as df)
779
+ def _build_converted_stats(
780
+ df, stat_cols, area_col, unit_type, decimal_places, strip_suffix
781
+ ):
782
+ area = df[area_col].replace(0, float("nan"))
783
+ new = {}
784
+ for col in stat_cols:
785
+ base = (
786
+ col[: -len(strip_suffix)]
787
+ if strip_suffix and col.endswith(strip_suffix)
788
+ else col
789
+ )
790
+ if unit_type == "ha":
791
+ # value is in whatever units the sum uses (ee outputs square meters) -> convert to hectares
792
+ # (user earlier used divide by 10000 pattern)
793
+ new[base] = (df[col] / 10000).round(decimal_places)
794
+ elif unit_type == "percent":
795
+ new[base] = ((df[col] / area) * 100).round(decimal_places)
796
+ else:
797
+ # unknown unit type: just copy the raw sums
798
+ new[base] = df[col].round(decimal_places)
799
+ df[area_col] = (df[area_col] / 10000).round(decimal_places)
800
+ return pd.DataFrame(new, index=df.index)
801
+
802
+ # Helper: convert water flag stat (if present) into bool by thresholding water_area / total_area
803
+ def _apply_water_flag(df, water_flag_column, strip_suffix, area_col, threshold):
804
+ # possible names for water stat: exact provided name, name+suffix
805
+ candidates = []
806
+ if water_flag_column in df.columns:
807
+ candidates.append(water_flag_column)
808
+ suffixed = water_flag_column + strip_suffix if strip_suffix else None
809
+ if suffixed and suffixed in df.columns:
810
+ candidates.append(suffixed)
811
+ # also check generic 'water' candidates
812
+ if "water" + strip_suffix in df.columns:
813
+ candidates.append("water" + strip_suffix)
814
+ if not candidates:
815
+ # nothing to do
816
+ return df
817
+ # pick first available candidate
818
+ water_col = candidates[0]
819
+ total_area = df[area_col].replace(0, float("nan"))
820
+ # compute ratio
821
+ ratio = df[water_col] / total_area
822
+ df[water_flag_column] = (ratio > threshold).astype(bool)
823
+ return df
824
+
825
+ # 1) Work on a shallow copy to avoid mutating caller inplace accidentally
826
+ df = df.copy()
827
+
828
+ # 2) Optionally drop median (or other) columns
829
+ if remove_columns and remove_columns_suffix:
830
+ logger.debug(f"Dropping columns ending with '{remove_columns_suffix}'")
831
+ logger.debug(
832
+ f"Columns before drop: {[c for c in df.columns if c.endswith(remove_columns_suffix)]}"
833
+ )
834
+ df = _drop_suffix_columns(df, remove_columns_suffix)
835
+ logger.debug(
836
+ f"Columns after drop: {[c for c in df.columns if c.endswith(remove_columns_suffix)]}"
837
+ )
838
+
839
+ # 3) Collect stat columns to convert (those ending with strip_suffix and not equal to area_col)
840
+ # EXCLUDE intermediate admin/context columns that should be completely dropped, not stripped
841
+ columns_to_exclude_from_stripping = ["admin_code", "water_flag"]
842
+ stat_cols = _collect_stat_columns(df.columns, strip_suffix, area_col)
843
+ stat_cols = [
844
+ c
845
+ for c in stat_cols
846
+ if not any(exc in c for exc in columns_to_exclude_from_stripping)
847
+ ]
848
+ logger.debug(
849
+ f"Stat columns after excluding intermediate admin/context columns: {stat_cols}"
850
+ )
851
+
852
+ # 4) Build converted stats DataFrame (these will have suffix removed as column names)
853
+ if stat_cols:
854
+ converted_stats_df = _build_converted_stats(
855
+ df, stat_cols, area_col, unit_type, decimal_places, strip_suffix
856
+ )
857
+ else:
858
+ converted_stats_df = pd.DataFrame(index=df.index)
859
+
860
+ # 5) Remove original stat columns (the ones with strip_suffix) from df (but keep area_col)
861
+ df = df.loc[
862
+ :, [c for c in df.columns if not (c.endswith(strip_suffix) and c != area_col)]
863
+ ]
864
+
865
+ # 6) Concatenate converted stats into df in one go to avoid fragmentation
866
+ if not converted_stats_df.empty:
867
+ df = pd.concat([df, converted_stats_df], axis=1)
868
+
869
+ # 7) Fill stats unit type column
870
+ df[stats_unit_type_column] = unit_type
871
+
872
+ # 8) Optionally convert water flag to boolean
873
+ if convert_water_flag:
874
+ df = _apply_water_flag(
875
+ df, water_flag_column, strip_suffix, area_col, water_flag_threshold
876
+ )
877
+
878
+ # 9) rename area_col by stripping suffix from area_col
879
+ area_col_stripped = (
880
+ area_col[: -len(strip_suffix)] if area_col.endswith(strip_suffix) else area_col
881
+ )
882
+ df.rename(columns={area_col: area_col_stripped}, inplace=True)
883
+
884
+ # 10) reorder by plotId column if present
885
+ df = (
886
+ df.sort_values(sort_column).reset_index(drop=True)
887
+ if sort_column in df.columns
888
+ else df
889
+ )
890
+
891
+ # 11) Defragment final DataFrame and return
892
+ return df.copy()