openforis-whisp 2.0.0b2__py3-none-any.whl → 3.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +35 -4
- openforis_whisp/advanced_stats.py +2070 -0
- openforis_whisp/data_checks.py +642 -0
- openforis_whisp/data_conversion.py +86 -44
- openforis_whisp/datasets.py +298 -225
- openforis_whisp/logger.py +26 -0
- openforis_whisp/parameters/__init__.py +0 -0
- openforis_whisp/parameters/lookup_gaul1_admin.py +18663 -0
- openforis_whisp/reformat.py +198 -2
- openforis_whisp/stats.py +488 -68
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a1.dist-info/RECORD +20 -0
- openforis_whisp-2.0.0b2.dist-info/RECORD +0 -16
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/LICENSE +0 -0
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/WHEEL +0 -0
openforis_whisp/reformat.py
CHANGED
|
@@ -81,7 +81,10 @@ def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
|
|
|
81
81
|
or load_schema_if_any_file_changed._last_cache_key != current_cache_key
|
|
82
82
|
):
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
# Suppress verbose output
|
|
85
|
+
if logger.level <= logging.INFO:
|
|
86
|
+
logger.debug(f"Creating schema for national_codes: {national_codes}")
|
|
87
|
+
# else: suppress entirely
|
|
85
88
|
|
|
86
89
|
# Load and combine lookup files
|
|
87
90
|
combined_lookup_df = append_csvs_to_dataframe(file_paths)
|
|
@@ -102,7 +105,10 @@ def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
|
|
|
102
105
|
|
|
103
106
|
return schema
|
|
104
107
|
else:
|
|
105
|
-
|
|
108
|
+
# Suppress verbose output
|
|
109
|
+
if logger.level <= logging.INFO:
|
|
110
|
+
logger.debug(f"Using cached schema for national_codes: {national_codes}")
|
|
111
|
+
# else: suppress entirely
|
|
106
112
|
return load_schema_if_any_file_changed._cached_schema
|
|
107
113
|
|
|
108
114
|
|
|
@@ -694,3 +700,193 @@ def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSch
|
|
|
694
700
|
logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
|
|
695
701
|
else:
|
|
696
702
|
logger.info("No extra columns found in DataFrame.")
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def format_stats_dataframe(
|
|
706
|
+
df,
|
|
707
|
+
area_col="Area_sum",
|
|
708
|
+
decimal_places=2,
|
|
709
|
+
unit_type="ha",
|
|
710
|
+
stats_unit_type_column="Unit",
|
|
711
|
+
strip_suffix="_sum",
|
|
712
|
+
remove_columns=True,
|
|
713
|
+
remove_columns_suffix="_median",
|
|
714
|
+
convert_water_flag=True,
|
|
715
|
+
water_flag_column="In_waterbody_sum",
|
|
716
|
+
water_flag_threshold=0.5,
|
|
717
|
+
sort_column="plotId",
|
|
718
|
+
):
|
|
719
|
+
"""Flexible stats formatting for DataFrame columns.
|
|
720
|
+
|
|
721
|
+
- Converts columns ending with `strip_suffix` (default '_sum') to hectares or percent.
|
|
722
|
+
- Removes columns ending with `remove_columns_suffix` (default '_median') if `remove_columns` is True.
|
|
723
|
+
- Optionally converts a water-flag stat into a boolean column based on the threshold compared to `area_col`.
|
|
724
|
+
- Strips the `strip_suffix` from produced stat column names (so 'Cocoa_sum' -> 'Cocoa').
|
|
725
|
+
- Fills `stats_unit_type_column` with `unit_type` for every row.
|
|
726
|
+
|
|
727
|
+
Returns a new DataFrame (copy) with conversions applied. Helper sub-functions are used for clarity
|
|
728
|
+
and to avoid fragmenting the original DataFrame (we build new columns and concat once).
|
|
729
|
+
|
|
730
|
+
Parameters
|
|
731
|
+
----------
|
|
732
|
+
df : pd.DataFrame
|
|
733
|
+
Input DataFrame with stats columns
|
|
734
|
+
area_col : str
|
|
735
|
+
Name of area column (default 'Area_sum')
|
|
736
|
+
decimal_places : int
|
|
737
|
+
Decimal places for rounding (default 2)
|
|
738
|
+
unit_type : str
|
|
739
|
+
'ha' or 'percent' (default 'ha')
|
|
740
|
+
stats_unit_type_column : str
|
|
741
|
+
Column name for unit type (default 'Unit')
|
|
742
|
+
strip_suffix : str
|
|
743
|
+
Suffix to strip from stat column names (default '_sum')
|
|
744
|
+
remove_columns : bool
|
|
745
|
+
Whether to remove columns with remove_columns_suffix (default True)
|
|
746
|
+
remove_columns_suffix : str
|
|
747
|
+
Suffix for columns to remove (default '_median')
|
|
748
|
+
convert_water_flag : bool
|
|
749
|
+
Whether to convert water flag to boolean (default True)
|
|
750
|
+
water_flag_column : str
|
|
751
|
+
Name of water flag column (default 'In_waterbody_sum')
|
|
752
|
+
water_flag_threshold : float
|
|
753
|
+
Threshold for water flag ratio (default 0.5)
|
|
754
|
+
sort_column : str
|
|
755
|
+
Column to sort by, or None to skip sorting (default "plotId")
|
|
756
|
+
|
|
757
|
+
Returns
|
|
758
|
+
-------
|
|
759
|
+
pd.DataFrame
|
|
760
|
+
Formatted DataFrame with converted values and updated column names
|
|
761
|
+
"""
|
|
762
|
+
# Helper: find stat columns that end with the strip_suffix (and are not the area_col)
|
|
763
|
+
def _collect_stat_columns(columns, strip_suffix, area_col):
|
|
764
|
+
cols = [c for c in columns if c.endswith(strip_suffix) and c != area_col]
|
|
765
|
+
return cols
|
|
766
|
+
|
|
767
|
+
# Helper: drop columns with a given suffix
|
|
768
|
+
def _drop_suffix_columns(df, suffix):
|
|
769
|
+
if suffix is None or suffix == "":
|
|
770
|
+
logger.debug(f"Suffix is None or empty, returning df as-is")
|
|
771
|
+
return df
|
|
772
|
+
cols_to_drop = df.columns[df.columns.str.endswith(suffix)].tolist()
|
|
773
|
+
logger.debug(f"Columns ending with '{suffix}': {cols_to_drop}")
|
|
774
|
+
result = df.loc[:, ~df.columns.str.endswith(suffix)]
|
|
775
|
+
logger.debug(f"After dropping '{suffix}': {result.columns.tolist()}")
|
|
776
|
+
return result
|
|
777
|
+
|
|
778
|
+
# Helper: build converted stats (returns DataFrame of new columns indexed same as df)
|
|
779
|
+
def _build_converted_stats(
|
|
780
|
+
df, stat_cols, area_col, unit_type, decimal_places, strip_suffix
|
|
781
|
+
):
|
|
782
|
+
area = df[area_col].replace(0, float("nan"))
|
|
783
|
+
new = {}
|
|
784
|
+
for col in stat_cols:
|
|
785
|
+
base = (
|
|
786
|
+
col[: -len(strip_suffix)]
|
|
787
|
+
if strip_suffix and col.endswith(strip_suffix)
|
|
788
|
+
else col
|
|
789
|
+
)
|
|
790
|
+
if unit_type == "ha":
|
|
791
|
+
# value is in whatever units the sum uses (ee outputs square meters) -> convert to hectares
|
|
792
|
+
# (user earlier used divide by 10000 pattern)
|
|
793
|
+
new[base] = (df[col] / 10000).round(decimal_places)
|
|
794
|
+
elif unit_type == "percent":
|
|
795
|
+
new[base] = ((df[col] / area) * 100).round(decimal_places)
|
|
796
|
+
else:
|
|
797
|
+
# unknown unit type: just copy the raw sums
|
|
798
|
+
new[base] = df[col].round(decimal_places)
|
|
799
|
+
df[area_col] = (df[area_col] / 10000).round(decimal_places)
|
|
800
|
+
return pd.DataFrame(new, index=df.index)
|
|
801
|
+
|
|
802
|
+
# Helper: convert water flag stat (if present) into bool by thresholding water_area / total_area
|
|
803
|
+
def _apply_water_flag(df, water_flag_column, strip_suffix, area_col, threshold):
|
|
804
|
+
# possible names for water stat: exact provided name, name+suffix
|
|
805
|
+
candidates = []
|
|
806
|
+
if water_flag_column in df.columns:
|
|
807
|
+
candidates.append(water_flag_column)
|
|
808
|
+
suffixed = water_flag_column + strip_suffix if strip_suffix else None
|
|
809
|
+
if suffixed and suffixed in df.columns:
|
|
810
|
+
candidates.append(suffixed)
|
|
811
|
+
# also check generic 'water' candidates
|
|
812
|
+
if "water" + strip_suffix in df.columns:
|
|
813
|
+
candidates.append("water" + strip_suffix)
|
|
814
|
+
if not candidates:
|
|
815
|
+
# nothing to do
|
|
816
|
+
return df
|
|
817
|
+
# pick first available candidate
|
|
818
|
+
water_col = candidates[0]
|
|
819
|
+
total_area = df[area_col].replace(0, float("nan"))
|
|
820
|
+
# compute ratio
|
|
821
|
+
ratio = df[water_col] / total_area
|
|
822
|
+
df[water_flag_column] = (ratio > threshold).astype(bool)
|
|
823
|
+
return df
|
|
824
|
+
|
|
825
|
+
# 1) Work on a shallow copy to avoid mutating caller inplace accidentally
|
|
826
|
+
df = df.copy()
|
|
827
|
+
|
|
828
|
+
# 2) Optionally drop median (or other) columns
|
|
829
|
+
if remove_columns and remove_columns_suffix:
|
|
830
|
+
logger.debug(f"Dropping columns ending with '{remove_columns_suffix}'")
|
|
831
|
+
logger.debug(
|
|
832
|
+
f"Columns before drop: {[c for c in df.columns if c.endswith(remove_columns_suffix)]}"
|
|
833
|
+
)
|
|
834
|
+
df = _drop_suffix_columns(df, remove_columns_suffix)
|
|
835
|
+
logger.debug(
|
|
836
|
+
f"Columns after drop: {[c for c in df.columns if c.endswith(remove_columns_suffix)]}"
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
# 3) Collect stat columns to convert (those ending with strip_suffix and not equal to area_col)
|
|
840
|
+
# EXCLUDE intermediate admin/context columns that should be completely dropped, not stripped
|
|
841
|
+
columns_to_exclude_from_stripping = ["admin_code", "water_flag"]
|
|
842
|
+
stat_cols = _collect_stat_columns(df.columns, strip_suffix, area_col)
|
|
843
|
+
stat_cols = [
|
|
844
|
+
c
|
|
845
|
+
for c in stat_cols
|
|
846
|
+
if not any(exc in c for exc in columns_to_exclude_from_stripping)
|
|
847
|
+
]
|
|
848
|
+
logger.debug(
|
|
849
|
+
f"Stat columns after excluding intermediate admin/context columns: {stat_cols}"
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
# 4) Build converted stats DataFrame (these will have suffix removed as column names)
|
|
853
|
+
if stat_cols:
|
|
854
|
+
converted_stats_df = _build_converted_stats(
|
|
855
|
+
df, stat_cols, area_col, unit_type, decimal_places, strip_suffix
|
|
856
|
+
)
|
|
857
|
+
else:
|
|
858
|
+
converted_stats_df = pd.DataFrame(index=df.index)
|
|
859
|
+
|
|
860
|
+
# 5) Remove original stat columns (the ones with strip_suffix) from df (but keep area_col)
|
|
861
|
+
df = df.loc[
|
|
862
|
+
:, [c for c in df.columns if not (c.endswith(strip_suffix) and c != area_col)]
|
|
863
|
+
]
|
|
864
|
+
|
|
865
|
+
# 6) Concatenate converted stats into df in one go to avoid fragmentation
|
|
866
|
+
if not converted_stats_df.empty:
|
|
867
|
+
df = pd.concat([df, converted_stats_df], axis=1)
|
|
868
|
+
|
|
869
|
+
# 7) Fill stats unit type column
|
|
870
|
+
df[stats_unit_type_column] = unit_type
|
|
871
|
+
|
|
872
|
+
# 8) Optionally convert water flag to boolean
|
|
873
|
+
if convert_water_flag:
|
|
874
|
+
df = _apply_water_flag(
|
|
875
|
+
df, water_flag_column, strip_suffix, area_col, water_flag_threshold
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# 9) rename area_col by stripping suffix from area_col
|
|
879
|
+
area_col_stripped = (
|
|
880
|
+
area_col[: -len(strip_suffix)] if area_col.endswith(strip_suffix) else area_col
|
|
881
|
+
)
|
|
882
|
+
df.rename(columns={area_col: area_col_stripped}, inplace=True)
|
|
883
|
+
|
|
884
|
+
# 10) reorder by plotId column if present
|
|
885
|
+
df = (
|
|
886
|
+
df.sort_values(sort_column).reset_index(drop=True)
|
|
887
|
+
if sort_column in df.columns
|
|
888
|
+
else df
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
# 11) Defragment final DataFrame and return
|
|
892
|
+
return df.copy()
|