PyPI - geocif - Versions diffs - 0.1.46__tar.gz → 0.1.47__tar.gz - Mend

geocif 0.1.46tar.gz → 0.1.47tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{geocif-0.1.46/geocif.egg-info → geocif-0.1.47}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.46
+Version: 0.1.47
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.46 → geocif-0.1.47}/geocif/analysis.py RENAMED Viewed

@@ -162,8 +162,8 @@ class Geoanalysis:
             return pd.DataFrame(), pd.DataFrame()
         df_metrics = self._compute_metrics(df)
-        # df_metrics = self._process_metrics(df_metrics)
-        # self._plot_metrics(df_metrics)
+        df_metrics = self._process_metrics(df_metrics)
+        self._plot_metrics(df_metrics)
         df_regional_metrics_by_year = self._compute_regional_metrics(
             df, by="Harvest Year"
@@ -172,8 +172,10 @@ class Geoanalysis:
             df_regional_metrics_by_year
         )
         df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
-        breakpoint()
-        self._store_results(None, df_regional_metrics, df_regional_metrics_by_year)
+        self._store_results(
+            df_metrics, df_regional_metrics, df_regional_metrics_by_year
+        )
         df_national_yield = self._compute_national_yield(df)
         self._plot_national_yield(df_national_yield)
@@ -193,7 +195,7 @@ class Geoanalysis:
             .apply(self.annual_metrics)
             .reset_index()
         )
-        breakpoint()
         return df_metrics.pivot_table(
             index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
             columns="level_5",

{geocif-0.1.46 → geocif-0.1.47}/geocif/experiments.py RENAMED Viewed

@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
     # Experiment: lag_years
     logger.info("Experiment 3: lag_years")
-    parser = main(
-        inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
-    )
+    parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
     # Experiment: lag_yield_as_feature
     logger.info("Experiment 4: lag_yield_as_feature")
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
     # Experiment: median_years
     logger.info("Experiment 5: median_years")
-    parser = main(
-        inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
-    )
+    parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
     # Experiment: median_yield_as_feature
     logger.info("Experiment 6: median_yield_as_feature")
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
     # Experiment: optimize
     logger.info("Experiment 8: optimize")
-    parser = main(
-        inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
-    )
+    parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
 if __name__ == "__main__":

{geocif-0.1.46 → geocif-0.1.47}/geocif/geocif.py RENAMED Viewed

@@ -108,7 +108,6 @@ class Geocif:
                                 Config file: ML
         ====================================================================
         """
-        self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
         self.model_type = self.parser.get("ML", "model_type")
         self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
         self.analogous_year_yield_as_feature = self.parser.getboolean(
@@ -117,10 +116,10 @@ class Geocif:
         self.plot_map_for_correlation_plot = self.parser.getboolean(
             "ML", "plot_map_for_correlation_plot"
         )
-        self.correlation_threshold = self.parser.getfloat(
-            "ML", "correlation_threshold"
+        self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
+        self.include_lat_lon_as_feature = self.parser.getboolean(
+            "ML", "include_lat_lon_as_feature"
         )
-        self.include_lat_lon = self.parser.getboolean("ML", "include_lat_lon")
         self.spatial_autocorrelation = self.parser.getboolean(
             "ML", "spatial_autocorrelation"
         )
@@ -153,6 +152,9 @@ class Geocif:
             self.parser.get("ML", "cat_features")
         )
+        self.use_cumulative_features = self.parser.getboolean(
+            "DEFAULT", "use_cumulative_features"
+        )
         """
         ====================================================================
                                 Variables, Paths
@@ -198,6 +200,9 @@ class Geocif:
         self.db_path = self.dir_db / self.db_forecasts
+        # Store config file in database
+        output.config_to_db(self.db_path, self.parser, self.today)
         # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
         # obj_pickle = outlook.Outlook(self.pickle_file)
         # self.df_outlook = obj_pickle.read_outlook_file()
@@ -224,18 +229,29 @@ class Geocif:
         y_train = df_region[target_col]
         if self.ml_model:
-            self.logger.info(f"Selecting features for {self.country} {self.crop}")
-            selector, _, self.selected_features = fs.select_features(
-                X_train, y_train, method=self.feature_selection
-            )
-            self.logger.info(f"Selected features: {self.selected_features}")
+            if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+                all_features = X_train.columns
+                # Select the columns with use_ceis in it
+                self.selected_features = [
+                    column
+                    for column in all_features
+                    if any(cei in column for cei in self.use_ceis)
+                ]
+            else:
+                self.logger.info(f"Selecting features for {self.country} {self.crop}")
+                selector, _, self.selected_features = fs.select_features(
+                    X_train, y_train, method=self.feature_selection
+                )
+                self.logger.info(f"Selected features: {self.selected_features}")
             """ Update model to include conformal estimates """
-            if "lat" not in self.selected_features and self.include_lat_lon:
+            if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
                 self.selected_features.append("lat")
-            if "lon" not in self.selected_features and self.include_lat_lon:
+            if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
                 self.selected_features.append("lon")
             X_train = df_region[self.selected_features + self.cat_features]
             dir_output = (
                 self.dir_analysis
                 / self.country
@@ -312,8 +328,38 @@ class Geocif:
                     self.best_hyperparams = {}
                 elif self.model_name in ["cubist"]:
                     self.model.fit(X_train, y_train)
-            except:
-                self.logger.error(f"Error fitting model for {self.country} {self.crop}")
+                elif self.model_name in [
+                    "cumulative_1",
+                    "cumulative_2",
+                    "cumulative_3",
+                ]:
+                    from sklearn.preprocessing import StandardScaler, LabelEncoder
+                    # Standardize the numeric features
+                    scaler = StandardScaler()
+                    X_numeric = X_train.iloc[:, :3]
+                    X_scaled_numeric = pd.DataFrame(
+                        scaler.fit_transform(X_numeric),
+                        columns=X_numeric.columns,
+                        index=X_train.index,
+                    )
+                    # Encode the Region as categorical
+                    le = LabelEncoder()
+                    X_region = pd.Series(
+                        le.fit_transform(X_train["Region"]),
+                        name="Region",
+                        index=X_train.index,
+                    )
+                    # Combine scaled numeric features and encoded region
+                    X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
+                    self.model.fit(X_train_scaled, y_train)
+            except Exception as e:
+                self.logger.error(
+                    f"Error fitting model for {self.country} {self.crop} {e}"
+                )
     def predict(self, df_region, scaler=None):
         """
@@ -360,6 +406,33 @@ class Geocif:
                     X_test, Z_test, clusters_test.astype("object")
                 )
                 best_hyperparameters = self.model.fe_model.get_params().copy()
+            elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+                from sklearn.preprocessing import StandardScaler, LabelEncoder
+                # Standardize the numeric features
+                scaler = StandardScaler()
+                X_numeric = X_test.iloc[:, :3]
+                try:
+                    X_scaled_numeric = pd.DataFrame(
+                        scaler.fit_transform(X_numeric),
+                        columns=X_numeric.columns,
+                        index=X_test.index,
+                    )
+                except:
+                    breakpoint()
+                # Encode the Region as categorical
+                le = LabelEncoder()
+                X_region = pd.Series(
+                    le.fit_transform(X_test["Region"]),
+                    name="Region",
+                    index=X_test.index,
+                )
+                # Combine scaled numeric features and encoded region
+                X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
+                y_pred = self.model.predict(X_test_scaled)
+                best_hyperparameters = {}  # self.model.get_params().copy()
             elif self.model_name == "geospaNN":
                 import torch
                 import geospaNN
@@ -501,7 +574,9 @@ class Geocif:
             "Crop",
             "Harvest Year",
             "Stage Name",
+            "Time",
         ]
         df.index = df.apply(
             lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
         )
@@ -513,28 +588,37 @@ class Geocif:
     def create_feature_names(self, stages_features, selected_features):
         """
+        Create feature names for machine learning stages.
         Args:
-            stages_features:
-            selected_features:
+            stages_features (list): List of features for different stages.
+            selected_features (dict): Dictionary of selected features.
         Returns:
+            None
         """
+        # Assert stages_features is a list
+        assert isinstance(stages_features, list), "stages_features should be a list"
         # Clear out feature names
         self.feature_names = []
-        """ Select stages that will be used for ML
+        """
+        Select stages that will be used for ML
          1. method = "latest" - Select the latest stage
          2. method = "fraction" - Select a fraction (1-100) of all stages
         """
+        method = "fraction"
+        if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+            method = "latest"
         stages_features = stages.select_stages_for_ml(
-            stages_features, method="fraction", n=60
+            stages_features, method=method, n=60
         )
         for stage in stages_features:
             # Convert each element of stage to str and join with _
-            _stage = "_".join([str(x) for x in stage])
+            _stage = "_".join(map(str, stage))
             # Create a list appending _stage to each element of combined_keys
             _tmp = [f"{col}_{_stage}" for col in self.combined_keys]
@@ -543,17 +627,33 @@ class Geocif:
                 parts = _t.split("_")
                 cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
-                # Check if any element of dict_selected_features is in _t
-                for x in selected_features["CEI"].values:
-                    if x not in cei:
-                        continue
-                    dict_fn = stages.get_stage_information_dict(_t, self.method)
-                    tmp_col = dict_fn["CEI"] + " " + dict_fn["Stage Name"]
-                    if tmp_col in self.df_train.columns:
-                        self.feature_names.append(tmp_col)
+                try:
+                    if self.model_name in [
+                        "cumulative_1",
+                        "cumulative_2",
+                        "cumulative_3",
+                    ]:
+                        dict_fn = stages.get_stage_information_dict(_t, self.method)
+                        tmp_col = f"{dict_fn['CEI']}"
+                        if tmp_col in self.df_train.columns:
+                            self.feature_names.append(tmp_col)
+                    else:
+                        # Check if any element of dict_selected_features is in _t
+                        if selected_features["CEI"].any():
+                            for x in selected_features["CEI"].values:
+                                if x not in cei:
+                                    continue
+                                dict_fn = stages.get_stage_information_dict(
+                                    _t, self.method
+                                )
+                                tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
+                                if tmp_col in self.df_train.columns:
+                                    self.feature_names.append(tmp_col)
+                except:
+                    breakpoint()
         self.feature_names = list(set(self.feature_names))
         if self.median_yield_as_feature:
@@ -565,16 +665,14 @@ class Geocif:
                 self.feature_names.append(f"t -{i} {self.target}")
         if self.analogous_year_yield_as_feature:
-            self.feature_names.append("Analogous Year")
-            self.feature_names.append("Analogous Year Yield")
+            self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
         if self.use_outlook_as_feature:
             self.feature_names.append("FCST")
         # Add lat and lon to feature names
-        if self.include_lat_lon:
-            self.feature_names.append("lat")
-            self.feature_names.append("lon")
+        if self.include_lat_lon_as_feature:
+            self.feature_names.extend(["lat", "lon"])
         self.selected_features = []
@@ -598,6 +696,8 @@ class Geocif:
         for idx, region in enumerate(pbar):
             if self.model_name in ["linear", "gam"]:
                 self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
+            elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+                self.create_feature_names(stages, {})
             elif self.ml_model:
                 self.create_feature_names(stages, dict_selected_features[region])
             elif self.model_name in ["median"]:
@@ -727,11 +827,52 @@ class Geocif:
         parts = all_cei_columns[-1].split("_")
         cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
-        # HACK: Get feature name with GD4 in it to extract first and last stage id and name
-        cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
-        # Select the longest string in cei_column
-        cei_col = max(cei_column, key=len)
-        self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
+        # For each region, find the column with the longest string in cei_column
+        group_by = ["Region"]
+        groups = df.groupby(group_by)
+        if self.use_cumulative_features:
+            frames = []
+            for name, group in groups:
+                # Drop columns with all NaNs
+                group.dropna(axis=1, how="all", inplace=True)
+                cei_column = group[
+                    group.columns[group.columns.str.contains(cei)]
+                ].columns
+                max_cei_col = max(cei_column, key=len)
+                self.stage_info = stages.get_stage_information_dict(
+                    max_cei_col, self.method
+                )
+                # Subset dataframes to columns that contain self.stage_info["Stage_ID"]
+                all_columns = group.columns[
+                    group.columns.str.contains(self.stage_info["Stage_ID"])
+                ].tolist()
+                group = group[
+                    self.fixed_columns
+                    + [self.target]
+                    + self.statistics_columns
+                    + all_columns
+                ]
+                # rename all_columns to self.stage_info["CEI"]
+                group.rename(
+                    columns={
+                        col: stages.get_stage_information_dict(col, self.method)["CEI"]
+                        for col in all_columns
+                    },
+                    inplace=True,
+                )
+                frames.append(group)
+            df = pd.concat(frames)
+        else:
+            # HACK: Get feature name with GD4 in it to extract first and last stage id and name
+            cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
+            # Select the longest string in cei_column
+            cei_col = max(cei_column, key=len)
+            self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
         # Change column name
         # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
@@ -795,12 +936,14 @@ class Geocif:
         mask = self.df_results["Stage_ID"].isin(_stages)
         df = self.df_results[mask]
         """ Select which CEI categories to use for ML """
         if "all" in self.use_ceis:
             pass
         else:
-            df = df[df["Type"].isin(self.use_ceis)]
+            if self.select_cei_by == "Type":
+                df = df[df["Type"].isin(self.use_ceis)]
+            elif self.select_cei_by == "Index":
+                df = df[df["Index"].isin(self.use_ceis)]
         """ Convert this dataframe into an ML ready format and save to disk """
         df = self.create_ml_dataframe(df)
@@ -874,6 +1017,8 @@ class Geocif:
         if self.spatial_autocorrelation:
             sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
+        dict_selected_features = {}
+        dict_best_cei = {}
         if self.correlation_plots:
             self.logger.info(f"Correlation plot for {self.country} {self.crop}")
             (
@@ -949,6 +1094,8 @@ class Geocif:
         self.model_name = model
         self.experiment_name = self.parser.get("ML", "experiment_name")
         self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
+        self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
+        self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
         self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
         self.optimize = self.parser.getboolean(self.country, "optimize")
         self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
@@ -960,6 +1107,21 @@ class Geocif:
             self.estimate_ci = False
             self.check_yield_trend = False
             self.estimate_ci_for_all = False
+        elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+            self.correlation_plots = False
+            self.lag_yield_as_feature = False
+            self.median_yield_as_feature = False
+            self.median_area_as_feature = False
+            self.analogous_year_yield_as_feature = False
+            self.last_year_yield_as_feature = False
+            self.include_lat_lon_as_feature = False
+            self.do_xai = False
+            self.estimate_ci = False
+            self.estimate_ci_for_all = False
+            self.check_yield_trend = False
+            self.cluster_strategy = "single"
+            self.select_cei_by = "Index"
+            self.use_cumulative_features = True
         else:
             self.do_xai = self.parser.getboolean("ML", "do_xai")
             self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")

{geocif-0.1.46 → geocif-0.1.47}/geocif/indices_runner.py RENAMED Viewed

@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
             or "south_africa_maize" in i[3]
             or "mozambique_maize" in i[3]
             or "united_states_of_america" in i[3]
-               or "russian_federation" in i[3]
-               or "ukraine" in i[3]
+            or "russian_federation" in i[3]
+            or "ukraine" in i[3]
         ]
         #                 "malawi" in i[2]]

{geocif-0.1.46 → geocif-0.1.47}/geocif/indices_runner_v2.py RENAMED Viewed

@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
         self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
         self.base_dir = Path(
-            r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\afghanistan"
+            r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
         )  # Path(self.parser.get("PATHS", "dir_crop_inputs"))
         self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
         # Only keep those entries in combinations where the third elemt is
         # mozambique, south_africa, angola or dem_people's_rep_of_korea
         # This is done to test the code for these countries
-        combinations = [i for i in combinations if "afghanistan_maize_s1" in i[3]]
+        combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
         if True:
             num_cpu = int(cpu_count() * 0.5)

{geocif-0.1.46 → geocif-0.1.47}/geocif/ml/correlations.py RENAMED Viewed

@@ -157,8 +157,8 @@ def plot_feature_corr_by_time(df, **kwargs):
     cbar_ax.tick_params(axis="both", which="major", labelsize=5)
     _country = country.title().replace("_", " ")
-    _region_name = region_name.replace("_", " ") if not national_correlation else ""
-    _crop = "Poppy"  # crop.title().replace("_", " ")
+    _region_name = region_name if not national_correlation else ""
+    _crop = crop.title().replace("_", " ")
     if not national_correlation:
         fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
     else:
@@ -304,7 +304,7 @@ def all_correlated_feature_by_time(df, **kwargs):
                 )
                 kwargs["region_id"] = region_id
-                _region_names = "_".join([str(x) for x in group['Region'].unique()])
+                _region_names = ", ".join([str(x) for x in group['Region'].unique()])
                 kwargs["region_name"] = _region_names
                 plot_feature_corr_by_time(df_tmp, **kwargs)
                 # For each element in dict_best_cei, add the type of the cei

geocif-0.1.47/geocif/ml/misc.py ADDED Viewed

@@ -0,0 +1,33 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from taipy.gui import Gui
+# Load the dataset
+file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\ml\analysis\July_05_2024\russian_federation\maize\cumulative_1\2010\X_train_1.csv'  # Update with the correct file path
+df = pd.read_csv(file_path)
+print(df.head())
+# Define a function to create the plot
+def plot_auc_ndvi(data):
+    fig, ax = plt.subplots(figsize=(14, 8))
+    sns.lineplot(data=data, x="Harvest Year", y="AUC_NDVI Oct 7-Mar 25", hue="Region", marker="o", ax=ax)
+    ax.set_title("Trends of AUC_NDVI by Region (Oct 7 - Mar 25)")
+    ax.set_xlabel("Harvest Year")
+    ax.set_ylabel("AUC_NDVI Oct 7 - Mar 25")
+    ax.legend(title="Region", bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.show()
+    return fig
+# Create the plot and save it
+plot_fig = plot_auc_ndvi(df)
+# Define the Taipy page with the plot
+page = """
+# Trends of AUC_NDVI by Region
+<|{plot_fig}|chart|>
+"""
+# Create and run the GUI
+gui = Gui(page)
+gui.run()

{geocif-0.1.46 → geocif-0.1.47}/geocif/ml/output.py RENAMED Viewed

@@ -107,7 +107,6 @@ def store(db_path, experiment_id, df, model, model_name):
     try:
         utils.to_db(db_path, experiment_id, df)
     except Exception as e:
-        breakpoint()
         print(f"Error: {e}")
     index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
@@ -128,7 +127,6 @@ def store(db_path, experiment_id, df, model, model_name):
         df_model.index.set_names(["Index"], inplace=True)
         utils.to_db(db_path, "models", df_model)
     except Exception as e:
-        breakpoint()
         print(f"Error: {e}")
     con.commit()

{geocif-0.1.46 → geocif-0.1.47}/geocif/ml/stages.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import numpy as np
+from typing import Union
 from geocif import utils
@@ -277,23 +278,31 @@ def update_feature_names(df, method):
     return df
-def convert_stage_string(stage_info, to_array=True):
+def convert_stage_string(stage_info: Union[str, np.ndarray], to_array: bool = True) -> Union[np.ndarray, str]:
     """
-    e.g. input: '13_12_11'
-    output: array([13, 12, 11])
-    or vice versa if to_array = False
+    Converts a string of stage information to a numpy array or vice versa.
     Args:
-        stage_info:
-        to_array:
+        stage_info: A string of stages separated by underscores or a numpy array of stages e.g. '13_12_11'
+        to_array: A boolean indicating the direction of conversion. If True, converts string to numpy array e.g. array([13, 12, 11])
+                  If False, converts numpy array to string.
     Returns:
+        A numpy array of stages if to_array is True, or a string of stages if to_array is False.
+    Raises:
+        ValueError: If the input format is incorrect.
     """
     if to_array:
-        stages = stage_info.split("_")
-        stages = np.array([int(stage) for stage in stages])
+        if not isinstance(stage_info, str):
+            raise ValueError("Expected a string for stage_info when to_array is True.")
+        try:
+            stages = np.array([int(stage) for stage in stage_info.split("_")])
+        except ValueError:
+            raise ValueError("Stage info string should contain integers separated by underscores.")
     else:
-        stages = "_".join(stage_info.astype(str))
+        if not isinstance(stage_info, np.ndarray):
+            raise ValueError("Expected a numpy array for stage_info when to_array is False.")
+        stages = "_".join(map(str, stage_info))
     return stages

{geocif-0.1.46 → geocif-0.1.47}/geocif/ml/trainers.py RENAMED Viewed

@@ -2,6 +2,7 @@ import multiprocessing as mp
 import numpy as np
 import optuna
+import pandas as pd
 from catboost import CatBoostRegressor
 from sklearn.metrics import root_mean_squared_error
 from sklearn.model_selection import train_test_split
@@ -289,6 +290,27 @@ def auto_train(
             model = LinearGAM(n_splines=25, spline_order=3).gridsearch(
                 X_train.values, y_train.values, lam=np.logspace(-3, 3, 11)
             )
+        elif model_name == "cumulative_1":
+            from pygam import GAM, s, f, te
+            # compute index of column Region
+            region_idx = X_train.columns.get_loc("Region")
+            model = GAM(s(0) + f(region_idx))
+        elif model_name == "cumulative_2":
+            from pygam import GAM, s, f, te
+            # compute index of column Region
+            region_idx = X_train.columns.get_loc("Region")
+            model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
+        elif model_name == "cumulative_3":
+            from pygam import GAM, s, f, te
+            # compute index of column Region
+            region_idx = X_train.columns.get_loc("Region")
+            model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
         elif model_name == "geospaNN":
             import torch
             import geospaNN

{geocif-0.1.46 → geocif-0.1.47/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.46
+Version: 0.1.47
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/SOURCES.txt RENAMED Viewed

@@ -33,10 +33,10 @@ geocif/cei/definitions.py
 geocif/cei/indices.py
 geocif/ml/__init__.py
 geocif/ml/correlations.py
-geocif/ml/correlations_backup.py
 geocif/ml/embedding.py
 geocif/ml/feature_engineering.py
 geocif/ml/feature_selection.py
+geocif/ml/misc.py
 geocif/ml/outliers.py
 geocif/ml/outlook.py
 geocif/ml/output.py

{geocif-0.1.46 → geocif-0.1.47}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.46",
+    version="0.1.47",
     zip_safe=False,
 )

geocif-0.1.46/geocif/ml/correlations_backup.py DELETED Viewed

@@ -1,412 +0,0 @@
-import os
-import matplotlib.pyplot as plt
-import palettable as pal
-import pandas as pd
-import seaborn as sns
-from tqdm import tqdm
-from geocif import utils
-from geocif.ml import embedding
-from geocif.ml import stages
-def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
-    """
-    Args:
-        df_train:
-        simulation_stages:
-        target_col:
-    Returns:
-    """
-    frames = []
-    stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
-    # Only select columns that have been observed till the current stage
-    for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
-        current_feature_set = [
-            col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
-        ]
-        # Get the most correlated feature for each region
-        top_feature_by_region, counter = embedding.get_top_correlated_features(
-            df_train[current_feature_set + ["Region"]],
-            df_train[target_col],
-        )
-        # Create a dataframe with the most common top feature and number of occurrences over timestep
-        _feature = counter.most_common(1)[0][0]
-        # Loop through top_feature_by_region and find the average score for _feature
-        # Calculate the average score for 'DTR_36'
-        _feature_scores = [
-            value[1][0]
-            for key, value in top_feature_by_region.items()
-            if _feature in value[0]
-        ]
-        average_score = sum(_feature_scores) / len(_feature_scores)
-        _feature = utils.remove_last_part(_feature)
-        df = pd.DataFrame(
-            {
-                "Stage": [stage[-1]],
-                "Date": [utils.dict_growth_stages[stage[-1]]],
-                "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
-                "Feature Category": [_feature],
-                "Score": [average_score],
-                # "Type": [ci.dict_indices[_feature][0]],
-                "Number of Occurrences": [counter.most_common(1)[0][1]],
-                # "Current Feature Set": [current_feature_set],
-            }
-        )
-        frames.append(df)
-    df_most_corr_feature_by_time = pd.concat(frames)
-def plot_feature_corr_by_time(df, **kwargs):
-    country = kwargs.get("country")
-    crop = kwargs.get("crop")
-    dir_output = kwargs.get("dir_output")
-    forecast_season = kwargs.get("forecast_season")
-    national_correlation = kwargs.get("national_correlation")
-    group_by = kwargs.get("groupby")
-    # Setup the figure and gridspec
-    fig = plt.figure(figsize=(10, 5))
-    gs = fig.add_gridspec(
-        3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
-    )
-    # Assign subplots
-    ax_heatmap = fig.add_subplot(gs[0:2, 0])
-    ax_map = fig.add_subplot(gs[0, 1])
-    cbar_ax = fig.add_subplot(gs[2, 0])
-    ax4 = fig.add_subplot(gs[2, 1])
-    # Transpose and reverse the columns of the dataframe
-    #breakpoint()
-    ## Only select foll. columns:
-    df = df[
-        [
-            "TG",
-            "TG10p",
-            "DTR",
-            "vDTR",
-            "R99p",
-            "RX5day",
-            "MEAN_ESI4WK",
-        ]
-    ]
-    df_transpose = df.T
-    df = df_transpose[df_transpose.columns[::-1]]
-    # Split column names and only use value before space
-    df.columns = df.columns.str.split(" ").str[0]
-    # In row names, replace ESI4WK by ES
-    df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
-    df.index = df.index.str.replace("R99p", "MEAN_SM")
-    df.index = df.index.str.replace("RX5day", "AUC_SM")
-    # Remove the last row
-    # Select the first, third and fifth column
-    df = df[["Dec", "Feb", "Apr"]]
-    # Rename Dec to Planting - Early Vegetative
-    # Rename Feb to Early Vegetative - Senescence
-    # Rename Apr to Senescence - Harvest
-    df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
-    ax_heatmap = sns.heatmap(
-        df,
-        ax=ax_heatmap,
-        annot=True,
-        cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
-        fmt=".2f",
-        square=False,
-        linewidths=0.5,
-        linecolor="white",
-        cbar_ax=cbar_ax,
-        cbar_kws={"orientation": "horizontal"},  # , "shrink": 0.5},
-        annot_kws={"size": 6},
-        xticklabels=True,
-        yticklabels=True,
-    )
-    ax_heatmap.tick_params(left=False, bottom=False)
-    # Plot the map using GeoPandas
-    dg_country = kwargs.get("dg_country")
-    ax_map = dg_country.plot(
-        ax=ax_map,
-        color="white",
-        edgecolor="black",
-        linewidth=1.0,
-        facecolor=None,
-        legend=False,
-    )
-    if not national_correlation:
-        id = kwargs["region_id"]
-        dg_region = dg_country[dg_country[group_by] == id]
-        ax_map = dg_region.plot(
-            ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
-        )
-        # Set title with color blue
-        ax_map.set_title(f"Region: {id}", color="blue")
-    # No colorbar for the map
-    ax_map.axis("off")
-    # Remove borders
-    ax_map.spines["top"].set_visible(False)
-    ax_map.spines["right"].set_visible(False)
-    ax_map.spines["bottom"].set_visible(False)
-    ax_map.spines["left"].set_visible(False)
-    # ax4 should not be visible
-    ax4.axis("off")
-    # Add colorbar label
-    # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
-    cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
-    ax_heatmap.set_xticklabels(
-        ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
-    )
-    ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
-    ax_heatmap.set_xlabel("")
-    ax_heatmap.set_ylabel(" ")
-    # Reduce font size of ticks of colorbar
-    cbar_ax.tick_params(axis="both", which="major", labelsize=6)
-    _country = country.title().replace("_", " ")
-    _crop = crop.title().replace("_", " ")
-    if not national_correlation:
-        fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
-    else:
-        fname = f"{country}_{crop}_corr_feature_by_time.png"
-    ax_heatmap.set_title(f"{_country}\n{_crop}")
-    # plt.tight_layout()
-    os.makedirs(dir_output, exist_ok=True)
-    plt.savefig(dir_output / fname, dpi=250)
-    plt.close()
-def _all_correlated_feature_by_time(df, **kwargs):
-    """
-    Args:
-        df:
-        **kwargs:
-    Returns:
-    """
-    frames = []
-    all_stages = kwargs.get("all_stages")
-    target_col = kwargs.get("target_col")
-    method = kwargs.get("method")
-    longest_stage = max(all_stages, key=len)
-    # Split the original string into a list of its parts
-    longest_stage = longest_stage.split("_")
-    # Generate the list of strings as described by the user, removing one element from the start each time
-    stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
-    # Drop columns with no yield information
-    df = df.dropna(subset=[target_col])
-    # Only select columns that have been observed till the current stage
-    pbar = tqdm(stages_features, total=len(stages_features), leave=False)
-    for stage in pbar:
-        pbar.set_description(f"Calculating correlations")
-        pbar.update()
-        stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
-            "Stage Name"
-        ]
-        # starting_stage = stage_name.split("-")[0]
-        current_feature_set = [col for col in df.columns if stage_name in col]
-        # Get the most correlated feature for each region
-        df_tmp = embedding.get_all_features_correlation(
-            df[current_feature_set + ["Region"]], df[target_col], method
-        )
-        frames.append(df_tmp)
-    df_results = pd.concat(frames)
-    if not df_results.empty:
-        # Exclude Region column
-        df_results = df_results.drop(columns="Region")
-        # Groupby Dekad and compute mean of all columns apart from Region
-        df_results = df_results.groupby(method).mean()
-        all_stage_names = []
-        for stage in stages_features:
-            _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
-                "Stage Name"
-            ]
-            all_stage_names.append(_tmp)
-        df_results = df_results.reindex(all_stage_names)
-        # Drop rows with all NaN values
-        df_results = df_results.dropna(how="all")
-        # Split the index based on - and only keep the first element
-        df_results.index = df_results.index.str.split("-").str[0]
-        return df_results
-    else:
-        return pd.DataFrame()
-def all_correlated_feature_by_time(df, **kwargs):
-    """
-    Args:
-        df:
-        **kwargs:
-    Returns:
-    """
-    THRESHOLD = 0.1
-    national_correlation = kwargs.get("national_correlation")
-    group_by = kwargs.get("groupby")
-    combined_dict = kwargs.get("combined_dict")
-    dict_selected_features = {}
-    dict_best_cei = {}
-    if not national_correlation:
-        groups = df.groupby(group_by)
-        for region_id, group in tqdm(
-            groups, desc=f"Compute all correlated feature by {group_by}", leave=False
-        ):
-            df_corr = _all_correlated_feature_by_time(group, **kwargs)
-            # Remove columns with more than 50% NaN values
-            df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
-            if not df_corr.empty:
-                df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
-                # Add the columns to dict_selected_features along with the absolute mean value
-                absolute_medians = df_tmp.abs().median()
-                # Create a DataFrame to display the column names and their absolute median values
-                absolute_median_df = absolute_medians.reset_index()
-                absolute_median_df.columns = ['CEI', 'Median']
-                # Add the CEI and Median value to dict_selected_features
-                dict_selected_features[region_id] = absolute_median_df
-                df_tmp2 = (
-                    df_tmp.median(axis=0)
-                    .abs()
-                    .sort_values(ascending=False)
-                    .reset_index()
-                )
-                df_tmp2.columns = ["Metric", "Value"]
-                # Add another column based on Type of Metric
-                for idx, row in df_tmp2.iterrows():
-                    df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
-                # Compute median of each CEI and sort the dataframe based on the absolute value of the median
-                dict_best_cei[region_id] = (
-                    df_tmp2.groupby("Type")
-                    .max()
-                    .reset_index()
-                    .sort_values("Value", ascending=False)["Metric"]
-                    .values
-                )
-                kwargs["region_id"] = region_id
-                plot_feature_corr_by_time(df_tmp, **kwargs)
-                # For each element in dict_best_cei, add the type of the cei
-            else:
-                # HACK
-                df_corr = _all_correlated_feature_by_time(df, **kwargs)
-                df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
-                # Add the columns to dict_selected_features along with the absolute mean value
-                absolute_medians = df_tmp.abs().median()
-                # Create a DataFrame to display the column names and their absolute median values
-                absolute_median_df = absolute_medians.reset_index()
-                absolute_median_df.columns = ['CEI', 'Median']
-                # Add the CEI and Median value to dict_selected_features
-                dict_selected_features[region_id] = absolute_median_df
-                dict_best_cei[region_id] = {}
-    else:
-        df_corr = _all_correlated_feature_by_time(df, **kwargs)
-        df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
-        # Add the columns to dict_selected_features along with the absolute mean value
-        absolute_medians = df_tmp.abs().median()
-        # Create a DataFrame to display the column names and their absolute median values
-        absolute_median_df = absolute_medians.reset_index()
-        absolute_median_df.columns = ['CEI', 'Median']
-        # Add the CEI and Median value to dict_selected_features
-        dict_selected_features[0] = absolute_median_df
-        plot_feature_corr_by_time(df_corr, **kwargs)
-    return dict_selected_features, dict_best_cei
-def feature_correlation_by_time(**kwargs):
-    raise NotImplementedError()
-    frames = []
-    simulation_stages = kwargs.get("simulation_stages")
-    df_train = kwargs.get("df_train")
-    target_col = kwargs.get("target_col")
-    stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
-    # Only select columns that have been observed till the current stage
-    for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
-        current_feature_set = [
-            col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
-        ]
-        # Get the most correlated feature for each region
-        top_feature_by_region, counter = embedding.compute_feature_correlations(
-            df_train[current_feature_set + ["Region"]],
-            df_train[target_col],
-            "all",
-        )
-        # Create a dataframe with the most common top feature and number of occurrences over timestep
-        _feature = counter.most_common(1)[0][0]
-        # Loop through top_feature_by_region and find the average score for _feature
-        # Calculate the average score for 'DTR_36'
-        _feature_scores = [
-            value[1][0]
-            for key, value in top_feature_by_region.items()
-            if _feature in value[0]
-        ]
-        average_score = sum(_feature_scores) / len(_feature_scores)
-        _feature = utils.remove_last_part(_feature)
-        df = pd.DataFrame(
-            {
-                "Stage": [stage[-1]],
-                "Date": [utils.dict_growth_stages[stage[-1]]],
-                "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
-                "Feature Category": [_feature],
-                "Score": [average_score],
-                # "Type": [ci.dict_indices[_feature][0]],
-                "Number of Occurrences": [counter.most_common(1)[0][1]],
-                # "Current Feature Set": [current_feature_set],
-            }
-        )
-        frames.append(df)
-    df_corr_feature_by_time = pd.concat(frames)