PyPI - geocif - Versions diffs - 0.1.26__tar.gz → 0.1.28__tar.gz - Mend

geocif 0.1.26tar.gz → 0.1.28tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{geocif-0.1.26/geocif.egg-info → geocif-0.1.28}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.26
+Version: 0.1.28
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.26 → geocif-0.1.28}/geocif/analysis.py RENAMED Viewed

@@ -374,17 +374,18 @@ class Geoanalysis:
             # Remove df_tmp from df_model
             df_model = df_model.drop(df_tmp.index)
             # Plot the histogram of MAPE
             # Create bins for '% of total Area (ha)' and 'MAPE'
+            bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 5 + 1)
             df_model["Area Bins"] = pd.cut(
                 df_model["% of total Area (ha)"],
-                bins=[0, 2, 4, 6, 8, 10, 20, max(df_model["% of total Area (ha)"])],
+                bins=bin_edges,
                 precision=0,
             )
             df_model["MAPE Bins"] = pd.cut(
                 df_model["Mean Absolute Percentage Error"],
-                bins=6,  # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
+                bins=5,  # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
                 right=False,
                 precision=1,
             )
@@ -444,7 +445,6 @@ class Geoanalysis:
             plt.xlabel("Mean Absolute Percentage Error (%)")
             plt.ylabel("Density")
             plt.legend(title="Country", title_fontsize="13")
             plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
             plt.close()
@@ -460,6 +460,9 @@ class Geoanalysis:
             countries = df_model["Country"].unique().tolist()
             # make it title case and replace _ with space
             countries = [country.title().replace("_", " ") for country in countries]
+            countries = ["Malawi"]
+            df_model = df_model[df_model["Country"].isin(countries)]
+            self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
             plot.plot_df_shpfile(
                 self.dg,  # dataframe containing adm1 name and polygon
                 df_model,  # dataframe containing information that will be mapped
@@ -468,9 +471,9 @@ class Geoanalysis:
                 name_col=col,  # Which column to plot
                 dir_out=self.dir_analysis,  # Output directory
                 fname=fname,  # Output file name
-                label=f"Mean Absolute Percentage Error",
+                label=f"MAPE (%)",
                 vmin=df_model[col].min(),
-                vmax=50,  # df_model[col].max(),
+                vmax=df_model[col].max(),
                 cmap=pal.scientific.sequential.Bamako_20_r,
                 series="sequential",
                 show_bg=False,
@@ -488,6 +491,10 @@ class Geoanalysis:
         for model in models:
             df_model = df_plot[df_plot["Model"] == model]
+            countries = ["malawi"]
+            df_model = df_model[df_model["Country"].isin(countries)]
+            self.dg = self.dg[self.dg["ADM0_NAME"].isin(["Malawi", "malawi"])]
             countries = df_model["Country"].unique().tolist()
             if len(countries) > 1:
                 self.dir_plot = self.dir_analysis
@@ -502,6 +509,7 @@ class Geoanalysis:
                 + df_model["Region"].str.lower().str.replace("_", " ")
             )
             # Change Harvest year to type int
             df_model["Harvest Year"] = df_model["Harvest Year"].astype(int)
             annotate_region_column = (
@@ -517,83 +525,84 @@ class Geoanalysis:
                     df_time_period = df_harvest_year[
                         df_harvest_year["Stage Name"] == time_period
                     ]
-                    """ % of total area """
-                    # if idx == 0:
-                    #     fname = f"{self.country}_{self.crop}_perc_area.png"
-                    #     col = "% of total Area (ha)"
-                    #     plot.plot_df_shpfile(
-                    #         self.dg,  # dataframe containing adm1 name and polygon
-                    #         df_model,  # dataframe containing information that will be mapped
-                    #         merge_col="Country Region",  # Column on which to merge
-                    #         name_country=countries,  # Plot global map
-                    #         name_col=col,  # Which column to plot
-                    #         dir_out=self.plot_dir / str(year),  # Output directory
-                    #         fname=fname,  # Output file name
-                    #         label=f"% of Total Area (ha)\n{self.crop.title()}",
-                    #         vmin=df_model[col].min(),
-                    #         vmax=df_model[col].max(),
-                    #         cmap=pal.scientific.sequential.Bamako_20_r,
-                    #         series="sequential",
-                    #         show_bg=False,
-                    #         annotate_regions=True,
-                    #         annotate_region_column=annotate_region_column,
-                    #         loc_legend="lower left",
-                    #     )
-                    #
-                    #     """ Unique regions """
-                    #     fname = f"{self.country}_{self.crop}_region_ID.png"
-                    #     col = "Region_ID"
-                    #     df_model[col] = df_model[col].astype(int) + 1
-                    #     if len(df_model["Region_ID"].unique() > 1):
-                    #         # Create a dictionary with each region assigned a unique integer identifier and name
-                    #         dict_region = {
-                    #             int(key): key for key in df_time_period["Region_ID"].unique()
-                    #         }
-                    #         plot.plot_df_shpfile(
-                    #             self.dg,  # dataframe containing adm1 name and polygon
-                    #             df_model,  # dataframe containing information that will be mapped
-                    #             dict_lup=dict_region,
-                    #             merge_col="Country Region",  # Column on which to merge
-                    #             name_country=countries,  # Plot global map
-                    #             name_col=col,  # Which column to plot
-                    #             dir_out=self.plot_dir / str(year),  # Output directory
-                    #             fname=fname,  # Output file name
-                    #             label=f"Region Cluster\n{self.crop.title()}",
-                    #             vmin=df_model[col].min(),
-                    #             vmax=df_model[col].max(),
-                    #             cmap=pal.tableau.Tableau_20.mpl_colors,
-                    #             series="qualitative",
-                    #             show_bg=False,
-                    #             alpha_feature=1,
-                    #             use_key=True,
-                    #             annotate_regions=True,
-                    #             annotate_region_column=annotate_region_column,
-                    #             loc_legend="lower left",
-                    #         )
-                    """ Anomaly """
-                    fname = (
-                        f"{fname_prefix}_{self.crop}_{time_period}_{year}_anomaly.png"
-                    )
-                    plot.plot_df_shpfile(
-                        self.dg,  # dataframe containing adm1 name and polygon
-                        df_harvest_year,  # dataframe containing information that will be mapped
-                        merge_col="Country Region",  # Column on which to merge
-                        name_country=countries,  # Plot global map
-                        name_col="Anomaly",  # Which column to plot
-                        dir_out=self.dir_plot / str(year),  # Output directory
-                        fname=fname,  # Output file name
-                        label=f"% of {self.number_lag_years}-year Median Yield\n{self.crop.title()}, {year}",
-                        vmin=df_harvest_year["Anomaly"].min(),
-                        vmax=110,  # df_harvest_year["Anomaly"].max(),
-                        cmap=pal.cartocolors.diverging.Geyser_5_r,
-                        series="sequential",
-                        show_bg=False,
-                        annotate_regions=False,
-                        annotate_region_column=annotate_region_column,
-                        loc_legend="lower left",
-                    )
+    #
+    #                 """ % of total area """
+                    if idx == 0:
+                        fname = f"{self.country}_{self.crop}_perc_area.png"
+                        col = "% of total Area (ha)"
+                        plot.plot_df_shpfile(
+                            self.dg,  # dataframe containing adm1 name and polygon
+                            df_model,  # dataframe containing information that will be mapped
+                            merge_col="Country Region",  # Column on which to merge
+                            name_country=countries,  # Plot global map
+                            name_col=col,  # Which column to plot
+                            dir_out=self.dir_plot / str(year),  # Output directory
+                            fname=fname,  # Output file name
+                            label=f"% of Total Area (ha)\n{self.crop.title()}",
+                            vmin=df_model[col].min(),
+                            vmax=df_model[col].max(),
+                            cmap=pal.scientific.sequential.Bamako_20_r,
+                            series="sequential",
+                            show_bg=False,
+                            annotate_regions=False,
+                            annotate_region_column=annotate_region_column,
+                            loc_legend="lower left",
+                        )
+    #
+    #                 #     """ Unique regions """
+                    fname = f"{self.country}_{self.crop}_region_ID.png"
+                    col = "Region_ID"
+                    df_model[col] = df_model[col].astype(int) + 1
+                    if len(df_model["Region_ID"].unique() > 1):
+                        # Create a dictionary with each region assigned a unique integer identifier and name
+                        dict_region = {
+                            int(key): key for key in df_time_period["Region_ID"].unique()
+                        }
+                        plot.plot_df_shpfile(
+                            self.dg,  # dataframe containing adm1 name and polygon
+                            df_model,  # dataframe containing information that will be mapped
+                            dict_lup=dict_region,
+                            merge_col="Country Region",  # Column on which to merge
+                            name_country=countries,  # Plot global map
+                            name_col=col,  # Which column to plot
+                            dir_out=self.dir_plot / str(year),  # Output directory
+                            fname=fname,  # Output file name
+                            label=f"Region Cluster\n{self.crop.title()}",
+                            vmin=df_model[col].min(),
+                            vmax=df_model[col].max(),
+                            cmap=pal.tableau.Tableau_20.mpl_colors,
+                            series="qualitative",
+                            show_bg=False,
+                            alpha_feature=1,
+                            use_key=True,
+                            annotate_regions=False,
+                            annotate_region_column=annotate_region_column,
+                            loc_legend="lower left",
+                        )
+    #                     breakpoint()
+                    # """ Anomaly """
+                    # fname = (
+                    #     f"{fname_prefix}_{self.crop}_{time_period}_{year}_anomaly.png"
+                    # )
+                    # plot.plot_df_shpfile(
+                    #     self.dg,  # dataframe containing adm1 name and polygon
+                    #     df_harvest_year,  # dataframe containing information that will be mapped
+                    #     merge_col="Country Region",  # Column on which to merge
+                    #     name_country=countries,  # Plot global map
+                    #     name_col="Anomaly",  # Which column to plot
+                    #     dir_out=self.dir_plot / str(year),  # Output directory
+                    #     fname=fname,  # Output file name
+                    #     label=f"% of {self.number_lag_years}-year Median Yield\n{self.crop.title()}, {year}",
+                    #     vmin=df_harvest_year["Anomaly"].min(),
+                    #     vmax=110,  # df_harvest_year["Anomaly"].max(),
+                    #     cmap=pal.cartocolors.diverging.Geyser_5_r,
+                    #     series="sequential",
+                    #     show_bg=False,
+                    #     annotate_regions=False,
+                    #     annotate_region_column=annotate_region_column,
+                    #     loc_legend="lower left",
+                    # )
                     """ Predicted Yield """
                     fname = f"{fname_prefix}_{self.crop}_{time_period}_{year}_predicted_yield.png"
@@ -605,7 +614,7 @@ class Geoanalysis:
                         name_col="Predicted Yield (tn per ha)",  # Which column to plot
                         dir_out=self.dir_plot / str(year),  # Output directory
                         fname=fname,  # Output file name
-                        label=f"{self.predicted}\n{self.crop.title()}, {year}",
+                        label=f"Predicted Yield (Mg/ha)\n{self.crop.title()}, {year}",
                         vmin=df_harvest_year[self.predicted].min(),
                         vmax=df_harvest_year[self.predicted].max(),
                         cmap=pal.scientific.sequential.Bamako_20_r,
@@ -749,10 +758,10 @@ class Geoanalysis:
             name_shapefile = df[df["Option"] == "boundary_file"]["Value"].values[0]
             for crop in crops:
-                # Does a table with the name {country}-{crop} exist in the database?
-                table = f"{country}-{crop}"
+                # Does a table with the name {country}_{crop} exist in the database?
+                table = f"{country}_{crop}"
                 if self.table_exists(self.db_path, table):
-                    self.dict_config[f"{country}-{crop}"] = {
+                    self.dict_config[f"{country}_{crop}"] = {
                         "method": method,
                         "crops": crop,
                         "models": models,
@@ -789,7 +798,7 @@ class Geoanalysis:
             self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
         )
         # Make it lower case
-        self.dg["Country Region"] = self.dg["Country Region"].str.lower()
+        self.dg["Country Region"] = self.dg["Country Region"].str.lower().replace("_", " ")
 def run(path_config_files=[Path("../config/geocif.txt")]):
@@ -800,16 +809,16 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
     """ Loop over each country, crop, model combination in dict_config """
     frames = []
     for country_crop, value in obj.dict_config.items():
-        obj.country = country_crop.split("-")[0]
-        obj.crop = country_crop.split("-")[1]
+        obj.crop = value["crops"]
+        # to get country, remove obj.crops from country_crop
+        obj.country = country_crop.replace(f"_{obj.crop}", "")
         obj.admin_zone = value["admin_zone"]
         obj.boundary_file = value["name_shapefile"]
         obj.method = value["method"]
-        obj.number_lag_years = value["number_lag_years"]
+        obj.number_lag_years = 5
-        obj.table = f"{obj.country}-{obj.crop}"
-        breakpoint()
+        obj.table = f"{obj.country}_{obj.crop}"
         models = value["models"]
         for model in models:
             obj.model = model

{geocif-0.1.26 → geocif-0.1.28}/geocif/geocif.py RENAMED Viewed

@@ -93,6 +93,7 @@ class Geocif:
         self.countries = ast.literal_eval(self.parser.get("DEFAULT", "countries"))
         self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
         self.update_input_file = self.parser.getboolean("DEFAULT", "update_input_file")
+        self.correlation_plots = self.parser.getboolean("DEFAULT", "correlation_plots")
         self.national_correlation = self.parser.getboolean(
             "DEFAULT", "national_correlation"
         )
@@ -143,8 +144,8 @@ class Geocif:
         """
         # If ML model is run for individual region or cluster, then Region_ID is the same for each region
         # or cluster and therefore redundant for the ML model
-        if self.cluster_strategy in ["individual", "auto_detect"]:
-            self.cat_features.remove("Region_ID")
+        #if self.cluster_strategy in ["individual", "auto_detect"]:
+        #    self.cat_features.remove("Region_ID")
         self.fixed_columns: list = [
             "Country",
@@ -264,6 +265,12 @@ class Geocif:
                         verbose=False,
                         # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
                     )
+                elif self.model_name == "merf":
+                    Z_train = np.ones((len(X_train), 1))
+                    clusters_train = df_region["Region"]
+                    clusters_train.reset_index(drop=True, inplace=True)
+                    self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
                 elif self.model_name == "linear":
                     self.model.fit(X_train_scaled, y_train)
                 elif self.model_name == "gam":
@@ -273,23 +280,6 @@ class Geocif:
                     self.model.fit(X_train, y_train)
             except:
                 self.logger.error(f"Error fitting model for {self.country} {self.crop}")
-            # if self.cluster_strategy == "individual" or len(X_train) == 1:
-            #     self.model.fit(
-            #         X_train,
-            #         y_train,
-            #         cat_features=self.cat_features,
-            #         verbose=False,
-            #         # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
-            #     )
-            # elif self.cluster_strategy in ["auto_detect", "single"]:
-            #     # Use MERF
-            #     Z_train = np.ones((len(X_train), 1))
-            #     clusters_train = df_region["Region"]
-            #     clusters_train.reset_index(drop=True, inplace=True)
-            #
-            #     self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
-            #     # change clusters_train to object dtype
-            #     # clusters_train = clusters_train.astype("object")
     def predict(self, df_region, scaler=None):
         """
@@ -313,7 +303,6 @@ class Geocif:
                     len(X_test), df_region[f"Last Year {self.target}"].values
                 )
         else:
-            best_hyperparameters = self.model.get_params().copy()
             if self.model_name in ["linear", "gam"]:
                 # Drop cat_features from X_test
                 X_test = X_test.drop(
@@ -327,12 +316,17 @@ class Geocif:
             if self.estimate_ci:
                 if self.estimate_ci_for_all or self.forecast_season == self.today_year:
                     y_pred, y_pred_ci = self.model.predict(X_test, alpha=0.1)
+                    best_hyperparameters = self.model.get_params().copy()
+            elif self.model_name == "merf":
+                Z_test = np.ones((len(X_test), 1))
+                clusters_test = df_region["Region"]
+                clusters_test.reset_index(drop=True, inplace=True)
+                y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
+                best_hyperparameters = self.model.fe_model.get_params().copy()
             else:
                 y_pred = self.model.predict(X_test)
-            # Z_test = np.ones((len(X_test), 1))
-            # clusters_test = df_region["Region"]
-            # clusters_test.reset_index(drop=True, inplace=True)
-            # y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
+                best_hyperparameters = self.model.get_params().copy()
         if self.check_yield_trend:
             # Get information for retrending
@@ -353,7 +347,7 @@ class Geocif:
         # Create a dataframe with forecast results
         shp = len(X_test)
-        experiment_id = f"{self.country}-{self.crop}"
+        experiment_id = f"{self.country}_{self.crop}"
         now = ar.utcnow().to("America/New_York").format("MMMM-DD-YYYY HH:mm:ss")
         selected_features = self.selected_features + self.cat_features
         df = pd.DataFrame(
@@ -431,6 +425,7 @@ class Geocif:
         # Create an index based on following columns
         index_columns = [
             "Model",
+            "Cluster Strategy"
             "Country",
             "Region",
             "Crop",
@@ -464,7 +459,7 @@ class Geocif:
          2. method = "fraction" - Select a fraction (1-100) of all stages
         """
         stages_features = stages.select_stages_for_ml(
-            stages_features, method="fraction", n=30
+            stages_features, method="fraction", n=60
         )
         for stage in stages_features:
@@ -723,6 +718,7 @@ class Geocif:
         )
         # cat_features should be converted to category type
         df[self.cat_features] = df[self.cat_features].astype("category")
         """  Heatmap of correlation of various features with yield at each time step"""
@@ -743,30 +739,31 @@ class Geocif:
             how="outer",
         )
-        dict_kwargs = {}
-        dict_kwargs["all_stages"] = self.all_stages
-        dict_kwargs["target_col"] = self.target
-        dict_kwargs["country"] = self.country
-        dict_kwargs["crop"] = self.crop
-        dict_kwargs["dir_output"] = (
-            self.dir_analysis
-            / self.country
-            / self.crop
-            / self.model_name
-            / str(self.forecast_season)
-        )
-        dict_kwargs["forecast_season"] = self.forecast_season
-        dict_kwargs["method"] = self.method
-        dict_kwargs["national_correlation"] = self.national_correlation
-        dict_kwargs["groupby"] = self.correlation_plot_groupby
-        dict_kwargs["dg_country"] = self.dg_country
-        dict_kwargs["combined_dict"] = self.combined_dict
-        self.logger.info(f"Correlation plot for {self.country} {self.crop}")
-        (
-            dict_selected_features,
-            dict_best_cei,
-        ) = correlations.all_correlated_feature_by_time(df, **dict_kwargs)
+        if self.correlation_plots:
+            dict_kwargs = {}
+            dict_kwargs["all_stages"] = self.all_stages
+            dict_kwargs["target_col"] = self.target
+            dict_kwargs["country"] = self.country
+            dict_kwargs["crop"] = self.crop
+            dict_kwargs["dir_output"] = (
+                self.dir_analysis
+                / self.country
+                / self.crop
+                / self.model_name
+                / str(self.forecast_season)
+            )
+            dict_kwargs["forecast_season"] = self.forecast_season
+            dict_kwargs["method"] = self.method
+            dict_kwargs["national_correlation"] = self.national_correlation
+            dict_kwargs["groupby"] = self.correlation_plot_groupby
+            dict_kwargs["dg_country"] = self.dg_country
+            dict_kwargs["combined_dict"] = self.combined_dict
+            self.logger.info(f"Correlation plot for {self.country} {self.crop}")
+            (
+                dict_selected_features,
+                dict_best_cei,
+            ) = correlations.all_correlated_feature_by_time(df, **dict_kwargs)
         """ Separate into train and test datasets based on forecast_season """
         mask = df["Harvest Year"] == self.forecast_season
@@ -841,7 +838,7 @@ class Geocif:
         self.all_seasons = self.df_results["Harvest Year"].unique()
         """ If not using a ML model then set XAI and CI to False """
-        if not self.ml_model or self.model_name in ["linear", "gam"]:
+        if not self.ml_model or self.model_name in ["linear", "gam", "merf"]:
             self.do_xai = False
             self.estimate_ci = False
             self.check_yield_trend = False

{geocif-0.1.26 → geocif-0.1.28}/geocif/indices_runner.py RENAMED Viewed

@@ -155,21 +155,21 @@ class cei_runner(base.BaseGeo):
                 "ndvi",
                 True,
             )
-            for year in range(2001, ar.utcnow().year + 1)
+            for year in range(2023, ar.utcnow().year + 1)
             for status, path, filename, admin_zone, category in combinations
         ]
         # Only keep those entries in combinations where the third elemt is
         # mozambique, south_africa, angola or dem_people's_rep_of_korea
         # This is done to test the code for these countries
-        # combinations = [i for i in combinations if "angola" in i[2] or
-        #                 "lesotho" in i[2] or
+        combinations = [i for i in combinations if "angola_maize" in i[3] or
+                       "lesotho_maize" in i[3] or
         #                 "namibia" in i[2] or
         #                 "united_republic_of_tanzania" in i[2] or
-        #                 "zambia" in i[2] or
-        #                 "zimbabwe" in i[2] or
+                         "zambia_maize" in i[3] or
+                         "zimbabwe_maize" in i[3] or
         #                 "south_africa" in i[2] or
-        #                 "mozambique" in i[2] or
+                      "mozambique_maize" in i[3]]
         #                 "malawi" in i[2]]
         if self.do_parallel:
@@ -201,7 +201,7 @@ def run(path_config_files=[]):
     indices.validate_index_definitions()
     for method in [
-        "monthly_r",  # "dekad_r"  # "dekad_r"
+        "biweekly_r",  # "dekad_r"  # "dekad_r"
     ]:  # , "full_season", "phenological_stages", "fraction_season"]:
         obj = cei_runner(path_config_files)
         obj.main(method)

{geocif-0.1.26 → geocif-0.1.28}/geocif/ml/correlations.py RENAMED Viewed

@@ -274,8 +274,12 @@ def all_correlated_feature_by_time(df, **kwargs):
                 # For each element in dict_best_cei, add the type of the cei
             else:
                 # HACK
-                dict_selected_features[region_id] = dict_selected_features[0]
-                dict_best_cei[region_id] = dict_best_cei[0]
+                df_corr = _all_correlated_feature_by_time(df, **kwargs)
+                dict_selected_features[region_id] = df_corr.columns
+                dict_best_cei[region_id] = {}
+                #dict_selected_features[region_id] = dict_selected_features[0]
+                #dict_best_cei[region_id] = dict_best_cei[0]
                 # Combine all unique values from the existing dictionary elements
                 # combined_metrics = set()
                 # for key in dict_selected_features:

{geocif-0.1.26 → geocif-0.1.28}/geocif/ml/feature_selection.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import numpy as np
+from tqdm import tqdm
 from sklearn.ensemble import RandomForestRegressor
@@ -27,6 +28,7 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
     # # You would adjust the threshold based on new criteria since variances have been normalized.
     # selector = VarianceThreshold(threshold=scaled_data.var().mean())
     # X = selector.fit_transform(scaled_data)
+    selector = None
     # Fill in columns with median of that column
     X = X.fillna(X.median())
@@ -47,17 +49,58 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
     if method == "SelectKBest":
         from sklearn.feature_selection import SelectKBest, f_regression
-        k = 10  # Number of features to select
-        selector = SelectKBest(score_func=f_regression, k=10)
+        k = 15  # Number of features to select
+        selector = SelectKBest(score_func=f_regression, k=k)
         # Fit the selector to the data and transform the data to select the best features
-        X_new = selector.fit_transform(X, y)
+        try:
+            X_new = selector.fit_transform(X, y)
+        except:
+            breakpoint()
         # Get the selected feature indices
         selected_features = selector.get_support(indices=True)
         # Get the selected feature names
         selected_features = X.columns[selected_features].tolist()
+    elif method == "SHAP":
+        import pandas as pd
+        from catboost import CatBoostRegressor
+        from fasttreeshap import TreeExplainer as FastTreeExplainer
+        from sklearn.model_selection import cross_val_score
+        model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
+        model.fit(X, y)
+        explainer = FastTreeExplainer(model)
+        shap_values = explainer.shap_values(X)
+        # Step 5: Summarize the SHAP values for feature importance
+        shap_importances = np.mean(np.abs(shap_values), axis=0)
+        shap_importance_df = pd.DataFrame({
+            'feature': X.columns,
+            'importance': shap_importances
+        }).sort_values(by='importance', ascending=False)
+        def evaluate_model_with_n_features(N, X_train, y_train):
+            top_features = shap_importance_df['feature'].head(N).values
+            X_train_selected = X_train[top_features]
+            selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
+            scores = cross_val_score(selector, X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
+            return np.mean(scores)
+        # Evaluate model performance with different number of features
+        nrange = [5, 10, 15, 20, 25, 30]
+        cv_scores = []
+        for N in tqdm(nrange):
+            cv_scores.append(evaluate_model_with_n_features(N, X, y))
+        # Select the number of features that gives the best cross-validation score (lowest MSE)
+        optimal_N = nrange[np.argmax(cv_scores)]
+        # Use optimal N to select features
+        selected_features = shap_importance_df['feature'].head(optimal_N).values.tolist()
     elif method == "feature_engine":
         from feature_engine.selection import SmartCorrelatedSelection
@@ -110,6 +153,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
         selector.fit(X.values, y.values)
         selected_features_mask = selector.support_
         selected_features = X.columns[selected_features_mask].tolist()
+        tentative_features = X.columns[selector.support_weak_].tolist()
+        selected_features = selected_features + tentative_features
     elif method == "Leshy":
         import arfs.feature_selection.allrelevant as arfsgroot
         from catboost import CatBoostRegressor

{geocif-0.1.26 → geocif-0.1.28}/geocif/ml/stats.py RENAMED Viewed

@@ -205,6 +205,11 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
                 "value",
             ]
+            # Replace any inf or 0 values by NaN
+            yield_value = yield_value.replace([0, np.inf, -np.inf], np.nan)
+            area_value = area_value.replace([0, np.inf, -np.inf], np.nan)
+            prod_value = prod_value.replace([0, np.inf, -np.inf], np.nan)
             if not yield_value.empty:
                 group.loc[:, target_col] = yield_value.values[0]
                 group.loc[:, "Area (ha)"] = area_value.values[0]

{geocif-0.1.26 → geocif-0.1.28}/geocif/ml/trainers.py RENAMED Viewed

@@ -252,7 +252,7 @@ def auto_train(
     else:
         hyperparams = {}
-        if model_name == "catboost":
+        if model_name in ["catboost", "merf"]:
             hyperparams = {
                 "depth": 6,
                 "learning_rate": 0.01,
@@ -265,17 +265,14 @@ def auto_train(
                 "random_seed": seed,
                 "verbose": False,
             }
-            model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
-            # if cluster_strategy in ["auto_detect", "single"]:
-            #     from merf import MERF
-            #
-            #     regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
-            #
-            #     model = MERF(regr, max_iterations=5)
-            # elif cluster_strategy == "individual":
-            #     # For all features with AUC in name, set monotone_constraints to 1, rest are 0
-            #     # monotone_constraints = [1 if "AUC_" in ftr else 0 for ftr in feature_names]
-            #     model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
+            if model_name == "catboost":
+                model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
+            elif model_name == "merf":
+                from merf import MERF
+                hyperparams["iterations"] = 1000
+                regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
+                model = MERF(regr, max_iterations=10)
         elif model_name == "linear":
             from sklearn.linear_model import LassoCV

{geocif-0.1.26 → geocif-0.1.28}/geocif/viz/plot.py RENAMED Viewed

@@ -332,7 +332,7 @@ def plot_df_shpfile(
             cb.ax.set_title(
                 label, fontsize=8, fontweight="semibold", fontfamily="Arial"
             )
-            cb.ax.set_xticklabels(ticks, fontsize=6, fontfamily="Arial")
+            cb.ax.set_xticklabels(ticks, fontsize=4, fontfamily="Arial")
             # Use BoundaryNorm to create discrete levels
             # sm = plt.cm.ScalarMappable(cmap=cmap.mpl_colormap, norm=norm)

{geocif-0.1.26 → geocif-0.1.28/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.26
+Version: 0.1.28
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.26 → geocif-0.1.28}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.26",
+    version="0.1.28",
     zip_safe=False,
 )