PyPI - geocif - Versions diffs - 0.2.24__tar.gz → 0.2.26__tar.gz - Mend

geocif 0.2.24tar.gz → 0.2.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

{geocif-0.2.24/geocif.egg-info → geocif-0.2.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.24
+Version: 0.2.26
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.24 → geocif-0.2.26}/geocif/geocif.py RENAMED Viewed

@@ -179,6 +179,13 @@ class Geocif:
             "Production (tn)",
         ]
+        if self.model_type == "REGRESSION":
+            self.target_column = (
+                f"Detrended {self.target}" if self.check_yield_trend else self.target
+            )
+        elif self.model_type == "CLASSIFICATION":
+            self.target_column = self.target_class
         self.combined_dict = {
             **di.dict_indices,
             **di.dict_ndvi,
@@ -204,7 +211,30 @@ class Geocif:
         # obj_pickle = outlook.Outlook(self.pickle_file)
         # self.df_outlook = obj_pickle.read_outlook_file()
-    def train(self, df_region, scaler=None):
+    def apply_feature_selector(self, dir_output):
+        if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+            all_features = self.X_train.columns
+            # Select the columns with use_ceis in it
+            self.selected_features = [
+                column
+                for column in all_features
+                if any(cei in column for cei in self.use_ceis)
+            ]
+        else:
+            self.logger.info(f"Selecting features for {self.country} {self.crop}")
+            selector, _, self.selected_features = fs.select_features(
+                self.X_train, self.y_train, method=self.feature_selection, dir_output=dir_output
+            )
+            self.logger.info(f"Selected features: {self.selected_features}")
+        """ Update model to include conformal estimates """
+        if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
+            self.selected_features.append("lat")
+        if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
+            self.selected_features.append("lon")
+    def train_model(self, df_region, scaler=None):
         """
         Args:
@@ -214,55 +244,9 @@ class Geocif:
         Returns:
         """
-        """ Perform feature selection """
-        if self.model_type == "REGRESSION":
-            target_column = (
-                f"Detrended {self.target}" if self.check_yield_trend else self.target
-            )
-        elif self.model_type == "CLASSIFICATION":
-            target_column = self.target_class
-        # Drop rows where target_column is NaN
-        df_region = df_region.dropna(subset=[target_column])
-        X_train = df_region[self.feature_names]
-        # Drop any columns with NaNs
-        X_train = X_train.dropna(axis=1, how="any")
-        y_train = df_region[target_column]
         if self.ml_model:
-            if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
-                all_features = X_train.columns
-                # Select the columns with use_ceis in it
-                self.selected_features = [
-                    column
-                    for column in all_features
-                    if any(cei in column for cei in self.use_ceis)
-                ]
-            else:
-                self.logger.info(f"Selecting features for {self.country} {self.crop}")
-                selector, _, self.selected_features = fs.select_features(
-                    X_train, y_train, method=self.feature_selection
-                )
-                self.logger.info(f"Selected features: {self.selected_features}")
-            """ Update model to include conformal estimates """
-            if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
-                self.selected_features.append("lat")
-            if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
-                self.selected_features.append("lon")
             X_train = df_region[self.selected_features + self.cat_features]
-            dir_output = (
-                self.dir_analysis
-                / self.country
-                / self.crop
-                / self.model_name
-                / str(self.forecast_season)
-            )
             region_id = df_region["Region_ID"].unique()[0]
             X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
             if scaler:
@@ -284,9 +268,9 @@ class Geocif:
                 "Harvest Year",
                 df_region[self.selected_features + self.cat_features + [self.target]],
                 X_train_scaled,
-                y_train,
+                self.y_train,
                 feature_names=self.selected_features,
-                target_col=target_column,
+                target_col=self.target_column,
                 optimize=self.optimize,
                 fraction_loocv=self.fraction_loocv,
                 cat_features=self.cat_features,
@@ -303,7 +287,7 @@ class Geocif:
                 if self.model_name == "catboost":
                     self.model.fit(
                         X_train,
-                        y_train,
+                        self.y_train,
                         cat_features=self.cat_features,
                         verbose=True,
                     )
@@ -313,16 +297,16 @@ class Geocif:
                             item for item in self.cat_features if item != "Harvest Year"
                         ]
                     )
-                    self.model.fit(X_train, y_train)
+                    self.model.fit(X_train, self.y_train)
                 elif self.model_name == "ydf":
                     # Combine X_train and y_train
-                    df_train = pd.concat([X_train, y_train], axis=1)
+                    df_train = pd.concat([X_train, self.y_train], axis=1)
                     self.model = self.model.train(df_train)
                 elif self.model_name == "geospaNN":
                     self.model.fit(
                         X_train,
-                        y_train,
+                        self.y_train,
                         # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
                     )
                 elif self.model_name == "merf":
@@ -334,15 +318,15 @@ class Geocif:
                         X_train,
                         Z_train,
                         clusters_train.astype("object"),
-                        y_train.values,
+                        self.y_train.values,
                     )
                 elif self.model_name == "linear":
-                    self.model.fit(X_train_scaled, y_train)
+                    self.model.fit(X_train_scaled, self.y_train)
                 elif self.model_name == "gam":
-                    self.model.fit(X_train_scaled.values, y_train.values)
+                    self.model.fit(X_train_scaled.values, self.y_train.values)
                     self.best_hyperparams = {}
                 elif self.model_name in ["cubist"]:
-                    self.model.fit(X_train, y_train)
+                    self.model.fit(X_train, self.y_train)
                 elif self.model_name in [
                     "cumulative_1",
                     "cumulative_2",
@@ -377,7 +361,7 @@ class Geocif:
                     # Combine scaled numeric features and encoded region
                     X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
-                    self.model.fit(X_train_scaled, y_train)
+                    self.model.fit(X_train_scaled, self.y_train)
             except Exception as e:
                 self.logger.error(
                     f"Error fitting model for {self.country} {self.crop} {e}"
@@ -782,6 +766,14 @@ class Geocif:
         Returns:
         """
+        dir_output = (
+            self.dir_analysis
+            / self.country
+            / self.crop
+            / self.model_name
+            / str(self.forecast_season)
+        )
         from sklearn.preprocessing import StandardScaler
         scaler = StandardScaler() if self.model_name in ["linear", "gam"] else None
@@ -807,8 +799,6 @@ class Geocif:
             mask_train = self.df_train["Region_ID"] == region
             mask_test = self.df_test["Region_ID"] == region
-            num_regions_in_cluster = self.df_train[mask_train]["Region"].unique()
             if self.cluster_strategy == "individual":
                 region_name = self.df_train["Region"].unique()[idx]
                 pbar.set_description(f"Fit/Predict for {region_name}")
@@ -836,12 +826,20 @@ class Geocif:
             if self.last_year_yield_as_feature:
                 common_columns += [f"Last Year {self.target}"]
-            """ Train """
+            """ Feature selection and then Train """
             # Filter dataframe based on region and self.feature_names
             df_region_train = self.df_train[mask_train]
             df_region_train = df_region_train[self.fixed_columns + common_columns]
             df_region_train.reset_index(drop=True, inplace=True)
-            self.train(df_region_train, scaler)
+            df_region_train = df_region_train.dropna(subset=[self.target_column])
+            self.X_train = df_region_train[self.feature_names]
+            # Drop any columns with NaNs
+            self.X_train.dropna(axis=1, how="any", inplace=True)
+            self.y_train = df_region_train[self.target_column]
+            breakpoint()
+            self.apply_feature_selector(dir_output)
+            self.train_model(df_region_train, scaler, dir_output)
             """ Predict """
             if self.check_yield_trend:

{geocif-0.2.24 → geocif-0.2.26}/geocif/ml/feature_selection.py RENAMED Viewed

@@ -34,10 +34,11 @@ def are_all_features_non_eo(features):
 def select_features(
     X, y,
-    method="RFE",
+    method="multi",
     min_features_to_select=3,
     threshold_nan=0.2,
-    threshold_unique=0.6
+    threshold_unique=0.6,
+    dir_output="."
 ):
     """
     Feature-selection wrapper supporting many methods plus a new 'multi' option.
@@ -75,8 +76,9 @@ def select_features(
     # --- multi-method ensemble -------------------------------
     if method == "multi":
         counter = Counter()
+        models = ["BorutaPy", "mrmr"]
         # run three selectors and count feature picks
-        for sub_m in ["BorutaPy", "mrmr"]:
+        for sub_m in models:
             _, _, feats = select_features(
                 X_clean, y,
                 method=sub_m,
@@ -84,7 +86,6 @@ def select_features(
                 threshold_nan=threshold_nan,
                 threshold_unique=threshold_unique
             )
-            print(sub_m, feats)
             counter.update(feats)
         # union of all features
@@ -97,12 +98,11 @@ def select_features(
         fig = freq.plot(kind="bar", width=0.9).get_figure()
         plt.title("Feature selection frequency across methods")
         plt.xlabel("Feature")
-        plt.ylabel("Times selected (out of 3)")
+        plt.ylabel(f"Times selected (out of {len(models)})")
         plt.tight_layout()
-        out_dir = Path("feature_selection_multi")
-        out_dir.mkdir(parents=True, exist_ok=True)
-        fig.savefig(out_dir / "feature_selection_frequency.png", dpi=300)
+        dir_output = dir_output / Path("feature_selection")
+        fig.savefig(dir_output / "feature_selection_frequency.png", dpi=300)
         plt.close(fig)
         return None, X_out, combined

{geocif-0.2.24 → geocif-0.2.26}/geocif/ml/trainers.py RENAMED Viewed

@@ -268,7 +268,7 @@ def auto_train(
             loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
             bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
             hyperparams = {
-                "iterations": 2500,
+                "iterations": 1500,
                 "learning_rate": 0.025,
                 "depth": 6,
                 "subsample": 1.0,
@@ -278,7 +278,7 @@ def auto_train(
                 "loss_function": loss_function,
                 "early_stopping_rounds": 20,
                 "random_seed": seed,
-                "verbose": True,
+                "verbose": False,
             }
             if model_name == "catboost":

{geocif-0.2.24 → geocif-0.2.26/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.24
+Version: 0.2.26
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.24 → geocif-0.2.26}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.2.24",
+    version="0.2.26",
     zip_safe=False,
 )