PyPI - geocif - Versions diffs - 0.2.59__tar.gz → 0.2.60__tar.gz - Mend

geocif 0.2.59tar.gz → 0.2.60tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

{geocif-0.2.59/geocif.egg-info → geocif-0.2.60}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.59
+Version: 0.2.60
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.59 → geocif-0.2.60}/geocif/geocif.py RENAMED Viewed

@@ -249,140 +249,155 @@ class Geocif:
         Returns:
         """
-        if self.ml_model:
-            X_train = df_region[self.selected_features + self.cat_features]
+        X_train = df_region[self.selected_features + self.cat_features]
+        # Drop columns in X_train that have any NaNs, log the number of columns dropped
+        initial_columns = X_train.shape[1]
+        X_train = X_train.dropna(axis=1, how="any")
+        dropped_columns = initial_columns - X_train.shape[1]
+        if dropped_columns > 0:
+            self.logger.info(
+                f"Dropped {dropped_columns} columns with NaNs from X_train for {self.country} {self.crop}"
+            )
+        # Reset index of X_train
+        X_train.reset_index(drop=True, inplace=True)
+        region_id = df_region["Region_ID"].unique()[0]
+        X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
+        if scaler:
+            X_train_nocat = X_train.drop(
+                columns=[
+                    item for item in self.cat_features if item != "Harvest Year"
+                ]
+            )
+            X_train_scaled = scaler.fit_transform(X_train_nocat)
+        else:
+            X_train_scaled = X_train
+        """ Train model """
+        self.best_hyperparams, self.model = trainers.auto_train(
+            self.cluster_strategy,
+            self.model_name,
+            self.model_type,
+            False,
+            "Harvest Year",
+            df_region[self.selected_features + self.cat_features + [self.target]],
+            X_train_scaled,
+            self.y_train,
+            feature_names=self.selected_features,
+            target_col=self.target_column,
+            optimize=self.optimize,
+            fraction_loocv=self.fraction_loocv,
+            cat_features=self.cat_features,
+        )
+        """ Estimate CI only if flag is True """
+        if self.estimate_ci:
+            if self.estimate_ci_for_all or self.forecast_season == self.today_year:
+                self.model = trainers.estimate_ci(
+                    self.model_type, self.model_name, self.model
+                )
-            region_id = df_region["Region_ID"].unique()[0]
-            X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
-            if scaler:
-                X_train_nocat = X_train.drop(
+        try:
+            if self.model_name == "catboost":
+                self.model.fit(
+                    X_train,
+                    self.y_train,
+                    cat_features=self.cat_features,
+                    verbose=False,
+                )
+            elif self.model_name in ["tabpfn"]:
+                # Identify the column indices for cat_features in X_train
+                if self.cat_features is None:
+                    cat_feature_indices = []
+                cat_feature_indices = [X_train.columns.get_loc(col) for col in self.cat_features if
+                    col in X_train.columns]
+                self.model.fit(X_train, self.y_train, categorical_feature_indices=cat_feature_indices)
+            elif self.model_name in ["ngboost", "oblique"]:
+                X_train = X_train.drop(
                     columns=[
                         item for item in self.cat_features if item != "Harvest Year"
                     ]
                 )
-                X_train_scaled = scaler.fit_transform(X_train_nocat)
-            else:
-                X_train_scaled = X_train
-            """ Train model """
-            self.best_hyperparams, self.model = trainers.auto_train(
-                self.cluster_strategy,
-                self.model_name,
-                self.model_type,
-                False,
-                "Harvest Year",
-                df_region[self.selected_features + self.cat_features + [self.target]],
-                X_train_scaled,
-                self.y_train,
-                feature_names=self.selected_features,
-                target_col=self.target_column,
-                optimize=self.optimize,
-                fraction_loocv=self.fraction_loocv,
-                cat_features=self.cat_features,
-            )
-            """ Estimate CI only if flag is True """
-            if self.estimate_ci:
-                if self.estimate_ci_for_all or self.forecast_season == self.today_year:
-                    self.model = trainers.estimate_ci(
-                        self.model_type, self.model_name, self.model
-                    )
-            try:
-                if self.model_name == "catboost":
-                    self.model.fit(
-                        X_train,
-                        self.y_train,
-                        cat_features=self.cat_features,
-                        verbose=False,
-                    )
-                elif self.model_name in ["tabpfn"]:
-                    # Identify the column indices for cat_features in X_train
-                    if self.cat_features is None:
-                        cat_feature_indices = []
-                    cat_feature_indices = [X_train.columns.get_loc(col) for col in self.cat_features if
-                        col in X_train.columns]
-                    self.model.fit(X_train, self.y_train, categorical_feature_indices=cat_feature_indices)
-                elif self.model_name in ["ngboost", "oblique"]:
-                    X_train = X_train.drop(
-                        columns=[
-                            item for item in self.cat_features if item != "Harvest Year"
-                        ]
-                    )
+                self.model.fit(X_train, self.y_train)
+            elif self.model_name == "ydf":
+                # Combine X_train and y_train
+                df_train = pd.concat([X_train, self.y_train], axis=1)
-                    self.model.fit(X_train, self.y_train)
-                elif self.model_name == "ydf":
-                    # Combine X_train and y_train
-                    df_train = pd.concat([X_train, self.y_train], axis=1)
-                    self.model = self.model.train(df_train)
-                elif self.model_name == "geospaNN":
-                    self.model.fit(
-                        X_train,
-                        self.y_train,
-                        # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
-                    )
-                elif self.model_name == "merf":
-                    Z_train = np.ones((len(X_train), 1))
-                    clusters_train = df_region["Region"]
-                    clusters_train.reset_index(drop=True, inplace=True)
-                    self.model.fit(
-                        X_train,
-                        Z_train,
-                        clusters_train.astype("object"),
-                        self.y_train.values,
-                    )
-                elif self.model_name == "linear":
-                    self.model.fit(X_train_scaled, self.y_train)
-                elif self.model_name == "gam":
-                    self.model.fit(X_train_scaled, self.y_train.values)
-                    self.best_hyperparams = {}
-                elif self.model_name in ["cubist"]:
-                    self.model.fit(X_train, self.y_train)
-                elif self.model_name in [
-                    "cumulative_1",
-                    "cumulative_2",
-                    "cumulative_3",
-                ]:
-                    from sklearn.preprocessing import StandardScaler, LabelEncoder
-                    if self.model_name == "cumulative_1":
-                        num_columns = 1
-                    elif self.model_name == "cumulative_2":
-                        num_columns = 2
-                    elif self.model_name == "cumulative_3":
-                        num_columns = 3
-                    # Standardize the numeric features
-                    scaler = StandardScaler()
-                    X_numeric = X_train.iloc[:, :num_columns]
-                    X_scaled_numeric = pd.DataFrame(
-                        scaler.fit_transform(X_numeric),
-                        columns=X_numeric.columns,
-                        index=X_train.index,
-                    )
+                self.model = self.model.train(df_train)
+            elif self.model_name == "geospaNN":
+                self.model.fit(
+                    X_train,
+                    self.y_train,
+                    # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
+                )
+            elif self.model_name == "merf":
+                Z_train = np.ones((len(X_train), 1))
+                clusters_train = df_region["Region"]
+                clusters_train.reset_index(drop=True, inplace=True)
+                self.model.fit(
+                    X_train,
+                    Z_train,
+                    clusters_train.astype("object"),
+                    self.y_train.values,
+                )
+            elif self.model_name == "linear":
+                self.model.fit(X_train_scaled, self.y_train)
+            elif self.model_name == "gam":
+                self.model.fit(X_train_scaled, self.y_train.values)
+                self.best_hyperparams = {}
+            elif self.model_name in ["cubist"]:
+                self.model.fit(X_train, self.y_train)
+            elif self.model_name in [
+                "cumulative_1",
+                "cumulative_2",
+                "cumulative_3",
+            ]:
+                from sklearn.preprocessing import StandardScaler, LabelEncoder
-                    # Encode the Region as categorical
-                    le = LabelEncoder()
-                    X_region = pd.Series(
-                        le.fit_transform(X_train["Region"]),
-                        name="Region",
-                        index=X_train.index,
-                    )
+                if self.model_name == "cumulative_1":
+                    num_columns = 1
+                elif self.model_name == "cumulative_2":
+                    num_columns = 2
+                elif self.model_name == "cumulative_3":
+                    num_columns = 3
-                    # Combine scaled numeric features and encoded region
-                    X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
+                # Standardize the numeric features
+                scaler = StandardScaler()
+                X_numeric = X_train.iloc[:, :num_columns]
+                X_scaled_numeric = pd.DataFrame(
+                    scaler.fit_transform(X_numeric),
+                    columns=X_numeric.columns,
+                    index=X_train.index,
+                )
-                    self.model.fit(X_train_scaled, self.y_train)
-                elif self.model_name in ["desreg"]:
-                    self.model.fit(X_train, self.y_train)
-            except Exception as e:
-                self.logger.error(
-                    f"Error fitting model for {self.country} {self.crop} {e}"
+                # Encode the Region as categorical
+                le = LabelEncoder()
+                X_region = pd.Series(
+                    le.fit_transform(X_train["Region"]),
+                    name="Region",
+                    index=X_train.index,
                 )
+                # Combine scaled numeric features and encoded region
+                X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
+                self.model.fit(X_train_scaled, self.y_train)
+            elif self.model_name in ["desreg"]:
+                # Convert any string columns to categorical
+                # Fit the model
+                breakpoint()
+                self.model.fit(X_train, self.y_train)
+        except Exception as e:
+            self.logger.error(
+                f"Error fitting model for {self.country} {self.crop} {e}"
+            )
+            breakpoint()
     def predict(self, df_region, scaler=None):
         """
         Predict yield for the current stage
@@ -864,7 +879,8 @@ class Geocif:
             elif self.cluster_strategy in ["auto_detect", "single"]:
                 pbar.set_description(f"Fit/Predict for group {idx + 1}")
                 pbar.update()
-            self.train_model(df_region_train, dir_output, scaler)
+            if self.ml_model:
+                self.train_model(df_region_train, dir_output, scaler)
             """ Predict """
             if self.check_yield_trend:
@@ -1057,9 +1073,9 @@ class Geocif:
             df, self.all_seasons_with_yield, self.number_median_years, self.target
         )
-        df = fe.compute_user_median_statistics(df, range(2018, 2023))
+        df = fe.compute_user_median_statistics(df, range(2018, 2023), self.target)
-        df = fe.compute_user_median_statistics(df, range(2013, 2018))
+        df = fe.compute_user_median_statistics(df, range(2013, 2018), self.target)
         if self.median_area_as_feature:
             df = fe.compute_median_statistics(

{geocif-0.2.59 → geocif-0.2.60}/geocif/ml/trainers.py RENAMED Viewed

@@ -329,7 +329,21 @@ def auto_train(
             model_tabpfn = AutoTabPFNRegressor(max_time=600,
                                                # categorical_feature_indices=cat_feature_indices,
                                                ignore_pretraining_limits=True)
-            model = DESRegression(regressors_list=[model_catboost, model_tabpfn])
+            import ydf
+            templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
+            task = ydf.Task.REGRESSION if model_type == "REGRESSION" else ydf.Task.CLASSIFICATION
+            model_ydf = ydf.GradientBoostedTreesLearner(
+                label=target_col, task=task,
+                growing_strategy='BEST_FIRST_GLOBAL',
+                categorical_algorithm='RANDOM',
+                split_axis='SPARSE_OBLIQUE',
+                sparse_oblique_normalization='MIN_MAX',
+                sparse_oblique_num_projections_exponent=2.0
+            )
+            hyperparams = templates["benchmark_rank1v1"]
+            model = DESRegression(regressors_list=[model_catboost, model_ydf])
         elif model_name == "ngboost":
             if model_type == "REGRESSION":
                 from ngboost import NGBRegressor

{geocif-0.2.59 → geocif-0.2.60/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.59
+Version: 0.2.60
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.59 → geocif-0.2.60}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.2.59",
+    version="0.2.60",
     zip_safe=False,
 )