PyPI - geocif - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

geocif 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

{geocif-0.2.2/geocif.egg-info → geocif-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.2
+Version: 0.2.4
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.2 → geocif-0.2.4}/geocif/cei/indices.py RENAMED Viewed

@@ -94,7 +94,7 @@ def standardize_dataframe(df: pd.DataFrame, vi_var: str) -> pd.DataFrame:
         if df[vi_var].max() > 1:
             df[vi_var] = (df[vi_var] - 50) / 200
-    # Exclude seasons before 2001 if that’s your logic
+    # HACK Exclude seasons before 2001
     df = df[df["Season"] >= 2001]
     return df
@@ -507,7 +507,7 @@ class CEIs:
         if not self.redo:
             # If harvest_year is older than last year and file exists, skip
             if (self.harvest_year < (current_year - 1)) and cei_file.is_file():
-                logger.info("CEI file exists and year is old. Skipping: %s", cei_file)
+                logger.info(f"CEI file exists, skipping: {cei_file}")
                 return None
         return intermediate_file

{geocif-0.2.2 → geocif-0.2.4}/geocif/geocif.py RENAMED Viewed

@@ -179,6 +179,13 @@ class Geocif:
             "Production (tn)",
         ]
+        if self.model_type == "REGRESSION":
+            self.target_column = (
+                f"Detrended {self.target}" if self.check_yield_trend else self.target
+            )
+        elif self.model_type == "CLASSIFICATION":
+            self.target_column = self.target_class
         self.combined_dict = {
             **di.dict_indices,
             **di.dict_ndvi,
@@ -204,65 +211,47 @@ class Geocif:
         # obj_pickle = outlook.Outlook(self.pickle_file)
         # self.df_outlook = obj_pickle.read_outlook_file()
-    def train(self, df_region, scaler=None):
+    def apply_feature_selector(self, region, dir_output):
+        if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
+            all_features = self.X_train.columns
+            # Select the columns with use_ceis in it
+            self.selected_features = [
+                column
+                for column in all_features
+                if any(cei in column for cei in self.use_ceis)
+            ]
+        else:
+            self.logger.info(f"Selecting features for {self.country} {self.crop}")
+            selector, _, self.selected_features = fs.select_features(
+                self.X_train,
+                self.y_train,
+                method=self.feature_selection,
+                dir_output=dir_output,
+                region=region
+            )
+            self.logger.info(f"Selected features: {self.selected_features}")
+        """ Update model to include conformal estimates """
+        if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
+            self.selected_features.append("lat")
+        if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
+            self.selected_features.append("lon")
+    def train_model(self, df_region, dir_output, scaler=None):
         """
         Args:
             df_region:
+            dir_output:
             scaler:
         Returns:
         """
-        """ Perform feature selection """
-        if self.model_type == "REGRESSION":
-            target_column = (
-                f"Detrended {self.target}" if self.check_yield_trend else self.target
-            )
-        elif self.model_type == "CLASSIFICATION":
-            target_column = self.target_class
-        # Drop rows where target_column is NaN
-        df_region = df_region.dropna(subset=[target_column])
-        X_train = df_region[self.feature_names]
-        # Drop any columns with NaNs
-        X_train = X_train.dropna(axis=1, how="any")
-        y_train = df_region[target_column]
         if self.ml_model:
-            if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
-                all_features = X_train.columns
-                # Select the columns with use_ceis in it
-                self.selected_features = [
-                    column
-                    for column in all_features
-                    if any(cei in column for cei in self.use_ceis)
-                ]
-            else:
-                self.logger.info(f"Selecting features for {self.country} {self.crop}")
-                selector, _, self.selected_features = fs.select_features(
-                    X_train, y_train, method=self.feature_selection
-                )
-                self.logger.info(f"Selected features: {self.selected_features}")
-            """ Update model to include conformal estimates """
-            if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
-                self.selected_features.append("lat")
-            if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
-                self.selected_features.append("lon")
             X_train = df_region[self.selected_features + self.cat_features]
-            dir_output = (
-                self.dir_analysis
-                / self.country
-                / self.crop
-                / self.model_name
-                / str(self.forecast_season)
-            )
             region_id = df_region["Region_ID"].unique()[0]
             X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
             if scaler:
@@ -284,9 +273,9 @@ class Geocif:
                 "Harvest Year",
                 df_region[self.selected_features + self.cat_features + [self.target]],
                 X_train_scaled,
-                y_train,
+                self.y_train,
                 feature_names=self.selected_features,
-                target_col=target_column,
+                target_col=self.target_column,
                 optimize=self.optimize,
                 fraction_loocv=self.fraction_loocv,
                 cat_features=self.cat_features,
@@ -303,9 +292,9 @@ class Geocif:
                 if self.model_name == "catboost":
                     self.model.fit(
                         X_train,
-                        y_train,
+                        self.y_train,
                         cat_features=self.cat_features,
-                        verbose=True,
+                        verbose=False,
                     )
                 elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
                     X_train = X_train.drop(
@@ -313,16 +302,16 @@ class Geocif:
                             item for item in self.cat_features if item != "Harvest Year"
                         ]
                     )
-                    self.model.fit(X_train, y_train)
+                    self.model.fit(X_train, self.y_train)
                 elif self.model_name == "ydf":
                     # Combine X_train and y_train
-                    df_train = pd.concat([X_train, y_train], axis=1)
+                    df_train = pd.concat([X_train, self.y_train], axis=1)
                     self.model = self.model.train(df_train)
                 elif self.model_name == "geospaNN":
                     self.model.fit(
                         X_train,
-                        y_train,
+                        self.y_train,
                         # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
                     )
                 elif self.model_name == "merf":
@@ -334,15 +323,15 @@ class Geocif:
                         X_train,
                         Z_train,
                         clusters_train.astype("object"),
-                        y_train.values,
+                        self.y_train.values,
                     )
                 elif self.model_name == "linear":
-                    self.model.fit(X_train_scaled, y_train)
+                    self.model.fit(X_train_scaled, self.y_train)
                 elif self.model_name == "gam":
-                    self.model.fit(X_train_scaled.values, y_train.values)
+                    self.model.fit(X_train_scaled, self.y_train.values)
                     self.best_hyperparams = {}
                 elif self.model_name in ["cubist"]:
-                    self.model.fit(X_train, y_train)
+                    self.model.fit(X_train, self.y_train)
                 elif self.model_name in [
                     "cumulative_1",
                     "cumulative_2",
@@ -377,7 +366,7 @@ class Geocif:
                     # Combine scaled numeric features and encoded region
                     X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
-                    self.model.fit(X_train_scaled, y_train)
+                    self.model.fit(X_train_scaled, self.y_train)
             except Exception as e:
                 self.logger.error(
                     f"Error fitting model for {self.country} {self.crop} {e}"
@@ -782,6 +771,14 @@ class Geocif:
         Returns:
         """
+        dir_output = (
+            self.dir_analysis
+            / self.country
+            / self.crop
+            / self.model_name
+            / str(self.forecast_season)
+        )
         from sklearn.preprocessing import StandardScaler
         scaler = StandardScaler() if self.model_name in ["linear", "gam"] else None
@@ -789,7 +786,7 @@ class Geocif:
         """ Train, Predict, Explain and Store results for each region """
         pbar = tqdm(self.df_train["Region_ID"].unique(), leave=False)
         for idx, region in enumerate(pbar):
-            if self.model_name in ["linear", "gam"]:
+            if self.model_name in ["linear"]:
                 self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
             elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
                 self.create_feature_names(stages, {})
@@ -807,16 +804,6 @@ class Geocif:
             mask_train = self.df_train["Region_ID"] == region
             mask_test = self.df_test["Region_ID"] == region
-            num_regions_in_cluster = self.df_train[mask_train]["Region"].unique()
-            if self.cluster_strategy == "individual":
-                region_name = self.df_train["Region"].unique()[idx]
-                pbar.set_description(f"Fit/Predict for {region_name}")
-                pbar.update()
-            elif self.cluster_strategy in ["auto_detect", "single"]:
-                pbar.set_description(f"Fit/Predict for group {idx + 1}")
-                pbar.update()
             common_columns = (
                 [self.target, self.target_class]
                 + self.statistics_columns
@@ -836,12 +823,43 @@ class Geocif:
             if self.last_year_yield_as_feature:
                 common_columns += [f"Last Year {self.target}"]
-            """ Train """
+            """ Feature selection and then Train """
             # Filter dataframe based on region and self.feature_names
             df_region_train = self.df_train[mask_train]
             df_region_train = df_region_train[self.fixed_columns + common_columns]
             df_region_train.reset_index(drop=True, inplace=True)
-            self.train(df_region_train, scaler)
+            df_region_train = df_region_train.dropna(subset=[self.target_column])
+            self.X_train = df_region_train[self.feature_names]
+            # Drop any columns with NaNs except the lag yield columns
+            lag_prefix = "t -"
+            lag_cols = [c for c in self.X_train.columns if c.startswith(lag_prefix)]
+            self.X_train = (
+                self.X_train
+                .drop(columns=lag_cols)  # temporarily remove the lag-yield cols
+                .dropna(axis=1, how="any")  # drop cols with any NA left
+                .join(self.X_train[lag_cols])  # add lag-yield cols back untouched
+            )
+            # Some models cannot handle any NaN values, so gapfill them
+            if self.model_name in ["gam", "linear"]:
+                for col in self.X_train.columns:
+                    if self.X_train[col].isnull().any():
+                        median = self.X_train[col].median()
+                        self.X_train[col].fillna(median, inplace=True)
+            self.y_train = df_region_train[self.target_column]
+            self.apply_feature_selector(region, dir_output)
+            if self.cluster_strategy == "individual":
+                region_name = self.df_train["Region"].unique()[idx]
+                pbar.set_description(f"Fit/Predict for {region_name}")
+                pbar.update()
+            elif self.cluster_strategy in ["auto_detect", "single"]:
+                pbar.set_description(f"Fit/Predict for group {idx + 1}")
+                pbar.update()
+            self.train_model(df_region_train, dir_output, scaler)
             """ Predict """
             if self.check_yield_trend:
@@ -1040,17 +1058,27 @@ class Geocif:
         if self.median_area_as_feature:
             df = fe.compute_median_statistics(
-                df, self.all_seasons_with_yield, self.number_median_years, "Area (ha)"
+                df,
+                self.all_seasons_with_yield,
+                self.number_median_years,
+                "Area (ha)"
             )
         if self.lag_yield_as_feature:
             df = fe.compute_lag_yield(
-                df, self.all_seasons_with_yield, self.number_lag_years, self.target
+                df,
+                self.all_seasons_with_yield,
+                self.forecast_season,
+                self.number_lag_years,
+                self.target
             )
         if self.analogous_year_yield_as_feature:
             df = fe.compute_analogous_yield(
-                df, self.all_seasons_with_yield, self.number_median_years, self.target
+                df,
+                self.all_seasons_with_yield,
+                self.number_median_years,
+                self.target
             )
         # Create Region_ID column based on Region column category code
@@ -1066,6 +1094,8 @@ class Geocif:
             # Region_ID should be type category
             df["Region_ID"] = df["Region_ID"].astype("category")
+        else:
+            raise ValueError(f"Unsupported cluster strategy {self.cluster_strategy}")
         return df
@@ -1247,7 +1277,10 @@ class Geocif:
                 )
                 pbar.update()
-                self.loop_ml(stage, dict_selected_features, dict_best_cei)
+                try:
+                    self.loop_ml(stage, dict_selected_features, dict_best_cei)
+                except Exception as e:
+                    self.logger.error(e)
         wandb.finish()
     def setup(self, forecast_season, model):

{geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner.py RENAMED Viewed

@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
         combinations = [
             i
             for i in combinations
-            if "ukraine" in i[3]
+            if "ethiopia" in i[3]
             # or "lesotho_maize" in i[3] or
             # #   "namibia_" in i[2] or
             # "united_republic_of_tanzania_maize" in i[3]
@@ -174,13 +174,13 @@ class cei_runner(base.BaseGeo):
             # or "south_africa_maize" in i[3]
             # or "mozambique_maize" in i[3]
             # or "united_states_of_america" in i[3]
-            or "russian_federation" in i[3]
+            #or "russian_federation" in i[3]
             # or "ukraine" in i[3]
         ]
         #                 "malawi" in i[2]]
         if self.do_parallel:
-            num_cpu = int(cpu_count() * 0.6)
+            num_cpu = int(cpu_count() * 0.75)
             with Pool(num_cpu) as p:
                 for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
                     pass

{geocif-0.2.2 → geocif-0.2.4}/geocif/ml/correlations.py RENAMED Viewed

@@ -3,7 +3,6 @@ import os
 import matplotlib.pyplot as plt
 import palettable as pal
 import pandas as pd
-import seaborn as sns
 from tqdm import tqdm
 from geocif import utils
@@ -68,6 +67,8 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
 def plot_feature_corr_by_time(df, **kwargs):
+    import seaborn as sns
     country = kwargs.get("country")
     crop = kwargs.get("crop")
     dir_output = kwargs.get("dir_output")
@@ -295,16 +296,13 @@ def all_correlated_feature_by_time(df, **kwargs):
                     df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
                 # Compute median of each CEI and sort the dataframe based on the absolute value of the median
-                try:
-                    dict_best_cei[region_id] = (
-                        df_tmp2.groupby("Type")
-                        .max()
-                        .reset_index()
-                        .sort_values("Value", ascending=False)["Metric"]
-                        .values
-                    )
-                except:
-                    breakpoint()
+                dict_best_cei[region_id] = (
+                    df_tmp2.groupby("Type")
+                    .max()
+                    .reset_index()
+                    .sort_values("Value", ascending=False)["Metric"]
+                    .values
+                )
                 kwargs["region_id"] = region_id
                 _region_names = ", ".join([str(x) for x in group['Region'].unique()])

{geocif-0.2.2 → geocif-0.2.4}/geocif/ml/embedding.py RENAMED Viewed

@@ -3,6 +3,7 @@ from collections import Counter
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr as pearsonr
+from tqdm import tqdm
 def extract_regions(X, y, regions=[]):
@@ -32,10 +33,7 @@ def _compute_correlations(X, y):
         f_series = X[feature]
         # Ignore NaN values in either y or f_series
-        try:
-            mask = ~(np.isnan(y) | np.isnan(f_series))
-        except:
-            breakpoint()
+        mask = ~(np.isnan(y) | np.isnan(f_series))
         y_filtered = y[mask]
         f_series_filtered = f_series[mask]
@@ -107,57 +105,47 @@ def get_top_correlated_features(inputs, targets):
     return feature_by_region, counter
-def get_all_features_correlation(inputs, targets, method):
+def get_all_features_correlation(inputs: pd.DataFrame,
+                                 targets: pd.Series,
+                                 method: str) -> pd.DataFrame:
     """
-    Get the top correlated features for each region
-    :param inputs: pd.DataFrame, input data
-    :param targets: pd.Series, target data
-    :param method: str, method to use to find the top correlated features
+    Fast version – identical output, no length-mismatch on regions whose
+    feature names contain no spaces.
     """
-    frames = []
-    for region_id in inputs["Region"].unique():
-        X, y = extract_regions(inputs, targets, regions=[region_id])
+    numeric_cols = inputs.select_dtypes(include=[np.number]).columns.tolist()
-        feature_correlations = _compute_correlations(X, y)
+    df_all = inputs[numeric_cols + ["Region"]].copy()
+    df_all["__target__"] = targets.values
-        # Exclude any nan values
-        feature_correlations = {
-            k: v for k, v in feature_correlations.items() if not np.isnan(v)
-        }
+    frames: list[pd.DataFrame] = []
-        if not feature_correlations:
+    for region_id, g in tqdm(df_all.groupby("Region", sort=False), leave=False):
+        corr = g[numeric_cols].corrwith(g["__target__"]).round(3).dropna()
+        if corr.empty:
             continue
-        split_keys = []
-        for key in feature_correlations.keys():
-            parts = key.split(" ")
-            cei = parts[0]
-            time_period = " ".join(parts[1:])
-            split_keys.append([cei, time_period])
-        # split_keys = [key.rsplit("_", 1) for key in feature_correlations.keys()]
-        values = list(feature_correlations.values())
-        # Creating a DataFrame
-        df = pd.DataFrame(split_keys, columns=["Metric", method])
-        df["Value"] = values
-        # Pivot the DataFrame so each metric becomes a column name and include the year as a separate column
-        df_pivoted = df.pivot_table(
-            index=method, columns="Metric", values="Value", aggfunc="first"
-        ).reset_index()
-        df_pivoted["Region"] = region_id
-        # Move the 'Region' column to the front
-        cols = df_pivoted.columns.tolist()
-        cols = cols[-1:] + cols[:-1]
-        df_pivoted = df_pivoted[cols]
+        # ---- safe split: always two columns --------------------------------
+        split = (
+            pd.Series(corr.index)            # guarantees a Series
+              .str.split(" ", n=1, expand=True)
+        )
+        if split.shape[1] == 1:              # no spaces in any feature name
+            split[1] = ""                    # match legacy behaviour
+        split.columns = [0, 1]               # make column labels predictable
+        df_region = (
+            pd.DataFrame({
+                "Metric": split[0].values,
+                method:  split[1].values,
+                "Value": corr.values         # same length as above
+            })
+            .pivot_table(index=method, columns="Metric",
+                         values="Value", aggfunc="first")
+            .reset_index()
+        )
+        df_region.insert(0, "Region", region_id)
+        frames.append(df_region)
-        frames.append(df_pivoted)
+    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
-    if len(frames):
-        feature_by_region = pd.concat(frames)
-    else:
-        feature_by_region = pd.DataFrame()
-    return feature_by_region

{geocif-0.2.2 → geocif-0.2.4}/geocif/ml/feature_engineering.py RENAMED Viewed

@@ -39,23 +39,32 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
     return df
-def compute_closest_years(all_years, harvest_year, number_lag_years):
+def compute_closest_years(all_years, harvest_year, number_lag_years, only_historic=False):
     """
     Finds the historical years closest to a given harvest year,
-    excluding any future year (harvest_year itself and beyond).
+    excluding any future year (harvest_year itself and beyond) based on the only_historic flag.
     Args:
         all_years (array-like): List or array of all years to consider.
         harvest_year (int): The year from which to compute distance.
         number_lag_years (int): Number of closest years to return.
+        only_historic (bool): If True, only consider years before the harvest year.
     Returns:
         list: The historical years closest to the given harvest year.
               Returns an empty list if no historical years exist.
     """
     # Exclude the harvest year before computation to simplify logic
-    filtered_years = [year for year in all_years if year != harvest_year]
+    if only_historic:
+        filtered_years = [year for year in all_years if year < harvest_year]
+    else:
+        filtered_years = [year for year in all_years if year != harvest_year]
+    # If no historical years exist, return an empty list
+    if not filtered_years:
+        return []
+    # Sort the years based on their absolute difference from the harvest year
     closest_years = np.array(filtered_years)[
         np.argsort(np.abs(np.array(filtered_years) - harvest_year))[:number_lag_years]
     ]
@@ -150,7 +159,7 @@ def compute_user_median_statistics(df, user_years, target_col="Yield (tn per ha)
 def compute_lag_yield(
-    df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"
+    df, all_seasons_with_yield, forecast_season, number_lag_years, target_col="Yield (tn per ha)"
 ):
     # For the number of years specified in self.number_lag_years, add the yield of that number of years
     # ago to the dataframe
@@ -169,7 +178,7 @@ def compute_lag_yield(
         for harvest_year in unique_years:
             closest_years = compute_closest_years(
-                all_seasons_with_yield, harvest_year, number_lag_years
+                all_seasons_with_yield, harvest_year, number_lag_years, only_historic=True
             )
             # For each year in the closest years, add the yield to the dataframe as a new column

geocif 0.2.2__tar.gz → 0.2.4__tar.gz

geocif 0.2.2tar.gz → 0.2.4tar.gz