PyPI - geocif - Versions diffs - 0.2.43__tar.gz → 0.2.45__tar.gz - Mend

geocif 0.2.43tar.gz → 0.2.45tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{geocif-0.2.43/geocif.egg-info → geocif-0.2.45}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.43
+Version: 0.2.45
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.43 → geocif-0.2.45}/geocif/cei/indices.py RENAMED Viewed

@@ -67,7 +67,7 @@ def standardize_dataframe(df: pd.DataFrame, vi_var: str) -> pd.DataFrame:
     if "time" not in df.columns:
         # Use year + day of year if no time column
         df["time"] = pd.to_datetime(
-            df["year"].astype(str) + df["JD"].astype(str),
+            df["year"].astype(str) + df["Doy"].astype(str),
             format="%Y%j"
         )
     else:

{geocif-0.2.43 → geocif-0.2.45}/geocif/geocif.py RENAMED Viewed

@@ -328,8 +328,7 @@ class Geocif:
                 elif self.model_name == "linear":
                     self.model.fit(X_train_scaled, self.y_train)
                 elif self.model_name == "gam":
-                    breakpoint()
-                    self.model.fit(X_train_scaled.values, self.y_train.values)
+                    self.model.fit(X_train_scaled, self.y_train.values)
                     self.best_hyperparams = {}
                 elif self.model_name in ["cubist"]:
                     self.model.fit(X_train, self.y_train)
@@ -842,6 +841,14 @@ class Geocif:
                 .dropna(axis=1, how="any")  # drop cols with any NA left
                 .join(self.X_train[lag_cols])  # add lag-yield cols back untouched
             )
+            # Some models cannot handle any NaN values, so gapfill them
+            if self.model_name in ["gam", "linear"]:
+                for col in self.X_train.columns:
+                    if self.X_train[col].isnull().any():
+                        breakpoint()
+                        median = self.X_train[col].median()
+                        self.X_train[col].fillna(median, inplace=True)
             self.y_train = df_region_train[self.target_column]
             self.apply_feature_selector(region, dir_output)
@@ -1088,6 +1095,8 @@ class Geocif:
             # Region_ID should be type category
             df["Region_ID"] = df["Region_ID"].astype("category")
+        else:
+            raise ValueError(f"Unsupported cluster strategy {self.cluster_strategy}")
         return df

{geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner.py RENAMED Viewed

@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
         combinations = [
             i
             for i in combinations
-            if "ukraine" in i[3]
+            if "ethiopia" in i[3]
             # or "lesotho_maize" in i[3] or
             # #   "namibia_" in i[2] or
             # "united_republic_of_tanzania_maize" in i[3]
@@ -174,13 +174,13 @@ class cei_runner(base.BaseGeo):
             # or "south_africa_maize" in i[3]
             # or "mozambique_maize" in i[3]
             # or "united_states_of_america" in i[3]
-            or "russian_federation" in i[3]
+            #or "russian_federation" in i[3]
             # or "ukraine" in i[3]
         ]
         #                 "malawi" in i[2]]
         if self.do_parallel:
-            num_cpu = int(cpu_count() * 0.6)
+            num_cpu = int(cpu_count() * 0.75)
             with Pool(num_cpu) as p:
                 for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
                     pass

{geocif-0.2.43 → geocif-0.2.45}/geocif/ml/correlations.py RENAMED Viewed

@@ -200,7 +200,6 @@ def _all_correlated_feature_by_time(df, **kwargs):
     # Only select columns that have been observed till the current stage
     pbar = tqdm(stages_features, total=len(stages_features), leave=False)
     for stage in pbar:
         pbar.set_description(f"Calculating correlations")
         pbar.update()

{geocif-0.2.43 → geocif-0.2.45}/geocif/ml/embedding.py RENAMED Viewed

@@ -3,6 +3,7 @@ from collections import Counter
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr as pearsonr
+from tqdm import tqdm
 def extract_regions(X, y, regions=[]):
@@ -104,57 +105,47 @@ def get_top_correlated_features(inputs, targets):
     return feature_by_region, counter
-def get_all_features_correlation(inputs, targets, method):
+def get_all_features_correlation(inputs: pd.DataFrame,
+                                 targets: pd.Series,
+                                 method: str) -> pd.DataFrame:
     """
-    Get the top correlated features for each region
-    :param inputs: pd.DataFrame, input data
-    :param targets: pd.Series, target data
-    :param method: str, method to use to find the top correlated features
+    Fast version – identical output, no length-mismatch on regions whose
+    feature names contain no spaces.
     """
-    frames = []
-    for region_id in inputs["Region"].unique():
-        X, y = extract_regions(inputs, targets, regions=[region_id])
+    numeric_cols = inputs.select_dtypes(include=[np.number]).columns.tolist()
-        feature_correlations = _compute_correlations(X, y)
+    df_all = inputs[numeric_cols + ["Region"]].copy()
+    df_all["__target__"] = targets.values
-        # Exclude any nan values
-        feature_correlations = {
-            k: v for k, v in feature_correlations.items() if not np.isnan(v)
-        }
+    frames: list[pd.DataFrame] = []
-        if not feature_correlations:
+    for region_id, g in tqdm(df_all.groupby("Region", sort=False), leave=False):
+        corr = g[numeric_cols].corrwith(g["__target__"]).round(3).dropna()
+        if corr.empty:
             continue
-        split_keys = []
-        for key in feature_correlations.keys():
-            parts = key.split(" ")
-            cei = parts[0]
-            time_period = " ".join(parts[1:])
-            split_keys.append([cei, time_period])
-        # split_keys = [key.rsplit("_", 1) for key in feature_correlations.keys()]
-        values = list(feature_correlations.values())
-        # Creating a DataFrame
-        df = pd.DataFrame(split_keys, columns=["Metric", method])
-        df["Value"] = values
-        # Pivot the DataFrame so each metric becomes a column name and include the year as a separate column
-        df_pivoted = df.pivot_table(
-            index=method, columns="Metric", values="Value", aggfunc="first"
-        ).reset_index()
-        df_pivoted["Region"] = region_id
-        # Move the 'Region' column to the front
-        cols = df_pivoted.columns.tolist()
-        cols = cols[-1:] + cols[:-1]
-        df_pivoted = df_pivoted[cols]
+        # ---- safe split: always two columns --------------------------------
+        split = (
+            pd.Series(corr.index)            # guarantees a Series
+              .str.split(" ", n=1, expand=True)
+        )
+        if split.shape[1] == 1:              # no spaces in any feature name
+            split[1] = ""                    # match legacy behaviour
+        split.columns = [0, 1]               # make column labels predictable
+        df_region = (
+            pd.DataFrame({
+                "Metric": split[0].values,
+                method:  split[1].values,
+                "Value": corr.values         # same length as above
+            })
+            .pivot_table(index=method, columns="Metric",
+                         values="Value", aggfunc="first")
+            .reset_index()
+        )
+        df_region.insert(0, "Region", region_id)
+        frames.append(df_region)
-        frames.append(df_pivoted)
+    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
-    if len(frames):
-        feature_by_region = pd.concat(frames)
-    else:
-        feature_by_region = pd.DataFrame()
-    return feature_by_region

{geocif-0.2.43 → geocif-0.2.45}/geocif/ml/output.py RENAMED Viewed

@@ -6,6 +6,20 @@ import pandas as pd
 from geocif import utils
+def make_serializable(hparams):
+    serializable = hparams.copy()
+    # Convert callbacks to strings
+    if 'callbacks' in serializable:
+        serializable['callbacks'] = [str(cb) for cb in serializable['callbacks']]
+    # Convert terms to string
+    if 'terms' in serializable:
+        serializable['terms'] = str(serializable['terms'])
+    return serializable
 def config_to_dict(parser):
     """
     Reads a configuration file and returns the configuration as a nested dictionary.
@@ -103,6 +117,9 @@ def store(db_path, experiment_id, df, model, model_name):
     for col in df.select_dtypes(include=["category"]).columns:
         df[col] = df[col].astype(str)
+    # Convert all columns to string
+    df['Best Hyperparameters'] = df['Best Hyperparameters'].apply(make_serializable)
     # Output results to database
     try:
         utils.to_db(db_path, experiment_id, df)

{geocif-0.2.43 → geocif-0.2.45}/geocif/ml/trainers.py RENAMED Viewed

@@ -350,14 +350,7 @@ def auto_train(
                 "cumulative_2": s(0) + s(1) + te(0, 1) + f(2),
                 "cumulative_3": s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3),
             }
-            try:
-                # Fill nans with medians
-                for col in X_train.columns:
-                    if X_train[col].isnull().any():
-                        median = X_train[col].median()
-                        X_train[col].fillna(median, inplace=True)
-            except:
-                breakpoint()
+            breakpoint()
             formula = gam_formulas.get(model_name, gam_cls(n_splines=25, spline_order=3))
             model = gam_cls(n_splines=25, spline_order=3).gridsearch(X_train, y_train.values, lam=np.logspace(-3, 3, 11)) if model_name.startswith("gam") else formula
         elif model_name == "geospaNN":

{geocif-0.2.43 → geocif-0.2.45}/geocif/mm.py RENAMED Viewed

@@ -15,6 +15,22 @@
 import os
+from pygam import LinearGAM, GammaGAM, s
+import numpy as np, pandas as pd
+X = np.random.uniform(0, 5, 500)[:, None]
+y_pos = 2 * np.exp(0.3*X.squeeze()) + np.random.gamma(shape=2, scale=1, size=500)
+# Bad idea – LinearGAM on skewed positive data
+lin = LinearGAM(s(0)).fit(X, y_pos)
+# Appropriate – GammaGAM with log link
+gam = GammaGAM(terms, fit_intercept=True)
+print("LinearGAM R2:", lin.statistics_['pseudo_r2']['explained_deviance'])
+print("GammaGAM  R2:", gam.statistics_['pseudo_r2']['explained_deviance'])
+breakpoint()
 # Set R_HOME environment variable before importing rpy2
 os.environ["R_HOME"] = f"{os.environ['CONDA_PREFIX']}\Lib\R"

{geocif-0.2.43 → geocif-0.2.45/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.2.43
+Version: 0.2.45
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.2.43 → geocif-0.2.45}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.2.43",
+    version="0.2.45",
     zip_safe=False,
 )