PyPI - geocif - Versions diffs - 0.1.29__tar.gz → 0.1.31__tar.gz - Mend

geocif 0.1.29tar.gz → 0.1.31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{geocif-0.1.29/geocif.egg-info → geocif-0.1.31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.29
+Version: 0.1.31
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.29 → geocif-0.1.31}/geocif/geocif.py RENAMED Viewed

@@ -17,6 +17,7 @@ from tqdm import tqdm
 from geocif import logger as log
 from .cei import definitions as di
 from .ml import correlations
+from .ml import spatial_autocorrelation as sa
 from .ml import feature_engineering as fe
 from .ml import feature_selection as fs
 from .ml import output
@@ -112,6 +113,10 @@ class Geocif:
         self.analogous_year_yield_as_feature = self.parser.getboolean(
             "ML", "analogous_year_yield_as_feature"
         )
+        self.spatial_autocorrelation = self.parser.getboolean(
+            "ML", "spatial_autocorrelation"
+        )
+        self.sa_method = self.parser.get("ML", "sa_method")
         self.last_year_yield_as_feature = self.parser.getboolean(
             "ML", "last_year_yield_as_feature"
         )
@@ -350,6 +355,8 @@ class Geocif:
         experiment_id = f"{self.country}_{self.crop}"
         now = ar.utcnow().to("America/New_York").format("MMMM-DD-YYYY HH:mm:ss")
         selected_features = self.selected_features + self.cat_features
+        # Compute percentage difference between y_pred and y_test
+        ape = np.abs((y_pred - y_test) / y_test) * 100
         df = pd.DataFrame(
             {
                 "Experiment_ID": np.full(shp, experiment_id),
@@ -367,12 +374,13 @@ class Geocif:
                 "Starting Stage": np.full(shp, self.stage_info["Starting Stage"]),
                 "Ending Stage": np.full(shp, self.stage_info["Ending Stage"]),
                 "Model": np.full(shp, self.model_name),
-                "Area (ha)": df_region["Area (ha)"].values,
                 "Region_ID": df_region["Region_ID"].values,
                 "Region": df_region["Region"].values,
                 "Harvest Year": df_region["Harvest Year"].values,
+                "Area (ha)": df_region["Area (ha)"].values,
                 f"Observed {self.target}": np.around(y_test, 3).ravel(),
                 f"Predicted {self.target}": np.around(y_pred, 3).ravel(),
+                f"APE": np.around(ape, 3).ravel(),
             }
         )
@@ -412,6 +420,12 @@ class Geocif:
             except:
                 breakpoint()
+        # if self.spatial_autocorrelation:
+        #     # Compute spatial autocorrelation
+        #     df = sa.compute_spatial_autocorrelation(
+        #         self.dg_country
+        #     )
         for col in [
             f"Median {self.target}",
             "Analogous Year",
@@ -425,7 +439,7 @@ class Geocif:
         # Create an index based on following columns
         index_columns = [
             "Model",
-            "Cluster Strategy"
+            "Cluster Strategy",
             "Country",
             "Region",
             "Crop",
@@ -709,7 +723,7 @@ class Geocif:
         """ Convert this dataframe into an ML ready format and save to disk """
         df = self.create_ml_dataframe(df)
         dir_output = (
-            self.dir_analysis / self.country / self.crop / str(self.forecast_season)
+            self.dir_analysis / self.country / self.crop / self.model_name / str(self.forecast_season)
         )
         os.makedirs(dir_output, exist_ok=True)
         df.to_csv(
@@ -718,7 +732,6 @@ class Geocif:
         )
         # cat_features should be converted to category type
         df[self.cat_features] = df[self.cat_features].astype("category")
         """  Heatmap of correlation of various features with yield at each time step"""
@@ -739,26 +752,29 @@ class Geocif:
             how="outer",
         )
-        if self.correlation_plots:
-            dict_kwargs = {}
-            dict_kwargs["all_stages"] = self.all_stages
-            dict_kwargs["target_col"] = self.target
-            dict_kwargs["country"] = self.country
-            dict_kwargs["crop"] = self.crop
-            dict_kwargs["dir_output"] = (
-                self.dir_analysis
-                / self.country
-                / self.crop
-                / self.model_name
-                / str(self.forecast_season)
-            )
-            dict_kwargs["forecast_season"] = self.forecast_season
-            dict_kwargs["method"] = self.method
-            dict_kwargs["national_correlation"] = self.national_correlation
-            dict_kwargs["groupby"] = self.correlation_plot_groupby
-            dict_kwargs["dg_country"] = self.dg_country
-            dict_kwargs["combined_dict"] = self.combined_dict
+        dict_kwargs = {}
+        dict_kwargs["all_stages"] = self.all_stages
+        dict_kwargs["target_col"] = self.target
+        dict_kwargs["country"] = self.country
+        dict_kwargs["crop"] = self.crop
+        dict_kwargs["dir_output"] = (
+            self.dir_analysis
+            / self.country
+            / self.crop
+            / self.model_name
+            / str(self.forecast_season)
+        )
+        dict_kwargs["forecast_season"] = self.forecast_season
+        dict_kwargs["method"] = self.method
+        dict_kwargs["national_correlation"] = self.national_correlation
+        dict_kwargs["groupby"] = self.correlation_plot_groupby
+        dict_kwargs["dg_country"] = self.dg_country
+        dict_kwargs["combined_dict"] = self.combined_dict
+        if self.spatial_autocorrelation:
+           sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
+        if self.correlation_plots:
             self.logger.info(f"Correlation plot for {self.country} {self.crop}")
             (
                 dict_selected_features,

{geocif-0.1.29 → geocif-0.1.31}/geocif/indices_runner.py RENAMED Viewed

@@ -155,7 +155,7 @@ class cei_runner(base.BaseGeo):
                 "ndvi",
                 False,  # redo
             )
-            for year in range(2023, ar.utcnow().year + 1)
+            for year in range(2001, ar.utcnow().year + 1)
             for status, path, filename, admin_zone, category in combinations
         ]

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/correlations.py RENAMED Viewed

@@ -28,7 +28,9 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
     # Only select columns that have been observed till the current stage
     for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
-        current_feature_set = [col for col in df_train.columns if col.endswith(f"_{stage[-1]}")]
+        current_feature_set = [
+            col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
+        ]
         # Get the most correlated feature for each region
         top_feature_by_region, counter = embedding.get_top_correlated_features(
@@ -41,7 +43,9 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
         # Loop through top_feature_by_region and find the average score for _feature
         # Calculate the average score for 'DTR_36'
         _feature_scores = [
-            value[1][0] for key, value in top_feature_by_region.items() if _feature in value[0]
+            value[1][0]
+            for key, value in top_feature_by_region.items()
+            if _feature in value[0]
         ]
         average_score = sum(_feature_scores) / len(_feature_scores)
         _feature = utils.remove_last_part(_feature)
@@ -137,7 +141,9 @@ def plot_feature_corr_by_time(df, **kwargs):
     # Add colorbar label
     # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
     cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
-    ax_heatmap.set_xticklabels(ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5)
+    ax_heatmap.set_xticklabels(
+        ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5
+    )
     ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=5)
     ax_heatmap.set_xlabel("")
     ax_heatmap.set_ylabel(" ")
@@ -190,7 +196,9 @@ def _all_correlated_feature_by_time(df, **kwargs):
         pbar.set_description(f"Calculating correlations")
         pbar.update()
-        stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)["Stage Name"]
+        stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
+            "Stage Name"
+        ]
         # starting_stage = stage_name.split("-")[0]
         current_feature_set = [col for col in df.columns if stage_name in col]
@@ -210,7 +218,9 @@ def _all_correlated_feature_by_time(df, **kwargs):
         all_stage_names = []
         for stage in stages_features:
-            _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)["Stage Name"]
+            _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
+                "Stage Name"
+            ]
             all_stage_names.append(_tmp)
         df_results = df_results.reindex(all_stage_names)
@@ -254,7 +264,12 @@ def all_correlated_feature_by_time(df, **kwargs):
                 df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
                 dict_selected_features[region_id] = df_tmp.columns
-                df_tmp2 = df_tmp.median(axis=0).abs().sort_values(ascending=False).reset_index()
+                df_tmp2 = (
+                    df_tmp.median(axis=0)
+                    .abs()
+                    .sort_values(ascending=False)
+                    .reset_index()
+                )
                 df_tmp2.columns = ["Metric", "Value"]
                 # Add another column based on Type of Metric
                 for idx, row in df_tmp2.iterrows():
@@ -278,8 +293,8 @@ def all_correlated_feature_by_time(df, **kwargs):
                 dict_selected_features[region_id] = df_corr.columns
                 dict_best_cei[region_id] = {}
-                #dict_selected_features[region_id] = dict_selected_features[0]
-                #dict_best_cei[region_id] = dict_best_cei[0]
+                # dict_selected_features[region_id] = dict_selected_features[0]
+                # dict_best_cei[region_id] = dict_best_cei[0]
                 # Combine all unique values from the existing dictionary elements
                 # combined_metrics = set()
                 # for key in dict_selected_features:
@@ -310,7 +325,9 @@ def feature_correlation_by_time(**kwargs):
     # Only select columns that have been observed till the current stage
     for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
-        current_feature_set = [col for col in df_train.columns if col.endswith(f"_{stage[-1]}")]
+        current_feature_set = [
+            col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
+        ]
         # Get the most correlated feature for each region
         top_feature_by_region, counter = embedding.compute_feature_correlations(
@@ -324,7 +341,9 @@ def feature_correlation_by_time(**kwargs):
         # Loop through top_feature_by_region and find the average score for _feature
         # Calculate the average score for 'DTR_36'
         _feature_scores = [
-            value[1][0] for key, value in top_feature_by_region.items() if _feature in value[0]
+            value[1][0]
+            for key, value in top_feature_by_region.items()
+            if _feature in value[0]
         ]
         average_score = sum(_feature_scores) / len(_feature_scores)
         _feature = utils.remove_last_part(_feature)

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/embedding.py RENAMED Viewed

@@ -79,7 +79,9 @@ def get_top_correlated_features(inputs, targets):
         feature_correlations = _compute_correlations(X, y)
         # Exclude any nan values
-        feature_correlations = {k: v for k, v in feature_correlations.items() if not np.isnan(v)}
+        feature_correlations = {
+            k: v for k, v in feature_correlations.items() if not np.isnan(v)
+        }
         if not feature_correlations:
             continue
@@ -113,7 +115,9 @@ def get_all_features_correlation(inputs, targets, method):
         feature_correlations = _compute_correlations(X, y)
         # Exclude any nan values
-        feature_correlations = {k: v for k, v in feature_correlations.items() if not np.isnan(v)}
+        feature_correlations = {
+            k: v for k, v in feature_correlations.items() if not np.isnan(v)
+        }
         if not feature_correlations:
             continue

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/feature_engineering.py RENAMED Viewed

@@ -21,11 +21,15 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
     # Initialize the new column with NaNs
     df[f"Last Year {target_col}"] = np.nan
-    for region, group in tqdm(df.groupby("Region"), desc="Last year yields", leave=False):
+    for region, group in tqdm(
+        df.groupby("Region"), desc="Last year yields", leave=False
+    ):
         unique_years = group["Harvest Year"].unique()
         for harvest_year in unique_years:
-            mask = (group["Harvest Year"] == harvest_year - 1) & (group["Region"] == region)
+            mask = (group["Harvest Year"] == harvest_year - 1) & (
+                group["Region"] == region
+            )
             last_year_yield = group.loc[mask, target_col].values
             if last_year_yield:
                 df.loc[
@@ -89,7 +93,9 @@ def compute_median_yield(
             closest_years = compute_closest_years(
                 all_seasons_with_yield, harvest_year, number_median_years
             )
-            mask = (group["Harvest Year"].isin(closest_years)) & (group["Region"] == region)
+            mask = (group["Harvest Year"].isin(closest_years)) & (
+                group["Region"] == region
+            )
             median_yield = group.loc[mask, target_col].median()
             df.loc[
                 (df["Region"] == region) & (df["Harvest Year"] == harvest_year),
@@ -99,7 +105,9 @@ def compute_median_yield(
     return df
-def compute_lag_yield(df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"):
+def compute_lag_yield(
+    df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"
+):
     # For the number of years specified in self.number_lag_years, add the yield of that number of years
     # ago to the dataframe
     # For example, if number_lag_years is 3, then the yield of each year upto 3 years ago will be added
@@ -125,7 +133,9 @@ def compute_lag_yield(df, all_seasons_with_yield, number_lag_years, target_col="
                 col = f"t -{idx + 1} {target_col}"
                 mask_group_year = group["Harvest Year"] == year
-                mask_region = (df["Region"] == region) & (df["Harvest Year"] == harvest_year)
+                mask_region = (df["Region"] == region) & (
+                    df["Harvest Year"] == harvest_year
+                )
                 yield_value = group.loc[mask_group_year, target_col].values
                 if yield_value.size > 0:
@@ -181,11 +191,15 @@ def compute_analogous_yield(
     all_years = df["Harvest Year"].unique()
     for harvest_year in tqdm(all_years, desc="Computing analogous yields", leave=False):
-        lag_years = compute_closest_years(all_seasons_with_yield, harvest_year, number_lag_years)
+        lag_years = compute_closest_years(
+            all_seasons_with_yield, harvest_year, number_lag_years
+        )
         for region in df["Region"].unique():
             # Filter current year and region dataset
-            df_current = df[(df["Harvest Year"] == harvest_year) & (df["Region"] == region)]
+            df_current = df[
+                (df["Harvest Year"] == harvest_year) & (df["Region"] == region)
+            ]
             # Filter dataset for lag years and the same region
             df_lag = df[(df["Harvest Year"].isin(lag_years)) & (df["Region"] == region)]
@@ -242,6 +256,7 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
     # Suppress warnings in this function
     import warnings
     warnings.filterwarnings("ignore")
     from kneed import KneeLocator
@@ -291,7 +306,9 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
         inertia.append(kmeans.inertia_)
     # Use KneeLocator to find the elbow point automatically
-    knee_locator = KneeLocator(range_of_clusters, inertia, curve="convex", direction="decreasing")
+    knee_locator = KneeLocator(
+        range_of_clusters, inertia, curve="convex", direction="decreasing"
+    )
     # # Plot the Elbow Method for visual confirmation
     # plt.figure(figsize=(10, 6))
@@ -306,7 +323,9 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
     # Use the detected number of clusters
     optimal_clusters = knee_locator.knee
     if optimal_clusters:
-        optimal_clusters = optimal_clusters + 1 if optimal_clusters > 1 else optimal_clusters
+        optimal_clusters = (
+            optimal_clusters + 1 if optimal_clusters > 1 else optimal_clusters
+        )
         # Apply K-Means clustering with the detected optimal number of clusters
         kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
@@ -321,6 +340,43 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
         )
     else:
         # If no optimal_clusters is found, then assign all regions to a single cluster
-        clusters_assigned = pd.DataFrame({"Region": df_yield_pivot.index, "Region_ID": 1})
+        clusters_assigned = pd.DataFrame(
+            {"Region": df_yield_pivot.index, "Region_ID": 1}
+        )
     return clusters_assigned
+# breakpoint()
+# from libpysal.weights import Queen, Rook
+# from pysal.lib import weights
+# from scipy.linalg import eigh
+#
+# breakpoint()
+# df = pd.DataFrame()
+#
+# # Create a spatial weights matrix (e.g., Queen contiguity)
+# w = weights.Queen.from_dataframe(dg)
+#
+# # Transform weights to row-standardized form
+# w.transform = 'r'
+#
+# # Convert the weights matrix to a dense format for eigen decomposition
+# W_dense = w.full()[0]
+#
+# # Compute eigenvalues and eigenvectors
+# eigenvalues, eigenvectors = eigh(W_dense)
+#
+# # Sort eigenvalues and corresponding eigenvectors
+# sorted_indices = np.argsort(eigenvalues)[::-1]
+# eigenvalues = eigenvalues[sorted_indices]
+# eigenvectors = eigenvectors[:, sorted_indices]
+#
+# # Select a subset of eigenvectors (e.g., first 10)
+# selected_eigenvectors = eigenvectors[:, :2]
+#
+# breakpoint()
+# # Add eigenvectors to the GeoDataFrame
+# for i in range(selected_eigenvectors.shape[1]):
+#     df[f'EV_{i + 1}'] = selected_eigenvectors[:, i]

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/feature_selection.py RENAMED Viewed

@@ -77,16 +77,22 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
         # Step 5: Summarize the SHAP values for feature importance
         shap_importances = np.mean(np.abs(shap_values), axis=0)
-        shap_importance_df = pd.DataFrame({
-            'feature': X.columns,
-            'importance': shap_importances
-        }).sort_values(by='importance', ascending=False)
+        shap_importance_df = pd.DataFrame(
+            {"feature": X.columns, "importance": shap_importances}
+        ).sort_values(by="importance", ascending=False)
         def evaluate_model_with_n_features(N, X_train, y_train):
-            top_features = shap_importance_df['feature'].head(N).values
+            top_features = shap_importance_df["feature"].head(N).values
             X_train_selected = X_train[top_features]
             selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
-            scores = cross_val_score(selector, X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
+            scores = cross_val_score(
+                selector,
+                X_train_selected,
+                y_train,
+                cv=5,
+                scoring="neg_mean_squared_error",
+                n_jobs=-1,
+            )
             return np.mean(scores)
@@ -100,7 +106,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
         optimal_N = nrange[np.argmax(cv_scores)]
         # Use optimal N to select features
-        selected_features = shap_importance_df['feature'].head(optimal_N).values.tolist()
+        selected_features = (
+            shap_importance_df["feature"].head(optimal_N).values.tolist()
+        )
     elif method == "feature_engine":
         from feature_engine.selection import SmartCorrelatedSelection
@@ -202,7 +210,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
         }
         model = CatBoostRegressor(**hyperparams)
-        selector = BorutaShap(model=model, importance_measure="shap", classification=False)
+        selector = BorutaShap(
+            model=model, importance_measure="shap", classification=False
+        )
         selector.fit(
             X=X,
             y=y,
@@ -237,7 +247,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
     elif method == "RFE":
         from sklearn.feature_selection import RFE
-        selector = RFE(forest, n_features_to_select=min_features_to_select, step=1, verbose=1)
+        selector = RFE(
+            forest, n_features_to_select=min_features_to_select, step=1, verbose=1
+        )
         selector = selector.fit(X, y)
         selected_features_mask = selector.support_
         selected_features = X.columns[selected_features_mask].tolist()

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/outliers.py RENAMED Viewed

@@ -94,19 +94,23 @@ if __name__ == "__main__":
         if not os.path.isfile(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv"):
             # In rows where admin_2 != "none", replace admin_1 with admin_2
-            df_fewsnet_sub.loc[df_fewsnet_sub["admin_2"] != "none", "admin_1"] = df_fewsnet_sub[
-                "admin_2"
-            ]
+            df_fewsnet_sub.loc[
+                df_fewsnet_sub["admin_2"] != "none", "admin_1"
+            ] = df_fewsnet_sub["admin_2"]
             df_output = find_outlier(df_fewsnet_sub)
-            df_output.to_csv(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv", index=False)
+            df_output.to_csv(
+                BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv", index=False
+            )
         else:
-            df_output = pd.read_csv(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv")
+            df_output = pd.read_csv(
+                BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv"
+            )
-        df_fewsnet_sub.loc[df_fewsnet_sub["admin_2"] != "none", "admin_1"] = df_fewsnet_sub[
-            "admin_2"
-        ]
+        df_fewsnet_sub.loc[
+            df_fewsnet_sub["admin_2"] != "none", "admin_1"
+        ] = df_fewsnet_sub["admin_2"]
         # Create a column called Z-Score Category based on the value of the z-score
         # The categories are:
@@ -142,7 +146,9 @@ if __name__ == "__main__":
             df_fewsnet_sub["harvest_year"] = df_fewsnet_sub["harvest_year"].astype(int)
             df_yield = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "yield")]
-            df_production = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "production")]
+            df_production = df_fewsnet_sub[
+                mask & (df_fewsnet_sub["indicator"] == "production")
+            ]
             df_area = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "area")]
             df_yield["harvest_year"] = df_yield["harvest_year"].astype(int)
@@ -158,9 +164,13 @@ if __name__ == "__main__":
             # Add 3 subplots, first for area
             plt.figure(figsize=(10, 10))
             plt.subplot(3, 1, 1)
-            plt.plot(df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"])
+            plt.plot(
+                df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"]
+            )
             # Add a circle for each year where yield is available
-            plt.scatter(df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"])
+            plt.scatter(
+                df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"]
+            )
             # Draw a horizontal line at the average df_yield[mask]["value"]
             plt.axhline(df_yield[mask]["value"].mean(), color="red", linestyle="--")
             # Place a tick on x-axis at every year and make labels vertical
@@ -195,13 +205,17 @@ if __name__ == "__main__":
                 df_production[mask]["value"],
             )
             # Place a tick on x-axis at every year
-            plt.xticks(df_production[mask]["harvest_year"].astype(int)[::2], rotation=90)
+            plt.xticks(
+                df_production[mask]["harvest_year"].astype(int)[::2], rotation=90
+            )
             plt.xlabel("Year")
             plt.ylabel("Production")
             plt.subplot(3, 1, 3)
             plt.plot(df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"])
-            plt.scatter(df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"])
+            plt.scatter(
+                df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"]
+            )
             # Place a tick on x-axis at every year
             plt.xticks(df_area[mask]["harvest_year"].astype(int)[::2], rotation=90)
             plt.xlabel("Year")
@@ -210,7 +224,9 @@ if __name__ == "__main__":
             try:
                 os.makedirs(BASE_DIR / crop, exist_ok=True)
                 plt.savefig(
-                    BASE_DIR / crop / f"{fnid}_{country}_{admin_1}_{crop}_{season_name}.png"
+                    BASE_DIR
+                    / crop
+                    / f"{fnid}_{country}_{admin_1}_{crop}_{season_name}.png"
                 )
             except:
                 breakpoint()

geocif-0.1.31/geocif/ml/spatial_autocorrelation.py ADDED Viewed

@@ -0,0 +1,224 @@
+import warnings
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import pandas as pd
+from pysal.lib import weights
+warnings.filterwarnings("ignore")
+def validate_inputs(df_results, required_columns):
+    """
+    Args:
+        df_results:
+        required_columns:
+    Returns:
+    """
+    if not all(column in df_results.columns for column in required_columns):
+        raise ValueError(
+            f"df_results must contain the following columns: {required_columns}"
+        )
+def preprocess_data(df_results, dg_country):
+    """
+    Args:
+        df_results:
+        dg_country:
+    Returns:
+    """
+    df = df_results.drop_duplicates()
+    df = df.dropna(subset=["Yield (tn per ha)"])
+    dg_country = dg_country.drop_duplicates(subset="Country Region")
+    dg_country = dg_country.dropna(subset=["Country Region", "Region_ID", "geometry"])
+    df["Country Region"] = (df["Country"] + " " + df["Region"]).str.lower()
+    dg_country["Country Region"] = dg_country["Country Region"].str.lower()
+    dg_country = dg_country[dg_country["Country Region"].isin(df["Country Region"])]
+    dg_country.reset_index(drop=True, inplace=True)
+    merged_df = dg_country.merge(df, on="Country Region", how="inner")
+    return merged_df
+def create_base_weights(merged_df):
+    """
+    Args:
+        merged_df:
+    Returns:
+    """
+    dg = merged_df[["Country Region", "geometry"]].drop_duplicates()
+    try:
+        w_base = weights.Queen.from_dataframe(dg)
+    except Exception as e:
+        raise RuntimeError(f"Failed to create spatial weights: {e}")
+    no_neighbors = [
+        index for index, neighbors in w_base.neighbors.items() if len(neighbors) == 0
+    ]
+    if no_neighbors:
+        dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
+        w_base = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
+    return w_base, dg
+def create_weights_for_year(dg_country, regions_with_data):
+    """
+    Args:
+        dg_country:
+        regions_with_data:
+    Returns:
+    """
+    dg = dg_country[dg_country["Country Region"].isin(regions_with_data)]
+    dg = dg.reset_index(drop=True)
+    wt = weights.Queen.from_dataframe(dg)
+    no_neighbors = [
+        index for index, neighbors in wt.neighbors.items() if len(neighbors) == 0
+    ]
+    if no_neighbors:
+        dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
+        wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
+    return wt, dg
+def compute_morans_i(merged_df):
+    """
+    Args:
+        merged_df:
+        dg_country:
+    Returns:
+    """
+    from pysal.explore import esda
+    # Drop any regions with missing data
+    merged_df = merged_df.dropna(subset=["Yield (tn per ha)"])
+    years = merged_df["Harvest Year"].unique()
+    results = {"Harvest Year": [], "Moran's I": [], "p-value": [], "Significant": []}
+    for year in tqdm(years, desc="Compute Moran's I"):
+        year_data = merged_df[merged_df["Harvest Year"] == year]
+        regions_with_data = year_data["Country Region"].unique()
+        year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
+        y = year_data[["Country Region", "Region", "Yield (tn per ha)"]].drop_duplicates()
+        dg_country = year_data[["Country Region", "geometry"]].drop_duplicates()
+        if len(y) > 1:
+            w, x = create_weights_for_year(dg_country, regions_with_data)
+            y = y[y["Country Region"].isin(x["Country Region"])]
+            try:
+                mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
+            except:
+                breakpoint()
+            results["Harvest Year"].append(year)
+            try:
+                results["Moran's I"].append(mi.I)
+            except:
+                breakpoint()
+            results["p-value"].append(mi.p_sim)
+            results["Significant"].append(mi.p_sim < 0.1)
+        else:
+            results["Harvest Year"].append(year)
+            results["Moran's I"].append(None)
+            results["p-value"].append(None)
+            results["Significant"].append(False)
+    return pd.DataFrame(results)
+def plot_morans_i_time_series(results_df, country, crop, dir_output):
+    """
+    Args:
+        results_df:
+        country:
+        crop:
+        dir_output:
+    Returns:
+    """
+    plt.figure(figsize=(10, 6))
+    significant = results_df[results_df["Significant"]]
+    plt.scatter(
+        significant["Harvest Year"],
+        significant["Moran's I"],
+        color="red",
+        label="Significant (p < 0.1)",
+    )
+    not_significant = results_df[~results_df["Significant"]]
+    plt.plot(
+        not_significant["Harvest Year"],
+        not_significant["Moran's I"],
+        marker="o",
+        linestyle="-",
+        color="blue",
+        label="Non-Significant",
+    )
+    plt.ylabel("Moran's I")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(dir_output / f"{country}_{crop}.png")
+    plt.close()
+def compute_spatial_autocorrelation(df_results, **kwargs):
+    """
+    Args:
+        df_results:
+        **kwargs:
+    Returns:
+    """
+    country = kwargs.get("country")
+    crop = kwargs.get("crop")
+    dg_country = kwargs.get("dg_country")
+    dir_output = kwargs.get("dir_output")
+    required_columns = [
+        "Country",
+        "Crop",
+        "Region",
+        "Harvest Year",
+        "Yield (tn per ha)",
+    ]
+    validate_inputs(df_results, required_columns)
+    merged_df = preprocess_data(df_results, dg_country)
+    if merged_df.empty:
+        raise ValueError("No valid data available after preprocessing")
+    results_df = compute_morans_i(merged_df)
+    plot_morans_i_time_series(results_df, country, crop, dir_output)

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/stages.py RENAMED Viewed

@@ -17,7 +17,9 @@ def add_stage_information(df, method):
     df["Stage"] = df["Stage"].astype(str)
     df["Stage_ID"] = df["Stage"]
-    df["Stage Range"] = df["Stage"].apply(lambda x: "_".join([x.split("_")[0], x.split("_")[-1]]))
+    df["Stage Range"] = df["Stage"].apply(
+        lambda x: "_".join([x.split("_")[0], x.split("_")[-1]])
+    )
     # Create a column with starting stage and ending stage
     # Stage looks like this: 13_12_11
@@ -34,14 +36,18 @@ def add_stage_information(df, method):
         dict = utils.dict_growth_stages_biweekly
     elif "monthly" in method:
         dict = utils.dict_growth_stages_monthly
-    df["Stage Names"] = df["Starting Stage"].map(dict) + " - " + df["Ending Stage"].map(dict)
+    df["Stage Names"] = (
+        df["Starting Stage"].map(dict) + " - " + df["Ending Stage"].map(dict)
+    )
     # Group by Region, Harvest Year
     # For each group, add a column called Percentage Season
     # that is the percentage of the season that has passed based on the number of rows
     # in the group
     grouped = df.groupby(["Region", "Harvest Year"])
-    df["Percentage Season"] = grouped.cumcount() * 100.0 / grouped["CEI"].transform("size")
+    df["Percentage Season"] = (
+        grouped.cumcount() * 100.0 / grouped["CEI"].transform("size")
+    )
     return df
@@ -186,7 +192,9 @@ def get_stage_information_dict(stage_str, method):
     end_stage = parts[-1]
     # Exclude cei from the stage_str string
-    stage_info["Stage_ID"] = "_".join(parts[1:]) if parts[1].isdigit() else "_".join(parts[2:])
+    stage_info["Stage_ID"] = (
+        "_".join(parts[1:]) if parts[1].isdigit() else "_".join(parts[2:])
+    )
     stage_info["CEI"] = cei
     stage_info["Stage Range"] = "_".join([start_stage, end_stage])

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/stats.py RENAMED Viewed

@@ -29,33 +29,54 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
         #    df_tmp = df.loc[mask_adm1]
         df_tmp = df.copy()
-        if name_crop == 'rice':
-            if cntr == 'Viet nam':
-                df_tmp = df.loc[df.Season == 'Spring Paddy']
-            elif cntr == 'Thailand':
-                df_tmp = df.loc[df.Season == 'Major Season']
-            elif cntr == 'China':
-                df_tmp = df.loc[df.Season == 'Single-cropping and Middle-season Rice']
-            elif cntr == 'India':
-                df_tmp = df.loc[df.Season == 'Kharif']
-        elif name_crop == 'maize' and \
-                cntr in ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Czech  Republic', 'Denmark','Germany', 'Greece', 'Hungary',
-                         'Italy', 'Lithuania', 'Luxembourg', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
-                         'Sweden', 'United Kingdom']:
-            df_tmp = df.loc[df.Season == 'Grain Maize and Corn-cob-mix']
-        elif name_crop == 'maize' and cntr in ['France']:
-            df_tmp = df.loc[df.Season == 'Green Maize']
+        if name_crop == "rice":
+            if cntr == "Viet nam":
+                df_tmp = df.loc[df.Season == "Spring Paddy"]
+            elif cntr == "Thailand":
+                df_tmp = df.loc[df.Season == "Major Season"]
+            elif cntr == "China":
+                df_tmp = df.loc[df.Season == "Single-cropping and Middle-season Rice"]
+            elif cntr == "India":
+                df_tmp = df.loc[df.Season == "Kharif"]
+        elif name_crop == "maize" and cntr in [
+            "Austria",
+            "Belgium",
+            "Bulgaria",
+            "Croatia",
+            "Czech  Republic",
+            "Denmark",
+            "Germany",
+            "Greece",
+            "Hungary",
+            "Italy",
+            "Lithuania",
+            "Luxembourg",
+            "Netherlands",
+            "Poland",
+            "Portugal",
+            "Romania",
+            "Slovakia",
+            "Slovenia",
+            "Spain",
+            "Sweden",
+            "United Kingdom",
+        ]:
+            df_tmp = df.loc[df.Season == "Grain Maize and Corn-cob-mix"]
+        elif name_crop == "maize" and cntr in ["France"]:
+            df_tmp = df.loc[df.Season == "Green Maize"]
         if not df_tmp.empty:
-            if cntr != 'Vietnam':
-                mask_tmp_country = (df_tmp['ADM0_NAME'].str.lower() == cntr.replace('_', ' ').lower())
+            if cntr != "Vietnam":
+                mask_tmp_country = (
+                    df_tmp["ADM0_NAME"].str.lower() == cntr.replace("_", " ").lower()
+                )
             else:
-                mask_tmp_country = (df_tmp['ADM0_NAME'].str.lower() == 'viet nam')
+                mask_tmp_country = df_tmp["ADM0_NAME"].str.lower() == "viet nam"
             if region:
-                mask_tmp_adm1 = (df_tmp[region_column].str.lower() == region.lower())
+                mask_tmp_adm1 = df_tmp[region_column].str.lower() == region.lower()
             else:
                 # ADM1_NAME column should be NaN to get country level stats
-                mask_tmp_adm1 = (df_tmp[region_column].isnull())
+                mask_tmp_adm1 = df_tmp[region_column].isnull()
             val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
@@ -145,7 +166,16 @@ def add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone):
     return df
-def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, target_col="Yield (tn per ha)"):
+def add_statistics(
+    dir_stats,
+    df,
+    country,
+    crop,
+    admin_zone,
+    stats,
+    method,
+    target_col="Yield (tn per ha)",
+):
     """
     Args:
@@ -166,7 +196,9 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
     # HACK
     if country == "Afghanistan":
-        df_fewsnet.loc[:, "product"] = df_fewsnet["season_name"] + " " + df_fewsnet["product"]
+        df_fewsnet.loc[:, "product"] = (
+            df_fewsnet["season_name"] + " " + df_fewsnet["product"]
+        )
     # Check if country and crop exist in the fewsnet database
     mask = (df_fewsnet["country"] == country) & (df_fewsnet["product"] == crop)
@@ -183,12 +215,27 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
             mask_region = df_fewsnet[admin_zone] == region
             mask_yield = (
                 df_fewsnet["crop_production_system"].isin(
-                    ["none", "Small-scale (PS)", "Commercial (PS)", "All (PS)", "irrigated", "rainfed"]
+                    [
+                        "none",
+                        "Small-scale (PS)",
+                        "Commercial (PS)",
+                        "All (PS)",
+                        "irrigated",
+                        "rainfed",
+                    ]
                 )
                 & (df_fewsnet["harvest_year"] == harvest_year)
                 & (df_fewsnet["product"] == crop)
                 & df_fewsnet["season_name"].isin(
-                    ["Main", "Meher", "Main harvest", "Annual", "Summer", "Spring", "Winter"]
+                    [
+                        "Main",
+                        "Meher",
+                        "Main harvest",
+                        "Annual",
+                        "Summer",
+                        "Spring",
+                        "Winter",
+                    ]
                 )
                 & (df_fewsnet["indicator"].isin(["yield", "area", "production"]))
             )

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/trainers.py RENAMED Viewed

@@ -84,7 +84,9 @@ def optuna_objective(model, df, feature_names, target_col, cat_features=[]):
     y = df[target_col]
     # Divide the data into training and validation sets
-    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)
+    train_X, val_X, train_y, val_y = train_test_split(
+        X, y, test_size=0.2, random_state=0
+    )
     model.fit(
         train_X,
@@ -134,7 +136,9 @@ def optimized_model(
                 params = {
                     "depth": trial.suggest_int("depth", 1, 7),
                     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
-                    "iterations": trial.suggest_int("iterations", low=1000, high=5000, step=500),
+                    "iterations": trial.suggest_int(
+                        "iterations", low=1000, high=5000, step=500
+                    ),
                     "subsample": trial.suggest_float("subsample", 1.0, 1.0),
                     "random_strength": trial.suggest_float("random_strength", 0.3, 1.0),
                     "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
@@ -177,7 +181,9 @@ def optimized_model(
         optuna.logging.set_verbosity(optuna.logging.WARNING)  # Disable verbose
         sampler = optuna.samplers.TPESampler(seed=seed)
         study = optuna.create_study(sampler=sampler, direction="minimize")
-        study.optimize(_optuna_objective, n_trials=n_trials, n_jobs=int(mp.cpu_count() * 0.4))
+        study.optimize(
+            _optuna_objective, n_trials=n_trials, n_jobs=int(mp.cpu_count() * 0.4)
+        )
         if study.best_trial is None:
             raise ValueError("Optimization failed to complete any trials.")
         hyperparams = study.best_trial.params

{geocif-0.1.29 → geocif-0.1.31}/geocif/ml/trend.py RENAMED Viewed

@@ -74,7 +74,9 @@ def compute_trend(detrended_data, future_time_points=None):
     model = detrended_data.trend_model[0]
     if model_type == "mean":
-        trend_component = model.predict(np.ones(len(future_time_points)), has_constant="add")
+        trend_component = model.predict(
+            np.ones(len(future_time_points)), has_constant="add"
+        )
     elif model_type == "linear":
         X_linear = add_constant(future_time_points, has_constant="add")
         trend_component = model.predict(X_linear)

{geocif-0.1.29 → geocif-0.1.31/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.29
+Version: 0.1.31
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.29 → geocif-0.1.31}/geocif.egg-info/SOURCES.txt RENAMED Viewed

@@ -37,6 +37,7 @@ geocif/ml/feature_selection.py
 geocif/ml/outliers.py
 geocif/ml/outlook.py
 geocif/ml/output.py
+geocif/ml/spatial_autocorrelation.py
 geocif/ml/stages.py
 geocif/ml/stats.py
 geocif/ml/trainers.py

{geocif-0.1.29 → geocif-0.1.31}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.29",
+    version="0.1.31",
     zip_safe=False,
 )