PyPI - geocif - Versions diffs - 0.1.31__tar.gz → 0.1.32__tar.gz - Mend

geocif 0.1.31tar.gz → 0.1.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{geocif-0.1.31/geocif.egg-info → geocif-0.1.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.31
+Version: 0.1.32
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.31 → geocif-0.1.32}/geocif/analysis.py RENAMED Viewed

@@ -156,48 +156,78 @@ class Geoanalysis:
     def analyze(self):
         self.logger.info(f"Analyze {self.country} {self.crop}")
-        # Remove rows with missing values in Observed Yield (tn per ha)
-        df = self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
+        df = self._clean_data()
         if df.empty:
             return pd.DataFrame(), pd.DataFrame()
-        # For each Harvest Year, Stages combination, compute
-        # RMSE, NSE, R2, MAE, MAPE, PBIAS
-        df_metrics = df.groupby(
-            ["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"]
-        ).apply(self.annual_metrics)
+        df_metrics = self._compute_metrics(df)
+        df_metrics = self._process_metrics(df_metrics)
+        self._plot_metrics(df_metrics)
+        df_regional_metrics_by_year = self._compute_regional_metrics(
+            df, by="Harvest Year"
+        )
+        df_regional_metrics_by_year = self._select_top_years(
+            df_regional_metrics_by_year
+        )
+        df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
+        self._store_results(
+            df_metrics, df_regional_metrics, df_regional_metrics_by_year
+        )
+        df_national_yield = self._compute_national_yield(df)
+        self._plot_national_yield(df_national_yield)
+        return df_metrics, df_regional_metrics, df_national_yield
-        df_metrics = df_metrics.reset_index()
+    def _clean_data(self):
+        # Remove rows with missing values in Observed Yield (tn per ha)
+        return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
+    def _compute_metrics(self, df):
+        # For each Harvest Year, Stages combination, compute metrics
+        df_metrics = (
+            df.groupby(
+                ["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"]
+            )
+            .apply(self.annual_metrics)
+            .reset_index()
+        )
+        return df_metrics.pivot_table(
+            index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
+            columns="level_5",
+            values=0,
+        ).reset_index()
+    def _process_metrics(self, df_metrics):
         # Assign each unique Stage Name a unique integer identifier
         df_metrics["Stage_ID"] = pd.Categorical(df_metrics["Stage Name"]).codes
         # Order by Harvest Year and Number Stages (ascending)
-        df_metrics = df_metrics.sort_values(
-            by=["Harvest Year", "Stage_ID"], ascending=[True, True]
-        )
+        df_metrics = df_metrics.sort_values(by=["Harvest Year", "Stage_ID"])
         # Add columns with the name of the country and crop
         df_metrics["Country"] = self.country
         df_metrics["Crop"] = self.crop
         # Add stage information for plotting
-        df_metrics = self.add_stage_information(df_metrics)
+        return self.add_stage_information(df_metrics)
-        # Rename level_2 to Metric and 0 to Value
-        # df_metrics = df_metrics.rename(columns={"level_2": "Metric", 0: "Value"})
-        # breakpoint()
-        # df_metrics.to_csv(r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\dekad\ml\analysis\February-28-2024\ethiopia_maize\ab1.csv')
-        for metric in [
+    def _plot_metrics(self, df_metrics):
+        metrics = [
             "Root Mean Square Error",
-            # "Nash-Sutcliff Efficiency",
             "$r^2$",
             "Mean Absolute Error",
             "Mean Absolute\nPercentage Error",
             "Percentage Bias",
-        ]:
+        ]
+        for metric in metrics:
             self.plot_metric(df_metrics, metric)
+    def _compute_regional_metrics(self, df, by=None):
         cols = [
             "Country",
             "Region",
@@ -208,64 +238,193 @@ class Geoanalysis:
             "Stage Range",
         ]
-        # For each Stages combination, compute MAPE
-        df_regional_metrics = (
-            df.groupby(cols + ["Harvest Year"])
-            .apply(self.regional_metrics)
-            .reset_index()
-        )
+        if by:
+            return df.groupby(cols + [by]).apply(self.regional_metrics).reset_index()
+        else:
+            return df.groupby(cols).apply(self.regional_metrics).reset_index()
-        # HACK
-        # For each Country, Region, harvest Year combination, select the 10 years with least MAPE
-        df_regional_metrics = (
+    def _select_top_years(self, df_regional_metrics):
+        return (
             df_regional_metrics.groupby(["Country", "Region"])
             .apply(lambda x: self.select_top_N_years(x, 10))
             .reset_index(drop=True)
         )
-        # Determine average MAPE for each Country, Region, Model, Crop, Stage Name, Stage Range
-        df_regional_metrics = (
-            df_regional_metrics.groupby(cols)["Mean Absolute Percentage Error"]
-            .mean()
-            .reset_index()
-        )
-        # Create an index based on following columns
+    def _average_mape(self, df_regional_metrics):
         cols = [
             "Country",
-            "Crop",
+            "Region",
+            "% of total Area (ha)",
             "Model",
-            "Harvest Year",
+            "Crop",
             "Stage Name",
+            "Stage Range",
         ]
+        return (
+            df_regional_metrics.groupby(cols)["Mean Absolute Percentage Error"]
+            .mean()
+            .reset_index()
+        )
+    def _store_results(
+        self, df_metrics, df_regional_metrics, df_regional_metrics_by_year
+    ):
+        # Create an index based on specific columns
         df_metrics.index = df_metrics.apply(
-            lambda row: "_".join([str(row[col]) for col in cols]), axis=1
+            lambda row: "_".join(
+                [
+                    str(row[col])
+                    for col in [
+                        "Country",
+                        "Crop",
+                        "Model",
+                        "Harvest Year",
+                        "Stage Name",
+                    ]
+                ]
+            ),
+            axis=1,
         )
         df_metrics.index.set_names(["Index"], inplace=True)
-        cols = [
-            "Country",
-            "Region",
-            "Model",
-            "Crop",
-            "Stage Name",
-        ]
         df_regional_metrics.index = df_regional_metrics.apply(
-            lambda row: "_".join([str(row[col]) for col in cols]), axis=1
+            lambda row: "_".join(
+                [
+                    str(row[col])
+                    for col in ["Country", "Region", "Model", "Crop", "Stage Name"]
+                ]
+            ),
+            axis=1,
         )
         df_regional_metrics.index.set_names(["Index"], inplace=True)
+        df_regional_metrics_by_year.index = df_regional_metrics_by_year.apply(
+            lambda row: "_".join(
+                [
+                    str(row[col])
+                    for col in [
+                        "Country",
+                        "Region",
+                        "Model",
+                        "Crop",
+                        "Stage Name",
+                        "Harvest Year",
+                    ]
+                ]
+            ),
+            axis=1,
+        )
+        df_regional_metrics_by_year.index.set_names(["Index"], inplace=True)
         # Format with 3 places after the decimal point
         df_metrics = df_metrics.round(3)
         df_regional_metrics = df_regional_metrics.round(3)
+        df_regional_metrics_by_year = df_regional_metrics_by_year.round(3)
         # Store results in database
-        con = sqlite3.connect(self.db_path)
-        utils.to_db(self.db_path, f"country_metrics", df_metrics)
-        utils.to_db(self.db_path, f"regional_metrics", df_regional_metrics)
+        with sqlite3.connect(self.db_path) as con:
+            utils.to_db(self.db_path, "country_metrics", df_metrics)
+            utils.to_db(self.db_path, "regional_metrics", df_regional_metrics)
+            utils.to_db(
+                self.db_path, "regional_metrics_by_year", df_regional_metrics_by_year
+            )
-        con.commit()
-        con.close()
+            con.commit()
+    def _compute_national_yield(self, df):
+        # Compute observed and predicted national yield by multiplying Yield (tn per ha) by Area (ha)
+        observed = "Observed Yield (tn per ha)"
+        predicted = "Predicted Yield (tn per ha)"
+        area_ha = "Area (ha)"
+        df.loc[:, observed] = df[observed] * df[area_ha]
+        df.loc[:, predicted] = df[predicted] * df[area_ha]
+        # Group by Country and Harvest Year, then sum the National Yield and Area
+        df_national_yield = (
+            df.groupby(["Country", "Harvest Year"])
+            .agg({observed: "sum", predicted: "sum", area_ha: "sum"})
+            .reset_index()
+        )
+        # Compute observed and predicted yield per ha for each Harvest Year
+        df_national_yield[observed] = (
+            df_national_yield[observed] / df_national_yield[area_ha]
+        )
+        df_national_yield[predicted] = (
+            df_national_yield[predicted] / df_national_yield[area_ha]
+        )
+        return df_national_yield
+    def _plot_national_yield(self, df_national_yield, use_different_colors=True):
+        from sklearn.metrics import (
+            mean_squared_error,
+            r2_score,
+            mean_absolute_percentage_error,
+        )
+        x = df_national_yield["Harvest Year"]
+        y_observed = df_national_yield["Observed Yield (tn per ha)"]
+        y_predicted = df_national_yield["Predicted Yield (tn per ha)"]
+        with plt.style.context("science"):
+            plt.figure(figsize=(10, 6))
+            import palettable as pal
+            colors = pal.tableau.Tableau_20.mpl_colors
+            colors = colors[: len(x)]
+            # Add dashed gray grid lines with alpha=0.5
+            plt.grid(True, linestyle="--", alpha=0.5)
+            for i in range(len(x)):
+                plt.scatter(y_observed[i], y_predicted[i], color=colors[i], label=x[i])
+            # X and Y-axis range from 0 to the maximum observed/predicted yield * 1.1
+            max_yield = max(y_observed.max(), y_predicted.max()) * 1.25
+            plt.xlim(0, max_yield)
+            plt.ylim(0, max_yield)
+            # Add a line diagonally representing 1:1
+            plt.plot([0, max_yield], [0, max_yield], color="gray", linestyle="--")
+            # Calculate metrics
+            rmse = np.sqrt(mean_squared_error(y_observed, y_predicted))
+            mape = mean_absolute_percentage_error(y_observed, y_predicted)
+            r2 = r2_score(y_observed, y_predicted)
+            # Annotate metrics
+            textstr = "\n".join(
+                (
+                    f"RMSE: {rmse:.2f} tn/ha",
+                    f"MAPE: {mape:.2%}",
+                    f"R²: {r2:.2f}",
+                )
+            )
+            plt.gca().annotate(
+                textstr,
+                xy=(0.05, 0.95),
+                xycoords="axes fraction",
+                fontsize=12,
+                verticalalignment="top",
+            )
+            plt.xlabel("Observed Yield (tn/ha)")
+            plt.ylabel("Predicted Yield (tn/ha)")
+            # Place legend outside the plot to the right without a border
+            plt.legend(
+                title="Year",
+                bbox_to_anchor=(1.05, 1),
+                loc="upper left",
+                edgecolor="none",
+            )
+            plt.tight_layout()
+            fname = f"scatter_{self.country}_{self.crop}.png"
+            plt.savefig(self.dir_analysis / fname, dpi=250)
+            plt.close()
     def get_historic_production(self):
         # Read in historic production data
@@ -342,148 +501,6 @@ class Geoanalysis:
         return self.df_analysis
-    def map_regional(self):
-        con = sqlite3.connect(self.db_path)
-        # Read from database, where country and crop match
-        query = "SELECT * FROM country_metrics"
-        df_country = pd.read_sql_query(query, con)
-        query = "SELECT * FROM regional_metrics"
-        df_regional = pd.read_sql_query(query, con)
-        # Plot a histogram of the MAPE, different color for each country
-        # Plotting the histograms with KDE for each country
-        df_regional["Country"] = (
-            df_regional["Country"].str.replace("_", " ").str.title()
-        )
-        df_regional["Model"] = df_regional["Model"].str.title()
-        # Plotting the histogram with a smaller bin size for greater detail
-        # Plotting the KDE for each country, ensuring each step works
-        models = df_regional["Model"].unique()
-        for model in models:
-            df_model = df_regional[df_regional["Model"] == model]
-            # HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
-            # or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
-            df_tmp = df_model[
-                (df_model["% of total Area (ha)"] < 1)
-                & (df_model["Mean Absolute Percentage Error"] > 50)
-                & (df_model["Country"].isin(["Angola", "United Republic Of Tanzania"]))
-            ]
-            # Remove df_tmp from df_model
-            df_model = df_model.drop(df_tmp.index)
-            # Plot the histogram of MAPE
-            # Create bins for '% of total Area (ha)' and 'MAPE'
-            bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 5 + 1)
-            df_model["Area Bins"] = pd.cut(
-                df_model["% of total Area (ha)"],
-                bins=bin_edges,
-                precision=0,
-            )
-            df_model["MAPE Bins"] = pd.cut(
-                df_model["Mean Absolute Percentage Error"],
-                bins=5,  # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
-                right=False,
-                precision=1,
-            )
-            # Count occurrences of MAPE values for each area bin
-            area_mape_counts = (
-                df_model.groupby(["Area Bins", "MAPE Bins"])
-                .size()
-                .unstack(fill_value=0)
-            )
-            # Create the heatmap
-            plt.figure(figsize=(10, 8))
-            ax = sns.heatmap(
-                area_mape_counts,
-                annot=True,
-                square=True,
-                cmap=pal.scientific.sequential.Bamako_20_r.mpl_colormap,
-                fmt="d",
-            )
-            # Do not color or annotate cells with 0
-            for text in ax.texts:
-                if text.get_text() == "0":
-                    text.set_text("")
-                    text.set_color("white")
-            # plt.title("Heatmap of MAPE Bins vs % Total Area Bins")
-            plt.ylabel("% of Total Area (ha) Bins")
-            plt.xlabel("MAPE Bins")
-            # Adjust y-axis labels to horizontal
-            ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
-            # Invert y-axis to have the highest bin at the top
-            ax.invert_yaxis()
-            plt.savefig(self.dir_analysis / f"heatmap_{model}.png", dpi=250)
-            plt.close()
-            # Plot the KDE of MAPE
-            plt.figure(figsize=(12, 8))
-            for label, group_data in df_model.groupby("Country"):
-                sns.kdeplot(
-                    group_data["Mean Absolute Percentage Error"],
-                    label=label,
-                    clip=(0, None),
-                    # bins=len(group_data),
-                    # kde=True,
-                )
-            # Add minor ticks on the x-axis
-            plt.minorticks_on()
-            # Setting the title and labels
-            plt.title(
-                f"Kernel Density Estimation of Mean Absolute Percentage Error by Country - {model}"
-            )
-            plt.xlabel("Mean Absolute Percentage Error (%)")
-            plt.ylabel("Density")
-            plt.legend(title="Country", title_fontsize="13")
-            plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
-            plt.close()
-            # Map MAPE at regional level
-            df_model["Country Region"] = (
-                df_model["Country"].str.lower().str.replace("_", " ")
-                + " "
-                + df_model["Region"].str.lower()
-            )
-            fname = f"mape_{self.crop}_{model}.png"
-            col = "Mean Absolute Percentage Error"
-            countries = df_model["Country"].unique().tolist()
-            # make it title case and replace _ with space
-            countries = [country.title().replace("_", " ") for country in countries]
-            countries = ["Malawi"]
-            df_model = df_model[df_model["Country"].isin(countries)]
-            self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
-            plot.plot_df_shpfile(
-                self.dg,  # dataframe containing adm1 name and polygon
-                df_model,  # dataframe containing information that will be mapped
-                merge_col="Country Region",  # Column on which to merge
-                name_country=countries,
-                name_col=col,  # Which column to plot
-                dir_out=self.dir_analysis,  # Output directory
-                fname=fname,  # Output file name
-                label=f"MAPE (%)",
-                vmin=df_model[col].min(),
-                vmax=df_model[col].max(),
-                cmap=pal.scientific.sequential.Bamako_20_r,
-                series="sequential",
-                show_bg=False,
-                annotate_regions=False,
-                loc_legend="lower left",
-            )
-        con.commit()
-        con.close()
     def map(self, df_plot):
         # df_plot = self.df_analysis.copy()
         models = df_plot["Model"].unique()
@@ -491,10 +508,6 @@ class Geoanalysis:
         for model in models:
             df_model = df_plot[df_plot["Model"] == model]
-            countries = ["malawi"]
-            df_model = df_model[df_model["Country"].isin(countries)]
-            self.dg = self.dg[self.dg["ADM0_NAME"].isin(["Malawi", "malawi"])]
             countries = df_model["Country"].unique().tolist()
             if len(countries) > 1:
                 self.dir_plot = self.dir_analysis
@@ -509,7 +522,6 @@ class Geoanalysis:
                 + df_model["Region"].str.lower().str.replace("_", " ")
             )
             # Change Harvest year to type int
             df_model["Harvest Year"] = df_model["Harvest Year"].astype(int)
             annotate_region_column = (
@@ -525,8 +537,8 @@ class Geoanalysis:
                     df_time_period = df_harvest_year[
                         df_harvest_year["Stage Name"] == time_period
                     ]
-    #
-    #                 """ % of total area """
+                    #
+                    #                 """ % of total area """
                     if idx == 0:
                         fname = f"{self.country}_{self.crop}_perc_area.png"
                         col = "% of total Area (ha)"
@@ -548,15 +560,16 @@ class Geoanalysis:
                             annotate_region_column=annotate_region_column,
                             loc_legend="lower left",
                         )
-    #
-    #                 #     """ Unique regions """
+                    #
+                    #                 #     """ Unique regions """
                     fname = f"{self.country}_{self.crop}_region_ID.png"
                     col = "Region_ID"
                     df_model[col] = df_model[col].astype(int) + 1
                     if len(df_model["Region_ID"].unique() > 1):
                         # Create a dictionary with each region assigned a unique integer identifier and name
                         dict_region = {
-                            int(key): key for key in df_time_period["Region_ID"].unique()
+                            int(key): key
+                            for key in df_time_period["Region_ID"].unique()
                         }
                         plot.plot_df_shpfile(
                             self.dg,  # dataframe containing adm1 name and polygon
@@ -579,7 +592,7 @@ class Geoanalysis:
                             annotate_region_column=annotate_region_column,
                             loc_legend="lower left",
                         )
-    #                     breakpoint()
+                    #                     breakpoint()
                     # """ Anomaly """
                     # fname = (
@@ -690,10 +703,10 @@ class Geoanalysis:
     def execute(self):
         self.query()
-        aa = self.preprocess()
+        df = self.preprocess()
         self.analyze()
-        return aa
+        return df
     def get_config_data(self):
         try:
@@ -737,7 +750,6 @@ class Geoanalysis:
         """
         self.dict_config = {}
-        self.get_config_data()
         self.observed = "Observed Yield (tn per ha)"
         self.predicted = "Predicted Yield (tn per ha)"
@@ -798,12 +810,232 @@ class Geoanalysis:
             self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
         )
         # Make it lower case
-        self.dg["Country Region"] = self.dg["Country Region"].str.lower().replace("_", " ")
+        self.dg["Country Region"] = (
+            self.dg["Country Region"].str.lower().replace("_", " ")
+        )
+@dataclass
+class RegionalMapper(Geoanalysis):
+    path_config_files: List[Path] = field(default_factory=list)
+    logger: log = None
+    parser: ConfigParser = field(default_factory=ConfigParser)
+    def __post_init__(self):
+        # Call the parent class constructor
+        super().__post_init__()
+        self.get_config_data()
+        self.setup()
+    def map_regional(self):
+        """Main function to read data and generate plots."""
+        self.read_data()
+        self.clean_data()
+        self.plot_heatmap()
+        self.plot_kde()
+        self.plot_mape_map()
+        self.plot_mape_by_year()
+    def read_data(self):
+        """Read data from the database."""
+        con = sqlite3.connect(self.db_path)
+        query = "SELECT * FROM regional_metrics"
+        self.df_regional = pd.read_sql_query(query, con)
+        query = "SELECT * FROM regional_metrics_by_year"
+        self.df_regional_by_year = pd.read_sql_query(query, con)
+        con.close()
+    def clean_data(self):
+        """Clean and format the data."""
+        self.df_regional["Country"] = (
+            self.df_regional["Country"].str.replace("_", " ").str.title()
+        )
+        self.df_regional["Model"] = self.df_regional["Model"].str.title()
+    def plot_heatmap(self):
+        """Generate heatmaps of MAPE bins vs. % total area bins."""
+        models = self.df_regional["Model"].unique()
+        for model in models:
+            df_model = self.df_regional[self.df_regional["Model"] == model]
+            # HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
+            # or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
+            df_tmp = df_model[
+                (df_model["% of total Area (ha)"] < 0.5)
+                & (df_model["Mean Absolute Percentage Error"] > 100)
+            ]
+            df_model = df_model.drop(df_tmp.index)
+            bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 6)
+            df_model["Area Bins"] = pd.cut(
+                df_model["% of total Area (ha)"], bins=bin_edges, precision=0
+            )
+            df_model["MAPE Bins"] = pd.cut(
+                df_model["Mean Absolute Percentage Error"],
+                bins=5,
+                right=False,
+                precision=1,
+            )
+            area_mape_counts = (
+                df_model.groupby(["Area Bins", "MAPE Bins"])
+                .size()
+                .unstack(fill_value=0)
+            )
+            self._plot_heatmap(area_mape_counts, model)
+    def _plot_heatmap(self, area_mape_counts, model):
+        """
+        Plot heatmap helper function
+        Args:
+            area_mape_counts:
+            model:
+        Returns:
+        """
+        plt.figure(figsize=(10, 8))
+        ax = sns.heatmap(
+            area_mape_counts,
+            annot=True,
+            square=True,
+            cmap=pal.scientific.sequential.Bamako_20_r.mpl_colormap,
+            fmt="d",
+        )
+        for text in ax.texts:
+            if text.get_text() == "0":
+                text.set_text("")
+                text.set_color("white")
+        plt.ylabel("% of Total Area (ha) Bins")
+        plt.xlabel("MAPE Bins")
+        ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
+        ax.invert_yaxis()
+        plt.tight_layout()
+        plt.savefig(self.dir_analysis / f"heatmap_{model}.png", dpi=250)
+        plt.close()
+    def plot_kde(self):
+        """Generate KDE plots of MAPE for each country."""
+        models = self.df_regional["Model"].unique()
+        for model in models:
+            df_model = self.df_regional[self.df_regional["Model"] == model]
+            # HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
+            # or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
+            df_tmp = df_model[
+                (df_model["% of total Area (ha)"] < 0.5)
+                & (df_model["Mean Absolute Percentage Error"] > 100)
+            ]
+            df_model = df_model.drop(df_tmp.index)
+            with plt.style.context("science"):
+                plt.figure(figsize=(12, 8))
+                for label, group_data in df_model.groupby("Country"):
+                    sns.histplot(
+                        group_data["Mean Absolute Percentage Error"],
+                        label=label,
+                        # clip=(0, None),
+                    )
+                # Plot a dashed gray line at x=20
+                plt.axvline(x=20, color="gray", linestyle="--")
+                plt.minorticks_on()
+                plt.xlabel("Mean Absolute Percentage Error (%)")
+                plt.ylabel("Frequency")
+                plt.legend(title="Country", title_fontsize="13")
+                plt.tight_layout()
+                plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
+                plt.close()
+    def plot_mape_map(self):
+        """Plot the map of MAPE."""
+        self.df_regional["Country Region"] = (
+            self.df_regional["Country"].str.lower().str.replace("_", " ")
+            + " "
+            + self.df_regional["Region"].str.lower()
+        )
+        models = self.df_regional["Model"].unique()
+        for model in models:
+            df_model = self.df_regional[self.df_regional["Model"] == model]
+            # HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
+            # or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
+            df_tmp = df_model[
+                (df_model["% of total Area (ha)"] < 0.5)
+                & (df_model["Mean Absolute Percentage Error"] > 100)
+            ]
+            df_model = df_model.drop(df_tmp.index)
+            fname = f"mape_{self.crop}_{df_model['Model'].iloc[0]}.png"
+            col = "Mean Absolute Percentage Error"
+            countries = df_model["Country"].unique().tolist()
+            countries = [country.title().replace("_", " ") for country in countries]
+            df = df_model[df_model["Country"].isin(countries)]
+            self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
+            plot.plot_df_shpfile(
+                self.dg,
+                df,
+                merge_col="Country Region",
+                name_country=countries,
+                name_col=col,
+                dir_out=self.dir_analysis,
+                fname=fname,
+                label="MAPE (%)",
+                vmin=df[col].min(),
+                vmax=df[col].max(),
+                cmap=pal.scientific.sequential.Bamako_20_r,
+                series="sequential",
+                show_bg=False,
+                annotate_regions=False,
+                loc_legend="lower left",
+            )
+    def plot_mape_by_year(self):
+        """Compute MAPE by year and plot using a bar chart."""
+        # Compute the Mean Absolute Percentage Error (MAPE) by year
+        mape_by_year = (
+            self.df_regional_by_year.groupby("Harvest Year")[
+                "Mean Absolute Percentage Error"
+            ]
+            .mean()
+            .reset_index()
+        )
+        # Plot MAPE by year
+        with plt.style.context("science"):
+            plt.figure(figsize=(10, 6))
+            sns.barplot(
+                x="Harvest Year", y="Mean Absolute Percentage Error", data=mape_by_year
+            )
+            # Draw a dashed gray line at y=20
+            plt.axhline(y=20, color="gray", linestyle="--")
+            plt.title("Mean Absolute Percentage Error by Year")
+            plt.xlabel("Year")
+            plt.ylabel("Mean Absolute Percentage Error (%)")
+            plt.xticks(rotation=0)
+            plt.tight_layout()
+            plt.savefig(self.dir_analysis / "mape_by_year.png", dpi=250)
+            plt.close()
 def run(path_config_files=[Path("../config/geocif.txt")]):
     logger, parser = log.setup_logger_parser(path_config_files)
     obj = Geoanalysis(path_config_files, logger, parser)
+    obj.get_config_data()
     obj.setup()
     """ Loop over each country, crop, model combination in dict_config """
@@ -826,11 +1058,14 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
             df_tmp = obj.execute()
             frames.append(df_tmp)
-    dk = pd.concat(frames)
+    df = pd.concat(frames)
+    """ Map regional error metrics """
+    mapper = RegionalMapper(path_config_files, logger, parser)
+    mapper.map_regional()
-    # Map the metrics
-    # obj.map_regional()
-    obj.map(dk)
+    """ For each country, plot yields, conditions, anomalies, etc. """
+    obj.map(df)
 if __name__ == "__main__":

{geocif-0.1.31 → geocif-0.1.32}/geocif/geocif.py RENAMED Viewed

@@ -108,6 +108,7 @@ class Geocif:
                                 Config file: ML
         ====================================================================
         """
+        self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
         self.model_type = self.parser.get("ML", "model_type")
         self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
         self.analogous_year_yield_as_feature = self.parser.getboolean(
@@ -149,7 +150,7 @@ class Geocif:
         """
         # If ML model is run for individual region or cluster, then Region_ID is the same for each region
         # or cluster and therefore redundant for the ML model
-        #if self.cluster_strategy in ["individual", "auto_detect"]:
+        # if self.cluster_strategy in ["individual", "auto_detect"]:
         #    self.cat_features.remove("Region_ID")
         self.fixed_columns: list = [
@@ -223,7 +224,11 @@ class Geocif:
             """ Update model to include conformal estimates """
             X_train = df_region[self.selected_features + self.cat_features]
             dir_output = (
-                self.dir_analysis / self.country / self.crop / self.model_name / str(self.forecast_season)
+                self.dir_analysis
+                / self.country
+                / self.crop
+                / self.model_name
+                / str(self.forecast_season)
             )
             region_id = df_region["Region_ID"].unique()[0]
@@ -275,7 +280,12 @@ class Geocif:
                     clusters_train = df_region["Region"]
                     clusters_train.reset_index(drop=True, inplace=True)
-                    self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
+                    self.model.fit(
+                        X_train,
+                        Z_train,
+                        clusters_train.astype("object"),
+                        y_train.values,
+                    )
                 elif self.model_name == "linear":
                     self.model.fit(X_train_scaled, y_train)
                 elif self.model_name == "gam":
@@ -327,7 +337,9 @@ class Geocif:
                 clusters_test = df_region["Region"]
                 clusters_test.reset_index(drop=True, inplace=True)
-                y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
+                y_pred = self.model.predict(
+                    X_test, Z_test, clusters_test.astype("object")
+                )
                 best_hyperparameters = self.model.fe_model.get_params().copy()
             else:
                 y_pred = self.model.predict(X_test)
@@ -609,6 +621,15 @@ class Geocif:
                 model = self.model
             output.store(self.db_path, experiment_id, df, model, self.model_name)
+    def get_cei_column_names(self, df):
+        all_cei_columns = [
+            col
+            for col in df.columns
+            if col not in self.fixed_columns + [self.target] + self.statistics_columns
+        ]
+        return all_cei_columns
     def create_ml_dataframe(self, df):
         """
         Create ML ready dataframe
@@ -650,23 +671,22 @@ class Geocif:
         # Flatten the multi-index columns
         df.columns = [f"{i}_{j}" if j != "" else f"{i}" for i, j in df.columns]
+        # Get all the columns apart from the fixed columns, target column and stats columns
+        all_cei_columns = self.get_cei_column_names(df)
+        parts = all_cei_columns[-1].split("_")
+        cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
         # HACK: Get feature name with GD4 in it to extract first and last stage id and name
-        GD4_column = df[df.columns[df.columns.str.contains("GD4")]].columns
-        # Select the longest string in GD4_column
-        GD4_col = max(GD4_column, key=len)
-        self.stage_info = stages.get_stage_information_dict(GD4_col, self.method)
+        cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
+        # Select the longest string in cei_column
+        cei_col = max(cei_column, key=len)
+        self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
         # Change column name
         # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
         df = stages.update_feature_names(df, self.method)
-        # Get all the columns apart from the fixed columns, target column and stats columns
-        all_cei_columns = [
-            col
-            for col in df.columns
-            if col not in self.fixed_columns + [self.target] + self.statistics_columns
-        ]
+        all_cei_columns = self.get_cei_column_names(df)
         # Fill in any missing values with 0
         df.loc[:, all_cei_columns].fillna(0, inplace=True)
@@ -720,10 +740,20 @@ class Geocif:
         mask = self.df_results["Stage_ID"].isin(_stages)
         df = self.df_results[mask]
+        """ Select which CEI categories to use for ML """
+        if "all" in self.use_ceis:
+            pass
+        else:
+            df = df[df["Type"].isin(self.use_ceis)]
         """ Convert this dataframe into an ML ready format and save to disk """
         df = self.create_ml_dataframe(df)
         dir_output = (
-            self.dir_analysis / self.country / self.crop / self.model_name / str(self.forecast_season)
+            self.dir_analysis
+            / self.country
+            / self.crop
+            / self.model_name
+            / str(self.forecast_season)
         )
         os.makedirs(dir_output, exist_ok=True)
         df.to_csv(
@@ -772,7 +802,7 @@ class Geocif:
         dict_kwargs["combined_dict"] = self.combined_dict
         if self.spatial_autocorrelation:
-           sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
+            sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
         if self.correlation_plots:
             self.logger.info(f"Correlation plot for {self.country} {self.crop}")

{geocif-0.1.31 → geocif-0.1.32}/geocif/indices_runner.py RENAMED Viewed

@@ -162,14 +162,16 @@ class cei_runner(base.BaseGeo):
         # Only keep those entries in combinations where the third elemt is
         # mozambique, south_africa, angola or dem_people's_rep_of_korea
         # This is done to test the code for these countries
-        combinations = [i for i in combinations if "angola_maize" in i[3] or
-                       "lesotho_maize" in i[3] or
-        #                 "namibia" in i[2] or
-        #                 "united_republic_of_tanzania" in i[2] or
-                         "zambia_maize" in i[3] or
-                         "zimbabwe_maize" in i[3] or
-        #                 "south_africa" in i[2] or
-                      "mozambique_maize" in i[3]]
+        combinations = [
+            i
+            for i in combinations
+            if "angola_maize" in i[3] or "lesotho_maize" in i[3] or
+            #                 "namibia" in i[2] or
+            #                 "united_republic_of_tanzania" in i[2] or
+            "zambia_maize" in i[3] or "zimbabwe_maize" in i[3] or
+            #                 "south_africa" in i[2] or
+            "mozambique_maize" in i[3]
+        ]
         #                 "malawi" in i[2]]
         if self.do_parallel:

{geocif-0.1.31 → geocif-0.1.32}/geocif/ml/spatial_autocorrelation.py RENAMED Viewed

@@ -97,8 +97,10 @@ def create_weights_for_year(dg_country, regions_with_data):
     ]
     if no_neighbors:
         dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
-        wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
+        try:
+            wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
+        except:
+            breakpoint()
     return wt, dg
@@ -125,13 +127,15 @@ def compute_morans_i(merged_df):
         regions_with_data = year_data["Country Region"].unique()
         year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
-        y = year_data[["Country Region", "Region", "Yield (tn per ha)"]].drop_duplicates()
+        y = year_data[
+            ["Country Region", "Region", "Yield (tn per ha)"]
+        ].drop_duplicates()
         dg_country = year_data[["Country Region", "geometry"]].drop_duplicates()
-        if len(y) > 1:
-            w, x = create_weights_for_year(dg_country, regions_with_data)
-            y = y[y["Country Region"].isin(x["Country Region"])]
+        w, x = create_weights_for_year(dg_country, regions_with_data)
+        y = y[y["Country Region"].isin(x["Country Region"])]
+        if len(y) > 1:
             try:
                 mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
             except:

{geocif-0.1.31 → geocif-0.1.32}/geocif/viz/plot.py RENAMED Viewed

@@ -332,7 +332,7 @@ def plot_df_shpfile(
             cb.ax.set_title(
                 label, fontsize=8, fontweight="semibold", fontfamily="Arial"
             )
-            cb.ax.set_xticklabels(ticks, fontsize=4, fontfamily="Arial")
+            cb.ax.set_xticklabels(ticks, fontsize=5, fontfamily="Arial")
             # Use BoundaryNorm to create discrete levels
             # sm = plt.cm.ScalarMappable(cmap=cmap.mpl_colormap, norm=norm)
@@ -394,9 +394,9 @@ def plot_df_shpfile(
                 _name_country, buffer=1.0
             )  # left, right, bottom, top
             # Hack: Add space to the top for adding title
-            extent[3] = extent[3] + 5
+            extent[3] = extent[3] + 2
             # Add some space to the bottom for adding legend and colorbar
-            extent[2] = extent[2] - 4
+            extent[2] = extent[2] - 3
             ax.set_extent(extent)
         elif name_country == "world":
             ax.add_feature(
@@ -419,14 +419,12 @@ def plot_df_shpfile(
     #     ax.tick_params(bottom=False, labelbottom=False, left=False, labelleft=False)
     #     ax.axis("off")
-    plt.tight_layout()
     # cbar.ax.tick_params(labelsize=8)
     # if series == "sequential":
     #     cbar.ax.tick_params(size=2, width=0.5, which="both")
     #     cbar.outline.set_visible(False)
     # plt.tight_layout()
     try:
-        print(fname)
         plt.savefig(dir_out / fname, dpi=350, bbox_inches="tight")
         plt.close(fig)
     except:

{geocif-0.1.31 → geocif-0.1.32/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.31
+Version: 0.1.32
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.31 → geocif-0.1.32}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.31",
+    version="0.1.32",
     zip_safe=False,
 )