PyPI - geocif - Versions diffs - 0.1.33__tar.gz → 0.1.35__tar.gz - Mend

geocif 0.1.33tar.gz → 0.1.35tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{geocif-0.1.33/geocif.egg-info → geocif-0.1.35}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.33
+Version: 0.1.35
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.33 → geocif-0.1.35}/geocif/analysis.py RENAMED Viewed

@@ -162,8 +162,8 @@ class Geoanalysis:
             return pd.DataFrame(), pd.DataFrame()
         df_metrics = self._compute_metrics(df)
-        df_metrics = self._process_metrics(df_metrics)
-        self._plot_metrics(df_metrics)
+        #df_metrics = self._process_metrics(df_metrics)
+        #self._plot_metrics(df_metrics)
         df_regional_metrics_by_year = self._compute_regional_metrics(
             df, by="Harvest Year"
@@ -172,9 +172,9 @@ class Geoanalysis:
             df_regional_metrics_by_year
         )
         df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
+        breakpoint()
         self._store_results(
-            df_metrics, df_regional_metrics, df_regional_metrics_by_year
+            None, df_regional_metrics, df_regional_metrics_by_year
         )
         df_national_yield = self._compute_national_yield(df)
@@ -195,7 +195,7 @@ class Geoanalysis:
             .apply(self.annual_metrics)
             .reset_index()
         )
+        breakpoint()
         return df_metrics.pivot_table(
             index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
             columns="level_5",

{geocif-0.1.33 → geocif-0.1.35}/geocif/cei/indices.py RENAMED Viewed

@@ -393,6 +393,7 @@ class CEIs:
             / self.admin_zone
             / self.country
         )
         os.makedirs(self.dir_output, exist_ok=True)
         os.makedirs(self.dir_intermediate, exist_ok=True)
@@ -465,7 +466,7 @@ class CEIs:
         extended_stages_list = []
         if self.method in ["phenological_stages", "fraction_season", "full_season"]:
-            extended_stages_list = [stages]
+            extended_stages_list = stages
         elif self.method in ["dekad_r", "biweekly_r", "monthly_r"]:
             # reverse stages
             stages = stages[::-1]
@@ -566,10 +567,10 @@ class CEIs:
         """
         if self.method in ["phenological_stages", "fraction_season"]:
-            mask = df_harvest_year_region[col].isin(stages)
+            mask = df_harvest_year_region[col].isin([stages])
             df_time_period = df_harvest_year_region[mask]
-            mask = df_all_years[col].isin(stages)
+            mask = df_all_years[col].isin([stages])
             df_base_period = df_all_years[mask]
         elif self.method in [
             "dekad",
@@ -605,6 +606,10 @@ class CEIs:
         Returns:
         """
+        # If stage is not a list then convert it to a list
+        if not isinstance(stage, list):
+            stage = [stage]
         columns = [
             "Description",
             "CEI",
@@ -721,6 +726,10 @@ class CEIs:
         :param index_details:
         :return:
         """
+        # If stage is not a list then convert it to a list
+        if not isinstance(stage, list):
+            stage = [stage]
         df = df[df["bounds"] == 1]
         # Exclude lat, lon, time, bounds and time_bounds columns
         df = df.drop(columns=["lat", "lon", "time", "bounds", "time_bounds"])

{geocif-0.1.33 → geocif-0.1.35}/geocif/indices_runner.py RENAMED Viewed

@@ -165,11 +165,12 @@ class cei_runner(base.BaseGeo):
         combinations = [
             i
             for i in combinations
-            if "angola_maize" in i[3] or "lesotho_maize" in i[3] or
-            #                 "namibia" in i[2] or
-            #                 "united_republic_of_tanzania" in i[2] or
+            if "angola_maize" in i[3] or
+               "lesotho_maize" in i[3] or
+            #   "namibia_" in i[2] or
+                "united_republic_of_tanzania_maize" in i[3] or
             "zambia_maize" in i[3] or "zimbabwe_maize" in i[3] or
-            #                 "south_africa" in i[2] or
+            "south_africa_maize" in i[3] or
             "mozambique_maize" in i[3]
         ]
         #                 "malawi" in i[2]]

geocif-0.1.35/geocif/indices_runner_v2.py ADDED Viewed

@@ -0,0 +1,207 @@
+import itertools
+import warnings
+from multiprocessing import Pool, cpu_count
+from pathlib import Path
+import arrow as ar
+import pandas as pd
+from tqdm import tqdm
+warnings.filterwarnings("ignore")
+from .cei import indices
+from geoprepare import base
+def remove_duplicates(lst):
+    """
+    :param lst:
+    :return:
+    """
+    return list(set([i for i in lst]))
+def get_admin_zone(country, dg_shp):
+    admin_zone = "admin_1"
+    country = country.title().replace(" ", "_")
+    # Read in shapefile
+    dg_country = dg_shp[dg_shp["ADMIN0"] == country]
+    # Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
+    if dg_country.empty:
+        admin_zone = "admin_1"
+    elif not dg_country["ADMIN2"].isna().all():
+        admin_zone = "admin_2"
+    return admin_zone
+class cei_runner(base.BaseGeo):
+    def __init__(self, path_config_file):
+        super().__init__(path_config_file)
+        # Parse configuration files
+        self.parse_config()
+        self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
+        self.base_dir = Path(self.parser.get("PATHS", "dir_crop_inputs"))
+        self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
+    def collect_files(self):
+        """
+        1. Collect all the files which contain EO information
+        2. Exclude files from the `processed` directory if it is already in
+        processed_include_fall directory
+        3. Create a dataframe that contains the following columns:
+            - directory: name of directory where file is located
+            - path: full path to file
+            - filename: name of file
+        :return: Return the dataframe created above
+        """
+        import geopandas as gp
+        dg_shp = gp.read_file(
+            self.dir_input
+            / "Global_Datasets"
+            / "Regions"
+            / "Shps"
+            / "adm_shapefile.shp",
+            engine="pyogrio",
+        )
+        # Collect all the files which contain EO information
+        df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
+        for filepath in self.base_dir.rglob("*.csv"):
+            country = filepath.parents[0].name
+            admin_zone = get_admin_zone(country, dg_shp)
+            # If country is not in cc.COUNTRIES then skip
+            # HACK: Skip korea for now, as it is giving errors
+            if country == "republic_of_korea":
+                continue
+            # Get name of directory one level up
+            process_type = filepath.parents[1].name
+            # Get name of file
+            filename = filepath.name
+            # Add to dataframe
+            df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
+        # Exclude those rows where directory is processed and file is already in
+        # processed_include_fall directory
+        no_fall = df_files["directory"] == "processed"
+        include_fall = df_files[df_files["directory"] == "processed_include_fall"][
+            "filename"
+        ]
+        df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
+        return df_files
+    def process_combinations(self, df, method):
+        """
+        Create a list of tuples of the following:
+            - directory: name of directory where file is located
+            - path: full path to file
+            - filename: name of file
+            - method: whether to compute indices for phenological stages or not
+        This tuple will be used as input to the `process` function
+        :param df:
+        :param method:
+        :return:
+        """
+        combinations = []
+        for index, row in tqdm(df.iterrows()):
+            combinations.extend(
+                list(
+                    itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
+                )
+            )
+        combinations = remove_duplicates(combinations)
+        return combinations
+    def main(self, method):
+        """
+        :param method:
+        :return:
+        """
+        # Create a dataframe of the files to be analyzed
+        df_files = self.collect_files()
+        combinations = self.process_combinations(df_files, method)
+        # Add an element to the tuple to indicate the season
+        # Last element is redo flag which is True if the analysis is to be redone
+        # and False otherwise. Analysis is always redone for the current year
+        # and last year whether file exists or not
+        combinations = [
+            (
+                self.parser,
+                status,
+                path,
+                filename,
+                admin_zone,
+                category,
+                year,
+                "ndvi",
+                False,  # redo
+            )
+            for year in range(2024, ar.utcnow().year + 1)
+            for status, path, filename, admin_zone, category in combinations
+        ]
+        # Only keep those entries in combinations where the third elemt is
+        # mozambique, south_africa, angola or dem_people's_rep_of_korea
+        # This is done to test the code for these countries
+        combinations = [
+           i
+           for i in combinations
+           if "malawi_maize_s1" in i[3]
+        ]
+        if False:
+            num_cpu = int(cpu_count() * 0.3)
+            with Pool(num_cpu) as p:
+                for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
+                    pass
+        else:
+            # Use the code below if you want to test without parallelization or
+            # if you want to debug by using pdb
+            pbar = tqdm(combinations)
+            for i, val in enumerate(pbar):
+                pbar.set_description(
+                    f"Main loop {combinations[i][2]} {combinations[i][5]}"
+                )
+                indices.process(val)
+def run(path_config_files=[]):
+    """
+    Args:
+        path_config_files:
+    Returns:
+    """
+    """ Check dictionary keys to have no spaces"""
+    indices.validate_index_definitions()
+    for method in [
+        "biweekly_r",  # "dekad_r"  # "dekad_r"
+    ]:  # , "full_season", "phenological_stages", "fraction_season"]:
+        obj = cei_runner(path_config_files)
+        obj.main(method)
+if __name__ == "__main__":
+    run()

{geocif-0.1.33 → geocif-0.1.35}/geocif/ml/correlations.py RENAMED Viewed

@@ -246,6 +246,7 @@ def all_correlated_feature_by_time(df, **kwargs):
     Returns:
     """
+    THRESHOLD = 0.1
     national_correlation = kwargs.get("national_correlation")
     group_by = kwargs.get("groupby")
     combined_dict = kwargs.get("combined_dict")
@@ -260,9 +261,20 @@ def all_correlated_feature_by_time(df, **kwargs):
         ):
             df_corr = _all_correlated_feature_by_time(group, **kwargs)
+            # Remove columns with more than 50% NaN values
+            df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
             if not df_corr.empty:
-                df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
-                dict_selected_features[region_id] = df_tmp.columns
+                df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
+                # Add the columns to dict_selected_features along with the absolute mean value
+                absolute_medians = df_tmp.abs().median()
+                # Create a DataFrame to display the column names and their absolute median values
+                absolute_median_df = absolute_medians.reset_index()
+                absolute_median_df.columns = ['CEI', 'Median']
+                # Add the CEI and Median value to dict_selected_features
+                dict_selected_features[region_id] = absolute_median_df
                 df_tmp2 = (
                     df_tmp.median(axis=0)
@@ -290,24 +302,31 @@ def all_correlated_feature_by_time(df, **kwargs):
             else:
                 # HACK
                 df_corr = _all_correlated_feature_by_time(df, **kwargs)
-                dict_selected_features[region_id] = df_corr.columns
-                dict_best_cei[region_id] = {}
-                # dict_selected_features[region_id] = dict_selected_features[0]
-                # dict_best_cei[region_id] = dict_best_cei[0]
-                # Combine all unique values from the existing dictionary elements
-                # combined_metrics = set()
-                # for key in dict_selected_features:
-                #     breakpoint()
-                #     combined_metrics.update(dict_selected_features[key])
-                #
-                # # Add the combined set as a new element with key 3
-                # dict_selected_features[region_id] = sorted(list(combined_metrics))
+                df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
+                # Add the columns to dict_selected_features along with the absolute mean value
+                absolute_medians = df_tmp.abs().median()
+                # Create a DataFrame to display the column names and their absolute median values
+                absolute_median_df = absolute_medians.reset_index()
+                absolute_median_df.columns = ['CEI', 'Median']
+                # Add the CEI and Median value to dict_selected_features
+                dict_selected_features[region_id] = absolute_median_df
+                dict_best_cei[region_id] = {}
     else:
         df_corr = _all_correlated_feature_by_time(df, **kwargs)
-        dict_selected_features[0] = df_corr.columns
+        df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
+        # Add the columns to dict_selected_features along with the absolute mean value
+        absolute_medians = df_tmp.abs().median()
+        # Create a DataFrame to display the column names and their absolute median values
+        absolute_median_df = absolute_medians.reset_index()
+        absolute_median_df.columns = ['CEI', 'Median']
+        # Add the CEI and Median value to dict_selected_features
+        dict_selected_features[0] = absolute_median_df
-        df_corr = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
         plot_feature_corr_by_time(df_corr, **kwargs)
     return dict_selected_features, dict_best_cei

geocif-0.1.35/geocif/ml/correlations_backup.py ADDED Viewed

@@ -0,0 +1,412 @@
+import os
+import matplotlib.pyplot as plt
+import palettable as pal
+import pandas as pd
+import seaborn as sns
+from tqdm import tqdm
+from geocif import utils
+from geocif.ml import embedding
+from geocif.ml import stages
+def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
+    """
+    Args:
+        df_train:
+        simulation_stages:
+        target_col:
+    Returns:
+    """
+    frames = []
+    stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
+    # Only select columns that have been observed till the current stage
+    for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
+        current_feature_set = [
+            col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
+        ]
+        # Get the most correlated feature for each region
+        top_feature_by_region, counter = embedding.get_top_correlated_features(
+            df_train[current_feature_set + ["Region"]],
+            df_train[target_col],
+        )
+        # Create a dataframe with the most common top feature and number of occurrences over timestep
+        _feature = counter.most_common(1)[0][0]
+        # Loop through top_feature_by_region and find the average score for _feature
+        # Calculate the average score for 'DTR_36'
+        _feature_scores = [
+            value[1][0]
+            for key, value in top_feature_by_region.items()
+            if _feature in value[0]
+        ]
+        average_score = sum(_feature_scores) / len(_feature_scores)
+        _feature = utils.remove_last_part(_feature)
+        df = pd.DataFrame(
+            {
+                "Stage": [stage[-1]],
+                "Date": [utils.dict_growth_stages[stage[-1]]],
+                "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
+                "Feature Category": [_feature],
+                "Score": [average_score],
+                # "Type": [ci.dict_indices[_feature][0]],
+                "Number of Occurrences": [counter.most_common(1)[0][1]],
+                # "Current Feature Set": [current_feature_set],
+            }
+        )
+        frames.append(df)
+    df_most_corr_feature_by_time = pd.concat(frames)
+def plot_feature_corr_by_time(df, **kwargs):
+    country = kwargs.get("country")
+    crop = kwargs.get("crop")
+    dir_output = kwargs.get("dir_output")
+    forecast_season = kwargs.get("forecast_season")
+    national_correlation = kwargs.get("national_correlation")
+    group_by = kwargs.get("groupby")
+    # Setup the figure and gridspec
+    fig = plt.figure(figsize=(10, 5))
+    gs = fig.add_gridspec(
+        3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
+    )
+    # Assign subplots
+    ax_heatmap = fig.add_subplot(gs[0:2, 0])
+    ax_map = fig.add_subplot(gs[0, 1])
+    cbar_ax = fig.add_subplot(gs[2, 0])
+    ax4 = fig.add_subplot(gs[2, 1])
+    # Transpose and reverse the columns of the dataframe
+    #breakpoint()
+    ## Only select foll. columns:
+    df = df[
+        [
+            "TG",
+            "TG10p",
+            "DTR",
+            "vDTR",
+            "R99p",
+            "RX5day",
+            "MEAN_ESI4WK",
+        ]
+    ]
+    df_transpose = df.T
+    df = df_transpose[df_transpose.columns[::-1]]
+    # Split column names and only use value before space
+    df.columns = df.columns.str.split(" ").str[0]
+    # In row names, replace ESI4WK by ES
+    df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
+    df.index = df.index.str.replace("R99p", "MEAN_SM")
+    df.index = df.index.str.replace("RX5day", "AUC_SM")
+    # Remove the last row
+    # Select the first, third and fifth column
+    df = df[["Dec", "Feb", "Apr"]]
+    # Rename Dec to Planting - Early Vegetative
+    # Rename Feb to Early Vegetative - Senescence
+    # Rename Apr to Senescence - Harvest
+    df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
+    ax_heatmap = sns.heatmap(
+        df,
+        ax=ax_heatmap,
+        annot=True,
+        cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
+        fmt=".2f",
+        square=False,
+        linewidths=0.5,
+        linecolor="white",
+        cbar_ax=cbar_ax,
+        cbar_kws={"orientation": "horizontal"},  # , "shrink": 0.5},
+        annot_kws={"size": 6},
+        xticklabels=True,
+        yticklabels=True,
+    )
+    ax_heatmap.tick_params(left=False, bottom=False)
+    # Plot the map using GeoPandas
+    dg_country = kwargs.get("dg_country")
+    ax_map = dg_country.plot(
+        ax=ax_map,
+        color="white",
+        edgecolor="black",
+        linewidth=1.0,
+        facecolor=None,
+        legend=False,
+    )
+    if not national_correlation:
+        id = kwargs["region_id"]
+        dg_region = dg_country[dg_country[group_by] == id]
+        ax_map = dg_region.plot(
+            ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
+        )
+        # Set title with color blue
+        ax_map.set_title(f"Region: {id}", color="blue")
+    # No colorbar for the map
+    ax_map.axis("off")
+    # Remove borders
+    ax_map.spines["top"].set_visible(False)
+    ax_map.spines["right"].set_visible(False)
+    ax_map.spines["bottom"].set_visible(False)
+    ax_map.spines["left"].set_visible(False)
+    # ax4 should not be visible
+    ax4.axis("off")
+    # Add colorbar label
+    # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
+    cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
+    ax_heatmap.set_xticklabels(
+        ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
+    )
+    ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
+    ax_heatmap.set_xlabel("")
+    ax_heatmap.set_ylabel(" ")
+    # Reduce font size of ticks of colorbar
+    cbar_ax.tick_params(axis="both", which="major", labelsize=6)
+    _country = country.title().replace("_", " ")
+    _crop = crop.title().replace("_", " ")
+    if not national_correlation:
+        fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
+    else:
+        fname = f"{country}_{crop}_corr_feature_by_time.png"
+    ax_heatmap.set_title(f"{_country}\n{_crop}")
+    # plt.tight_layout()
+    os.makedirs(dir_output, exist_ok=True)
+    plt.savefig(dir_output / fname, dpi=250)
+    plt.close()
+def _all_correlated_feature_by_time(df, **kwargs):
+    """
+    Args:
+        df:
+        **kwargs:
+    Returns:
+    """
+    frames = []
+    all_stages = kwargs.get("all_stages")
+    target_col = kwargs.get("target_col")
+    method = kwargs.get("method")
+    longest_stage = max(all_stages, key=len)
+    # Split the original string into a list of its parts
+    longest_stage = longest_stage.split("_")
+    # Generate the list of strings as described by the user, removing one element from the start each time
+    stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
+    # Drop columns with no yield information
+    df = df.dropna(subset=[target_col])
+    # Only select columns that have been observed till the current stage
+    pbar = tqdm(stages_features, total=len(stages_features), leave=False)
+    for stage in pbar:
+        pbar.set_description(f"Calculating correlations")
+        pbar.update()
+        stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
+            "Stage Name"
+        ]
+        # starting_stage = stage_name.split("-")[0]
+        current_feature_set = [col for col in df.columns if stage_name in col]
+        # Get the most correlated feature for each region
+        df_tmp = embedding.get_all_features_correlation(
+            df[current_feature_set + ["Region"]], df[target_col], method
+        )
+        frames.append(df_tmp)
+    df_results = pd.concat(frames)
+    if not df_results.empty:
+        # Exclude Region column
+        df_results = df_results.drop(columns="Region")
+        # Groupby Dekad and compute mean of all columns apart from Region
+        df_results = df_results.groupby(method).mean()
+        all_stage_names = []
+        for stage in stages_features:
+            _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
+                "Stage Name"
+            ]
+            all_stage_names.append(_tmp)
+        df_results = df_results.reindex(all_stage_names)
+        # Drop rows with all NaN values
+        df_results = df_results.dropna(how="all")
+        # Split the index based on - and only keep the first element
+        df_results.index = df_results.index.str.split("-").str[0]
+        return df_results
+    else:
+        return pd.DataFrame()
+def all_correlated_feature_by_time(df, **kwargs):
+    """
+    Args:
+        df:
+        **kwargs:
+    Returns:
+    """
+    THRESHOLD = 0.1
+    national_correlation = kwargs.get("national_correlation")
+    group_by = kwargs.get("groupby")
+    combined_dict = kwargs.get("combined_dict")
+    dict_selected_features = {}
+    dict_best_cei = {}
+    if not national_correlation:
+        groups = df.groupby(group_by)
+        for region_id, group in tqdm(
+            groups, desc=f"Compute all correlated feature by {group_by}", leave=False
+        ):
+            df_corr = _all_correlated_feature_by_time(group, **kwargs)
+            # Remove columns with more than 50% NaN values
+            df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
+            if not df_corr.empty:
+                df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
+                # Add the columns to dict_selected_features along with the absolute mean value
+                absolute_medians = df_tmp.abs().median()
+                # Create a DataFrame to display the column names and their absolute median values
+                absolute_median_df = absolute_medians.reset_index()
+                absolute_median_df.columns = ['CEI', 'Median']
+                # Add the CEI and Median value to dict_selected_features
+                dict_selected_features[region_id] = absolute_median_df
+                df_tmp2 = (
+                    df_tmp.median(axis=0)
+                    .abs()
+                    .sort_values(ascending=False)
+                    .reset_index()
+                )
+                df_tmp2.columns = ["Metric", "Value"]
+                # Add another column based on Type of Metric
+                for idx, row in df_tmp2.iterrows():
+                    df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
+                # Compute median of each CEI and sort the dataframe based on the absolute value of the median
+                dict_best_cei[region_id] = (
+                    df_tmp2.groupby("Type")
+                    .max()
+                    .reset_index()
+                    .sort_values("Value", ascending=False)["Metric"]
+                    .values
+                )
+                kwargs["region_id"] = region_id
+                plot_feature_corr_by_time(df_tmp, **kwargs)
+                # For each element in dict_best_cei, add the type of the cei
+            else:
+                # HACK
+                df_corr = _all_correlated_feature_by_time(df, **kwargs)
+                df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
+                # Add the columns to dict_selected_features along with the absolute mean value
+                absolute_medians = df_tmp.abs().median()
+                # Create a DataFrame to display the column names and their absolute median values
+                absolute_median_df = absolute_medians.reset_index()
+                absolute_median_df.columns = ['CEI', 'Median']
+                # Add the CEI and Median value to dict_selected_features
+                dict_selected_features[region_id] = absolute_median_df
+                dict_best_cei[region_id] = {}
+    else:
+        df_corr = _all_correlated_feature_by_time(df, **kwargs)
+        df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
+        # Add the columns to dict_selected_features along with the absolute mean value
+        absolute_medians = df_tmp.abs().median()
+        # Create a DataFrame to display the column names and their absolute median values
+        absolute_median_df = absolute_medians.reset_index()
+        absolute_median_df.columns = ['CEI', 'Median']
+        # Add the CEI and Median value to dict_selected_features
+        dict_selected_features[0] = absolute_median_df
+        plot_feature_corr_by_time(df_corr, **kwargs)
+    return dict_selected_features, dict_best_cei
+def feature_correlation_by_time(**kwargs):
+    raise NotImplementedError()
+    frames = []
+    simulation_stages = kwargs.get("simulation_stages")
+    df_train = kwargs.get("df_train")
+    target_col = kwargs.get("target_col")
+    stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
+    # Only select columns that have been observed till the current stage
+    for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
+        current_feature_set = [
+            col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
+        ]
+        # Get the most correlated feature for each region
+        top_feature_by_region, counter = embedding.compute_feature_correlations(
+            df_train[current_feature_set + ["Region"]],
+            df_train[target_col],
+            "all",
+        )
+        # Create a dataframe with the most common top feature and number of occurrences over timestep
+        _feature = counter.most_common(1)[0][0]
+        # Loop through top_feature_by_region and find the average score for _feature
+        # Calculate the average score for 'DTR_36'
+        _feature_scores = [
+            value[1][0]
+            for key, value in top_feature_by_region.items()
+            if _feature in value[0]
+        ]
+        average_score = sum(_feature_scores) / len(_feature_scores)
+        _feature = utils.remove_last_part(_feature)
+        df = pd.DataFrame(
+            {
+                "Stage": [stage[-1]],
+                "Date": [utils.dict_growth_stages[stage[-1]]],
+                "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
+                "Feature Category": [_feature],
+                "Score": [average_score],
+                # "Type": [ci.dict_indices[_feature][0]],
+                "Number of Occurrences": [counter.most_common(1)[0][1]],
+                # "Current Feature Set": [current_feature_set],
+            }
+        )
+        frames.append(df)
+    df_corr_feature_by_time = pd.concat(frames)

{geocif-0.1.33 → geocif-0.1.35}/geocif/ml/stages.py RENAMED Viewed

@@ -144,10 +144,13 @@ def select_stages_for_ml(stages_features, method="latest", n=100):
     selected_stages = []
     if method == "latest":
+        # Find the longest array in the list of arrays
+        selected_stages = [max(stages_features, key=len)]
         # Only select those arrays in the list of arrays that are starting with latest_stage
-        for stage in stages_features:
-            if stage[0] == latest_stage[0]:
-                selected_stages.append(stage)
+        # for stage in stages_features:
+        #     if stage[0] == latest_stage[0]:
+        #         selected_stages.append(stage)
     elif method == "fraction":
         # Filter arrays with exactly 2 elements
         two_element_arrays = []

{geocif-0.1.33 → geocif-0.1.35}/geocif/playground/misc.py RENAMED Viewed

@@ -1,6 +1,76 @@
-import pandas as pd
+import geopandas as gpd
+import pygmt
 import matplotlib.pyplot as plt
-import matplotlib.patches as patches
+from matplotlib.lines import Line2D
+import matplotlib.patches as mpatches
+import os
+filtered_shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\filtered_shapefile5.shp"
+if not os.path.isfile(filtered_shapefile_path):
+    # Load the shapefile using GeoPandas
+    shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\adm_shapefile.shp"
+    gdf = gpd.read_file(shapefile_path, engine="pyogrio")
+    # Only keep one row per ADMIN0
+    gdf = gdf.drop_duplicates(subset="ADMIN0")
+    sh2_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\Level_1.shp"
+    gdf2 = gpd.read_file(sh2_path, engine="pyogrio")
+    # Subset gdf2 to USA, Pakistan and Afghanistan
+    gdf2 = gdf2[gdf2["ADM0_NAME"].isin(["United States of America"])]
+    # Exclude Alska and Hawaii from the USA
+    gdf2 = gdf2[~gdf2["ADM1_NAME"].isin(["Alaska", "Hawaii"])]
+    # Now combine all the states into one polygon
+    gdf2 = gdf2.dissolve(by="ADM0_NAME")
+    gdf2 = gdf2.reset_index()
+    # Rename ADM0_NAME to ADMIN0 for consistency
+    gdf2.rename(columns={"ADM0_NAME": "ADMIN0"}, inplace=True)
+    # Only keep ADMIN0 and geometry columns in gdf and gdf2
+    gdf = gdf[["ADMIN0", "geometry"]]
+    gdf2 = gdf2[["ADMIN0", "geometry"]]
+    # Merge gdf and gdf2
+    import pandas as pd
+    gdf = pd.concat([gdf, gdf2], ignore_index=True)
+    # Save the filtered shapefile as a temporary file
+    gdf.to_file(filtered_shapefile_path)
+else:
+    gdf = gpd.read_file(filtered_shapefile_path, engine="pyogrio")
+# Create the global map with highlighted countries
+fig = pygmt.Figure()
+# Define the region of interest and projection
+# fig.basemap(region="g", projection="R12c/20", frame=True)
+fig.basemap(region=[-135, 60, -35, 53], projection="Q12c", frame=True)
+# Use the coast function to draw land and water
+fig.coast(land="lightgray", water="lightcyan")
+# Highlight the countries using the filtered shapefile
+fig.plot(data=filtered_shapefile_path, pen="0.35p,black")
+# Add hatches to Pakistan and Afghanistan
+gdf_filled = gdf[gdf["ADMIN0"].isin(["Pakistan", "Afghanistan"])]
+for _, row in gdf_filled.iterrows():
+    fill_gdf = gpd.GeoDataFrame([row], columns=gdf.columns)
+    with pygmt.helpers.GMTTempFile() as tmpfile:
+        fill_gdf.to_file(tmpfile.name, driver="GeoJSON")
+        fig.plot(data=tmpfile.name, pen="0.35p,black", fill="black@50+h")
+# Save the figure
+fig.savefig("global_choropleth_highlighted_v1.png", dpi=1000)
+# Show the figure
+fig.show()
 import matplotlib.pyplot as plt
 import cartopy.crs as ccrs

{geocif-0.1.33 → geocif-0.1.35/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.33
+Version: 0.1.35
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.33 → geocif-0.1.35}/geocif.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ geocif/__init__.py
 geocif/analysis.py
 geocif/geocif.py
 geocif/indices_runner.py
+geocif/indices_runner_v2.py
 geocif/logger.py
 geocif/utils.py
 geocif.egg-info/PKG-INFO
@@ -31,6 +32,7 @@ geocif/cei/definitions.py
 geocif/cei/indices.py
 geocif/ml/__init__.py
 geocif/ml/correlations.py
+geocif/ml/correlations_backup.py
 geocif/ml/embedding.py
 geocif/ml/feature_engineering.py
 geocif/ml/feature_selection.py

{geocif-0.1.33 → geocif-0.1.35}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.33",
+    version="0.1.35",
     zip_safe=False,
 )