PyPI - geocif - Versions diffs - 0.1.60__tar.gz → 0.1.62__tar.gz - Mend

geocif 0.1.60tar.gz → 0.1.62tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

{geocif-0.1.60/geocif.egg-info → geocif-0.1.62}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.60
+Version: 0.1.62
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.60 → geocif-0.1.62}/geocif/analysis.py RENAMED Viewed

@@ -186,11 +186,6 @@ class Geoanalysis:
         return df_metrics, df_regional_metrics, df_national_yield
     def _clean_data(self):
-        # Hack exclude 2012 if country == "illinois"
-        if self.country == "illinois":
-            self.df_analysis = self.df_analysis[
-                self.df_analysis["Harvest Year"] != 2012
-            ]
         # Remove rows with missing values in Observed Yield (tn per ha)
         return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
@@ -543,34 +538,52 @@ class Geoanalysis:
         country = self.country.title().replace("_", " ")
         crop = self.crop.title().replace("_", " ")
         file = dir_statistics / f"{country}_{crop}_statistics_s1_{self.method}.csv"
-        df_historic = pd.read_csv(file)
-        df_historic = df_historic[["Region", "Harvest Year", "Yield (tn per ha)"]]
+        df_all = pd.read_csv(file)
-        # Drop rows with NaN values
-        df_historic = df_historic.dropna()
+        # Keep only the relevant columns and drop NaNs
+        df_all = df_all[["Region", "Harvest Year", "Yield (tn per ha)"]].dropna()
-        # Determine unique years
-        years = df_historic["Harvest Year"].unique()
+        # --- For computing the % of total production ---
+        # Determine unique years and sort them (in case they aren't already)
+        years = sorted(df_all["Harvest Year"].unique())
+        # Subset dataframe to include only the last 5 years of the dataset
+        last_five_years = years[-5:]
+        df_recent = df_all[df_all["Harvest Year"].isin(last_five_years)]
-        # Subset dataframe to only include the last years of the dataset
-        df_historic = df_historic[df_historic["Harvest Year"].isin(years[-5:])]
-        # For each region, compute the % of the total production
-        df_historic = (
-            df_historic.groupby("Region")["Yield (tn per ha)"]
+        # For each region, compute the % of total production (using yield sum over the last five years)
+        df_pct = (
+            df_recent.groupby("Region")["Yield (tn per ha)"]
             .sum()
             .pipe(lambda x: x / x.sum() * 100)
             .to_frame(name="% of total Area (ha)")
             .reset_index()
         )
-        # Find median yield for each region
-        # df_historic = (
-        #     df_historic.groupby("Region")["Yield (tn per ha)"]
-        #     .median()
-        #     .to_frame(name="Median Yield (tn per ha)")
-        #     .reset_index()
-        # )
+        # --- For computing median yields ---
+        # Compute median yield for 2014 - 2018
+        df_median_2014_2018 = (
+            df_all[df_all["Harvest Year"].between(2014, 2018)]
+            .groupby("Region")["Yield (tn per ha)"]
+            .median()
+            .rename(f"Median Yield (tn per ha) (2014-2018)")
+            .reset_index()
+        )
+        # Compute median yield for 2013 - 2017
+        df_median_2013_2017 = (
+            df_all[df_all["Harvest Year"].between(2013, 2017)]
+            .groupby("Region")["Yield (tn per ha)"]
+            .median()
+            .rename("Median Yield (tn per ha) (2013-2017)")
+            .reset_index()
+        )
+        # Merge the median yield columns with the % of total production dataframe
+        df_historic = (
+            df_pct
+            .merge(df_median_2014_2018, on="Region", how="left")
+            .merge(df_median_2013_2017, on="Region", how="left")
+        )
         return df_historic

{geocif-0.1.60 → geocif-0.1.62}/geocif/geocif.py RENAMED Viewed

@@ -587,6 +587,16 @@ class Geocif:
             df_region[f"Median {self.target}"].values, 3
         )
+        if f"Median {self.target} (2014-2018)" in df_region.columns:
+            df.loc[:, f"Median {self.target} (2014-2018)"] = np.around(
+                df_region[f"Median {self.target} (2014-2018)"].values, 3
+            )
+        if f"Median {self.target} (2013-2017)" in df_region.columns:
+            df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
+                df_region[f"Median {self.target} (2013-2017)"].values, 3
+            )
         if self.estimate_ci:
             if self.estimate_ci_for_all or self.forecast_season == self.today_year:
                 # Iterate over each element in y_pred_ci
@@ -730,8 +740,6 @@ class Geocif:
         if self.median_yield_as_feature:
             self.feature_names.append(f"Median {self.target}")
-            self.feature_names.append(f"Median {self.target} (2014-2018)")
-            self.feature_names.append(f"Median {self.target} (2013-2017)")
         if self.lag_yield_as_feature:
             # For the number of years specified in self.number_lag_years
@@ -801,6 +809,8 @@ class Geocif:
                 + self.statistics_columns
                 + self.feature_names
                 + [f"Median {self.target}"]
+                + [f"Median {self.target} (2014-2018)"]
+                + [f"Median {self.target} (2013-2017)"]
                 + ["Region_ID"]
             )
             if self.check_yield_trend:

{geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_angola.py RENAMED Viewed

@@ -12,7 +12,7 @@ warnings.filterwarnings("ignore")
 from .cei import indices
 from geoprepare import base
-country = "angola"
+country = "wolayita"
 def remove_duplicates(lst):
     """
@@ -174,7 +174,7 @@ class cei_runner(base.BaseGeo):
         combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
         if True:
-            num_cpu = int(cpu_count() * 0.1)
+            num_cpu = int(cpu_count() * 0.5)
             with Pool(num_cpu) as p:
                 for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
                     pass

{geocif-0.1.60 → geocif-0.1.62}/geocif/ml/stats.py RENAMED Viewed

@@ -80,8 +80,11 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
             # CM_Season should be 1 for the Main season
             # TODO: Make this user specified
-            mask_cm_season = df_tmp["CM_Season"] == 1
-            val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
+            if "CM_Season" in df_tmp.columns:
+                mask_cm_season = df_tmp["CM_Season"] == 1
+                val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
+            else:
+                val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
             try:
                 if val.isnull().all():

{geocif-0.1.60 → geocif-0.1.62}/geocif/ml/trainers.py RENAMED Viewed

@@ -268,7 +268,7 @@ def auto_train(
             loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
             bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
             hyperparams = {
-                "iterations": 3500,
+                "iterations": 2500,
                 "learning_rate": 0.025,
                 "depth": 6,
                 "subsample": 1.0,

geocif-0.1.62/geocif/playground/area.py ADDED Viewed

@@ -0,0 +1,117 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+# 1. Read the CSV
+df = pd.read_csv(r"C:\Users\ritvik\Downloads\ET_AgStats.csv")
+# 2. Filter for the crop "Maize (Corn)"
+df = df[df['DNL_SourceCrop'] == 'Maize (Corn)']
+# 3. Remove rows where "Area Planted: ha" is "NA" or "NC"
+df = df[df['Area Planted: ha'] != 'NA']
+df = df[df['Area Planted: ha'] != 'NC']
+df = df[df['Yield: MT/ha'] != 'NA']
+df = df[df['Yield: MT/ha'] != 'NC']
+df = df[df['Yield: MT/ha'] != '#REF!']
+# Remove rows where Admin 2 is null
+df = df.dropna(subset=['Admin 2'])
+df = df.dropna(subset=['Yield: MT/ha'])
+# 4. Convert "Area Planted: ha" to float by removing commas
+df['Area Planted: ha'] = (
+    df['Area Planted: ha']
+    .str.replace(',', '', regex=False)
+    .astype(float)
+)
+df['Yield: MT/ha'] = (
+    df['Yield: MT/ha']
+    .str.replace(',', '', regex=False)
+    .astype(float)
+)
+# 5. Group by [region, season] to calculate z-scores
+grouped = df.groupby(['Admin 2', 'Season'])
+anomalies_list = []
+for (region, season), group_data in grouped:
+    mean_area = group_data['Area Planted: ha'].mean()
+    std_area = group_data['Area Planted: ha'].std()
+    # Avoid division by zero
+    if std_area == 0:
+        group_data['Z_score'] = 0
+    else:
+        group_data['Z_score'] = (group_data['Area Planted: ha'] - mean_area) / std_area
+    # Flag anomalies if abs(z-score) > 3
+    group_data['Anomaly'] = group_data['Z_score'].apply(lambda x: 'Yes' if abs(x) > 3 else 'No')
+    anomalies_list.append(group_data)
+# 6. Concatenate grouped data back together
+df_analyzed = pd.concat(anomalies_list)
+# 7. Filter to see only anomalies
+df_anomalies = df_analyzed[df_analyzed['Anomaly'] == 'Yes']
+# 8. Print full dataset with anomaly flags and the subset of anomalies
+print("All data with anomaly flags:")
+print(df_analyzed)
+print("\nDetected anomalies:")
+print(df_anomalies)
+df_anomalies.to_csv(r"df_anomalies_v2.csv", index=False)
+# 11. Distribution of "Yield: MT/ha"
+plt.figure(figsize=(8, 5))
+sns.histplot(df['Yield: MT/ha'], kde=True, bins=30)
+plt.title('Distribution of Yield (MT/ha)')
+plt.xlabel('Yield (MT/ha)')
+plt.ylabel('Count')
+plt.tight_layout()
+plt.show()
+# count number of values where yield < 1
+low_yield = df[df['Yield: MT/ha'] < 1].shape[0]
+total = df.shape[0]
+print(f"Number of records with yield < 1: {low_yield} / {total}")
+breakpoint()
+# 9. Bar chart of number of anomalies per Season
+anomalies_by_season = df_anomalies['Season'].value_counts()
+plt.figure(figsize=(8, 5))
+anomalies_by_season.plot(kind='bar')
+plt.title('Number of Anomalies per Season')
+plt.xlabel('Season')
+plt.ylabel('Count of Anomalies')
+plt.tight_layout()
+plt.show()
+# 10. Heatmap of anomalies by Region (rows) and Year (columns)
+# Ensure "Year" is numeric for pivoting
+df_anomalies['Year'] = pd.to_numeric(df_anomalies['Year'], errors='coerce')
+# Count how many anomalies per (region, year)
+heatmap_data = df_anomalies.groupby(['Admin 1', 'Year']).size().unstack(fill_value=0)
+# Plot the heatmap
+plt.figure(figsize=(10, 6))
+sns.heatmap(
+    heatmap_data,
+    annot=True,
+    cmap='Blues',
+    fmt='d'
+)
+plt.title('Number of Anomalies by Region and Year')
+plt.xlabel('Year')
+plt.ylabel('Region')
+plt.tight_layout()
+plt.show()

geocif-0.1.62/geocif/viz/tmp.py ADDED Viewed

@@ -0,0 +1,268 @@
+import geopandas as gpd
+import pandas as pd
+import matplotlib.pyplot as plt
+import palettable as pal
+import matplotlib.colors as mcolors
+import pandas as pd
+import glob
+import os
+# 1. Specify the directory containing your .dta files:
+data_dir = r"C:\Users\ritvik\Downloads\maize_yield\maize_yield"
+# 2. Use glob to find all .dta files in that directory:
+dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
+# 3. Read each .dta file into a pandas DataFrame and store in a list:
+dataframes = [pd.read_stata(f) for f in dta_files]
+# 4. Concatenate them all into one DataFrame (row-wise):
+merged_df = pd.concat(dataframes, ignore_index=True)
+merged_df['ZONE'] = merged_df['ZONE'].astype(int)
+merged_df['DIST'] = merged_df['DIST'].astype(int)
+# create a column called W_CODE which is set up as follows
+# create a string by converting ZONE column to string and append 0
+# to the left of the string to make it 2 characters long
+# then do the same with DIST column
+# finally concatenate the two strings
+merged_df['W_CODE'] = merged_df['ZONE'].astype(str).str.zfill(2) + merged_df['DIST'].astype(str).str.zfill(2)
+merged_df['W_CODE'] = '7' + merged_df['W_CODE']
+# Remove the .0 at the end of the string in W_CODE
+merged_df['W_CODE'] = merged_df['W_CODE'].str.replace('.0', '')
+merged_df['W_CODE'] = merged_df['W_CODE'].astype(int)
+dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
+dg = dg[['W_CODE', 'W_NAME']]
+# Merge the two dataframes on W_CODE
+merged_df = pd.merge(merged_df, dg, on='W_CODE', how='left')
+# Remove rows where PROD98CQ or AREAH are null
+merged_df = merged_df.dropna(subset=['PROD98CQ', 'AREAH'])
+# Compte yield column
+merged_df['yield'] = merged_df['PROD98CQ'] / merged_df['AREAH']
+# create a new dataframe which computes average yield by W_NAME for each year
+df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR'])['yield'].mean().reset_index()
+# Change W_NAME column to title case
+df_avg_yield['W_NAME'] = df_avg_yield['W_NAME'].str.title()
+# Change YEAR to int
+df_avg_yield['YEAR'] = df_avg_yield['YEAR'].astype(int)
+# Convert to a format where each YEAR is converted to int and becomes a column and yield is the value
+df_avg_yield = df_avg_yield.pivot(index='W_NAME', columns='YEAR', values='yield')
+# Remove YEAR as column name and W_NAME as index name
+df_avg_yield.index.name = None
+df_avg_yield.columns.name = None
+df_avg_yield.to_csv('wolayita_yields.csv')
+breakpoint()
+# 5. (Optional) Inspect the merged DataFrame
+print(merged_df.head())
+print(len(merged_df))
+merged_df.to_csv('merged_df.csv', index=False)
+breakpoint()
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import geopandas as gpd
+dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\wolayita.shp")
+dg = dg[dg['Z_NAME'] == "Wolayita"]
+# Dissolve on W_NAME column
+dg = dg.dissolve(by="W_NAME")
+# save to disk
+dg.to_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
+breakpoint()
+# 1. Load the dataset
+df = pd.read_csv('merged_df.csv')
+# 2. Ensure we have a 'yield' column.
+#    If not present, we compute yield as Maize_Production / Maize_Area.
+if 'yield' not in df.columns:
+    if 'PROD98CQ' in df.columns and 'AREAH' in df.columns:
+        # Compute yield in tonnes per hectare (or adjust unit if needed)
+        df['yield'] = df['PROD98CQ'] / df['AREAH']
+    else:
+        raise ValueError("The required columns to compute yield are missing.")
+# 3. Calculate percentage of missing data for yield
+missing_pct_yield = df['yield'].isnull().mean() * 100
+print(f"Percentage of missing data for yield: {missing_pct_yield:.2f}%")
+# 4. Check if some years have more or less data
+#    Count the number of records for each year
+year_counts = df['YEAR'].value_counts().sort_index()
+print("\nNumber of records per year:")
+print(year_counts)
+# 5. Plot histogram of yield distributions by year
+import seaborn as sns
+# Instead of looping and plotting histograms, we can use a boxplot
+plt.figure(figsize=(12, 8))
+sns.boxplot(x='YEAR', y='yield', data=df)
+# Add labels and title
+plt.xlabel("")
+plt.ylabel("Yield")
+plt.show()
+# Group by YEAR and get size (number of rows)
+df_year_counts = df.groupby('YEAR').size().reset_index(name='Count')
+# Sort by YEAR if you want ascending year order
+df_year_counts.sort_values(by='YEAR', inplace=True)
+plt.figure(figsize=(10, 6))
+sns.barplot(data=df_year_counts, x='YEAR', y='Count', color='skyblue', edgecolor='black')
+plt.xlabel("")
+plt.ylabel("Number of Yield Records")
+plt.xticks(rotation=45)  # Rotate x labels if needed
+plt.tight_layout()       # Adjust layout to avoid clipping
+plt.show()
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+# 1. Group by FA and YEAR, then calculate the mean yield
+fa_year_yield = df.groupby(['FA', 'YEAR'])['yield'].mean().reset_index()
+# 2. Pivot so rows = FA, columns = YEAR, values = average yield
+fa_year_pivot = fa_year_yield.pivot(index='FA', columns='YEAR', values='yield')
+# 3. Create the heatmap
+plt.figure(figsize=(12, 8))
+sns.heatmap(
+    fa_year_pivot,
+    cmap='viridis',       # color map; try 'coolwarm' or others
+    annot=False,           # show numeric values in each cell
+    fmt=".2f",            # format numbers (2 decimal places)
+    linewidths=.5         # line width between cells
+)
+plt.title("Heatmap of Average Yield by FA and YEAR")
+plt.xlabel("YEAR")
+plt.ylabel("FA")
+plt.tight_layout()
+plt.show()
+breakpoint()
+# --- Read and preprocess your main shapefile ---
+dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\safrica.shp")
+# remove rows where both ADMIN1 and ADMIN2 are null
+dg = dg.dropna(subset=["ADMIN1", "ADMIN2"], how="all")
+# if ADMIN2 is not null then replace ADMIN1 with ADMIN2 values
+dg["ADMIN1"] = dg["ADMIN2"].combine_first(dg["ADMIN1"])
+# --- Read your CSV and merge on ADMIN1 ---
+df = pd.read_csv(r"C:\Users\ritvik\Downloads\geocif.csv")
+dg = dg.merge(
+    df[["ADMIN1", 'Predicted Yield (tn per ha)',
+        'Median Yield (tn per ha) (2013-2017)', 'Predicted/Median']],
+    on="ADMIN1",
+    how="left"
+)
+# --- Create a dissolved national boundary GeoDataFrame ---
+boundary_gdf = dg.dissolve(by="ADMIN0")
+# --- Colormap and normalization setup ---
+cmap = pal.colorbrewer.get_map("BrBG", "diverging", 11).mpl_colormap
+norm = mcolors.TwoSlopeNorm(vmin=-40, vcenter=0, vmax=40)
+# --- First map: Predicted/Median ---
+fig, ax = plt.subplots(figsize=(10, 6))
+# 1) Plot the main layer
+dg.plot(
+    column="Predicted/Median",
+    cmap=cmap,
+    norm=norm,
+    legend=True,
+    ax=ax,
+    edgecolor='gray',
+    linewidth=0.2,
+    legend_kwds={
+        "shrink": 0.5,
+        "pad": 0.002,
+        "orientation": "horizontal"
+    }
+)
+url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
+world = gpd.read_file(url)
+world =  world[world['ADMIN'].isin(['South Africa', 'Angola', 'Malawi', 'Zambia'])]
+# 2) Plot the dissolved national boundaries on top
+world.plot(
+    ax=ax,
+    color="none",       # No fill
+    edgecolor="black",  # Outline color
+    linewidth=0.5
+)
+ax.set_title("Maize Yield Forecast % Anomaly")
+plt.axis("off")
+plt.tight_layout()
+plt.savefig("aa.png", dpi=300)
+plt.close()
+# --- Second map: Median Yield (2013-2017) ---
+# fig, ax = plt.subplots(figsize=(10, 6))
+#
+# # 1) Plot the main layer
+# dg.plot(
+#     column="Median Yield (tn per ha) (2013-2017)",
+#     cmap=cmap,
+#     legend=True,
+#     ax=ax,
+#     legend_kwds={
+#         "shrink": 0.5,
+#         "pad": 0.002,
+#         "orientation": "horizontal"
+#     }
+# )
+#
+# # 2) Plot the dissolved national boundaries on top
+# boundary_gdf.plot(
+#     ax=ax,
+#     color="none",
+#     edgecolor="black",
+#     linewidth=1
+# )
+#
+# ax.set_title("Median Maize Yield (2013-2017)")
+# plt.axis("off")
+# plt.tight_layout()
+# plt.show()
+# plt.close()
+breakpoint()

{geocif-0.1.60 → geocif-0.1.62/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.60
+Version: 0.1.62
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/SOURCES.txt RENAMED Viewed

@@ -55,6 +55,7 @@ geocif/ml/trend.py
 geocif/ml/xai.py
 geocif/playground/__init__.py
 geocif/playground/aa.py
+geocif/playground/area.py
 geocif/playground/automl.py
 geocif/playground/download_esi.py
 geocif/playground/enso.py
@@ -75,4 +76,5 @@ geocif/risk/__init__.py
 geocif/risk/impact_assessment.py
 geocif/viz/__init__.py
 geocif/viz/plot.py
+geocif/viz/tmp.py
 tests/test_geocif.py

{geocif-0.1.60 → geocif-0.1.62}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.60",
+    version="0.1.62",
     zip_safe=False,
 )