geocif 0.1.60__tar.gz → 0.1.62__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {geocif-0.1.60/geocif.egg-info → geocif-0.1.62}/PKG-INFO +1 -1
  2. {geocif-0.1.60 → geocif-0.1.62}/geocif/analysis.py +38 -25
  3. {geocif-0.1.60 → geocif-0.1.62}/geocif/geocif.py +12 -2
  4. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_angola.py +2 -2
  5. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/stats.py +5 -2
  6. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/trainers.py +1 -1
  7. geocif-0.1.62/geocif/playground/area.py +117 -0
  8. geocif-0.1.62/geocif/viz/tmp.py +268 -0
  9. {geocif-0.1.60 → geocif-0.1.62/geocif.egg-info}/PKG-INFO +1 -1
  10. {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/SOURCES.txt +2 -0
  11. {geocif-0.1.60 → geocif-0.1.62}/setup.py +1 -1
  12. {geocif-0.1.60 → geocif-0.1.62}/LICENSE +0 -0
  13. {geocif-0.1.60 → geocif-0.1.62}/MANIFEST.in +0 -0
  14. {geocif-0.1.60 → geocif-0.1.62}/README.md +0 -0
  15. {geocif-0.1.60 → geocif-0.1.62}/geocif/__init__.py +0 -0
  16. {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/__init__.py +0 -0
  17. {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/geoagmet.py +0 -0
  18. {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/plot.py +0 -0
  19. {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/utils.py +0 -0
  20. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/__init__.py +0 -0
  21. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/constants.py +0 -0
  22. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/features.py +0 -0
  23. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/geo.py +0 -0
  24. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/geocif.py +0 -0
  25. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/metadata.py +0 -0
  26. {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/models.py +0 -0
  27. {geocif-0.1.60 → geocif-0.1.62}/geocif/cei/__init__.py +0 -0
  28. {geocif-0.1.60 → geocif-0.1.62}/geocif/cei/definitions.py +0 -0
  29. {geocif-0.1.60 → geocif-0.1.62}/geocif/cei/indices.py +0 -0
  30. {geocif-0.1.60 → geocif-0.1.62}/geocif/experiments.py +0 -0
  31. {geocif-0.1.60 → geocif-0.1.62}/geocif/geocif_runner.py +0 -0
  32. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner.py +0 -0
  33. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_madagascar.py +0 -0
  34. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_malawi.py +0 -0
  35. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_mozambique.py +0 -0
  36. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_south_africa.py +0 -0
  37. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_zambia.py +0 -0
  38. {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_zimbabwe.py +0 -0
  39. {geocif-0.1.60 → geocif-0.1.62}/geocif/logger.py +0 -0
  40. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/__init__.py +0 -0
  41. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/correlations.py +0 -0
  42. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/embedding.py +0 -0
  43. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/feature_engineering.py +0 -0
  44. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/feature_selection.py +0 -0
  45. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/outliers.py +0 -0
  46. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/outlook.py +0 -0
  47. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/output.py +0 -0
  48. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/spatial_autocorrelation.py +0 -0
  49. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/stages.py +0 -0
  50. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/trend.py +0 -0
  51. {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/xai.py +0 -0
  52. {geocif-0.1.60 → geocif-0.1.62}/geocif/mm.py +0 -0
  53. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/__init__.py +0 -0
  54. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/aa.py +0 -0
  55. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/automl.py +0 -0
  56. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/download_esi.py +0 -0
  57. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/enso.py +0 -0
  58. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/eval.py +0 -0
  59. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/gamtest.py +0 -0
  60. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/gee_access.py +0 -0
  61. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/misc.py +0 -0
  62. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/play_xagg.py +0 -0
  63. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/reg.py +0 -0
  64. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/sustain.py +0 -0
  65. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/test_catboost.py +0 -0
  66. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp.py +0 -0
  67. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp2.py +0 -0
  68. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp3.py +0 -0
  69. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp4.py +0 -0
  70. {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp5.py +0 -0
  71. {geocif-0.1.60 → geocif-0.1.62}/geocif/risk/__init__.py +0 -0
  72. {geocif-0.1.60 → geocif-0.1.62}/geocif/risk/impact_assessment.py +0 -0
  73. {geocif-0.1.60 → geocif-0.1.62}/geocif/utils.py +0 -0
  74. {geocif-0.1.60 → geocif-0.1.62}/geocif/viz/__init__.py +0 -0
  75. {geocif-0.1.60 → geocif-0.1.62}/geocif/viz/plot.py +0 -0
  76. {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/dependency_links.txt +0 -0
  77. {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/not-zip-safe +0 -0
  78. {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/top_level.txt +0 -0
  79. {geocif-0.1.60 → geocif-0.1.62}/requirements.txt +0 -0
  80. {geocif-0.1.60 → geocif-0.1.62}/setup.cfg +0 -0
  81. {geocif-0.1.60 → geocif-0.1.62}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.60
3
+ Version: 0.1.62
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -186,11 +186,6 @@ class Geoanalysis:
186
186
  return df_metrics, df_regional_metrics, df_national_yield
187
187
 
188
188
  def _clean_data(self):
189
- # Hack exclude 2012 if country == "illinois"
190
- if self.country == "illinois":
191
- self.df_analysis = self.df_analysis[
192
- self.df_analysis["Harvest Year"] != 2012
193
- ]
194
189
  # Remove rows with missing values in Observed Yield (tn per ha)
195
190
  return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
196
191
 
@@ -543,34 +538,52 @@ class Geoanalysis:
543
538
  country = self.country.title().replace("_", " ")
544
539
  crop = self.crop.title().replace("_", " ")
545
540
  file = dir_statistics / f"{country}_{crop}_statistics_s1_{self.method}.csv"
546
- df_historic = pd.read_csv(file)
547
-
548
- df_historic = df_historic[["Region", "Harvest Year", "Yield (tn per ha)"]]
541
+ df_all = pd.read_csv(file)
549
542
 
550
- # Drop rows with NaN values
551
- df_historic = df_historic.dropna()
543
+ # Keep only the relevant columns and drop NaNs
544
+ df_all = df_all[["Region", "Harvest Year", "Yield (tn per ha)"]].dropna()
552
545
 
553
- # Determine unique years
554
- years = df_historic["Harvest Year"].unique()
546
+ # --- For computing the % of total production ---
547
+ # Determine unique years and sort them (in case they aren't already)
548
+ years = sorted(df_all["Harvest Year"].unique())
549
+ # Subset dataframe to include only the last 5 years of the dataset
550
+ last_five_years = years[-5:]
551
+ df_recent = df_all[df_all["Harvest Year"].isin(last_five_years)]
555
552
 
556
- # Subset dataframe to only include the last years of the dataset
557
- df_historic = df_historic[df_historic["Harvest Year"].isin(years[-5:])]
558
-
559
- # For each region, compute the % of the total production
560
- df_historic = (
561
- df_historic.groupby("Region")["Yield (tn per ha)"]
553
+ # For each region, compute the % of total production (using yield sum over the last five years)
554
+ df_pct = (
555
+ df_recent.groupby("Region")["Yield (tn per ha)"]
562
556
  .sum()
563
557
  .pipe(lambda x: x / x.sum() * 100)
564
558
  .to_frame(name="% of total Area (ha)")
565
559
  .reset_index()
566
560
  )
567
- # Find median yield for each region
568
- # df_historic = (
569
- # df_historic.groupby("Region")["Yield (tn per ha)"]
570
- # .median()
571
- # .to_frame(name="Median Yield (tn per ha)")
572
- # .reset_index()
573
- # )
561
+
562
+ # --- For computing median yields ---
563
+ # Compute median yield for 2014 - 2018
564
+ df_median_2014_2018 = (
565
+ df_all[df_all["Harvest Year"].between(2014, 2018)]
566
+ .groupby("Region")["Yield (tn per ha)"]
567
+ .median()
568
+ .rename(f"Median Yield (tn per ha) (2014-2018)")
569
+ .reset_index()
570
+ )
571
+
572
+ # Compute median yield for 2013 - 2017
573
+ df_median_2013_2017 = (
574
+ df_all[df_all["Harvest Year"].between(2013, 2017)]
575
+ .groupby("Region")["Yield (tn per ha)"]
576
+ .median()
577
+ .rename("Median Yield (tn per ha) (2013-2017)")
578
+ .reset_index()
579
+ )
580
+
581
+ # Merge the median yield columns with the % of total production dataframe
582
+ df_historic = (
583
+ df_pct
584
+ .merge(df_median_2014_2018, on="Region", how="left")
585
+ .merge(df_median_2013_2017, on="Region", how="left")
586
+ )
574
587
 
575
588
  return df_historic
576
589
 
@@ -587,6 +587,16 @@ class Geocif:
587
587
  df_region[f"Median {self.target}"].values, 3
588
588
  )
589
589
 
590
+ if f"Median {self.target} (2014-2018)" in df_region.columns:
591
+ df.loc[:, f"Median {self.target} (2014-2018)"] = np.around(
592
+ df_region[f"Median {self.target} (2014-2018)"].values, 3
593
+ )
594
+
595
+ if f"Median {self.target} (2013-2017)" in df_region.columns:
596
+ df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
597
+ df_region[f"Median {self.target} (2013-2017)"].values, 3
598
+ )
599
+
590
600
  if self.estimate_ci:
591
601
  if self.estimate_ci_for_all or self.forecast_season == self.today_year:
592
602
  # Iterate over each element in y_pred_ci
@@ -730,8 +740,6 @@ class Geocif:
730
740
 
731
741
  if self.median_yield_as_feature:
732
742
  self.feature_names.append(f"Median {self.target}")
733
- self.feature_names.append(f"Median {self.target} (2014-2018)")
734
- self.feature_names.append(f"Median {self.target} (2013-2017)")
735
743
 
736
744
  if self.lag_yield_as_feature:
737
745
  # For the number of years specified in self.number_lag_years
@@ -801,6 +809,8 @@ class Geocif:
801
809
  + self.statistics_columns
802
810
  + self.feature_names
803
811
  + [f"Median {self.target}"]
812
+ + [f"Median {self.target} (2014-2018)"]
813
+ + [f"Median {self.target} (2013-2017)"]
804
814
  + ["Region_ID"]
805
815
  )
806
816
  if self.check_yield_trend:
@@ -12,7 +12,7 @@ warnings.filterwarnings("ignore")
12
12
  from .cei import indices
13
13
  from geoprepare import base
14
14
 
15
- country = "angola"
15
+ country = "wolayita"
16
16
 
17
17
  def remove_duplicates(lst):
18
18
  """
@@ -174,7 +174,7 @@ class cei_runner(base.BaseGeo):
174
174
  combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
175
175
 
176
176
  if True:
177
- num_cpu = int(cpu_count() * 0.1)
177
+ num_cpu = int(cpu_count() * 0.5)
178
178
  with Pool(num_cpu) as p:
179
179
  for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
180
180
  pass
@@ -80,8 +80,11 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
80
80
 
81
81
  # CM_Season should be 1 for the Main season
82
82
  # TODO: Make this user specified
83
- mask_cm_season = df_tmp["CM_Season"] == 1
84
- val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
83
+ if "CM_Season" in df_tmp.columns:
84
+ mask_cm_season = df_tmp["CM_Season"] == 1
85
+ val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
86
+ else:
87
+ val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
85
88
 
86
89
  try:
87
90
  if val.isnull().all():
@@ -268,7 +268,7 @@ def auto_train(
268
268
  loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
269
269
  bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
270
270
  hyperparams = {
271
- "iterations": 3500,
271
+ "iterations": 2500,
272
272
  "learning_rate": 0.025,
273
273
  "depth": 6,
274
274
  "subsample": 1.0,
@@ -0,0 +1,117 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+
6
+ # 1. Read the CSV
7
+ df = pd.read_csv(r"C:\Users\ritvik\Downloads\ET_AgStats.csv")
8
+
9
+ # 2. Filter for the crop "Maize (Corn)"
10
+ df = df[df['DNL_SourceCrop'] == 'Maize (Corn)']
11
+
12
+ # 3. Remove rows where "Area Planted: ha" is "NA" or "NC"
13
+ df = df[df['Area Planted: ha'] != 'NA']
14
+ df = df[df['Area Planted: ha'] != 'NC']
15
+
16
+ df = df[df['Yield: MT/ha'] != 'NA']
17
+ df = df[df['Yield: MT/ha'] != 'NC']
18
+ df = df[df['Yield: MT/ha'] != '#REF!']
19
+
20
+ # Remove rows where Admin 2 is null
21
+ df = df.dropna(subset=['Admin 2'])
22
+ df = df.dropna(subset=['Yield: MT/ha'])
23
+
24
+ # 4. Convert "Area Planted: ha" to float by removing commas
25
+ df['Area Planted: ha'] = (
26
+ df['Area Planted: ha']
27
+ .str.replace(',', '', regex=False)
28
+ .astype(float)
29
+ )
30
+
31
+ df['Yield: MT/ha'] = (
32
+ df['Yield: MT/ha']
33
+ .str.replace(',', '', regex=False)
34
+ .astype(float)
35
+ )
36
+
37
+ # 5. Group by [region, season] to calculate z-scores
38
+ grouped = df.groupby(['Admin 2', 'Season'])
39
+ anomalies_list = []
40
+
41
+ for (region, season), group_data in grouped:
42
+ mean_area = group_data['Area Planted: ha'].mean()
43
+ std_area = group_data['Area Planted: ha'].std()
44
+
45
+ # Avoid division by zero
46
+ if std_area == 0:
47
+ group_data['Z_score'] = 0
48
+ else:
49
+ group_data['Z_score'] = (group_data['Area Planted: ha'] - mean_area) / std_area
50
+
51
+ # Flag anomalies if abs(z-score) > 3
52
+ group_data['Anomaly'] = group_data['Z_score'].apply(lambda x: 'Yes' if abs(x) > 3 else 'No')
53
+
54
+ anomalies_list.append(group_data)
55
+
56
+ # 6. Concatenate grouped data back together
57
+ df_analyzed = pd.concat(anomalies_list)
58
+
59
+ # 7. Filter to see only anomalies
60
+ df_anomalies = df_analyzed[df_analyzed['Anomaly'] == 'Yes']
61
+
62
+ # 8. Print full dataset with anomaly flags and the subset of anomalies
63
+ print("All data with anomaly flags:")
64
+ print(df_analyzed)
65
+
66
+ print("\nDetected anomalies:")
67
+ print(df_anomalies)
68
+ df_anomalies.to_csv(r"df_anomalies_v2.csv", index=False)
69
+
70
+ # 11. Distribution of "Yield: MT/ha"
71
+
72
+ plt.figure(figsize=(8, 5))
73
+ sns.histplot(df['Yield: MT/ha'], kde=True, bins=30)
74
+ plt.title('Distribution of Yield (MT/ha)')
75
+ plt.xlabel('Yield (MT/ha)')
76
+ plt.ylabel('Count')
77
+ plt.tight_layout()
78
+ plt.show()
79
+
80
+ # count number of values where yield < 1
81
+ low_yield = df[df['Yield: MT/ha'] < 1].shape[0]
82
+ total = df.shape[0]
83
+ print(f"Number of records with yield < 1: {low_yield} / {total}")
84
+ breakpoint()
85
+ # 9. Bar chart of number of anomalies per Season
86
+ anomalies_by_season = df_anomalies['Season'].value_counts()
87
+ plt.figure(figsize=(8, 5))
88
+ anomalies_by_season.plot(kind='bar')
89
+ plt.title('Number of Anomalies per Season')
90
+ plt.xlabel('Season')
91
+ plt.ylabel('Count of Anomalies')
92
+ plt.tight_layout()
93
+ plt.show()
94
+
95
+ # 10. Heatmap of anomalies by Region (rows) and Year (columns)
96
+
97
+ # Ensure "Year" is numeric for pivoting
98
+ df_anomalies['Year'] = pd.to_numeric(df_anomalies['Year'], errors='coerce')
99
+
100
+ # Count how many anomalies per (region, year)
101
+ heatmap_data = df_anomalies.groupby(['Admin 1', 'Year']).size().unstack(fill_value=0)
102
+
103
+ # Plot the heatmap
104
+ plt.figure(figsize=(10, 6))
105
+ sns.heatmap(
106
+ heatmap_data,
107
+ annot=True,
108
+ cmap='Blues',
109
+ fmt='d'
110
+ )
111
+ plt.title('Number of Anomalies by Region and Year')
112
+ plt.xlabel('Year')
113
+ plt.ylabel('Region')
114
+ plt.tight_layout()
115
+ plt.show()
116
+
117
+
@@ -0,0 +1,268 @@
1
+ import geopandas as gpd
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import palettable as pal
5
+ import matplotlib.colors as mcolors
6
+
7
+ import pandas as pd
8
+ import glob
9
+ import os
10
+
11
+ # 1. Specify the directory containing your .dta files:
12
+ data_dir = r"C:\Users\ritvik\Downloads\maize_yield\maize_yield"
13
+
14
+ # 2. Use glob to find all .dta files in that directory:
15
+ dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
16
+
17
+ # 3. Read each .dta file into a pandas DataFrame and store in a list:
18
+ dataframes = [pd.read_stata(f) for f in dta_files]
19
+
20
+ # 4. Concatenate them all into one DataFrame (row-wise):
21
+ merged_df = pd.concat(dataframes, ignore_index=True)
22
+
23
+ merged_df['ZONE'] = merged_df['ZONE'].astype(int)
24
+ merged_df['DIST'] = merged_df['DIST'].astype(int)
25
+
26
+ # create a column called W_CODE which is set up as follows
27
+ # create a string by converting ZONE column to string and append 0
28
+ # to the left of the string to make it 2 characters long
29
+ # then do the same with DIST column
30
+ # finally concatenate the two strings
31
+ merged_df['W_CODE'] = merged_df['ZONE'].astype(str).str.zfill(2) + merged_df['DIST'].astype(str).str.zfill(2)
32
+
33
+ merged_df['W_CODE'] = '7' + merged_df['W_CODE']
34
+
35
+ # Remove the .0 at the end of the string in W_CODE
36
+ merged_df['W_CODE'] = merged_df['W_CODE'].str.replace('.0', '')
37
+ merged_df['W_CODE'] = merged_df['W_CODE'].astype(int)
38
+
39
+ dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
40
+ dg = dg[['W_CODE', 'W_NAME']]
41
+
42
+ # Merge the two dataframes on W_CODE
43
+ merged_df = pd.merge(merged_df, dg, on='W_CODE', how='left')
44
+
45
+ # Remove rows where PROD98CQ or AREAH are null
46
+ merged_df = merged_df.dropna(subset=['PROD98CQ', 'AREAH'])
47
+
48
+ # Compte yield column
49
+ merged_df['yield'] = merged_df['PROD98CQ'] / merged_df['AREAH']
50
+
51
+ # create a new dataframe which computes average yield by W_NAME for each year
52
+ df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR'])['yield'].mean().reset_index()
53
+
54
+ # Change W_NAME column to title case
55
+ df_avg_yield['W_NAME'] = df_avg_yield['W_NAME'].str.title()
56
+
57
+ # Change YEAR to int
58
+ df_avg_yield['YEAR'] = df_avg_yield['YEAR'].astype(int)
59
+
60
+ # Convert to a format where each YEAR is converted to int and becomes a column and yield is the value
61
+ df_avg_yield = df_avg_yield.pivot(index='W_NAME', columns='YEAR', values='yield')
62
+
63
+ # Remove YEAR as column name and W_NAME as index name
64
+ df_avg_yield.index.name = None
65
+ df_avg_yield.columns.name = None
66
+
67
+ df_avg_yield.to_csv('wolayita_yields.csv')
68
+
69
+ breakpoint()
70
+ # 5. (Optional) Inspect the merged DataFrame
71
+ print(merged_df.head())
72
+ print(len(merged_df))
73
+ merged_df.to_csv('merged_df.csv', index=False)
74
+ breakpoint()
75
+
76
+ import pandas as pd
77
+ import matplotlib.pyplot as plt
78
+ import seaborn as sns
79
+
80
+ import geopandas as gpd
81
+ dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\wolayita.shp")
82
+ dg = dg[dg['Z_NAME'] == "Wolayita"]
83
+
84
+ # Dissolve on W_NAME column
85
+ dg = dg.dissolve(by="W_NAME")
86
+
87
+ # save to disk
88
+ dg.to_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
89
+
90
+ breakpoint()
91
+ # 1. Load the dataset
92
+ df = pd.read_csv('merged_df.csv')
93
+
94
+ # 2. Ensure we have a 'yield' column.
95
+ # If not present, we compute yield as Maize_Production / Maize_Area.
96
+ if 'yield' not in df.columns:
97
+ if 'PROD98CQ' in df.columns and 'AREAH' in df.columns:
98
+ # Compute yield in tonnes per hectare (or adjust unit if needed)
99
+ df['yield'] = df['PROD98CQ'] / df['AREAH']
100
+ else:
101
+ raise ValueError("The required columns to compute yield are missing.")
102
+
103
+ # 3. Calculate percentage of missing data for yield
104
+ missing_pct_yield = df['yield'].isnull().mean() * 100
105
+ print(f"Percentage of missing data for yield: {missing_pct_yield:.2f}%")
106
+
107
+ # 4. Check if some years have more or less data
108
+ # Count the number of records for each year
109
+ year_counts = df['YEAR'].value_counts().sort_index()
110
+ print("\nNumber of records per year:")
111
+ print(year_counts)
112
+
113
+ # 5. Plot histogram of yield distributions by year
114
+ import seaborn as sns
115
+
116
+ # Instead of looping and plotting histograms, we can use a boxplot
117
+ plt.figure(figsize=(12, 8))
118
+
119
+ sns.boxplot(x='YEAR', y='yield', data=df)
120
+
121
+ # Add labels and title
122
+ plt.xlabel("")
123
+ plt.ylabel("Yield")
124
+
125
+ plt.show()
126
+
127
+
128
+ # Group by YEAR and get size (number of rows)
129
+ df_year_counts = df.groupby('YEAR').size().reset_index(name='Count')
130
+ # Sort by YEAR if you want ascending year order
131
+ df_year_counts.sort_values(by='YEAR', inplace=True)
132
+
133
+ plt.figure(figsize=(10, 6))
134
+ sns.barplot(data=df_year_counts, x='YEAR', y='Count', color='skyblue', edgecolor='black')
135
+
136
+ plt.xlabel("")
137
+ plt.ylabel("Number of Yield Records")
138
+ plt.xticks(rotation=45) # Rotate x labels if needed
139
+ plt.tight_layout() # Adjust layout to avoid clipping
140
+ plt.show()
141
+
142
+
143
+ import pandas as pd
144
+ import numpy as np
145
+ import seaborn as sns
146
+ import matplotlib.pyplot as plt
147
+
148
+ # 1. Group by FA and YEAR, then calculate the mean yield
149
+ fa_year_yield = df.groupby(['FA', 'YEAR'])['yield'].mean().reset_index()
150
+
151
+ # 2. Pivot so rows = FA, columns = YEAR, values = average yield
152
+ fa_year_pivot = fa_year_yield.pivot(index='FA', columns='YEAR', values='yield')
153
+
154
+ # 3. Create the heatmap
155
+ plt.figure(figsize=(12, 8))
156
+ sns.heatmap(
157
+ fa_year_pivot,
158
+ cmap='viridis', # color map; try 'coolwarm' or others
159
+ annot=False, # show numeric values in each cell
160
+ fmt=".2f", # format numbers (2 decimal places)
161
+ linewidths=.5 # line width between cells
162
+ )
163
+
164
+ plt.title("Heatmap of Average Yield by FA and YEAR")
165
+ plt.xlabel("YEAR")
166
+ plt.ylabel("FA")
167
+ plt.tight_layout()
168
+ plt.show()
169
+
170
+ breakpoint()
171
+
172
+
173
+ # --- Read and preprocess your main shapefile ---
174
+ dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\safrica.shp")
175
+
176
+ # remove rows where both ADMIN1 and ADMIN2 are null
177
+ dg = dg.dropna(subset=["ADMIN1", "ADMIN2"], how="all")
178
+
179
+ # if ADMIN2 is not null then replace ADMIN1 with ADMIN2 values
180
+ dg["ADMIN1"] = dg["ADMIN2"].combine_first(dg["ADMIN1"])
181
+
182
+ # --- Read your CSV and merge on ADMIN1 ---
183
+ df = pd.read_csv(r"C:\Users\ritvik\Downloads\geocif.csv")
184
+
185
+ dg = dg.merge(
186
+ df[["ADMIN1", 'Predicted Yield (tn per ha)',
187
+ 'Median Yield (tn per ha) (2013-2017)', 'Predicted/Median']],
188
+ on="ADMIN1",
189
+ how="left"
190
+ )
191
+
192
+ # --- Create a dissolved national boundary GeoDataFrame ---
193
+ boundary_gdf = dg.dissolve(by="ADMIN0")
194
+
195
+ # --- Colormap and normalization setup ---
196
+ cmap = pal.colorbrewer.get_map("BrBG", "diverging", 11).mpl_colormap
197
+ norm = mcolors.TwoSlopeNorm(vmin=-40, vcenter=0, vmax=40)
198
+
199
+ # --- First map: Predicted/Median ---
200
+ fig, ax = plt.subplots(figsize=(10, 6))
201
+
202
+ # 1) Plot the main layer
203
+ dg.plot(
204
+ column="Predicted/Median",
205
+ cmap=cmap,
206
+ norm=norm,
207
+ legend=True,
208
+ ax=ax,
209
+ edgecolor='gray',
210
+ linewidth=0.2,
211
+ legend_kwds={
212
+ "shrink": 0.5,
213
+ "pad": 0.002,
214
+ "orientation": "horizontal"
215
+ }
216
+ )
217
+
218
+ url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
219
+
220
+ world = gpd.read_file(url)
221
+ world = world[world['ADMIN'].isin(['South Africa', 'Angola', 'Malawi', 'Zambia'])]
222
+
223
+ # 2) Plot the dissolved national boundaries on top
224
+ world.plot(
225
+ ax=ax,
226
+ color="none", # No fill
227
+ edgecolor="black", # Outline color
228
+ linewidth=0.5
229
+ )
230
+
231
+ ax.set_title("Maize Yield Forecast % Anomaly")
232
+ plt.axis("off")
233
+ plt.tight_layout()
234
+ plt.savefig("aa.png", dpi=300)
235
+ plt.close()
236
+
237
+
238
+ # --- Second map: Median Yield (2013-2017) ---
239
+ # fig, ax = plt.subplots(figsize=(10, 6))
240
+ #
241
+ # # 1) Plot the main layer
242
+ # dg.plot(
243
+ # column="Median Yield (tn per ha) (2013-2017)",
244
+ # cmap=cmap,
245
+ # legend=True,
246
+ # ax=ax,
247
+ # legend_kwds={
248
+ # "shrink": 0.5,
249
+ # "pad": 0.002,
250
+ # "orientation": "horizontal"
251
+ # }
252
+ # )
253
+ #
254
+ # # 2) Plot the dissolved national boundaries on top
255
+ # boundary_gdf.plot(
256
+ # ax=ax,
257
+ # color="none",
258
+ # edgecolor="black",
259
+ # linewidth=1
260
+ # )
261
+ #
262
+ # ax.set_title("Median Maize Yield (2013-2017)")
263
+ # plt.axis("off")
264
+ # plt.tight_layout()
265
+ # plt.show()
266
+ # plt.close()
267
+
268
+ breakpoint()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.60
3
+ Version: 0.1.62
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -55,6 +55,7 @@ geocif/ml/trend.py
55
55
  geocif/ml/xai.py
56
56
  geocif/playground/__init__.py
57
57
  geocif/playground/aa.py
58
+ geocif/playground/area.py
58
59
  geocif/playground/automl.py
59
60
  geocif/playground/download_esi.py
60
61
  geocif/playground/enso.py
@@ -75,4 +76,5 @@ geocif/risk/__init__.py
75
76
  geocif/risk/impact_assessment.py
76
77
  geocif/viz/__init__.py
77
78
  geocif/viz/plot.py
79
+ geocif/viz/tmp.py
78
80
  tests/test_geocif.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.60",
53
+ version="0.1.62",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes