geocif 0.1.60__tar.gz → 0.1.62__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.60/geocif.egg-info → geocif-0.1.62}/PKG-INFO +1 -1
- {geocif-0.1.60 → geocif-0.1.62}/geocif/analysis.py +38 -25
- {geocif-0.1.60 → geocif-0.1.62}/geocif/geocif.py +12 -2
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_angola.py +2 -2
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/stats.py +5 -2
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/trainers.py +1 -1
- geocif-0.1.62/geocif/playground/area.py +117 -0
- geocif-0.1.62/geocif/viz/tmp.py +268 -0
- {geocif-0.1.60 → geocif-0.1.62/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.60 → geocif-0.1.62}/setup.py +1 -1
- {geocif-0.1.60 → geocif-0.1.62}/LICENSE +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/MANIFEST.in +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/README.md +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/constants.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/features.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/geo.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/backup/models.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/cei/indices.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/experiments.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/geocif_runner.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/logger.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/correlations.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/output.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/stages.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/trend.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/ml/xai.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/mm.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/aa.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/automl.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/download_esi.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/enso.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/eval.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/gamtest.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/gee_access.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/misc.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/reg.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/sustain.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp2.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp3.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp4.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/playground/tmp5.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/risk/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/utils.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif/viz/plot.py +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/requirements.txt +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/setup.cfg +0 -0
- {geocif-0.1.60 → geocif-0.1.62}/tests/test_geocif.py +0 -0
@@ -186,11 +186,6 @@ class Geoanalysis:
|
|
186
186
|
return df_metrics, df_regional_metrics, df_national_yield
|
187
187
|
|
188
188
|
def _clean_data(self):
|
189
|
-
# Hack exclude 2012 if country == "illinois"
|
190
|
-
if self.country == "illinois":
|
191
|
-
self.df_analysis = self.df_analysis[
|
192
|
-
self.df_analysis["Harvest Year"] != 2012
|
193
|
-
]
|
194
189
|
# Remove rows with missing values in Observed Yield (tn per ha)
|
195
190
|
return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
|
196
191
|
|
@@ -543,34 +538,52 @@ class Geoanalysis:
|
|
543
538
|
country = self.country.title().replace("_", " ")
|
544
539
|
crop = self.crop.title().replace("_", " ")
|
545
540
|
file = dir_statistics / f"{country}_{crop}_statistics_s1_{self.method}.csv"
|
546
|
-
|
547
|
-
|
548
|
-
df_historic = df_historic[["Region", "Harvest Year", "Yield (tn per ha)"]]
|
541
|
+
df_all = pd.read_csv(file)
|
549
542
|
|
550
|
-
#
|
551
|
-
|
543
|
+
# Keep only the relevant columns and drop NaNs
|
544
|
+
df_all = df_all[["Region", "Harvest Year", "Yield (tn per ha)"]].dropna()
|
552
545
|
|
553
|
-
#
|
554
|
-
years
|
546
|
+
# --- For computing the % of total production ---
|
547
|
+
# Determine unique years and sort them (in case they aren't already)
|
548
|
+
years = sorted(df_all["Harvest Year"].unique())
|
549
|
+
# Subset dataframe to include only the last 5 years of the dataset
|
550
|
+
last_five_years = years[-5:]
|
551
|
+
df_recent = df_all[df_all["Harvest Year"].isin(last_five_years)]
|
555
552
|
|
556
|
-
#
|
557
|
-
|
558
|
-
|
559
|
-
# For each region, compute the % of the total production
|
560
|
-
df_historic = (
|
561
|
-
df_historic.groupby("Region")["Yield (tn per ha)"]
|
553
|
+
# For each region, compute the % of total production (using yield sum over the last five years)
|
554
|
+
df_pct = (
|
555
|
+
df_recent.groupby("Region")["Yield (tn per ha)"]
|
562
556
|
.sum()
|
563
557
|
.pipe(lambda x: x / x.sum() * 100)
|
564
558
|
.to_frame(name="% of total Area (ha)")
|
565
559
|
.reset_index()
|
566
560
|
)
|
567
|
-
|
568
|
-
#
|
569
|
-
#
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
561
|
+
|
562
|
+
# --- For computing median yields ---
|
563
|
+
# Compute median yield for 2014 - 2018
|
564
|
+
df_median_2014_2018 = (
|
565
|
+
df_all[df_all["Harvest Year"].between(2014, 2018)]
|
566
|
+
.groupby("Region")["Yield (tn per ha)"]
|
567
|
+
.median()
|
568
|
+
.rename(f"Median Yield (tn per ha) (2014-2018)")
|
569
|
+
.reset_index()
|
570
|
+
)
|
571
|
+
|
572
|
+
# Compute median yield for 2013 - 2017
|
573
|
+
df_median_2013_2017 = (
|
574
|
+
df_all[df_all["Harvest Year"].between(2013, 2017)]
|
575
|
+
.groupby("Region")["Yield (tn per ha)"]
|
576
|
+
.median()
|
577
|
+
.rename("Median Yield (tn per ha) (2013-2017)")
|
578
|
+
.reset_index()
|
579
|
+
)
|
580
|
+
|
581
|
+
# Merge the median yield columns with the % of total production dataframe
|
582
|
+
df_historic = (
|
583
|
+
df_pct
|
584
|
+
.merge(df_median_2014_2018, on="Region", how="left")
|
585
|
+
.merge(df_median_2013_2017, on="Region", how="left")
|
586
|
+
)
|
574
587
|
|
575
588
|
return df_historic
|
576
589
|
|
@@ -587,6 +587,16 @@ class Geocif:
|
|
587
587
|
df_region[f"Median {self.target}"].values, 3
|
588
588
|
)
|
589
589
|
|
590
|
+
if f"Median {self.target} (2014-2018)" in df_region.columns:
|
591
|
+
df.loc[:, f"Median {self.target} (2014-2018)"] = np.around(
|
592
|
+
df_region[f"Median {self.target} (2014-2018)"].values, 3
|
593
|
+
)
|
594
|
+
|
595
|
+
if f"Median {self.target} (2013-2017)" in df_region.columns:
|
596
|
+
df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
|
597
|
+
df_region[f"Median {self.target} (2013-2017)"].values, 3
|
598
|
+
)
|
599
|
+
|
590
600
|
if self.estimate_ci:
|
591
601
|
if self.estimate_ci_for_all or self.forecast_season == self.today_year:
|
592
602
|
# Iterate over each element in y_pred_ci
|
@@ -730,8 +740,6 @@ class Geocif:
|
|
730
740
|
|
731
741
|
if self.median_yield_as_feature:
|
732
742
|
self.feature_names.append(f"Median {self.target}")
|
733
|
-
self.feature_names.append(f"Median {self.target} (2014-2018)")
|
734
|
-
self.feature_names.append(f"Median {self.target} (2013-2017)")
|
735
743
|
|
736
744
|
if self.lag_yield_as_feature:
|
737
745
|
# For the number of years specified in self.number_lag_years
|
@@ -801,6 +809,8 @@ class Geocif:
|
|
801
809
|
+ self.statistics_columns
|
802
810
|
+ self.feature_names
|
803
811
|
+ [f"Median {self.target}"]
|
812
|
+
+ [f"Median {self.target} (2014-2018)"]
|
813
|
+
+ [f"Median {self.target} (2013-2017)"]
|
804
814
|
+ ["Region_ID"]
|
805
815
|
)
|
806
816
|
if self.check_yield_trend:
|
@@ -12,7 +12,7 @@ warnings.filterwarnings("ignore")
|
|
12
12
|
from .cei import indices
|
13
13
|
from geoprepare import base
|
14
14
|
|
15
|
-
country = "
|
15
|
+
country = "wolayita"
|
16
16
|
|
17
17
|
def remove_duplicates(lst):
|
18
18
|
"""
|
@@ -174,7 +174,7 @@ class cei_runner(base.BaseGeo):
|
|
174
174
|
combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
|
175
175
|
|
176
176
|
if True:
|
177
|
-
num_cpu = int(cpu_count() * 0.
|
177
|
+
num_cpu = int(cpu_count() * 0.5)
|
178
178
|
with Pool(num_cpu) as p:
|
179
179
|
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
180
180
|
pass
|
@@ -80,8 +80,11 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
|
|
80
80
|
|
81
81
|
# CM_Season should be 1 for the Main season
|
82
82
|
# TODO: Make this user specified
|
83
|
-
|
84
|
-
|
83
|
+
if "CM_Season" in df_tmp.columns:
|
84
|
+
mask_cm_season = df_tmp["CM_Season"] == 1
|
85
|
+
val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
|
86
|
+
else:
|
87
|
+
val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
|
85
88
|
|
86
89
|
try:
|
87
90
|
if val.isnull().all():
|
@@ -268,7 +268,7 @@ def auto_train(
|
|
268
268
|
loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
|
269
269
|
bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
|
270
270
|
hyperparams = {
|
271
|
-
"iterations":
|
271
|
+
"iterations": 2500,
|
272
272
|
"learning_rate": 0.025,
|
273
273
|
"depth": 6,
|
274
274
|
"subsample": 1.0,
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import seaborn as sns
|
5
|
+
|
6
|
+
# 1. Read the CSV
|
7
|
+
df = pd.read_csv(r"C:\Users\ritvik\Downloads\ET_AgStats.csv")
|
8
|
+
|
9
|
+
# 2. Filter for the crop "Maize (Corn)"
|
10
|
+
df = df[df['DNL_SourceCrop'] == 'Maize (Corn)']
|
11
|
+
|
12
|
+
# 3. Remove rows where "Area Planted: ha" is "NA" or "NC"
|
13
|
+
df = df[df['Area Planted: ha'] != 'NA']
|
14
|
+
df = df[df['Area Planted: ha'] != 'NC']
|
15
|
+
|
16
|
+
df = df[df['Yield: MT/ha'] != 'NA']
|
17
|
+
df = df[df['Yield: MT/ha'] != 'NC']
|
18
|
+
df = df[df['Yield: MT/ha'] != '#REF!']
|
19
|
+
|
20
|
+
# Remove rows where Admin 2 is null
|
21
|
+
df = df.dropna(subset=['Admin 2'])
|
22
|
+
df = df.dropna(subset=['Yield: MT/ha'])
|
23
|
+
|
24
|
+
# 4. Convert "Area Planted: ha" to float by removing commas
|
25
|
+
df['Area Planted: ha'] = (
|
26
|
+
df['Area Planted: ha']
|
27
|
+
.str.replace(',', '', regex=False)
|
28
|
+
.astype(float)
|
29
|
+
)
|
30
|
+
|
31
|
+
df['Yield: MT/ha'] = (
|
32
|
+
df['Yield: MT/ha']
|
33
|
+
.str.replace(',', '', regex=False)
|
34
|
+
.astype(float)
|
35
|
+
)
|
36
|
+
|
37
|
+
# 5. Group by [region, season] to calculate z-scores
|
38
|
+
grouped = df.groupby(['Admin 2', 'Season'])
|
39
|
+
anomalies_list = []
|
40
|
+
|
41
|
+
for (region, season), group_data in grouped:
|
42
|
+
mean_area = group_data['Area Planted: ha'].mean()
|
43
|
+
std_area = group_data['Area Planted: ha'].std()
|
44
|
+
|
45
|
+
# Avoid division by zero
|
46
|
+
if std_area == 0:
|
47
|
+
group_data['Z_score'] = 0
|
48
|
+
else:
|
49
|
+
group_data['Z_score'] = (group_data['Area Planted: ha'] - mean_area) / std_area
|
50
|
+
|
51
|
+
# Flag anomalies if abs(z-score) > 3
|
52
|
+
group_data['Anomaly'] = group_data['Z_score'].apply(lambda x: 'Yes' if abs(x) > 3 else 'No')
|
53
|
+
|
54
|
+
anomalies_list.append(group_data)
|
55
|
+
|
56
|
+
# 6. Concatenate grouped data back together
|
57
|
+
df_analyzed = pd.concat(anomalies_list)
|
58
|
+
|
59
|
+
# 7. Filter to see only anomalies
|
60
|
+
df_anomalies = df_analyzed[df_analyzed['Anomaly'] == 'Yes']
|
61
|
+
|
62
|
+
# 8. Print full dataset with anomaly flags and the subset of anomalies
|
63
|
+
print("All data with anomaly flags:")
|
64
|
+
print(df_analyzed)
|
65
|
+
|
66
|
+
print("\nDetected anomalies:")
|
67
|
+
print(df_anomalies)
|
68
|
+
df_anomalies.to_csv(r"df_anomalies_v2.csv", index=False)
|
69
|
+
|
70
|
+
# 11. Distribution of "Yield: MT/ha"
|
71
|
+
|
72
|
+
plt.figure(figsize=(8, 5))
|
73
|
+
sns.histplot(df['Yield: MT/ha'], kde=True, bins=30)
|
74
|
+
plt.title('Distribution of Yield (MT/ha)')
|
75
|
+
plt.xlabel('Yield (MT/ha)')
|
76
|
+
plt.ylabel('Count')
|
77
|
+
plt.tight_layout()
|
78
|
+
plt.show()
|
79
|
+
|
80
|
+
# count number of values where yield < 1
|
81
|
+
low_yield = df[df['Yield: MT/ha'] < 1].shape[0]
|
82
|
+
total = df.shape[0]
|
83
|
+
print(f"Number of records with yield < 1: {low_yield} / {total}")
|
84
|
+
breakpoint()
|
85
|
+
# 9. Bar chart of number of anomalies per Season
|
86
|
+
anomalies_by_season = df_anomalies['Season'].value_counts()
|
87
|
+
plt.figure(figsize=(8, 5))
|
88
|
+
anomalies_by_season.plot(kind='bar')
|
89
|
+
plt.title('Number of Anomalies per Season')
|
90
|
+
plt.xlabel('Season')
|
91
|
+
plt.ylabel('Count of Anomalies')
|
92
|
+
plt.tight_layout()
|
93
|
+
plt.show()
|
94
|
+
|
95
|
+
# 10. Heatmap of anomalies by Region (rows) and Year (columns)
|
96
|
+
|
97
|
+
# Ensure "Year" is numeric for pivoting
|
98
|
+
df_anomalies['Year'] = pd.to_numeric(df_anomalies['Year'], errors='coerce')
|
99
|
+
|
100
|
+
# Count how many anomalies per (region, year)
|
101
|
+
heatmap_data = df_anomalies.groupby(['Admin 1', 'Year']).size().unstack(fill_value=0)
|
102
|
+
|
103
|
+
# Plot the heatmap
|
104
|
+
plt.figure(figsize=(10, 6))
|
105
|
+
sns.heatmap(
|
106
|
+
heatmap_data,
|
107
|
+
annot=True,
|
108
|
+
cmap='Blues',
|
109
|
+
fmt='d'
|
110
|
+
)
|
111
|
+
plt.title('Number of Anomalies by Region and Year')
|
112
|
+
plt.xlabel('Year')
|
113
|
+
plt.ylabel('Region')
|
114
|
+
plt.tight_layout()
|
115
|
+
plt.show()
|
116
|
+
|
117
|
+
|
@@ -0,0 +1,268 @@
|
|
1
|
+
import geopandas as gpd
|
2
|
+
import pandas as pd
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import palettable as pal
|
5
|
+
import matplotlib.colors as mcolors
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import glob
|
9
|
+
import os
|
10
|
+
|
11
|
+
# 1. Specify the directory containing your .dta files:
|
12
|
+
data_dir = r"C:\Users\ritvik\Downloads\maize_yield\maize_yield"
|
13
|
+
|
14
|
+
# 2. Use glob to find all .dta files in that directory:
|
15
|
+
dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
|
16
|
+
|
17
|
+
# 3. Read each .dta file into a pandas DataFrame and store in a list:
|
18
|
+
dataframes = [pd.read_stata(f) for f in dta_files]
|
19
|
+
|
20
|
+
# 4. Concatenate them all into one DataFrame (row-wise):
|
21
|
+
merged_df = pd.concat(dataframes, ignore_index=True)
|
22
|
+
|
23
|
+
merged_df['ZONE'] = merged_df['ZONE'].astype(int)
|
24
|
+
merged_df['DIST'] = merged_df['DIST'].astype(int)
|
25
|
+
|
26
|
+
# create a column called W_CODE which is set up as follows
|
27
|
+
# create a string by converting ZONE column to string and append 0
|
28
|
+
# to the left of the string to make it 2 characters long
|
29
|
+
# then do the same with DIST column
|
30
|
+
# finally concatenate the two strings
|
31
|
+
merged_df['W_CODE'] = merged_df['ZONE'].astype(str).str.zfill(2) + merged_df['DIST'].astype(str).str.zfill(2)
|
32
|
+
|
33
|
+
merged_df['W_CODE'] = '7' + merged_df['W_CODE']
|
34
|
+
|
35
|
+
# Remove the .0 at the end of the string in W_CODE
|
36
|
+
merged_df['W_CODE'] = merged_df['W_CODE'].str.replace('.0', '')
|
37
|
+
merged_df['W_CODE'] = merged_df['W_CODE'].astype(int)
|
38
|
+
|
39
|
+
dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
|
40
|
+
dg = dg[['W_CODE', 'W_NAME']]
|
41
|
+
|
42
|
+
# Merge the two dataframes on W_CODE
|
43
|
+
merged_df = pd.merge(merged_df, dg, on='W_CODE', how='left')
|
44
|
+
|
45
|
+
# Remove rows where PROD98CQ or AREAH are null
|
46
|
+
merged_df = merged_df.dropna(subset=['PROD98CQ', 'AREAH'])
|
47
|
+
|
48
|
+
# Compte yield column
|
49
|
+
merged_df['yield'] = merged_df['PROD98CQ'] / merged_df['AREAH']
|
50
|
+
|
51
|
+
# create a new dataframe which computes average yield by W_NAME for each year
|
52
|
+
df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR'])['yield'].mean().reset_index()
|
53
|
+
|
54
|
+
# Change W_NAME column to title case
|
55
|
+
df_avg_yield['W_NAME'] = df_avg_yield['W_NAME'].str.title()
|
56
|
+
|
57
|
+
# Change YEAR to int
|
58
|
+
df_avg_yield['YEAR'] = df_avg_yield['YEAR'].astype(int)
|
59
|
+
|
60
|
+
# Convert to a format where each YEAR is converted to int and becomes a column and yield is the value
|
61
|
+
df_avg_yield = df_avg_yield.pivot(index='W_NAME', columns='YEAR', values='yield')
|
62
|
+
|
63
|
+
# Remove YEAR as column name and W_NAME as index name
|
64
|
+
df_avg_yield.index.name = None
|
65
|
+
df_avg_yield.columns.name = None
|
66
|
+
|
67
|
+
df_avg_yield.to_csv('wolayita_yields.csv')
|
68
|
+
|
69
|
+
breakpoint()
|
70
|
+
# 5. (Optional) Inspect the merged DataFrame
|
71
|
+
print(merged_df.head())
|
72
|
+
print(len(merged_df))
|
73
|
+
merged_df.to_csv('merged_df.csv', index=False)
|
74
|
+
breakpoint()
|
75
|
+
|
76
|
+
import pandas as pd
|
77
|
+
import matplotlib.pyplot as plt
|
78
|
+
import seaborn as sns
|
79
|
+
|
80
|
+
import geopandas as gpd
|
81
|
+
dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\wolayita.shp")
|
82
|
+
dg = dg[dg['Z_NAME'] == "Wolayita"]
|
83
|
+
|
84
|
+
# Dissolve on W_NAME column
|
85
|
+
dg = dg.dissolve(by="W_NAME")
|
86
|
+
|
87
|
+
# save to disk
|
88
|
+
dg.to_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
|
89
|
+
|
90
|
+
breakpoint()
|
91
|
+
# 1. Load the dataset
|
92
|
+
df = pd.read_csv('merged_df.csv')
|
93
|
+
|
94
|
+
# 2. Ensure we have a 'yield' column.
|
95
|
+
# If not present, we compute yield as Maize_Production / Maize_Area.
|
96
|
+
if 'yield' not in df.columns:
|
97
|
+
if 'PROD98CQ' in df.columns and 'AREAH' in df.columns:
|
98
|
+
# Compute yield in tonnes per hectare (or adjust unit if needed)
|
99
|
+
df['yield'] = df['PROD98CQ'] / df['AREAH']
|
100
|
+
else:
|
101
|
+
raise ValueError("The required columns to compute yield are missing.")
|
102
|
+
|
103
|
+
# 3. Calculate percentage of missing data for yield
|
104
|
+
missing_pct_yield = df['yield'].isnull().mean() * 100
|
105
|
+
print(f"Percentage of missing data for yield: {missing_pct_yield:.2f}%")
|
106
|
+
|
107
|
+
# 4. Check if some years have more or less data
|
108
|
+
# Count the number of records for each year
|
109
|
+
year_counts = df['YEAR'].value_counts().sort_index()
|
110
|
+
print("\nNumber of records per year:")
|
111
|
+
print(year_counts)
|
112
|
+
|
113
|
+
# 5. Plot histogram of yield distributions by year
|
114
|
+
import seaborn as sns
|
115
|
+
|
116
|
+
# Instead of looping and plotting histograms, we can use a boxplot
|
117
|
+
plt.figure(figsize=(12, 8))
|
118
|
+
|
119
|
+
sns.boxplot(x='YEAR', y='yield', data=df)
|
120
|
+
|
121
|
+
# Add labels and title
|
122
|
+
plt.xlabel("")
|
123
|
+
plt.ylabel("Yield")
|
124
|
+
|
125
|
+
plt.show()
|
126
|
+
|
127
|
+
|
128
|
+
# Group by YEAR and get size (number of rows)
|
129
|
+
df_year_counts = df.groupby('YEAR').size().reset_index(name='Count')
|
130
|
+
# Sort by YEAR if you want ascending year order
|
131
|
+
df_year_counts.sort_values(by='YEAR', inplace=True)
|
132
|
+
|
133
|
+
plt.figure(figsize=(10, 6))
|
134
|
+
sns.barplot(data=df_year_counts, x='YEAR', y='Count', color='skyblue', edgecolor='black')
|
135
|
+
|
136
|
+
plt.xlabel("")
|
137
|
+
plt.ylabel("Number of Yield Records")
|
138
|
+
plt.xticks(rotation=45) # Rotate x labels if needed
|
139
|
+
plt.tight_layout() # Adjust layout to avoid clipping
|
140
|
+
plt.show()
|
141
|
+
|
142
|
+
|
143
|
+
import pandas as pd
|
144
|
+
import numpy as np
|
145
|
+
import seaborn as sns
|
146
|
+
import matplotlib.pyplot as plt
|
147
|
+
|
148
|
+
# 1. Group by FA and YEAR, then calculate the mean yield
|
149
|
+
fa_year_yield = df.groupby(['FA', 'YEAR'])['yield'].mean().reset_index()
|
150
|
+
|
151
|
+
# 2. Pivot so rows = FA, columns = YEAR, values = average yield
|
152
|
+
fa_year_pivot = fa_year_yield.pivot(index='FA', columns='YEAR', values='yield')
|
153
|
+
|
154
|
+
# 3. Create the heatmap
|
155
|
+
plt.figure(figsize=(12, 8))
|
156
|
+
sns.heatmap(
|
157
|
+
fa_year_pivot,
|
158
|
+
cmap='viridis', # color map; try 'coolwarm' or others
|
159
|
+
annot=False, # show numeric values in each cell
|
160
|
+
fmt=".2f", # format numbers (2 decimal places)
|
161
|
+
linewidths=.5 # line width between cells
|
162
|
+
)
|
163
|
+
|
164
|
+
plt.title("Heatmap of Average Yield by FA and YEAR")
|
165
|
+
plt.xlabel("YEAR")
|
166
|
+
plt.ylabel("FA")
|
167
|
+
plt.tight_layout()
|
168
|
+
plt.show()
|
169
|
+
|
170
|
+
breakpoint()
|
171
|
+
|
172
|
+
|
173
|
+
# --- Read and preprocess your main shapefile ---
|
174
|
+
dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\safrica.shp")
|
175
|
+
|
176
|
+
# remove rows where both ADMIN1 and ADMIN2 are null
|
177
|
+
dg = dg.dropna(subset=["ADMIN1", "ADMIN2"], how="all")
|
178
|
+
|
179
|
+
# if ADMIN2 is not null then replace ADMIN1 with ADMIN2 values
|
180
|
+
dg["ADMIN1"] = dg["ADMIN2"].combine_first(dg["ADMIN1"])
|
181
|
+
|
182
|
+
# --- Read your CSV and merge on ADMIN1 ---
|
183
|
+
df = pd.read_csv(r"C:\Users\ritvik\Downloads\geocif.csv")
|
184
|
+
|
185
|
+
dg = dg.merge(
|
186
|
+
df[["ADMIN1", 'Predicted Yield (tn per ha)',
|
187
|
+
'Median Yield (tn per ha) (2013-2017)', 'Predicted/Median']],
|
188
|
+
on="ADMIN1",
|
189
|
+
how="left"
|
190
|
+
)
|
191
|
+
|
192
|
+
# --- Create a dissolved national boundary GeoDataFrame ---
|
193
|
+
boundary_gdf = dg.dissolve(by="ADMIN0")
|
194
|
+
|
195
|
+
# --- Colormap and normalization setup ---
|
196
|
+
cmap = pal.colorbrewer.get_map("BrBG", "diverging", 11).mpl_colormap
|
197
|
+
norm = mcolors.TwoSlopeNorm(vmin=-40, vcenter=0, vmax=40)
|
198
|
+
|
199
|
+
# --- First map: Predicted/Median ---
|
200
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
201
|
+
|
202
|
+
# 1) Plot the main layer
|
203
|
+
dg.plot(
|
204
|
+
column="Predicted/Median",
|
205
|
+
cmap=cmap,
|
206
|
+
norm=norm,
|
207
|
+
legend=True,
|
208
|
+
ax=ax,
|
209
|
+
edgecolor='gray',
|
210
|
+
linewidth=0.2,
|
211
|
+
legend_kwds={
|
212
|
+
"shrink": 0.5,
|
213
|
+
"pad": 0.002,
|
214
|
+
"orientation": "horizontal"
|
215
|
+
}
|
216
|
+
)
|
217
|
+
|
218
|
+
url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
|
219
|
+
|
220
|
+
world = gpd.read_file(url)
|
221
|
+
world = world[world['ADMIN'].isin(['South Africa', 'Angola', 'Malawi', 'Zambia'])]
|
222
|
+
|
223
|
+
# 2) Plot the dissolved national boundaries on top
|
224
|
+
world.plot(
|
225
|
+
ax=ax,
|
226
|
+
color="none", # No fill
|
227
|
+
edgecolor="black", # Outline color
|
228
|
+
linewidth=0.5
|
229
|
+
)
|
230
|
+
|
231
|
+
ax.set_title("Maize Yield Forecast % Anomaly")
|
232
|
+
plt.axis("off")
|
233
|
+
plt.tight_layout()
|
234
|
+
plt.savefig("aa.png", dpi=300)
|
235
|
+
plt.close()
|
236
|
+
|
237
|
+
|
238
|
+
# --- Second map: Median Yield (2013-2017) ---
|
239
|
+
# fig, ax = plt.subplots(figsize=(10, 6))
|
240
|
+
#
|
241
|
+
# # 1) Plot the main layer
|
242
|
+
# dg.plot(
|
243
|
+
# column="Median Yield (tn per ha) (2013-2017)",
|
244
|
+
# cmap=cmap,
|
245
|
+
# legend=True,
|
246
|
+
# ax=ax,
|
247
|
+
# legend_kwds={
|
248
|
+
# "shrink": 0.5,
|
249
|
+
# "pad": 0.002,
|
250
|
+
# "orientation": "horizontal"
|
251
|
+
# }
|
252
|
+
# )
|
253
|
+
#
|
254
|
+
# # 2) Plot the dissolved national boundaries on top
|
255
|
+
# boundary_gdf.plot(
|
256
|
+
# ax=ax,
|
257
|
+
# color="none",
|
258
|
+
# edgecolor="black",
|
259
|
+
# linewidth=1
|
260
|
+
# )
|
261
|
+
#
|
262
|
+
# ax.set_title("Median Maize Yield (2013-2017)")
|
263
|
+
# plt.axis("off")
|
264
|
+
# plt.tight_layout()
|
265
|
+
# plt.show()
|
266
|
+
# plt.close()
|
267
|
+
|
268
|
+
breakpoint()
|
@@ -55,6 +55,7 @@ geocif/ml/trend.py
|
|
55
55
|
geocif/ml/xai.py
|
56
56
|
geocif/playground/__init__.py
|
57
57
|
geocif/playground/aa.py
|
58
|
+
geocif/playground/area.py
|
58
59
|
geocif/playground/automl.py
|
59
60
|
geocif/playground/download_esi.py
|
60
61
|
geocif/playground/enso.py
|
@@ -75,4 +76,5 @@ geocif/risk/__init__.py
|
|
75
76
|
geocif/risk/impact_assessment.py
|
76
77
|
geocif/viz/__init__.py
|
77
78
|
geocif/viz/plot.py
|
79
|
+
geocif/viz/tmp.py
|
78
80
|
tests/test_geocif.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|