geocif 0.1.61__tar.gz → 0.1.63__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.61/geocif.egg-info → geocif-0.1.63}/PKG-INFO +1 -1
- {geocif-0.1.61 → geocif-0.1.63}/geocif/analysis.py +38 -20
- {geocif-0.1.61 → geocif-0.1.63}/geocif/geocif.py +22 -5
- {geocif-0.1.61 → geocif-0.1.63}/geocif/geocif_runner.py +35 -34
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_angola.py +3 -3
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/stats.py +5 -2
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/trainers.py +5 -1
- geocif-0.1.63/geocif/playground/area.py +117 -0
- geocif-0.1.63/geocif/viz/tmp.py +268 -0
- {geocif-0.1.61 → geocif-0.1.63/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.61 → geocif-0.1.63}/setup.py +1 -1
- {geocif-0.1.61 → geocif-0.1.63}/LICENSE +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/MANIFEST.in +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/README.md +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/constants.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/features.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/geo.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/models.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/cei/indices.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/experiments.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/logger.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/correlations.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/output.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/stages.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/trend.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/xai.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/mm.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/aa.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/automl.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/download_esi.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/enso.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/eval.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/gamtest.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/gee_access.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/misc.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/reg.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/sustain.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp2.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp3.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp4.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp5.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/risk/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/utils.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif/viz/plot.py +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/requirements.txt +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/setup.cfg +0 -0
- {geocif-0.1.61 → geocif-0.1.63}/tests/test_geocif.py +0 -0
@@ -538,34 +538,52 @@ class Geoanalysis:
|
|
538
538
|
country = self.country.title().replace("_", " ")
|
539
539
|
crop = self.crop.title().replace("_", " ")
|
540
540
|
file = dir_statistics / f"{country}_{crop}_statistics_s1_{self.method}.csv"
|
541
|
-
|
541
|
+
df_all = pd.read_csv(file)
|
542
542
|
|
543
|
-
|
543
|
+
# Keep only the relevant columns and drop NaNs
|
544
|
+
df_all = df_all[["Region", "Harvest Year", "Yield (tn per ha)"]].dropna()
|
544
545
|
|
545
|
-
#
|
546
|
-
|
546
|
+
# --- For computing the % of total production ---
|
547
|
+
# Determine unique years and sort them (in case they aren't already)
|
548
|
+
years = sorted(df_all["Harvest Year"].unique())
|
549
|
+
# Subset dataframe to include only the last 5 years of the dataset
|
550
|
+
last_five_years = years[-5:]
|
551
|
+
df_recent = df_all[df_all["Harvest Year"].isin(last_five_years)]
|
547
552
|
|
548
|
-
#
|
549
|
-
|
550
|
-
|
551
|
-
# Subset dataframe to only include the last years of the dataset
|
552
|
-
df_historic = df_historic[df_historic["Harvest Year"].isin(years[-5:])]
|
553
|
-
|
554
|
-
# For each region, compute the % of the total production
|
555
|
-
df_historic = (
|
556
|
-
df_historic.groupby("Region")["Yield (tn per ha)"]
|
553
|
+
# For each region, compute the % of total production (using yield sum over the last five years)
|
554
|
+
df_pct = (
|
555
|
+
df_recent.groupby("Region")["Yield (tn per ha)"]
|
557
556
|
.sum()
|
558
557
|
.pipe(lambda x: x / x.sum() * 100)
|
559
558
|
.to_frame(name="% of total Area (ha)")
|
560
559
|
.reset_index()
|
561
560
|
)
|
562
|
-
|
563
|
-
#
|
564
|
-
#
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
561
|
+
|
562
|
+
# --- For computing median yields ---
|
563
|
+
# Compute median yield for 2014 - 2018
|
564
|
+
df_median_2014_2018 = (
|
565
|
+
df_all[df_all["Harvest Year"].between(2014, 2018)]
|
566
|
+
.groupby("Region")["Yield (tn per ha)"]
|
567
|
+
.median()
|
568
|
+
.rename(f"Median Yield (tn per ha) (2014-2018)")
|
569
|
+
.reset_index()
|
570
|
+
)
|
571
|
+
|
572
|
+
# Compute median yield for 2013 - 2017
|
573
|
+
df_median_2013_2017 = (
|
574
|
+
df_all[df_all["Harvest Year"].between(2013, 2017)]
|
575
|
+
.groupby("Region")["Yield (tn per ha)"]
|
576
|
+
.median()
|
577
|
+
.rename("Median Yield (tn per ha) (2013-2017)")
|
578
|
+
.reset_index()
|
579
|
+
)
|
580
|
+
|
581
|
+
# Merge the median yield columns with the % of total production dataframe
|
582
|
+
df_historic = (
|
583
|
+
df_pct
|
584
|
+
.merge(df_median_2014_2018, on="Region", how="left")
|
585
|
+
.merge(df_median_2013_2017, on="Region", how="left")
|
586
|
+
)
|
569
587
|
|
570
588
|
return df_historic
|
571
589
|
|
@@ -308,9 +308,12 @@ class Geocif:
|
|
308
308
|
cat_features=self.cat_features,
|
309
309
|
verbose=False,
|
310
310
|
)
|
311
|
-
elif self.model_name
|
312
|
-
|
313
|
-
|
311
|
+
elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
|
312
|
+
X_train = X_train.drop(
|
313
|
+
columns=[
|
314
|
+
item for item in self.cat_features if item != "Harvest Year"
|
315
|
+
]
|
316
|
+
)
|
314
317
|
self.model.fit(X_train, y_train)
|
315
318
|
elif self.model_name == "ydf":
|
316
319
|
# Combine X_train and y_train
|
@@ -517,6 +520,14 @@ class Geocif:
|
|
517
520
|
elif self.model_name == "ydf":
|
518
521
|
y_pred = self.model.evaluate(X_test)
|
519
522
|
best_hyperparameters = {}
|
523
|
+
elif self.model_name == "tabpfn":
|
524
|
+
X_test = X_test.drop(
|
525
|
+
columns=[
|
526
|
+
item for item in self.cat_features if item != "Harvest Year"
|
527
|
+
]
|
528
|
+
)
|
529
|
+
y_pred = self.model.predict(X_test)
|
530
|
+
best_hyperparameters = {}
|
520
531
|
else:
|
521
532
|
y_pred = self.model.predict(X_test)
|
522
533
|
best_hyperparameters = self.model.get_params().copy()
|
@@ -740,8 +751,6 @@ class Geocif:
|
|
740
751
|
|
741
752
|
if self.median_yield_as_feature:
|
742
753
|
self.feature_names.append(f"Median {self.target}")
|
743
|
-
self.feature_names.append(f"Median {self.target} (2014-2018)")
|
744
|
-
self.feature_names.append(f"Median {self.target} (2013-2017)")
|
745
754
|
|
746
755
|
if self.lag_yield_as_feature:
|
747
756
|
# For the number of years specified in self.number_lag_years
|
@@ -811,6 +820,8 @@ class Geocif:
|
|
811
820
|
+ self.statistics_columns
|
812
821
|
+ self.feature_names
|
813
822
|
+ [f"Median {self.target}"]
|
823
|
+
+ [f"Median {self.target} (2014-2018)"]
|
824
|
+
+ [f"Median {self.target} (2013-2017)"]
|
814
825
|
+ ["Region_ID"]
|
815
826
|
)
|
816
827
|
if self.check_yield_trend:
|
@@ -1280,6 +1291,9 @@ class Geocif:
|
|
1280
1291
|
self.cluster_strategy = "single"
|
1281
1292
|
self.select_cei_by = "Index"
|
1282
1293
|
self.use_cumulative_features = True
|
1294
|
+
elif self.model_name in ["tabpfn"]:
|
1295
|
+
self.do_xai = False
|
1296
|
+
self.estimate_ci = False
|
1283
1297
|
elif self.model_name in ["oblique", "ydf"]:
|
1284
1298
|
self.do_xai = False
|
1285
1299
|
self.estimate_ci = False
|
@@ -1360,6 +1374,9 @@ class Geocif:
|
|
1360
1374
|
if self.country == "nepal":
|
1361
1375
|
self.dg["ADM0_NAME"] = "nepal"
|
1362
1376
|
self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["PR_NAME"]
|
1377
|
+
elif self.country == "wolayita":
|
1378
|
+
self.dg["ADM0_NAME"] = "ethiopia"
|
1379
|
+
self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["W_NAME"]
|
1363
1380
|
elif self.admin_zone == "admin_1":
|
1364
1381
|
self.dg["Country Region"] = (
|
1365
1382
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
|
@@ -26,40 +26,41 @@ def loop_execute(inputs):
|
|
26
26
|
Returns:
|
27
27
|
|
28
28
|
"""
|
29
|
-
from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
|
30
|
-
from pycallgraph2.output import GraphvizOutput
|
31
|
-
|
32
|
-
graphviz = GraphvizOutput()
|
33
|
-
graphviz.output_file = "geocif_visualization.png"
|
34
|
-
plt.rcParams["figure.dpi"] = 600
|
35
|
-
config = Config(max_depth=5)
|
36
|
-
config.trace_filter = GlobbingFilter(
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
29
|
+
# from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
|
30
|
+
# from pycallgraph2.output import GraphvizOutput
|
31
|
+
#
|
32
|
+
# graphviz = GraphvizOutput()
|
33
|
+
# graphviz.output_file = "geocif_visualization.png"
|
34
|
+
# plt.rcParams["figure.dpi"] = 600
|
35
|
+
# config = Config(max_depth=5)
|
36
|
+
# config.trace_filter = GlobbingFilter(
|
37
|
+
# exclude=[
|
38
|
+
# "pycallgraph.*",
|
39
|
+
# "torch*",
|
40
|
+
# ]
|
41
|
+
# )
|
42
|
+
#
|
43
|
+
# with PyCallGraph(output=graphviz, config=config):
|
44
|
+
project_name, country, crop, season, model, logger, parser, index = inputs
|
45
|
+
|
46
|
+
logger.info("=====================================================")
|
47
|
+
logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
|
48
|
+
logger.info("=====================================================")
|
49
|
+
|
50
|
+
obj = geocif.Geocif(logger=logger,
|
51
|
+
parser=parser,
|
52
|
+
project_name=project_name)
|
53
|
+
obj.read_data(country, crop, season)
|
54
|
+
|
55
|
+
# Store config file in database, only execute this for
|
56
|
+
# the first iteration of the loop
|
57
|
+
if index == 0:
|
58
|
+
output.config_to_db(obj.db_path, obj.parser, obj.today)
|
59
|
+
|
60
|
+
# Setup metadata and run ML code
|
61
|
+
obj.setup(season, model)
|
62
|
+
if obj.simulation_stages:
|
63
|
+
obj.execute()
|
63
64
|
|
64
65
|
|
65
66
|
def gather_inputs(parser):
|
@@ -12,7 +12,7 @@ warnings.filterwarnings("ignore")
|
|
12
12
|
from .cei import indices
|
13
13
|
from geoprepare import base
|
14
14
|
|
15
|
-
country = "
|
15
|
+
country = "ethiopia"
|
16
16
|
|
17
17
|
def remove_duplicates(lst):
|
18
18
|
"""
|
@@ -171,10 +171,10 @@ class cei_runner(base.BaseGeo):
|
|
171
171
|
# Only keep those entries in combinations where the third elemt is
|
172
172
|
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
173
173
|
# This is done to test the code for these countries
|
174
|
-
combinations = [i for i in combinations if f"{country}
|
174
|
+
combinations = [i for i in combinations if f"{country}_wheat_s1" in i[3]]
|
175
175
|
|
176
176
|
if True:
|
177
|
-
num_cpu = int(cpu_count() * 0.
|
177
|
+
num_cpu = int(cpu_count() * 0.9)
|
178
178
|
with Pool(num_cpu) as p:
|
179
179
|
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
180
180
|
pass
|
@@ -80,8 +80,11 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
|
|
80
80
|
|
81
81
|
# CM_Season should be 1 for the Main season
|
82
82
|
# TODO: Make this user specified
|
83
|
-
|
84
|
-
|
83
|
+
if "CM_Season" in df_tmp.columns:
|
84
|
+
mask_cm_season = df_tmp["CM_Season"] == 1
|
85
|
+
val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
|
86
|
+
else:
|
87
|
+
val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
|
85
88
|
|
86
89
|
try:
|
87
90
|
if val.isnull().all():
|
@@ -268,7 +268,7 @@ def auto_train(
|
|
268
268
|
loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
|
269
269
|
bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
|
270
270
|
hyperparams = {
|
271
|
-
"iterations":
|
271
|
+
"iterations": 2500,
|
272
272
|
"learning_rate": 0.025,
|
273
273
|
"depth": 6,
|
274
274
|
"subsample": 1.0,
|
@@ -300,6 +300,10 @@ def auto_train(
|
|
300
300
|
n_estimators=1500, max_depth=20, max_features=n_features**2,
|
301
301
|
feature_combinations=n_features, n_jobs=-1, random_state=42
|
302
302
|
)
|
303
|
+
elif model_name == "tabpfn":
|
304
|
+
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
|
305
|
+
|
306
|
+
model = AutoTabPFNRegressor()
|
303
307
|
elif model_name == "ngboost":
|
304
308
|
if model_type == "REGRESSION":
|
305
309
|
from ngboost import NGBRegressor
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import seaborn as sns
|
5
|
+
|
6
|
+
# 1. Read the CSV
|
7
|
+
df = pd.read_csv(r"C:\Users\ritvik\Downloads\ET_AgStats.csv")
|
8
|
+
|
9
|
+
# 2. Filter for the crop "Maize (Corn)"
|
10
|
+
df = df[df['DNL_SourceCrop'] == 'Maize (Corn)']
|
11
|
+
|
12
|
+
# 3. Remove rows where "Area Planted: ha" is "NA" or "NC"
|
13
|
+
df = df[df['Area Planted: ha'] != 'NA']
|
14
|
+
df = df[df['Area Planted: ha'] != 'NC']
|
15
|
+
|
16
|
+
df = df[df['Yield: MT/ha'] != 'NA']
|
17
|
+
df = df[df['Yield: MT/ha'] != 'NC']
|
18
|
+
df = df[df['Yield: MT/ha'] != '#REF!']
|
19
|
+
|
20
|
+
# Remove rows where Admin 2 is null
|
21
|
+
df = df.dropna(subset=['Admin 2'])
|
22
|
+
df = df.dropna(subset=['Yield: MT/ha'])
|
23
|
+
|
24
|
+
# 4. Convert "Area Planted: ha" to float by removing commas
|
25
|
+
df['Area Planted: ha'] = (
|
26
|
+
df['Area Planted: ha']
|
27
|
+
.str.replace(',', '', regex=False)
|
28
|
+
.astype(float)
|
29
|
+
)
|
30
|
+
|
31
|
+
df['Yield: MT/ha'] = (
|
32
|
+
df['Yield: MT/ha']
|
33
|
+
.str.replace(',', '', regex=False)
|
34
|
+
.astype(float)
|
35
|
+
)
|
36
|
+
|
37
|
+
# 5. Group by [region, season] to calculate z-scores
|
38
|
+
grouped = df.groupby(['Admin 2', 'Season'])
|
39
|
+
anomalies_list = []
|
40
|
+
|
41
|
+
for (region, season), group_data in grouped:
|
42
|
+
mean_area = group_data['Area Planted: ha'].mean()
|
43
|
+
std_area = group_data['Area Planted: ha'].std()
|
44
|
+
|
45
|
+
# Avoid division by zero
|
46
|
+
if std_area == 0:
|
47
|
+
group_data['Z_score'] = 0
|
48
|
+
else:
|
49
|
+
group_data['Z_score'] = (group_data['Area Planted: ha'] - mean_area) / std_area
|
50
|
+
|
51
|
+
# Flag anomalies if abs(z-score) > 3
|
52
|
+
group_data['Anomaly'] = group_data['Z_score'].apply(lambda x: 'Yes' if abs(x) > 3 else 'No')
|
53
|
+
|
54
|
+
anomalies_list.append(group_data)
|
55
|
+
|
56
|
+
# 6. Concatenate grouped data back together
|
57
|
+
df_analyzed = pd.concat(anomalies_list)
|
58
|
+
|
59
|
+
# 7. Filter to see only anomalies
|
60
|
+
df_anomalies = df_analyzed[df_analyzed['Anomaly'] == 'Yes']
|
61
|
+
|
62
|
+
# 8. Print full dataset with anomaly flags and the subset of anomalies
|
63
|
+
print("All data with anomaly flags:")
|
64
|
+
print(df_analyzed)
|
65
|
+
|
66
|
+
print("\nDetected anomalies:")
|
67
|
+
print(df_anomalies)
|
68
|
+
df_anomalies.to_csv(r"df_anomalies_v2.csv", index=False)
|
69
|
+
|
70
|
+
# 11. Distribution of "Yield: MT/ha"
|
71
|
+
|
72
|
+
plt.figure(figsize=(8, 5))
|
73
|
+
sns.histplot(df['Yield: MT/ha'], kde=True, bins=30)
|
74
|
+
plt.title('Distribution of Yield (MT/ha)')
|
75
|
+
plt.xlabel('Yield (MT/ha)')
|
76
|
+
plt.ylabel('Count')
|
77
|
+
plt.tight_layout()
|
78
|
+
plt.show()
|
79
|
+
|
80
|
+
# count number of values where yield < 1
|
81
|
+
low_yield = df[df['Yield: MT/ha'] < 1].shape[0]
|
82
|
+
total = df.shape[0]
|
83
|
+
print(f"Number of records with yield < 1: {low_yield} / {total}")
|
84
|
+
breakpoint()
|
85
|
+
# 9. Bar chart of number of anomalies per Season
|
86
|
+
anomalies_by_season = df_anomalies['Season'].value_counts()
|
87
|
+
plt.figure(figsize=(8, 5))
|
88
|
+
anomalies_by_season.plot(kind='bar')
|
89
|
+
plt.title('Number of Anomalies per Season')
|
90
|
+
plt.xlabel('Season')
|
91
|
+
plt.ylabel('Count of Anomalies')
|
92
|
+
plt.tight_layout()
|
93
|
+
plt.show()
|
94
|
+
|
95
|
+
# 10. Heatmap of anomalies by Region (rows) and Year (columns)
|
96
|
+
|
97
|
+
# Ensure "Year" is numeric for pivoting
|
98
|
+
df_anomalies['Year'] = pd.to_numeric(df_anomalies['Year'], errors='coerce')
|
99
|
+
|
100
|
+
# Count how many anomalies per (region, year)
|
101
|
+
heatmap_data = df_anomalies.groupby(['Admin 1', 'Year']).size().unstack(fill_value=0)
|
102
|
+
|
103
|
+
# Plot the heatmap
|
104
|
+
plt.figure(figsize=(10, 6))
|
105
|
+
sns.heatmap(
|
106
|
+
heatmap_data,
|
107
|
+
annot=True,
|
108
|
+
cmap='Blues',
|
109
|
+
fmt='d'
|
110
|
+
)
|
111
|
+
plt.title('Number of Anomalies by Region and Year')
|
112
|
+
plt.xlabel('Year')
|
113
|
+
plt.ylabel('Region')
|
114
|
+
plt.tight_layout()
|
115
|
+
plt.show()
|
116
|
+
|
117
|
+
|
@@ -0,0 +1,268 @@
|
|
1
|
+
import geopandas as gpd
|
2
|
+
import pandas as pd
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import palettable as pal
|
5
|
+
import matplotlib.colors as mcolors
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import glob
|
9
|
+
import os
|
10
|
+
|
11
|
+
# 1. Specify the directory containing your .dta files:
|
12
|
+
data_dir = r"C:\Users\ritvik\Downloads\maize_yield\maize_yield"
|
13
|
+
|
14
|
+
# 2. Use glob to find all .dta files in that directory:
|
15
|
+
dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
|
16
|
+
|
17
|
+
# 3. Read each .dta file into a pandas DataFrame and store in a list:
|
18
|
+
dataframes = [pd.read_stata(f) for f in dta_files]
|
19
|
+
|
20
|
+
# 4. Concatenate them all into one DataFrame (row-wise):
|
21
|
+
merged_df = pd.concat(dataframes, ignore_index=True)
|
22
|
+
|
23
|
+
merged_df['ZONE'] = merged_df['ZONE'].astype(int)
|
24
|
+
merged_df['DIST'] = merged_df['DIST'].astype(int)
|
25
|
+
|
26
|
+
# create a column called W_CODE which is set up as follows
|
27
|
+
# create a string by converting ZONE column to string and append 0
|
28
|
+
# to the left of the string to make it 2 characters long
|
29
|
+
# then do the same with DIST column
|
30
|
+
# finally concatenate the two strings
|
31
|
+
merged_df['W_CODE'] = merged_df['ZONE'].astype(str).str.zfill(2) + merged_df['DIST'].astype(str).str.zfill(2)
|
32
|
+
|
33
|
+
merged_df['W_CODE'] = '7' + merged_df['W_CODE']
|
34
|
+
|
35
|
+
# Remove the .0 at the end of the string in W_CODE
|
36
|
+
merged_df['W_CODE'] = merged_df['W_CODE'].str.replace('.0', '')
|
37
|
+
merged_df['W_CODE'] = merged_df['W_CODE'].astype(int)
|
38
|
+
|
39
|
+
dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
|
40
|
+
dg = dg[['W_CODE', 'W_NAME']]
|
41
|
+
|
42
|
+
# Merge the two dataframes on W_CODE
|
43
|
+
merged_df = pd.merge(merged_df, dg, on='W_CODE', how='left')
|
44
|
+
|
45
|
+
# Remove rows where PROD98CQ or AREAH are null
|
46
|
+
merged_df = merged_df.dropna(subset=['PROD98CQ', 'AREAH'])
|
47
|
+
|
48
|
+
# Compte yield column
|
49
|
+
merged_df['yield'] = merged_df['PROD98CQ'] / merged_df['AREAH']
|
50
|
+
|
51
|
+
# create a new dataframe which computes average yield by W_NAME for each year
|
52
|
+
df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR'])['yield'].mean().reset_index()
|
53
|
+
|
54
|
+
# Change W_NAME column to title case
|
55
|
+
df_avg_yield['W_NAME'] = df_avg_yield['W_NAME'].str.title()
|
56
|
+
|
57
|
+
# Change YEAR to int
|
58
|
+
df_avg_yield['YEAR'] = df_avg_yield['YEAR'].astype(int)
|
59
|
+
|
60
|
+
# Convert to a format where each YEAR is converted to int and becomes a column and yield is the value
|
61
|
+
df_avg_yield = df_avg_yield.pivot(index='W_NAME', columns='YEAR', values='yield')
|
62
|
+
|
63
|
+
# Remove YEAR as column name and W_NAME as index name
|
64
|
+
df_avg_yield.index.name = None
|
65
|
+
df_avg_yield.columns.name = None
|
66
|
+
|
67
|
+
df_avg_yield.to_csv('wolayita_yields.csv')
|
68
|
+
|
69
|
+
breakpoint()
|
70
|
+
# 5. (Optional) Inspect the merged DataFrame
|
71
|
+
print(merged_df.head())
|
72
|
+
print(len(merged_df))
|
73
|
+
merged_df.to_csv('merged_df.csv', index=False)
|
74
|
+
breakpoint()
|
75
|
+
|
76
|
+
import pandas as pd
|
77
|
+
import matplotlib.pyplot as plt
|
78
|
+
import seaborn as sns
|
79
|
+
|
80
|
+
import geopandas as gpd
|
81
|
+
dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\wolayita.shp")
|
82
|
+
dg = dg[dg['Z_NAME'] == "Wolayita"]
|
83
|
+
|
84
|
+
# Dissolve on W_NAME column
|
85
|
+
dg = dg.dissolve(by="W_NAME")
|
86
|
+
|
87
|
+
# save to disk
|
88
|
+
dg.to_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
|
89
|
+
|
90
|
+
breakpoint()
|
91
|
+
# 1. Load the dataset
|
92
|
+
df = pd.read_csv('merged_df.csv')
|
93
|
+
|
94
|
+
# 2. Ensure we have a 'yield' column.
|
95
|
+
# If not present, we compute yield as Maize_Production / Maize_Area.
|
96
|
+
if 'yield' not in df.columns:
|
97
|
+
if 'PROD98CQ' in df.columns and 'AREAH' in df.columns:
|
98
|
+
# Compute yield in tonnes per hectare (or adjust unit if needed)
|
99
|
+
df['yield'] = df['PROD98CQ'] / df['AREAH']
|
100
|
+
else:
|
101
|
+
raise ValueError("The required columns to compute yield are missing.")
|
102
|
+
|
103
|
+
# 3. Calculate percentage of missing data for yield
|
104
|
+
missing_pct_yield = df['yield'].isnull().mean() * 100
|
105
|
+
print(f"Percentage of missing data for yield: {missing_pct_yield:.2f}%")
|
106
|
+
|
107
|
+
# 4. Check if some years have more or less data
|
108
|
+
# Count the number of records for each year
|
109
|
+
year_counts = df['YEAR'].value_counts().sort_index()
|
110
|
+
print("\nNumber of records per year:")
|
111
|
+
print(year_counts)
|
112
|
+
|
113
|
+
# 5. Plot histogram of yield distributions by year
|
114
|
+
import seaborn as sns
|
115
|
+
|
116
|
+
# Instead of looping and plotting histograms, we can use a boxplot
|
117
|
+
plt.figure(figsize=(12, 8))
|
118
|
+
|
119
|
+
sns.boxplot(x='YEAR', y='yield', data=df)
|
120
|
+
|
121
|
+
# Add labels and title
|
122
|
+
plt.xlabel("")
|
123
|
+
plt.ylabel("Yield")
|
124
|
+
|
125
|
+
plt.show()
|
126
|
+
|
127
|
+
|
128
|
+
# Group by YEAR and get size (number of rows)
|
129
|
+
df_year_counts = df.groupby('YEAR').size().reset_index(name='Count')
|
130
|
+
# Sort by YEAR if you want ascending year order
|
131
|
+
df_year_counts.sort_values(by='YEAR', inplace=True)
|
132
|
+
|
133
|
+
plt.figure(figsize=(10, 6))
|
134
|
+
sns.barplot(data=df_year_counts, x='YEAR', y='Count', color='skyblue', edgecolor='black')
|
135
|
+
|
136
|
+
plt.xlabel("")
|
137
|
+
plt.ylabel("Number of Yield Records")
|
138
|
+
plt.xticks(rotation=45) # Rotate x labels if needed
|
139
|
+
plt.tight_layout() # Adjust layout to avoid clipping
|
140
|
+
plt.show()
|
141
|
+
|
142
|
+
|
143
|
+
import pandas as pd
|
144
|
+
import numpy as np
|
145
|
+
import seaborn as sns
|
146
|
+
import matplotlib.pyplot as plt
|
147
|
+
|
148
|
+
# 1. Group by FA and YEAR, then calculate the mean yield
|
149
|
+
fa_year_yield = df.groupby(['FA', 'YEAR'])['yield'].mean().reset_index()
|
150
|
+
|
151
|
+
# 2. Pivot so rows = FA, columns = YEAR, values = average yield
|
152
|
+
fa_year_pivot = fa_year_yield.pivot(index='FA', columns='YEAR', values='yield')
|
153
|
+
|
154
|
+
# 3. Create the heatmap
|
155
|
+
plt.figure(figsize=(12, 8))
|
156
|
+
sns.heatmap(
|
157
|
+
fa_year_pivot,
|
158
|
+
cmap='viridis', # color map; try 'coolwarm' or others
|
159
|
+
annot=False, # show numeric values in each cell
|
160
|
+
fmt=".2f", # format numbers (2 decimal places)
|
161
|
+
linewidths=.5 # line width between cells
|
162
|
+
)
|
163
|
+
|
164
|
+
plt.title("Heatmap of Average Yield by FA and YEAR")
|
165
|
+
plt.xlabel("YEAR")
|
166
|
+
plt.ylabel("FA")
|
167
|
+
plt.tight_layout()
|
168
|
+
plt.show()
|
169
|
+
|
170
|
+
breakpoint()
|
171
|
+
|
172
|
+
|
173
|
+
# --- Read and preprocess your main shapefile ---
|
174
|
+
dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\safrica.shp")
|
175
|
+
|
176
|
+
# remove rows where both ADMIN1 and ADMIN2 are null
|
177
|
+
dg = dg.dropna(subset=["ADMIN1", "ADMIN2"], how="all")
|
178
|
+
|
179
|
+
# if ADMIN2 is not null then replace ADMIN1 with ADMIN2 values
|
180
|
+
dg["ADMIN1"] = dg["ADMIN2"].combine_first(dg["ADMIN1"])
|
181
|
+
|
182
|
+
# --- Read your CSV and merge on ADMIN1 ---
|
183
|
+
df = pd.read_csv(r"C:\Users\ritvik\Downloads\geocif.csv")
|
184
|
+
|
185
|
+
dg = dg.merge(
|
186
|
+
df[["ADMIN1", 'Predicted Yield (tn per ha)',
|
187
|
+
'Median Yield (tn per ha) (2013-2017)', 'Predicted/Median']],
|
188
|
+
on="ADMIN1",
|
189
|
+
how="left"
|
190
|
+
)
|
191
|
+
|
192
|
+
# --- Create a dissolved national boundary GeoDataFrame ---
|
193
|
+
boundary_gdf = dg.dissolve(by="ADMIN0")
|
194
|
+
|
195
|
+
# --- Colormap and normalization setup ---
|
196
|
+
cmap = pal.colorbrewer.get_map("BrBG", "diverging", 11).mpl_colormap
|
197
|
+
norm = mcolors.TwoSlopeNorm(vmin=-40, vcenter=0, vmax=40)
|
198
|
+
|
199
|
+
# --- First map: Predicted/Median ---
|
200
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
201
|
+
|
202
|
+
# 1) Plot the main layer
|
203
|
+
dg.plot(
|
204
|
+
column="Predicted/Median",
|
205
|
+
cmap=cmap,
|
206
|
+
norm=norm,
|
207
|
+
legend=True,
|
208
|
+
ax=ax,
|
209
|
+
edgecolor='gray',
|
210
|
+
linewidth=0.2,
|
211
|
+
legend_kwds={
|
212
|
+
"shrink": 0.5,
|
213
|
+
"pad": 0.002,
|
214
|
+
"orientation": "horizontal"
|
215
|
+
}
|
216
|
+
)
|
217
|
+
|
218
|
+
url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
|
219
|
+
|
220
|
+
world = gpd.read_file(url)
|
221
|
+
world = world[world['ADMIN'].isin(['South Africa', 'Angola', 'Malawi', 'Zambia'])]
|
222
|
+
|
223
|
+
# 2) Plot the dissolved national boundaries on top
|
224
|
+
world.plot(
|
225
|
+
ax=ax,
|
226
|
+
color="none", # No fill
|
227
|
+
edgecolor="black", # Outline color
|
228
|
+
linewidth=0.5
|
229
|
+
)
|
230
|
+
|
231
|
+
ax.set_title("Maize Yield Forecast % Anomaly")
|
232
|
+
plt.axis("off")
|
233
|
+
plt.tight_layout()
|
234
|
+
plt.savefig("aa.png", dpi=300)
|
235
|
+
plt.close()
|
236
|
+
|
237
|
+
|
238
|
+
# --- Second map: Median Yield (2013-2017) ---
|
239
|
+
# fig, ax = plt.subplots(figsize=(10, 6))
|
240
|
+
#
|
241
|
+
# # 1) Plot the main layer
|
242
|
+
# dg.plot(
|
243
|
+
# column="Median Yield (tn per ha) (2013-2017)",
|
244
|
+
# cmap=cmap,
|
245
|
+
# legend=True,
|
246
|
+
# ax=ax,
|
247
|
+
# legend_kwds={
|
248
|
+
# "shrink": 0.5,
|
249
|
+
# "pad": 0.002,
|
250
|
+
# "orientation": "horizontal"
|
251
|
+
# }
|
252
|
+
# )
|
253
|
+
#
|
254
|
+
# # 2) Plot the dissolved national boundaries on top
|
255
|
+
# boundary_gdf.plot(
|
256
|
+
# ax=ax,
|
257
|
+
# color="none",
|
258
|
+
# edgecolor="black",
|
259
|
+
# linewidth=1
|
260
|
+
# )
|
261
|
+
#
|
262
|
+
# ax.set_title("Median Maize Yield (2013-2017)")
|
263
|
+
# plt.axis("off")
|
264
|
+
# plt.tight_layout()
|
265
|
+
# plt.show()
|
266
|
+
# plt.close()
|
267
|
+
|
268
|
+
breakpoint()
|
@@ -55,6 +55,7 @@ geocif/ml/trend.py
|
|
55
55
|
geocif/ml/xai.py
|
56
56
|
geocif/playground/__init__.py
|
57
57
|
geocif/playground/aa.py
|
58
|
+
geocif/playground/area.py
|
58
59
|
geocif/playground/automl.py
|
59
60
|
geocif/playground/download_esi.py
|
60
61
|
geocif/playground/enso.py
|
@@ -75,4 +76,5 @@ geocif/risk/__init__.py
|
|
75
76
|
geocif/risk/impact_assessment.py
|
76
77
|
geocif/viz/__init__.py
|
77
78
|
geocif/viz/plot.py
|
79
|
+
geocif/viz/tmp.py
|
78
80
|
tests/test_geocif.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|