geocif 0.1.61__tar.gz → 0.1.63__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {geocif-0.1.61/geocif.egg-info → geocif-0.1.63}/PKG-INFO +1 -1
  2. {geocif-0.1.61 → geocif-0.1.63}/geocif/analysis.py +38 -20
  3. {geocif-0.1.61 → geocif-0.1.63}/geocif/geocif.py +22 -5
  4. {geocif-0.1.61 → geocif-0.1.63}/geocif/geocif_runner.py +35 -34
  5. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_angola.py +3 -3
  6. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/stats.py +5 -2
  7. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/trainers.py +5 -1
  8. geocif-0.1.63/geocif/playground/area.py +117 -0
  9. geocif-0.1.63/geocif/viz/tmp.py +268 -0
  10. {geocif-0.1.61 → geocif-0.1.63/geocif.egg-info}/PKG-INFO +1 -1
  11. {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/SOURCES.txt +2 -0
  12. {geocif-0.1.61 → geocif-0.1.63}/setup.py +1 -1
  13. {geocif-0.1.61 → geocif-0.1.63}/LICENSE +0 -0
  14. {geocif-0.1.61 → geocif-0.1.63}/MANIFEST.in +0 -0
  15. {geocif-0.1.61 → geocif-0.1.63}/README.md +0 -0
  16. {geocif-0.1.61 → geocif-0.1.63}/geocif/__init__.py +0 -0
  17. {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/__init__.py +0 -0
  18. {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/geoagmet.py +0 -0
  19. {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/plot.py +0 -0
  20. {geocif-0.1.61 → geocif-0.1.63}/geocif/agmet/utils.py +0 -0
  21. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/__init__.py +0 -0
  22. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/constants.py +0 -0
  23. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/features.py +0 -0
  24. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/geo.py +0 -0
  25. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/geocif.py +0 -0
  26. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/metadata.py +0 -0
  27. {geocif-0.1.61 → geocif-0.1.63}/geocif/backup/models.py +0 -0
  28. {geocif-0.1.61 → geocif-0.1.63}/geocif/cei/__init__.py +0 -0
  29. {geocif-0.1.61 → geocif-0.1.63}/geocif/cei/definitions.py +0 -0
  30. {geocif-0.1.61 → geocif-0.1.63}/geocif/cei/indices.py +0 -0
  31. {geocif-0.1.61 → geocif-0.1.63}/geocif/experiments.py +0 -0
  32. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner.py +0 -0
  33. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_madagascar.py +0 -0
  34. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_malawi.py +0 -0
  35. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_mozambique.py +0 -0
  36. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_south_africa.py +0 -0
  37. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_zambia.py +0 -0
  38. {geocif-0.1.61 → geocif-0.1.63}/geocif/indices_runner_zimbabwe.py +0 -0
  39. {geocif-0.1.61 → geocif-0.1.63}/geocif/logger.py +0 -0
  40. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/__init__.py +0 -0
  41. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/correlations.py +0 -0
  42. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/embedding.py +0 -0
  43. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/feature_engineering.py +0 -0
  44. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/feature_selection.py +0 -0
  45. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/outliers.py +0 -0
  46. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/outlook.py +0 -0
  47. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/output.py +0 -0
  48. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/spatial_autocorrelation.py +0 -0
  49. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/stages.py +0 -0
  50. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/trend.py +0 -0
  51. {geocif-0.1.61 → geocif-0.1.63}/geocif/ml/xai.py +0 -0
  52. {geocif-0.1.61 → geocif-0.1.63}/geocif/mm.py +0 -0
  53. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/__init__.py +0 -0
  54. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/aa.py +0 -0
  55. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/automl.py +0 -0
  56. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/download_esi.py +0 -0
  57. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/enso.py +0 -0
  58. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/eval.py +0 -0
  59. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/gamtest.py +0 -0
  60. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/gee_access.py +0 -0
  61. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/misc.py +0 -0
  62. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/play_xagg.py +0 -0
  63. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/reg.py +0 -0
  64. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/sustain.py +0 -0
  65. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/test_catboost.py +0 -0
  66. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp.py +0 -0
  67. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp2.py +0 -0
  68. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp3.py +0 -0
  69. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp4.py +0 -0
  70. {geocif-0.1.61 → geocif-0.1.63}/geocif/playground/tmp5.py +0 -0
  71. {geocif-0.1.61 → geocif-0.1.63}/geocif/risk/__init__.py +0 -0
  72. {geocif-0.1.61 → geocif-0.1.63}/geocif/risk/impact_assessment.py +0 -0
  73. {geocif-0.1.61 → geocif-0.1.63}/geocif/utils.py +0 -0
  74. {geocif-0.1.61 → geocif-0.1.63}/geocif/viz/__init__.py +0 -0
  75. {geocif-0.1.61 → geocif-0.1.63}/geocif/viz/plot.py +0 -0
  76. {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/dependency_links.txt +0 -0
  77. {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/not-zip-safe +0 -0
  78. {geocif-0.1.61 → geocif-0.1.63}/geocif.egg-info/top_level.txt +0 -0
  79. {geocif-0.1.61 → geocif-0.1.63}/requirements.txt +0 -0
  80. {geocif-0.1.61 → geocif-0.1.63}/setup.cfg +0 -0
  81. {geocif-0.1.61 → geocif-0.1.63}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.61
3
+ Version: 0.1.63
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -538,34 +538,52 @@ class Geoanalysis:
538
538
  country = self.country.title().replace("_", " ")
539
539
  crop = self.crop.title().replace("_", " ")
540
540
  file = dir_statistics / f"{country}_{crop}_statistics_s1_{self.method}.csv"
541
- df_historic = pd.read_csv(file)
541
+ df_all = pd.read_csv(file)
542
542
 
543
- df_historic = df_historic[["Region", "Harvest Year", "Yield (tn per ha)"]]
543
+ # Keep only the relevant columns and drop NaNs
544
+ df_all = df_all[["Region", "Harvest Year", "Yield (tn per ha)"]].dropna()
544
545
 
545
- # Drop rows with NaN values
546
- df_historic = df_historic.dropna()
546
+ # --- For computing the % of total production ---
547
+ # Determine unique years and sort them (in case they aren't already)
548
+ years = sorted(df_all["Harvest Year"].unique())
549
+ # Subset dataframe to include only the last 5 years of the dataset
550
+ last_five_years = years[-5:]
551
+ df_recent = df_all[df_all["Harvest Year"].isin(last_five_years)]
547
552
 
548
- # Determine unique years
549
- years = df_historic["Harvest Year"].unique()
550
-
551
- # Subset dataframe to only include the last years of the dataset
552
- df_historic = df_historic[df_historic["Harvest Year"].isin(years[-5:])]
553
-
554
- # For each region, compute the % of the total production
555
- df_historic = (
556
- df_historic.groupby("Region")["Yield (tn per ha)"]
553
+ # For each region, compute the % of total production (using yield sum over the last five years)
554
+ df_pct = (
555
+ df_recent.groupby("Region")["Yield (tn per ha)"]
557
556
  .sum()
558
557
  .pipe(lambda x: x / x.sum() * 100)
559
558
  .to_frame(name="% of total Area (ha)")
560
559
  .reset_index()
561
560
  )
562
- # Find median yield for each region
563
- # df_historic = (
564
- # df_historic.groupby("Region")["Yield (tn per ha)"]
565
- # .median()
566
- # .to_frame(name="Median Yield (tn per ha)")
567
- # .reset_index()
568
- # )
561
+
562
+ # --- For computing median yields ---
563
+ # Compute median yield for 2014 - 2018
564
+ df_median_2014_2018 = (
565
+ df_all[df_all["Harvest Year"].between(2014, 2018)]
566
+ .groupby("Region")["Yield (tn per ha)"]
567
+ .median()
568
+ .rename(f"Median Yield (tn per ha) (2014-2018)")
569
+ .reset_index()
570
+ )
571
+
572
+ # Compute median yield for 2013 - 2017
573
+ df_median_2013_2017 = (
574
+ df_all[df_all["Harvest Year"].between(2013, 2017)]
575
+ .groupby("Region")["Yield (tn per ha)"]
576
+ .median()
577
+ .rename("Median Yield (tn per ha) (2013-2017)")
578
+ .reset_index()
579
+ )
580
+
581
+ # Merge the median yield columns with the % of total production dataframe
582
+ df_historic = (
583
+ df_pct
584
+ .merge(df_median_2014_2018, on="Region", how="left")
585
+ .merge(df_median_2013_2017, on="Region", how="left")
586
+ )
569
587
 
570
588
  return df_historic
571
589
 
@@ -308,9 +308,12 @@ class Geocif:
308
308
  cat_features=self.cat_features,
309
309
  verbose=False,
310
310
  )
311
- elif self.model_name == "ngboost":
312
- self.model.fit(X_train, y_train)
313
- elif self.model_name in ["oblique"]:
311
+ elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
312
+ X_train = X_train.drop(
313
+ columns=[
314
+ item for item in self.cat_features if item != "Harvest Year"
315
+ ]
316
+ )
314
317
  self.model.fit(X_train, y_train)
315
318
  elif self.model_name == "ydf":
316
319
  # Combine X_train and y_train
@@ -517,6 +520,14 @@ class Geocif:
517
520
  elif self.model_name == "ydf":
518
521
  y_pred = self.model.evaluate(X_test)
519
522
  best_hyperparameters = {}
523
+ elif self.model_name == "tabpfn":
524
+ X_test = X_test.drop(
525
+ columns=[
526
+ item for item in self.cat_features if item != "Harvest Year"
527
+ ]
528
+ )
529
+ y_pred = self.model.predict(X_test)
530
+ best_hyperparameters = {}
520
531
  else:
521
532
  y_pred = self.model.predict(X_test)
522
533
  best_hyperparameters = self.model.get_params().copy()
@@ -740,8 +751,6 @@ class Geocif:
740
751
 
741
752
  if self.median_yield_as_feature:
742
753
  self.feature_names.append(f"Median {self.target}")
743
- self.feature_names.append(f"Median {self.target} (2014-2018)")
744
- self.feature_names.append(f"Median {self.target} (2013-2017)")
745
754
 
746
755
  if self.lag_yield_as_feature:
747
756
  # For the number of years specified in self.number_lag_years
@@ -811,6 +820,8 @@ class Geocif:
811
820
  + self.statistics_columns
812
821
  + self.feature_names
813
822
  + [f"Median {self.target}"]
823
+ + [f"Median {self.target} (2014-2018)"]
824
+ + [f"Median {self.target} (2013-2017)"]
814
825
  + ["Region_ID"]
815
826
  )
816
827
  if self.check_yield_trend:
@@ -1280,6 +1291,9 @@ class Geocif:
1280
1291
  self.cluster_strategy = "single"
1281
1292
  self.select_cei_by = "Index"
1282
1293
  self.use_cumulative_features = True
1294
+ elif self.model_name in ["tabpfn"]:
1295
+ self.do_xai = False
1296
+ self.estimate_ci = False
1283
1297
  elif self.model_name in ["oblique", "ydf"]:
1284
1298
  self.do_xai = False
1285
1299
  self.estimate_ci = False
@@ -1360,6 +1374,9 @@ class Geocif:
1360
1374
  if self.country == "nepal":
1361
1375
  self.dg["ADM0_NAME"] = "nepal"
1362
1376
  self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["PR_NAME"]
1377
+ elif self.country == "wolayita":
1378
+ self.dg["ADM0_NAME"] = "ethiopia"
1379
+ self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["W_NAME"]
1363
1380
  elif self.admin_zone == "admin_1":
1364
1381
  self.dg["Country Region"] = (
1365
1382
  self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
@@ -26,40 +26,41 @@ def loop_execute(inputs):
26
26
  Returns:
27
27
 
28
28
  """
29
- from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
30
- from pycallgraph2.output import GraphvizOutput
31
-
32
- graphviz = GraphvizOutput()
33
- graphviz.output_file = "geocif_visualization.png"
34
- plt.rcParams["figure.dpi"] = 600
35
- config = Config(max_depth=5)
36
- config.trace_filter = GlobbingFilter(
37
- exclude=[
38
- "pycallgraph.*",
39
- ]
40
- )
41
-
42
- with PyCallGraph(output=graphviz, config=config):
43
- project_name, country, crop, season, model, logger, parser, index = inputs
44
-
45
- logger.info("=====================================================")
46
- logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
47
- logger.info("=====================================================")
48
-
49
- obj = geocif.Geocif(logger=logger,
50
- parser=parser,
51
- project_name=project_name)
52
- obj.read_data(country, crop, season)
53
-
54
- # Store config file in database, only execute this for
55
- # the first iteration of the loop
56
- if index == 0:
57
- output.config_to_db(obj.db_path, obj.parser, obj.today)
58
-
59
- # Setup metadata and run ML code
60
- obj.setup(season, model)
61
- if obj.simulation_stages:
62
- obj.execute()
29
+ # from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
30
+ # from pycallgraph2.output import GraphvizOutput
31
+ #
32
+ # graphviz = GraphvizOutput()
33
+ # graphviz.output_file = "geocif_visualization.png"
34
+ # plt.rcParams["figure.dpi"] = 600
35
+ # config = Config(max_depth=5)
36
+ # config.trace_filter = GlobbingFilter(
37
+ # exclude=[
38
+ # "pycallgraph.*",
39
+ # "torch*",
40
+ # ]
41
+ # )
42
+ #
43
+ # with PyCallGraph(output=graphviz, config=config):
44
+ project_name, country, crop, season, model, logger, parser, index = inputs
45
+
46
+ logger.info("=====================================================")
47
+ logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
48
+ logger.info("=====================================================")
49
+
50
+ obj = geocif.Geocif(logger=logger,
51
+ parser=parser,
52
+ project_name=project_name)
53
+ obj.read_data(country, crop, season)
54
+
55
+ # Store config file in database, only execute this for
56
+ # the first iteration of the loop
57
+ if index == 0:
58
+ output.config_to_db(obj.db_path, obj.parser, obj.today)
59
+
60
+ # Setup metadata and run ML code
61
+ obj.setup(season, model)
62
+ if obj.simulation_stages:
63
+ obj.execute()
63
64
 
64
65
 
65
66
  def gather_inputs(parser):
@@ -12,7 +12,7 @@ warnings.filterwarnings("ignore")
12
12
  from .cei import indices
13
13
  from geoprepare import base
14
14
 
15
- country = "angola"
15
+ country = "ethiopia"
16
16
 
17
17
  def remove_duplicates(lst):
18
18
  """
@@ -171,10 +171,10 @@ class cei_runner(base.BaseGeo):
171
171
  # Only keep those entries in combinations where the third elemt is
172
172
  # mozambique, south_africa, angola or dem_people's_rep_of_korea
173
173
  # This is done to test the code for these countries
174
- combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
174
+ combinations = [i for i in combinations if f"{country}_wheat_s1" in i[3]]
175
175
 
176
176
  if True:
177
- num_cpu = int(cpu_count() * 0.1)
177
+ num_cpu = int(cpu_count() * 0.9)
178
178
  with Pool(num_cpu) as p:
179
179
  for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
180
180
  pass
@@ -80,8 +80,11 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
80
80
 
81
81
  # CM_Season should be 1 for the Main season
82
82
  # TODO: Make this user specified
83
- mask_cm_season = df_tmp["CM_Season"] == 1
84
- val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
83
+ if "CM_Season" in df_tmp.columns:
84
+ mask_cm_season = df_tmp["CM_Season"] == 1
85
+ val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1 & mask_cm_season][calendar_year]
86
+ else:
87
+ val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
85
88
 
86
89
  try:
87
90
  if val.isnull().all():
@@ -268,7 +268,7 @@ def auto_train(
268
268
  loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
269
269
  bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
270
270
  hyperparams = {
271
- "iterations": 3500,
271
+ "iterations": 2500,
272
272
  "learning_rate": 0.025,
273
273
  "depth": 6,
274
274
  "subsample": 1.0,
@@ -300,6 +300,10 @@ def auto_train(
300
300
  n_estimators=1500, max_depth=20, max_features=n_features**2,
301
301
  feature_combinations=n_features, n_jobs=-1, random_state=42
302
302
  )
303
+ elif model_name == "tabpfn":
304
+ from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
305
+
306
+ model = AutoTabPFNRegressor()
303
307
  elif model_name == "ngboost":
304
308
  if model_type == "REGRESSION":
305
309
  from ngboost import NGBRegressor
@@ -0,0 +1,117 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+
6
+ # 1. Read the CSV
7
+ df = pd.read_csv(r"C:\Users\ritvik\Downloads\ET_AgStats.csv")
8
+
9
+ # 2. Filter for the crop "Maize (Corn)"
10
+ df = df[df['DNL_SourceCrop'] == 'Maize (Corn)']
11
+
12
+ # 3. Remove rows where "Area Planted: ha" is "NA" or "NC"
13
+ df = df[df['Area Planted: ha'] != 'NA']
14
+ df = df[df['Area Planted: ha'] != 'NC']
15
+
16
+ df = df[df['Yield: MT/ha'] != 'NA']
17
+ df = df[df['Yield: MT/ha'] != 'NC']
18
+ df = df[df['Yield: MT/ha'] != '#REF!']
19
+
20
+ # Remove rows where Admin 2 is null
21
+ df = df.dropna(subset=['Admin 2'])
22
+ df = df.dropna(subset=['Yield: MT/ha'])
23
+
24
+ # 4. Convert "Area Planted: ha" to float by removing commas
25
+ df['Area Planted: ha'] = (
26
+ df['Area Planted: ha']
27
+ .str.replace(',', '', regex=False)
28
+ .astype(float)
29
+ )
30
+
31
+ df['Yield: MT/ha'] = (
32
+ df['Yield: MT/ha']
33
+ .str.replace(',', '', regex=False)
34
+ .astype(float)
35
+ )
36
+
37
+ # 5. Group by [region, season] to calculate z-scores
38
+ grouped = df.groupby(['Admin 2', 'Season'])
39
+ anomalies_list = []
40
+
41
+ for (region, season), group_data in grouped:
42
+ mean_area = group_data['Area Planted: ha'].mean()
43
+ std_area = group_data['Area Planted: ha'].std()
44
+
45
+ # Avoid division by zero
46
+ if std_area == 0:
47
+ group_data['Z_score'] = 0
48
+ else:
49
+ group_data['Z_score'] = (group_data['Area Planted: ha'] - mean_area) / std_area
50
+
51
+ # Flag anomalies if abs(z-score) > 3
52
+ group_data['Anomaly'] = group_data['Z_score'].apply(lambda x: 'Yes' if abs(x) > 3 else 'No')
53
+
54
+ anomalies_list.append(group_data)
55
+
56
+ # 6. Concatenate grouped data back together
57
+ df_analyzed = pd.concat(anomalies_list)
58
+
59
+ # 7. Filter to see only anomalies
60
+ df_anomalies = df_analyzed[df_analyzed['Anomaly'] == 'Yes']
61
+
62
+ # 8. Print full dataset with anomaly flags and the subset of anomalies
63
+ print("All data with anomaly flags:")
64
+ print(df_analyzed)
65
+
66
+ print("\nDetected anomalies:")
67
+ print(df_anomalies)
68
+ df_anomalies.to_csv(r"df_anomalies_v2.csv", index=False)
69
+
70
+ # 11. Distribution of "Yield: MT/ha"
71
+
72
+ plt.figure(figsize=(8, 5))
73
+ sns.histplot(df['Yield: MT/ha'], kde=True, bins=30)
74
+ plt.title('Distribution of Yield (MT/ha)')
75
+ plt.xlabel('Yield (MT/ha)')
76
+ plt.ylabel('Count')
77
+ plt.tight_layout()
78
+ plt.show()
79
+
80
+ # count number of values where yield < 1
81
+ low_yield = df[df['Yield: MT/ha'] < 1].shape[0]
82
+ total = df.shape[0]
83
+ print(f"Number of records with yield < 1: {low_yield} / {total}")
84
+ breakpoint()
85
+ # 9. Bar chart of number of anomalies per Season
86
+ anomalies_by_season = df_anomalies['Season'].value_counts()
87
+ plt.figure(figsize=(8, 5))
88
+ anomalies_by_season.plot(kind='bar')
89
+ plt.title('Number of Anomalies per Season')
90
+ plt.xlabel('Season')
91
+ plt.ylabel('Count of Anomalies')
92
+ plt.tight_layout()
93
+ plt.show()
94
+
95
+ # 10. Heatmap of anomalies by Region (rows) and Year (columns)
96
+
97
+ # Ensure "Year" is numeric for pivoting
98
+ df_anomalies['Year'] = pd.to_numeric(df_anomalies['Year'], errors='coerce')
99
+
100
+ # Count how many anomalies per (region, year)
101
+ heatmap_data = df_anomalies.groupby(['Admin 1', 'Year']).size().unstack(fill_value=0)
102
+
103
+ # Plot the heatmap
104
+ plt.figure(figsize=(10, 6))
105
+ sns.heatmap(
106
+ heatmap_data,
107
+ annot=True,
108
+ cmap='Blues',
109
+ fmt='d'
110
+ )
111
+ plt.title('Number of Anomalies by Region and Year')
112
+ plt.xlabel('Year')
113
+ plt.ylabel('Region')
114
+ plt.tight_layout()
115
+ plt.show()
116
+
117
+
@@ -0,0 +1,268 @@
1
+ import geopandas as gpd
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import palettable as pal
5
+ import matplotlib.colors as mcolors
6
+
7
+ import pandas as pd
8
+ import glob
9
+ import os
10
+
11
+ # 1. Specify the directory containing your .dta files:
12
+ data_dir = r"C:\Users\ritvik\Downloads\maize_yield\maize_yield"
13
+
14
+ # 2. Use glob to find all .dta files in that directory:
15
+ dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
16
+
17
+ # 3. Read each .dta file into a pandas DataFrame and store in a list:
18
+ dataframes = [pd.read_stata(f) for f in dta_files]
19
+
20
+ # 4. Concatenate them all into one DataFrame (row-wise):
21
+ merged_df = pd.concat(dataframes, ignore_index=True)
22
+
23
+ merged_df['ZONE'] = merged_df['ZONE'].astype(int)
24
+ merged_df['DIST'] = merged_df['DIST'].astype(int)
25
+
26
+ # create a column called W_CODE which is set up as follows
27
+ # create a string by converting ZONE column to string and append 0
28
+ # to the left of the string to make it 2 characters long
29
+ # then do the same with DIST column
30
+ # finally concatenate the two strings
31
+ merged_df['W_CODE'] = merged_df['ZONE'].astype(str).str.zfill(2) + merged_df['DIST'].astype(str).str.zfill(2)
32
+
33
+ merged_df['W_CODE'] = '7' + merged_df['W_CODE']
34
+
35
+ # Remove the .0 at the end of the string in W_CODE
36
+ merged_df['W_CODE'] = merged_df['W_CODE'].str.replace('.0', '')
37
+ merged_df['W_CODE'] = merged_df['W_CODE'].astype(int)
38
+
39
+ dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
40
+ dg = dg[['W_CODE', 'W_NAME']]
41
+
42
+ # Merge the two dataframes on W_CODE
43
+ merged_df = pd.merge(merged_df, dg, on='W_CODE', how='left')
44
+
45
+ # Remove rows where PROD98CQ or AREAH are null
46
+ merged_df = merged_df.dropna(subset=['PROD98CQ', 'AREAH'])
47
+
48
+ # Compte yield column
49
+ merged_df['yield'] = merged_df['PROD98CQ'] / merged_df['AREAH']
50
+
51
+ # create a new dataframe which computes average yield by W_NAME for each year
52
+ df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR'])['yield'].mean().reset_index()
53
+
54
+ # Change W_NAME column to title case
55
+ df_avg_yield['W_NAME'] = df_avg_yield['W_NAME'].str.title()
56
+
57
+ # Change YEAR to int
58
+ df_avg_yield['YEAR'] = df_avg_yield['YEAR'].astype(int)
59
+
60
+ # Convert to a format where each YEAR is converted to int and becomes a column and yield is the value
61
+ df_avg_yield = df_avg_yield.pivot(index='W_NAME', columns='YEAR', values='yield')
62
+
63
+ # Remove YEAR as column name and W_NAME as index name
64
+ df_avg_yield.index.name = None
65
+ df_avg_yield.columns.name = None
66
+
67
+ df_avg_yield.to_csv('wolayita_yields.csv')
68
+
69
+ breakpoint()
70
+ # 5. (Optional) Inspect the merged DataFrame
71
+ print(merged_df.head())
72
+ print(len(merged_df))
73
+ merged_df.to_csv('merged_df.csv', index=False)
74
+ breakpoint()
75
+
76
+ import pandas as pd
77
+ import matplotlib.pyplot as plt
78
+ import seaborn as sns
79
+
80
+ import geopandas as gpd
81
+ dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\wolayita.shp")
82
+ dg = dg[dg['Z_NAME'] == "Wolayita"]
83
+
84
+ # Dissolve on W_NAME column
85
+ dg = dg.dissolve(by="W_NAME")
86
+
87
+ # save to disk
88
+ dg.to_file(r"D:\Users\ritvik\projects\GEOGLAM\Input\countries\wolayita\wolayita_dissolved.shp")
89
+
90
+ breakpoint()
91
+ # 1. Load the dataset
92
+ df = pd.read_csv('merged_df.csv')
93
+
94
+ # 2. Ensure we have a 'yield' column.
95
+ # If not present, we compute yield as Maize_Production / Maize_Area.
96
+ if 'yield' not in df.columns:
97
+ if 'PROD98CQ' in df.columns and 'AREAH' in df.columns:
98
+ # Compute yield in tonnes per hectare (or adjust unit if needed)
99
+ df['yield'] = df['PROD98CQ'] / df['AREAH']
100
+ else:
101
+ raise ValueError("The required columns to compute yield are missing.")
102
+
103
+ # 3. Calculate percentage of missing data for yield
104
+ missing_pct_yield = df['yield'].isnull().mean() * 100
105
+ print(f"Percentage of missing data for yield: {missing_pct_yield:.2f}%")
106
+
107
+ # 4. Check if some years have more or less data
108
+ # Count the number of records for each year
109
+ year_counts = df['YEAR'].value_counts().sort_index()
110
+ print("\nNumber of records per year:")
111
+ print(year_counts)
112
+
113
+ # 5. Plot histogram of yield distributions by year
114
+ import seaborn as sns
115
+
116
+ # Instead of looping and plotting histograms, we can use a boxplot
117
+ plt.figure(figsize=(12, 8))
118
+
119
+ sns.boxplot(x='YEAR', y='yield', data=df)
120
+
121
+ # Add labels and title
122
+ plt.xlabel("")
123
+ plt.ylabel("Yield")
124
+
125
+ plt.show()
126
+
127
+
128
+ # Group by YEAR and get size (number of rows)
129
+ df_year_counts = df.groupby('YEAR').size().reset_index(name='Count')
130
+ # Sort by YEAR if you want ascending year order
131
+ df_year_counts.sort_values(by='YEAR', inplace=True)
132
+
133
+ plt.figure(figsize=(10, 6))
134
+ sns.barplot(data=df_year_counts, x='YEAR', y='Count', color='skyblue', edgecolor='black')
135
+
136
+ plt.xlabel("")
137
+ plt.ylabel("Number of Yield Records")
138
+ plt.xticks(rotation=45) # Rotate x labels if needed
139
+ plt.tight_layout() # Adjust layout to avoid clipping
140
+ plt.show()
141
+
142
+
143
+ import pandas as pd
144
+ import numpy as np
145
+ import seaborn as sns
146
+ import matplotlib.pyplot as plt
147
+
148
+ # 1. Group by FA and YEAR, then calculate the mean yield
149
+ fa_year_yield = df.groupby(['FA', 'YEAR'])['yield'].mean().reset_index()
150
+
151
+ # 2. Pivot so rows = FA, columns = YEAR, values = average yield
152
+ fa_year_pivot = fa_year_yield.pivot(index='FA', columns='YEAR', values='yield')
153
+
154
+ # 3. Create the heatmap
155
+ plt.figure(figsize=(12, 8))
156
+ sns.heatmap(
157
+ fa_year_pivot,
158
+ cmap='viridis', # color map; try 'coolwarm' or others
159
+ annot=False, # show numeric values in each cell
160
+ fmt=".2f", # format numbers (2 decimal places)
161
+ linewidths=.5 # line width between cells
162
+ )
163
+
164
+ plt.title("Heatmap of Average Yield by FA and YEAR")
165
+ plt.xlabel("YEAR")
166
+ plt.ylabel("FA")
167
+ plt.tight_layout()
168
+ plt.show()
169
+
170
+ breakpoint()
171
+
172
+
173
+ # --- Read and preprocess your main shapefile ---
174
+ dg = gpd.read_file(r"D:\Users\ritvik\projects\GEOGLAM\safrica.shp")
175
+
176
+ # remove rows where both ADMIN1 and ADMIN2 are null
177
+ dg = dg.dropna(subset=["ADMIN1", "ADMIN2"], how="all")
178
+
179
+ # if ADMIN2 is not null then replace ADMIN1 with ADMIN2 values
180
+ dg["ADMIN1"] = dg["ADMIN2"].combine_first(dg["ADMIN1"])
181
+
182
+ # --- Read your CSV and merge on ADMIN1 ---
183
+ df = pd.read_csv(r"C:\Users\ritvik\Downloads\geocif.csv")
184
+
185
+ dg = dg.merge(
186
+ df[["ADMIN1", 'Predicted Yield (tn per ha)',
187
+ 'Median Yield (tn per ha) (2013-2017)', 'Predicted/Median']],
188
+ on="ADMIN1",
189
+ how="left"
190
+ )
191
+
192
+ # --- Create a dissolved national boundary GeoDataFrame ---
193
+ boundary_gdf = dg.dissolve(by="ADMIN0")
194
+
195
+ # --- Colormap and normalization setup ---
196
+ cmap = pal.colorbrewer.get_map("BrBG", "diverging", 11).mpl_colormap
197
+ norm = mcolors.TwoSlopeNorm(vmin=-40, vcenter=0, vmax=40)
198
+
199
+ # --- First map: Predicted/Median ---
200
+ fig, ax = plt.subplots(figsize=(10, 6))
201
+
202
+ # 1) Plot the main layer
203
+ dg.plot(
204
+ column="Predicted/Median",
205
+ cmap=cmap,
206
+ norm=norm,
207
+ legend=True,
208
+ ax=ax,
209
+ edgecolor='gray',
210
+ linewidth=0.2,
211
+ legend_kwds={
212
+ "shrink": 0.5,
213
+ "pad": 0.002,
214
+ "orientation": "horizontal"
215
+ }
216
+ )
217
+
218
+ url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
219
+
220
+ world = gpd.read_file(url)
221
+ world = world[world['ADMIN'].isin(['South Africa', 'Angola', 'Malawi', 'Zambia'])]
222
+
223
+ # 2) Plot the dissolved national boundaries on top
224
+ world.plot(
225
+ ax=ax,
226
+ color="none", # No fill
227
+ edgecolor="black", # Outline color
228
+ linewidth=0.5
229
+ )
230
+
231
+ ax.set_title("Maize Yield Forecast % Anomaly")
232
+ plt.axis("off")
233
+ plt.tight_layout()
234
+ plt.savefig("aa.png", dpi=300)
235
+ plt.close()
236
+
237
+
238
+ # --- Second map: Median Yield (2013-2017) ---
239
+ # fig, ax = plt.subplots(figsize=(10, 6))
240
+ #
241
+ # # 1) Plot the main layer
242
+ # dg.plot(
243
+ # column="Median Yield (tn per ha) (2013-2017)",
244
+ # cmap=cmap,
245
+ # legend=True,
246
+ # ax=ax,
247
+ # legend_kwds={
248
+ # "shrink": 0.5,
249
+ # "pad": 0.002,
250
+ # "orientation": "horizontal"
251
+ # }
252
+ # )
253
+ #
254
+ # # 2) Plot the dissolved national boundaries on top
255
+ # boundary_gdf.plot(
256
+ # ax=ax,
257
+ # color="none",
258
+ # edgecolor="black",
259
+ # linewidth=1
260
+ # )
261
+ #
262
+ # ax.set_title("Median Maize Yield (2013-2017)")
263
+ # plt.axis("off")
264
+ # plt.tight_layout()
265
+ # plt.show()
266
+ # plt.close()
267
+
268
+ breakpoint()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.61
3
+ Version: 0.1.63
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -55,6 +55,7 @@ geocif/ml/trend.py
55
55
  geocif/ml/xai.py
56
56
  geocif/playground/__init__.py
57
57
  geocif/playground/aa.py
58
+ geocif/playground/area.py
58
59
  geocif/playground/automl.py
59
60
  geocif/playground/download_esi.py
60
61
  geocif/playground/enso.py
@@ -75,4 +76,5 @@ geocif/risk/__init__.py
75
76
  geocif/risk/impact_assessment.py
76
77
  geocif/viz/__init__.py
77
78
  geocif/viz/plot.py
79
+ geocif/viz/tmp.py
78
80
  tests/test_geocif.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.61",
53
+ version="0.1.63",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes