geocif 0.1.29__tar.gz → 0.1.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {geocif-0.1.29/geocif.egg-info → geocif-0.1.30}/PKG-INFO +1 -1
  2. {geocif-0.1.29 → geocif-0.1.30}/geocif/geocif.py +35 -22
  3. {geocif-0.1.29 → geocif-0.1.30}/geocif/indices_runner.py +1 -1
  4. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/correlations.py +29 -10
  5. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/embedding.py +6 -2
  6. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/feature_engineering.py +66 -10
  7. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/feature_selection.py +21 -9
  8. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/outliers.py +30 -14
  9. geocif-0.1.30/geocif/ml/spatial_autocorrelation.py +205 -0
  10. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/stages.py +12 -4
  11. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/stats.py +72 -25
  12. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/trainers.py +9 -3
  13. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/trend.py +3 -1
  14. {geocif-0.1.29 → geocif-0.1.30/geocif.egg-info}/PKG-INFO +1 -1
  15. {geocif-0.1.29 → geocif-0.1.30}/geocif.egg-info/SOURCES.txt +1 -0
  16. {geocif-0.1.29 → geocif-0.1.30}/setup.py +1 -1
  17. {geocif-0.1.29 → geocif-0.1.30}/LICENSE +0 -0
  18. {geocif-0.1.29 → geocif-0.1.30}/MANIFEST.in +0 -0
  19. {geocif-0.1.29 → geocif-0.1.30}/README.md +0 -0
  20. {geocif-0.1.29 → geocif-0.1.30}/geocif/__init__.py +0 -0
  21. {geocif-0.1.29 → geocif-0.1.30}/geocif/agmet/__init__.py +0 -0
  22. {geocif-0.1.29 → geocif-0.1.30}/geocif/agmet/geoagmet.py +0 -0
  23. {geocif-0.1.29 → geocif-0.1.30}/geocif/agmet/plot.py +0 -0
  24. {geocif-0.1.29 → geocif-0.1.30}/geocif/agmet/utils.py +0 -0
  25. {geocif-0.1.29 → geocif-0.1.30}/geocif/analysis.py +0 -0
  26. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/__init__.py +0 -0
  27. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/constants.py +0 -0
  28. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/features.py +0 -0
  29. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/geo.py +0 -0
  30. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/geocif.py +0 -0
  31. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/metadata.py +0 -0
  32. {geocif-0.1.29 → geocif-0.1.30}/geocif/backup/models.py +0 -0
  33. {geocif-0.1.29 → geocif-0.1.30}/geocif/cei/__init__.py +0 -0
  34. {geocif-0.1.29 → geocif-0.1.30}/geocif/cei/definitions.py +0 -0
  35. {geocif-0.1.29 → geocif-0.1.30}/geocif/cei/indices.py +0 -0
  36. {geocif-0.1.29 → geocif-0.1.30}/geocif/logger.py +0 -0
  37. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/__init__.py +0 -0
  38. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/outlook.py +0 -0
  39. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/output.py +0 -0
  40. {geocif-0.1.29 → geocif-0.1.30}/geocif/ml/xai.py +0 -0
  41. {geocif-0.1.29 → geocif-0.1.30}/geocif/playground/__init__.py +0 -0
  42. {geocif-0.1.29 → geocif-0.1.30}/geocif/playground/automl.py +0 -0
  43. {geocif-0.1.29 → geocif-0.1.30}/geocif/playground/misc.py +0 -0
  44. {geocif-0.1.29 → geocif-0.1.30}/geocif/utils.py +0 -0
  45. {geocif-0.1.29 → geocif-0.1.30}/geocif/viz/__init__.py +0 -0
  46. {geocif-0.1.29 → geocif-0.1.30}/geocif/viz/plot.py +0 -0
  47. {geocif-0.1.29 → geocif-0.1.30}/geocif.egg-info/dependency_links.txt +0 -0
  48. {geocif-0.1.29 → geocif-0.1.30}/geocif.egg-info/not-zip-safe +0 -0
  49. {geocif-0.1.29 → geocif-0.1.30}/geocif.egg-info/top_level.txt +0 -0
  50. {geocif-0.1.29 → geocif-0.1.30}/requirements.txt +0 -0
  51. {geocif-0.1.29 → geocif-0.1.30}/setup.cfg +0 -0
  52. {geocif-0.1.29 → geocif-0.1.30}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.29
3
+ Version: 0.1.30
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -17,6 +17,7 @@ from tqdm import tqdm
17
17
  from geocif import logger as log
18
18
  from .cei import definitions as di
19
19
  from .ml import correlations
20
+ from .ml import spatial_autocorrelation as sa
20
21
  from .ml import feature_engineering as fe
21
22
  from .ml import feature_selection as fs
22
23
  from .ml import output
@@ -112,6 +113,10 @@ class Geocif:
112
113
  self.analogous_year_yield_as_feature = self.parser.getboolean(
113
114
  "ML", "analogous_year_yield_as_feature"
114
115
  )
116
+ self.spatial_autocorrelation = self.parser.getboolean(
117
+ "ML", "spatial_autocorrelation"
118
+ )
119
+ self.sa_method = self.parser.get("ML", "sa_method")
115
120
  self.last_year_yield_as_feature = self.parser.getboolean(
116
121
  "ML", "last_year_yield_as_feature"
117
122
  )
@@ -367,10 +372,10 @@ class Geocif:
367
372
  "Starting Stage": np.full(shp, self.stage_info["Starting Stage"]),
368
373
  "Ending Stage": np.full(shp, self.stage_info["Ending Stage"]),
369
374
  "Model": np.full(shp, self.model_name),
370
- "Area (ha)": df_region["Area (ha)"].values,
371
375
  "Region_ID": df_region["Region_ID"].values,
372
376
  "Region": df_region["Region"].values,
373
377
  "Harvest Year": df_region["Harvest Year"].values,
378
+ "Area (ha)": df_region["Area (ha)"].values,
374
379
  f"Observed {self.target}": np.around(y_test, 3).ravel(),
375
380
  f"Predicted {self.target}": np.around(y_pred, 3).ravel(),
376
381
  }
@@ -412,6 +417,12 @@ class Geocif:
412
417
  except:
413
418
  breakpoint()
414
419
 
420
+ # if self.spatial_autocorrelation:
421
+ # # Compute spatial autocorrelation
422
+ # df = sa.compute_spatial_autocorrelation(
423
+ # self.dg_country
424
+ # )
425
+
415
426
  for col in [
416
427
  f"Median {self.target}",
417
428
  "Analogous Year",
@@ -425,7 +436,7 @@ class Geocif:
425
436
  # Create an index based on following columns
426
437
  index_columns = [
427
438
  "Model",
428
- "Cluster Strategy"
439
+ "Cluster Strategy",
429
440
  "Country",
430
441
  "Region",
431
442
  "Crop",
@@ -718,7 +729,6 @@ class Geocif:
718
729
  )
719
730
 
720
731
  # cat_features should be converted to category type
721
-
722
732
  df[self.cat_features] = df[self.cat_features].astype("category")
723
733
 
724
734
  """ Heatmap of correlation of various features with yield at each time step"""
@@ -739,32 +749,35 @@ class Geocif:
739
749
  how="outer",
740
750
  )
741
751
 
742
- if self.correlation_plots:
743
- dict_kwargs = {}
744
- dict_kwargs["all_stages"] = self.all_stages
745
- dict_kwargs["target_col"] = self.target
746
- dict_kwargs["country"] = self.country
747
- dict_kwargs["crop"] = self.crop
748
- dict_kwargs["dir_output"] = (
749
- self.dir_analysis
750
- / self.country
751
- / self.crop
752
- / self.model_name
753
- / str(self.forecast_season)
754
- )
755
- dict_kwargs["forecast_season"] = self.forecast_season
756
- dict_kwargs["method"] = self.method
757
- dict_kwargs["national_correlation"] = self.national_correlation
758
- dict_kwargs["groupby"] = self.correlation_plot_groupby
759
- dict_kwargs["dg_country"] = self.dg_country
760
- dict_kwargs["combined_dict"] = self.combined_dict
752
+ dict_kwargs = {}
753
+ dict_kwargs["all_stages"] = self.all_stages
754
+ dict_kwargs["target_col"] = self.target
755
+ dict_kwargs["country"] = self.country
756
+ dict_kwargs["crop"] = self.crop
757
+ dict_kwargs["dir_output"] = (
758
+ self.dir_analysis
759
+ / self.country
760
+ / self.crop
761
+ / self.model_name
762
+ / str(self.forecast_season)
763
+ )
764
+ dict_kwargs["forecast_season"] = self.forecast_season
765
+ dict_kwargs["method"] = self.method
766
+ dict_kwargs["national_correlation"] = self.national_correlation
767
+ dict_kwargs["groupby"] = self.correlation_plot_groupby
768
+ dict_kwargs["dg_country"] = self.dg_country
769
+ dict_kwargs["combined_dict"] = self.combined_dict
761
770
 
771
+ if self.correlation_plots:
762
772
  self.logger.info(f"Correlation plot for {self.country} {self.crop}")
763
773
  (
764
774
  dict_selected_features,
765
775
  dict_best_cei,
766
776
  ) = correlations.all_correlated_feature_by_time(df, **dict_kwargs)
767
777
 
778
+ if self.spatial_autocorrelation:
779
+ sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
780
+
768
781
  """ Separate into train and test datasets based on forecast_season """
769
782
  mask = df["Harvest Year"] == self.forecast_season
770
783
  self.df_train = df[~mask]
@@ -155,7 +155,7 @@ class cei_runner(base.BaseGeo):
155
155
  "ndvi",
156
156
  False, # redo
157
157
  )
158
- for year in range(2023, ar.utcnow().year + 1)
158
+ for year in range(2001, ar.utcnow().year + 1)
159
159
  for status, path, filename, admin_zone, category in combinations
160
160
  ]
161
161
 
@@ -28,7 +28,9 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
28
28
 
29
29
  # Only select columns that have been observed till the current stage
30
30
  for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
31
- current_feature_set = [col for col in df_train.columns if col.endswith(f"_{stage[-1]}")]
31
+ current_feature_set = [
32
+ col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
33
+ ]
32
34
 
33
35
  # Get the most correlated feature for each region
34
36
  top_feature_by_region, counter = embedding.get_top_correlated_features(
@@ -41,7 +43,9 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
41
43
  # Loop through top_feature_by_region and find the average score for _feature
42
44
  # Calculate the average score for 'DTR_36'
43
45
  _feature_scores = [
44
- value[1][0] for key, value in top_feature_by_region.items() if _feature in value[0]
46
+ value[1][0]
47
+ for key, value in top_feature_by_region.items()
48
+ if _feature in value[0]
45
49
  ]
46
50
  average_score = sum(_feature_scores) / len(_feature_scores)
47
51
  _feature = utils.remove_last_part(_feature)
@@ -137,7 +141,9 @@ def plot_feature_corr_by_time(df, **kwargs):
137
141
  # Add colorbar label
138
142
  # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
139
143
  cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
140
- ax_heatmap.set_xticklabels(ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5)
144
+ ax_heatmap.set_xticklabels(
145
+ ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5
146
+ )
141
147
  ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=5)
142
148
  ax_heatmap.set_xlabel("")
143
149
  ax_heatmap.set_ylabel(" ")
@@ -190,7 +196,9 @@ def _all_correlated_feature_by_time(df, **kwargs):
190
196
  pbar.set_description(f"Calculating correlations")
191
197
  pbar.update()
192
198
 
193
- stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)["Stage Name"]
199
+ stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
200
+ "Stage Name"
201
+ ]
194
202
  # starting_stage = stage_name.split("-")[0]
195
203
  current_feature_set = [col for col in df.columns if stage_name in col]
196
204
 
@@ -210,7 +218,9 @@ def _all_correlated_feature_by_time(df, **kwargs):
210
218
 
211
219
  all_stage_names = []
212
220
  for stage in stages_features:
213
- _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)["Stage Name"]
221
+ _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
222
+ "Stage Name"
223
+ ]
214
224
  all_stage_names.append(_tmp)
215
225
 
216
226
  df_results = df_results.reindex(all_stage_names)
@@ -254,7 +264,12 @@ def all_correlated_feature_by_time(df, **kwargs):
254
264
  df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
255
265
  dict_selected_features[region_id] = df_tmp.columns
256
266
 
257
- df_tmp2 = df_tmp.median(axis=0).abs().sort_values(ascending=False).reset_index()
267
+ df_tmp2 = (
268
+ df_tmp.median(axis=0)
269
+ .abs()
270
+ .sort_values(ascending=False)
271
+ .reset_index()
272
+ )
258
273
  df_tmp2.columns = ["Metric", "Value"]
259
274
  # Add another column based on Type of Metric
260
275
  for idx, row in df_tmp2.iterrows():
@@ -278,8 +293,8 @@ def all_correlated_feature_by_time(df, **kwargs):
278
293
  dict_selected_features[region_id] = df_corr.columns
279
294
  dict_best_cei[region_id] = {}
280
295
 
281
- #dict_selected_features[region_id] = dict_selected_features[0]
282
- #dict_best_cei[region_id] = dict_best_cei[0]
296
+ # dict_selected_features[region_id] = dict_selected_features[0]
297
+ # dict_best_cei[region_id] = dict_best_cei[0]
283
298
  # Combine all unique values from the existing dictionary elements
284
299
  # combined_metrics = set()
285
300
  # for key in dict_selected_features:
@@ -310,7 +325,9 @@ def feature_correlation_by_time(**kwargs):
310
325
 
311
326
  # Only select columns that have been observed till the current stage
312
327
  for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
313
- current_feature_set = [col for col in df_train.columns if col.endswith(f"_{stage[-1]}")]
328
+ current_feature_set = [
329
+ col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
330
+ ]
314
331
 
315
332
  # Get the most correlated feature for each region
316
333
  top_feature_by_region, counter = embedding.compute_feature_correlations(
@@ -324,7 +341,9 @@ def feature_correlation_by_time(**kwargs):
324
341
  # Loop through top_feature_by_region and find the average score for _feature
325
342
  # Calculate the average score for 'DTR_36'
326
343
  _feature_scores = [
327
- value[1][0] for key, value in top_feature_by_region.items() if _feature in value[0]
344
+ value[1][0]
345
+ for key, value in top_feature_by_region.items()
346
+ if _feature in value[0]
328
347
  ]
329
348
  average_score = sum(_feature_scores) / len(_feature_scores)
330
349
  _feature = utils.remove_last_part(_feature)
@@ -79,7 +79,9 @@ def get_top_correlated_features(inputs, targets):
79
79
  feature_correlations = _compute_correlations(X, y)
80
80
 
81
81
  # Exclude any nan values
82
- feature_correlations = {k: v for k, v in feature_correlations.items() if not np.isnan(v)}
82
+ feature_correlations = {
83
+ k: v for k, v in feature_correlations.items() if not np.isnan(v)
84
+ }
83
85
 
84
86
  if not feature_correlations:
85
87
  continue
@@ -113,7 +115,9 @@ def get_all_features_correlation(inputs, targets, method):
113
115
  feature_correlations = _compute_correlations(X, y)
114
116
 
115
117
  # Exclude any nan values
116
- feature_correlations = {k: v for k, v in feature_correlations.items() if not np.isnan(v)}
118
+ feature_correlations = {
119
+ k: v for k, v in feature_correlations.items() if not np.isnan(v)
120
+ }
117
121
 
118
122
  if not feature_correlations:
119
123
  continue
@@ -21,11 +21,15 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
21
21
  # Initialize the new column with NaNs
22
22
  df[f"Last Year {target_col}"] = np.nan
23
23
 
24
- for region, group in tqdm(df.groupby("Region"), desc="Last year yields", leave=False):
24
+ for region, group in tqdm(
25
+ df.groupby("Region"), desc="Last year yields", leave=False
26
+ ):
25
27
  unique_years = group["Harvest Year"].unique()
26
28
 
27
29
  for harvest_year in unique_years:
28
- mask = (group["Harvest Year"] == harvest_year - 1) & (group["Region"] == region)
30
+ mask = (group["Harvest Year"] == harvest_year - 1) & (
31
+ group["Region"] == region
32
+ )
29
33
  last_year_yield = group.loc[mask, target_col].values
30
34
  if last_year_yield:
31
35
  df.loc[
@@ -89,7 +93,9 @@ def compute_median_yield(
89
93
  closest_years = compute_closest_years(
90
94
  all_seasons_with_yield, harvest_year, number_median_years
91
95
  )
92
- mask = (group["Harvest Year"].isin(closest_years)) & (group["Region"] == region)
96
+ mask = (group["Harvest Year"].isin(closest_years)) & (
97
+ group["Region"] == region
98
+ )
93
99
  median_yield = group.loc[mask, target_col].median()
94
100
  df.loc[
95
101
  (df["Region"] == region) & (df["Harvest Year"] == harvest_year),
@@ -99,7 +105,9 @@ def compute_median_yield(
99
105
  return df
100
106
 
101
107
 
102
- def compute_lag_yield(df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"):
108
+ def compute_lag_yield(
109
+ df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"
110
+ ):
103
111
  # For the number of years specified in self.number_lag_years, add the yield of that number of years
104
112
  # ago to the dataframe
105
113
  # For example, if number_lag_years is 3, then the yield of each year upto 3 years ago will be added
@@ -125,7 +133,9 @@ def compute_lag_yield(df, all_seasons_with_yield, number_lag_years, target_col="
125
133
  col = f"t -{idx + 1} {target_col}"
126
134
 
127
135
  mask_group_year = group["Harvest Year"] == year
128
- mask_region = (df["Region"] == region) & (df["Harvest Year"] == harvest_year)
136
+ mask_region = (df["Region"] == region) & (
137
+ df["Harvest Year"] == harvest_year
138
+ )
129
139
  yield_value = group.loc[mask_group_year, target_col].values
130
140
 
131
141
  if yield_value.size > 0:
@@ -181,11 +191,15 @@ def compute_analogous_yield(
181
191
  all_years = df["Harvest Year"].unique()
182
192
 
183
193
  for harvest_year in tqdm(all_years, desc="Computing analogous yields", leave=False):
184
- lag_years = compute_closest_years(all_seasons_with_yield, harvest_year, number_lag_years)
194
+ lag_years = compute_closest_years(
195
+ all_seasons_with_yield, harvest_year, number_lag_years
196
+ )
185
197
 
186
198
  for region in df["Region"].unique():
187
199
  # Filter current year and region dataset
188
- df_current = df[(df["Harvest Year"] == harvest_year) & (df["Region"] == region)]
200
+ df_current = df[
201
+ (df["Harvest Year"] == harvest_year) & (df["Region"] == region)
202
+ ]
189
203
  # Filter dataset for lag years and the same region
190
204
  df_lag = df[(df["Harvest Year"].isin(lag_years)) & (df["Region"] == region)]
191
205
 
@@ -242,6 +256,7 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
242
256
 
243
257
  # Suppress warnings in this function
244
258
  import warnings
259
+
245
260
  warnings.filterwarnings("ignore")
246
261
 
247
262
  from kneed import KneeLocator
@@ -291,7 +306,9 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
291
306
  inertia.append(kmeans.inertia_)
292
307
 
293
308
  # Use KneeLocator to find the elbow point automatically
294
- knee_locator = KneeLocator(range_of_clusters, inertia, curve="convex", direction="decreasing")
309
+ knee_locator = KneeLocator(
310
+ range_of_clusters, inertia, curve="convex", direction="decreasing"
311
+ )
295
312
 
296
313
  # # Plot the Elbow Method for visual confirmation
297
314
  # plt.figure(figsize=(10, 6))
@@ -306,7 +323,9 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
306
323
  # Use the detected number of clusters
307
324
  optimal_clusters = knee_locator.knee
308
325
  if optimal_clusters:
309
- optimal_clusters = optimal_clusters + 1 if optimal_clusters > 1 else optimal_clusters
326
+ optimal_clusters = (
327
+ optimal_clusters + 1 if optimal_clusters > 1 else optimal_clusters
328
+ )
310
329
 
311
330
  # Apply K-Means clustering with the detected optimal number of clusters
312
331
  kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
@@ -321,6 +340,43 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
321
340
  )
322
341
  else:
323
342
  # If no optimal_clusters is found, then assign all regions to a single cluster
324
- clusters_assigned = pd.DataFrame({"Region": df_yield_pivot.index, "Region_ID": 1})
343
+ clusters_assigned = pd.DataFrame(
344
+ {"Region": df_yield_pivot.index, "Region_ID": 1}
345
+ )
325
346
 
326
347
  return clusters_assigned
348
+
349
+
350
+ # breakpoint()
351
+
352
+ # from libpysal.weights import Queen, Rook
353
+ # from pysal.lib import weights
354
+ # from scipy.linalg import eigh
355
+ #
356
+ # breakpoint()
357
+ # df = pd.DataFrame()
358
+ #
359
+ # # Create a spatial weights matrix (e.g., Queen contiguity)
360
+ # w = weights.Queen.from_dataframe(dg)
361
+ #
362
+ # # Transform weights to row-standardized form
363
+ # w.transform = 'r'
364
+ #
365
+ # # Convert the weights matrix to a dense format for eigen decomposition
366
+ # W_dense = w.full()[0]
367
+ #
368
+ # # Compute eigenvalues and eigenvectors
369
+ # eigenvalues, eigenvectors = eigh(W_dense)
370
+ #
371
+ # # Sort eigenvalues and corresponding eigenvectors
372
+ # sorted_indices = np.argsort(eigenvalues)[::-1]
373
+ # eigenvalues = eigenvalues[sorted_indices]
374
+ # eigenvectors = eigenvectors[:, sorted_indices]
375
+ #
376
+ # # Select a subset of eigenvectors (e.g., first 10)
377
+ # selected_eigenvectors = eigenvectors[:, :2]
378
+ #
379
+ # breakpoint()
380
+ # # Add eigenvectors to the GeoDataFrame
381
+ # for i in range(selected_eigenvectors.shape[1]):
382
+ # df[f'EV_{i + 1}'] = selected_eigenvectors[:, i]
@@ -77,16 +77,22 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
77
77
 
78
78
  # Step 5: Summarize the SHAP values for feature importance
79
79
  shap_importances = np.mean(np.abs(shap_values), axis=0)
80
- shap_importance_df = pd.DataFrame({
81
- 'feature': X.columns,
82
- 'importance': shap_importances
83
- }).sort_values(by='importance', ascending=False)
80
+ shap_importance_df = pd.DataFrame(
81
+ {"feature": X.columns, "importance": shap_importances}
82
+ ).sort_values(by="importance", ascending=False)
84
83
 
85
84
  def evaluate_model_with_n_features(N, X_train, y_train):
86
- top_features = shap_importance_df['feature'].head(N).values
85
+ top_features = shap_importance_df["feature"].head(N).values
87
86
  X_train_selected = X_train[top_features]
88
87
  selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
89
- scores = cross_val_score(selector, X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
88
+ scores = cross_val_score(
89
+ selector,
90
+ X_train_selected,
91
+ y_train,
92
+ cv=5,
93
+ scoring="neg_mean_squared_error",
94
+ n_jobs=-1,
95
+ )
90
96
 
91
97
  return np.mean(scores)
92
98
 
@@ -100,7 +106,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
100
106
  optimal_N = nrange[np.argmax(cv_scores)]
101
107
 
102
108
  # Use optimal N to select features
103
- selected_features = shap_importance_df['feature'].head(optimal_N).values.tolist()
109
+ selected_features = (
110
+ shap_importance_df["feature"].head(optimal_N).values.tolist()
111
+ )
104
112
  elif method == "feature_engine":
105
113
  from feature_engine.selection import SmartCorrelatedSelection
106
114
 
@@ -202,7 +210,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
202
210
  }
203
211
  model = CatBoostRegressor(**hyperparams)
204
212
 
205
- selector = BorutaShap(model=model, importance_measure="shap", classification=False)
213
+ selector = BorutaShap(
214
+ model=model, importance_measure="shap", classification=False
215
+ )
206
216
  selector.fit(
207
217
  X=X,
208
218
  y=y,
@@ -237,7 +247,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
237
247
  elif method == "RFE":
238
248
  from sklearn.feature_selection import RFE
239
249
 
240
- selector = RFE(forest, n_features_to_select=min_features_to_select, step=1, verbose=1)
250
+ selector = RFE(
251
+ forest, n_features_to_select=min_features_to_select, step=1, verbose=1
252
+ )
241
253
  selector = selector.fit(X, y)
242
254
  selected_features_mask = selector.support_
243
255
  selected_features = X.columns[selected_features_mask].tolist()
@@ -94,19 +94,23 @@ if __name__ == "__main__":
94
94
 
95
95
  if not os.path.isfile(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv"):
96
96
  # In rows where admin_2 != "none", replace admin_1 with admin_2
97
- df_fewsnet_sub.loc[df_fewsnet_sub["admin_2"] != "none", "admin_1"] = df_fewsnet_sub[
98
- "admin_2"
99
- ]
97
+ df_fewsnet_sub.loc[
98
+ df_fewsnet_sub["admin_2"] != "none", "admin_1"
99
+ ] = df_fewsnet_sub["admin_2"]
100
100
 
101
101
  df_output = find_outlier(df_fewsnet_sub)
102
102
 
103
- df_output.to_csv(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv", index=False)
103
+ df_output.to_csv(
104
+ BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv", index=False
105
+ )
104
106
  else:
105
- df_output = pd.read_csv(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv")
107
+ df_output = pd.read_csv(
108
+ BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv"
109
+ )
106
110
 
107
- df_fewsnet_sub.loc[df_fewsnet_sub["admin_2"] != "none", "admin_1"] = df_fewsnet_sub[
108
- "admin_2"
109
- ]
111
+ df_fewsnet_sub.loc[
112
+ df_fewsnet_sub["admin_2"] != "none", "admin_1"
113
+ ] = df_fewsnet_sub["admin_2"]
110
114
 
111
115
  # Create a column called Z-Score Category based on the value of the z-score
112
116
  # The categories are:
@@ -142,7 +146,9 @@ if __name__ == "__main__":
142
146
  df_fewsnet_sub["harvest_year"] = df_fewsnet_sub["harvest_year"].astype(int)
143
147
 
144
148
  df_yield = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "yield")]
145
- df_production = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "production")]
149
+ df_production = df_fewsnet_sub[
150
+ mask & (df_fewsnet_sub["indicator"] == "production")
151
+ ]
146
152
  df_area = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "area")]
147
153
 
148
154
  df_yield["harvest_year"] = df_yield["harvest_year"].astype(int)
@@ -158,9 +164,13 @@ if __name__ == "__main__":
158
164
  # Add 3 subplots, first for area
159
165
  plt.figure(figsize=(10, 10))
160
166
  plt.subplot(3, 1, 1)
161
- plt.plot(df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"])
167
+ plt.plot(
168
+ df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"]
169
+ )
162
170
  # Add a circle for each year where yield is available
163
- plt.scatter(df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"])
171
+ plt.scatter(
172
+ df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"]
173
+ )
164
174
  # Draw a horizontal line at the average df_yield[mask]["value"]
165
175
  plt.axhline(df_yield[mask]["value"].mean(), color="red", linestyle="--")
166
176
  # Place a tick on x-axis at every year and make labels vertical
@@ -195,13 +205,17 @@ if __name__ == "__main__":
195
205
  df_production[mask]["value"],
196
206
  )
197
207
  # Place a tick on x-axis at every year
198
- plt.xticks(df_production[mask]["harvest_year"].astype(int)[::2], rotation=90)
208
+ plt.xticks(
209
+ df_production[mask]["harvest_year"].astype(int)[::2], rotation=90
210
+ )
199
211
  plt.xlabel("Year")
200
212
  plt.ylabel("Production")
201
213
 
202
214
  plt.subplot(3, 1, 3)
203
215
  plt.plot(df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"])
204
- plt.scatter(df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"])
216
+ plt.scatter(
217
+ df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"]
218
+ )
205
219
  # Place a tick on x-axis at every year
206
220
  plt.xticks(df_area[mask]["harvest_year"].astype(int)[::2], rotation=90)
207
221
  plt.xlabel("Year")
@@ -210,7 +224,9 @@ if __name__ == "__main__":
210
224
  try:
211
225
  os.makedirs(BASE_DIR / crop, exist_ok=True)
212
226
  plt.savefig(
213
- BASE_DIR / crop / f"{fnid}_{country}_{admin_1}_{crop}_{season_name}.png"
227
+ BASE_DIR
228
+ / crop
229
+ / f"{fnid}_{country}_{admin_1}_{crop}_{season_name}.png"
214
230
  )
215
231
  except:
216
232
  breakpoint()
@@ -0,0 +1,205 @@
1
+ import pandas as pd
2
+ from pysal.lib import weights
3
+ from pysal.explore import esda
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ def validate_inputs(df_results, required_columns):
8
+ """
9
+
10
+ Args:
11
+ df_results:
12
+ required_columns:
13
+
14
+ Returns:
15
+
16
+ """
17
+ if not all(column in df_results.columns for column in required_columns):
18
+ raise ValueError(
19
+ f"df_results must contain the following columns: {required_columns}"
20
+ )
21
+
22
+
23
+ def preprocess_data(df_results, dg_country):
24
+ """
25
+
26
+ Args:
27
+ df_results:
28
+ dg_country:
29
+
30
+ Returns:
31
+
32
+ """
33
+ df = df_results.drop_duplicates()
34
+ df = df.dropna(subset=["Yield (tn per ha)"])
35
+
36
+ dg_country = dg_country.drop_duplicates(subset="Country Region")
37
+ dg_country = dg_country.dropna(subset=["Country Region", "Region_ID", "geometry"])
38
+
39
+ df["Country Region"] = (df["Country"] + " " + df["Region"]).str.lower()
40
+ dg_country["Country Region"] = dg_country["Country Region"].str.lower()
41
+ dg_country = dg_country[dg_country["Country Region"].isin(df["Country Region"])]
42
+
43
+ merged_df = dg_country.merge(df, on="Country Region", how="inner")
44
+
45
+ return merged_df, dg_country
46
+
47
+
48
+ def create_base_weights(dg_country):
49
+ """
50
+
51
+ Args:
52
+ dg_country:
53
+
54
+ Returns:
55
+
56
+ """
57
+ dg_subset = dg_country[["Country Region", "geometry"]].drop_duplicates()
58
+
59
+ try:
60
+ w_base = weights.Queen.from_dataframe(dg_subset)
61
+ except Exception as e:
62
+ raise RuntimeError(f"Failed to create spatial weights: {e}")
63
+
64
+ no_neighbors = [
65
+ index for index, neighbors in w_base.neighbors.items() if len(neighbors) == 0
66
+ ]
67
+ if no_neighbors:
68
+ print(f"Removing {len(no_neighbors)} polygons with 0 neighbors")
69
+ dg_country = dg_country.drop(index=no_neighbors).reset_index(drop=True)
70
+ w_base = weights.Queen.from_dataframe(
71
+ dg_country[["Country Region", "geometry"]]
72
+ )
73
+
74
+ return w_base, dg_country
75
+
76
+
77
+ def create_weights_for_year(dg_country, regions_with_data):
78
+ """
79
+
80
+ Args:
81
+ dg_country:
82
+ regions_with_data:
83
+
84
+ Returns:
85
+
86
+ """
87
+ dg_subset = dg_country[dg_country["Country Region"].isin(regions_with_data)]
88
+
89
+ wt = weights.Queen.from_dataframe(dg_subset)
90
+
91
+ return wt
92
+
93
+
94
+ def compute_morans_i(merged_df, dg_country):
95
+ """
96
+
97
+ Args:
98
+ merged_df:
99
+ dg_country:
100
+
101
+ Returns:
102
+
103
+ """
104
+ years = merged_df["Harvest Year"].unique()
105
+ results = {"Harvest Year": [], "Moran's I": [], "p-value": [], "Significant": []}
106
+
107
+ for year in years:
108
+ year_data = merged_df[merged_df["Harvest Year"] == year]
109
+ regions_with_data = year_data["Country Region"].unique()
110
+ year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
111
+
112
+ y = year_data[["Region", "Yield (tn per ha)"]].drop_duplicates()
113
+
114
+ if len(y) > 1:
115
+ w = create_weights_for_year(dg_country, regions_with_data)
116
+
117
+ try:
118
+ mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
119
+ except:
120
+ breakpoint()
121
+ results["Harvest Year"].append(year)
122
+ results["Moran's I"].append(mi.I)
123
+ results["p-value"].append(mi.p_sim)
124
+ results["Significant"].append(mi.p_sim < 0.1)
125
+ else:
126
+ results["Harvest Year"].append(year)
127
+ results["Moran's I"].append(None)
128
+ results["p-value"].append(None)
129
+ results["Significant"].append(False)
130
+
131
+ return pd.DataFrame(results)
132
+
133
+
134
+ def plot_moransi_time_series(results_df, country, crop, dir_output):
135
+ """
136
+
137
+ Args:
138
+ results_df:
139
+ country:
140
+ crop:
141
+ dir_output:
142
+
143
+ Returns:
144
+
145
+ """
146
+ plt.figure(figsize=(10, 6))
147
+
148
+ significant = results_df[results_df["Significant"]]
149
+ plt.scatter(
150
+ significant["Harvest Year"],
151
+ significant["Moran's I"],
152
+ color="red",
153
+ label="Significant (p < 0.1)",
154
+ )
155
+
156
+ not_significant = results_df[~results_df["Significant"]]
157
+ plt.plot(
158
+ not_significant["Harvest Year"],
159
+ not_significant["Moran's I"],
160
+ marker="o",
161
+ linestyle="-",
162
+ color="blue",
163
+ label="Non-Significant",
164
+ )
165
+
166
+ plt.ylabel("Moran's I")
167
+ plt.legend()
168
+ plt.grid(True)
169
+ plt.savefig(dir_output / f"{country}_{crop}.png")
170
+ plt.close()
171
+
172
+
173
+ def compute_spatial_autocorrelation(df_results, **kwargs):
174
+ """
175
+
176
+ Args:
177
+ df_results:
178
+ **kwargs:
179
+
180
+ Returns:
181
+
182
+ """
183
+ country = kwargs.get("country")
184
+ crop = kwargs.get("crop")
185
+ dg_country = kwargs.get("dg_country")
186
+ dir_output = kwargs.get("dir_output")
187
+
188
+ required_columns = [
189
+ "Country",
190
+ "Crop",
191
+ "Region",
192
+ "Harvest Year",
193
+ "Yield (tn per ha)",
194
+ ]
195
+ validate_inputs(df_results, required_columns)
196
+
197
+ merged_df, dg_country = preprocess_data(df_results, dg_country)
198
+ if merged_df.empty:
199
+ raise ValueError("No valid data available after preprocessing")
200
+
201
+ w_base, dg_country = create_base_weights(dg_country)
202
+
203
+ results_df = compute_morans_i(merged_df, dg_country)
204
+
205
+ plot_moransi_time_series(results_df, country, crop, dir_output)
@@ -17,7 +17,9 @@ def add_stage_information(df, method):
17
17
  df["Stage"] = df["Stage"].astype(str)
18
18
 
19
19
  df["Stage_ID"] = df["Stage"]
20
- df["Stage Range"] = df["Stage"].apply(lambda x: "_".join([x.split("_")[0], x.split("_")[-1]]))
20
+ df["Stage Range"] = df["Stage"].apply(
21
+ lambda x: "_".join([x.split("_")[0], x.split("_")[-1]])
22
+ )
21
23
 
22
24
  # Create a column with starting stage and ending stage
23
25
  # Stage looks like this: 13_12_11
@@ -34,14 +36,18 @@ def add_stage_information(df, method):
34
36
  dict = utils.dict_growth_stages_biweekly
35
37
  elif "monthly" in method:
36
38
  dict = utils.dict_growth_stages_monthly
37
- df["Stage Names"] = df["Starting Stage"].map(dict) + " - " + df["Ending Stage"].map(dict)
39
+ df["Stage Names"] = (
40
+ df["Starting Stage"].map(dict) + " - " + df["Ending Stage"].map(dict)
41
+ )
38
42
 
39
43
  # Group by Region, Harvest Year
40
44
  # For each group, add a column called Percentage Season
41
45
  # that is the percentage of the season that has passed based on the number of rows
42
46
  # in the group
43
47
  grouped = df.groupby(["Region", "Harvest Year"])
44
- df["Percentage Season"] = grouped.cumcount() * 100.0 / grouped["CEI"].transform("size")
48
+ df["Percentage Season"] = (
49
+ grouped.cumcount() * 100.0 / grouped["CEI"].transform("size")
50
+ )
45
51
 
46
52
  return df
47
53
 
@@ -186,7 +192,9 @@ def get_stage_information_dict(stage_str, method):
186
192
  end_stage = parts[-1]
187
193
 
188
194
  # Exclude cei from the stage_str string
189
- stage_info["Stage_ID"] = "_".join(parts[1:]) if parts[1].isdigit() else "_".join(parts[2:])
195
+ stage_info["Stage_ID"] = (
196
+ "_".join(parts[1:]) if parts[1].isdigit() else "_".join(parts[2:])
197
+ )
190
198
 
191
199
  stage_info["CEI"] = cei
192
200
  stage_info["Stage Range"] = "_".join([start_stage, end_stage])
@@ -29,33 +29,54 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
29
29
  # df_tmp = df.loc[mask_adm1]
30
30
 
31
31
  df_tmp = df.copy()
32
- if name_crop == 'rice':
33
- if cntr == 'Viet nam':
34
- df_tmp = df.loc[df.Season == 'Spring Paddy']
35
- elif cntr == 'Thailand':
36
- df_tmp = df.loc[df.Season == 'Major Season']
37
- elif cntr == 'China':
38
- df_tmp = df.loc[df.Season == 'Single-cropping and Middle-season Rice']
39
- elif cntr == 'India':
40
- df_tmp = df.loc[df.Season == 'Kharif']
41
- elif name_crop == 'maize' and \
42
- cntr in ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Czech Republic', 'Denmark','Germany', 'Greece', 'Hungary',
43
- 'Italy', 'Lithuania', 'Luxembourg', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain',
44
- 'Sweden', 'United Kingdom']:
45
- df_tmp = df.loc[df.Season == 'Grain Maize and Corn-cob-mix']
46
- elif name_crop == 'maize' and cntr in ['France']:
47
- df_tmp = df.loc[df.Season == 'Green Maize']
32
+ if name_crop == "rice":
33
+ if cntr == "Viet nam":
34
+ df_tmp = df.loc[df.Season == "Spring Paddy"]
35
+ elif cntr == "Thailand":
36
+ df_tmp = df.loc[df.Season == "Major Season"]
37
+ elif cntr == "China":
38
+ df_tmp = df.loc[df.Season == "Single-cropping and Middle-season Rice"]
39
+ elif cntr == "India":
40
+ df_tmp = df.loc[df.Season == "Kharif"]
41
+ elif name_crop == "maize" and cntr in [
42
+ "Austria",
43
+ "Belgium",
44
+ "Bulgaria",
45
+ "Croatia",
46
+ "Czech Republic",
47
+ "Denmark",
48
+ "Germany",
49
+ "Greece",
50
+ "Hungary",
51
+ "Italy",
52
+ "Lithuania",
53
+ "Luxembourg",
54
+ "Netherlands",
55
+ "Poland",
56
+ "Portugal",
57
+ "Romania",
58
+ "Slovakia",
59
+ "Slovenia",
60
+ "Spain",
61
+ "Sweden",
62
+ "United Kingdom",
63
+ ]:
64
+ df_tmp = df.loc[df.Season == "Grain Maize and Corn-cob-mix"]
65
+ elif name_crop == "maize" and cntr in ["France"]:
66
+ df_tmp = df.loc[df.Season == "Green Maize"]
48
67
 
49
68
  if not df_tmp.empty:
50
- if cntr != 'Vietnam':
51
- mask_tmp_country = (df_tmp['ADM0_NAME'].str.lower() == cntr.replace('_', ' ').lower())
69
+ if cntr != "Vietnam":
70
+ mask_tmp_country = (
71
+ df_tmp["ADM0_NAME"].str.lower() == cntr.replace("_", " ").lower()
72
+ )
52
73
  else:
53
- mask_tmp_country = (df_tmp['ADM0_NAME'].str.lower() == 'viet nam')
74
+ mask_tmp_country = df_tmp["ADM0_NAME"].str.lower() == "viet nam"
54
75
  if region:
55
- mask_tmp_adm1 = (df_tmp[region_column].str.lower() == region.lower())
76
+ mask_tmp_adm1 = df_tmp[region_column].str.lower() == region.lower()
56
77
  else:
57
78
  # ADM1_NAME column should be NaN to get country level stats
58
- mask_tmp_adm1 = (df_tmp[region_column].isnull())
79
+ mask_tmp_adm1 = df_tmp[region_column].isnull()
59
80
 
60
81
  val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
61
82
 
@@ -145,7 +166,16 @@ def add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone):
145
166
  return df
146
167
 
147
168
 
148
- def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, target_col="Yield (tn per ha)"):
169
+ def add_statistics(
170
+ dir_stats,
171
+ df,
172
+ country,
173
+ crop,
174
+ admin_zone,
175
+ stats,
176
+ method,
177
+ target_col="Yield (tn per ha)",
178
+ ):
149
179
  """
150
180
 
151
181
  Args:
@@ -166,7 +196,9 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
166
196
 
167
197
  # HACK
168
198
  if country == "Afghanistan":
169
- df_fewsnet.loc[:, "product"] = df_fewsnet["season_name"] + " " + df_fewsnet["product"]
199
+ df_fewsnet.loc[:, "product"] = (
200
+ df_fewsnet["season_name"] + " " + df_fewsnet["product"]
201
+ )
170
202
  # Check if country and crop exist in the fewsnet database
171
203
  mask = (df_fewsnet["country"] == country) & (df_fewsnet["product"] == crop)
172
204
 
@@ -183,12 +215,27 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
183
215
  mask_region = df_fewsnet[admin_zone] == region
184
216
  mask_yield = (
185
217
  df_fewsnet["crop_production_system"].isin(
186
- ["none", "Small-scale (PS)", "Commercial (PS)", "All (PS)", "irrigated", "rainfed"]
218
+ [
219
+ "none",
220
+ "Small-scale (PS)",
221
+ "Commercial (PS)",
222
+ "All (PS)",
223
+ "irrigated",
224
+ "rainfed",
225
+ ]
187
226
  )
188
227
  & (df_fewsnet["harvest_year"] == harvest_year)
189
228
  & (df_fewsnet["product"] == crop)
190
229
  & df_fewsnet["season_name"].isin(
191
- ["Main", "Meher", "Main harvest", "Annual", "Summer", "Spring", "Winter"]
230
+ [
231
+ "Main",
232
+ "Meher",
233
+ "Main harvest",
234
+ "Annual",
235
+ "Summer",
236
+ "Spring",
237
+ "Winter",
238
+ ]
192
239
  )
193
240
  & (df_fewsnet["indicator"].isin(["yield", "area", "production"]))
194
241
  )
@@ -84,7 +84,9 @@ def optuna_objective(model, df, feature_names, target_col, cat_features=[]):
84
84
  y = df[target_col]
85
85
 
86
86
  # Divide the data into training and validation sets
87
- train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)
87
+ train_X, val_X, train_y, val_y = train_test_split(
88
+ X, y, test_size=0.2, random_state=0
89
+ )
88
90
 
89
91
  model.fit(
90
92
  train_X,
@@ -134,7 +136,9 @@ def optimized_model(
134
136
  params = {
135
137
  "depth": trial.suggest_int("depth", 1, 7),
136
138
  "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
137
- "iterations": trial.suggest_int("iterations", low=1000, high=5000, step=500),
139
+ "iterations": trial.suggest_int(
140
+ "iterations", low=1000, high=5000, step=500
141
+ ),
138
142
  "subsample": trial.suggest_float("subsample", 1.0, 1.0),
139
143
  "random_strength": trial.suggest_float("random_strength", 0.3, 1.0),
140
144
  "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
@@ -177,7 +181,9 @@ def optimized_model(
177
181
  optuna.logging.set_verbosity(optuna.logging.WARNING) # Disable verbose
178
182
  sampler = optuna.samplers.TPESampler(seed=seed)
179
183
  study = optuna.create_study(sampler=sampler, direction="minimize")
180
- study.optimize(_optuna_objective, n_trials=n_trials, n_jobs=int(mp.cpu_count() * 0.4))
184
+ study.optimize(
185
+ _optuna_objective, n_trials=n_trials, n_jobs=int(mp.cpu_count() * 0.4)
186
+ )
181
187
  if study.best_trial is None:
182
188
  raise ValueError("Optimization failed to complete any trials.")
183
189
  hyperparams = study.best_trial.params
@@ -74,7 +74,9 @@ def compute_trend(detrended_data, future_time_points=None):
74
74
  model = detrended_data.trend_model[0]
75
75
 
76
76
  if model_type == "mean":
77
- trend_component = model.predict(np.ones(len(future_time_points)), has_constant="add")
77
+ trend_component = model.predict(
78
+ np.ones(len(future_time_points)), has_constant="add"
79
+ )
78
80
  elif model_type == "linear":
79
81
  X_linear = add_constant(future_time_points, has_constant="add")
80
82
  trend_component = model.predict(X_linear)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.29
3
+ Version: 0.1.30
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -37,6 +37,7 @@ geocif/ml/feature_selection.py
37
37
  geocif/ml/outliers.py
38
38
  geocif/ml/outlook.py
39
39
  geocif/ml/output.py
40
+ geocif/ml/spatial_autocorrelation.py
40
41
  geocif/ml/stages.py
41
42
  geocif/ml/stats.py
42
43
  geocif/ml/trainers.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.29",
53
+ version="0.1.30",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes