geocif 0.1.29__tar.gz → 0.1.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.29/geocif.egg-info → geocif-0.1.31}/PKG-INFO +1 -1
- {geocif-0.1.29 → geocif-0.1.31}/geocif/geocif.py +39 -23
- {geocif-0.1.29 → geocif-0.1.31}/geocif/indices_runner.py +1 -1
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/correlations.py +29 -10
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/embedding.py +6 -2
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/feature_engineering.py +66 -10
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/feature_selection.py +21 -9
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/outliers.py +30 -14
- geocif-0.1.31/geocif/ml/spatial_autocorrelation.py +224 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/stages.py +12 -4
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/stats.py +72 -25
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/trainers.py +9 -3
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/trend.py +3 -1
- {geocif-0.1.29 → geocif-0.1.31/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.29 → geocif-0.1.31}/geocif.egg-info/SOURCES.txt +1 -0
- {geocif-0.1.29 → geocif-0.1.31}/setup.py +1 -1
- {geocif-0.1.29 → geocif-0.1.31}/LICENSE +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/MANIFEST.in +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/README.md +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/analysis.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/constants.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/features.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/geo.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/backup/models.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/cei/indices.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/logger.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/output.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/ml/xai.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/playground/automl.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/playground/misc.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/utils.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif/viz/plot.py +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/requirements.txt +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/setup.cfg +0 -0
- {geocif-0.1.29 → geocif-0.1.31}/tests/test_geocif.py +0 -0
@@ -17,6 +17,7 @@ from tqdm import tqdm
|
|
17
17
|
from geocif import logger as log
|
18
18
|
from .cei import definitions as di
|
19
19
|
from .ml import correlations
|
20
|
+
from .ml import spatial_autocorrelation as sa
|
20
21
|
from .ml import feature_engineering as fe
|
21
22
|
from .ml import feature_selection as fs
|
22
23
|
from .ml import output
|
@@ -112,6 +113,10 @@ class Geocif:
|
|
112
113
|
self.analogous_year_yield_as_feature = self.parser.getboolean(
|
113
114
|
"ML", "analogous_year_yield_as_feature"
|
114
115
|
)
|
116
|
+
self.spatial_autocorrelation = self.parser.getboolean(
|
117
|
+
"ML", "spatial_autocorrelation"
|
118
|
+
)
|
119
|
+
self.sa_method = self.parser.get("ML", "sa_method")
|
115
120
|
self.last_year_yield_as_feature = self.parser.getboolean(
|
116
121
|
"ML", "last_year_yield_as_feature"
|
117
122
|
)
|
@@ -350,6 +355,8 @@ class Geocif:
|
|
350
355
|
experiment_id = f"{self.country}_{self.crop}"
|
351
356
|
now = ar.utcnow().to("America/New_York").format("MMMM-DD-YYYY HH:mm:ss")
|
352
357
|
selected_features = self.selected_features + self.cat_features
|
358
|
+
# Compute percentage difference between y_pred and y_test
|
359
|
+
ape = np.abs((y_pred - y_test) / y_test) * 100
|
353
360
|
df = pd.DataFrame(
|
354
361
|
{
|
355
362
|
"Experiment_ID": np.full(shp, experiment_id),
|
@@ -367,12 +374,13 @@ class Geocif:
|
|
367
374
|
"Starting Stage": np.full(shp, self.stage_info["Starting Stage"]),
|
368
375
|
"Ending Stage": np.full(shp, self.stage_info["Ending Stage"]),
|
369
376
|
"Model": np.full(shp, self.model_name),
|
370
|
-
"Area (ha)": df_region["Area (ha)"].values,
|
371
377
|
"Region_ID": df_region["Region_ID"].values,
|
372
378
|
"Region": df_region["Region"].values,
|
373
379
|
"Harvest Year": df_region["Harvest Year"].values,
|
380
|
+
"Area (ha)": df_region["Area (ha)"].values,
|
374
381
|
f"Observed {self.target}": np.around(y_test, 3).ravel(),
|
375
382
|
f"Predicted {self.target}": np.around(y_pred, 3).ravel(),
|
383
|
+
f"APE": np.around(ape, 3).ravel(),
|
376
384
|
}
|
377
385
|
)
|
378
386
|
|
@@ -412,6 +420,12 @@ class Geocif:
|
|
412
420
|
except:
|
413
421
|
breakpoint()
|
414
422
|
|
423
|
+
# if self.spatial_autocorrelation:
|
424
|
+
# # Compute spatial autocorrelation
|
425
|
+
# df = sa.compute_spatial_autocorrelation(
|
426
|
+
# self.dg_country
|
427
|
+
# )
|
428
|
+
|
415
429
|
for col in [
|
416
430
|
f"Median {self.target}",
|
417
431
|
"Analogous Year",
|
@@ -425,7 +439,7 @@ class Geocif:
|
|
425
439
|
# Create an index based on following columns
|
426
440
|
index_columns = [
|
427
441
|
"Model",
|
428
|
-
"Cluster Strategy"
|
442
|
+
"Cluster Strategy",
|
429
443
|
"Country",
|
430
444
|
"Region",
|
431
445
|
"Crop",
|
@@ -709,7 +723,7 @@ class Geocif:
|
|
709
723
|
""" Convert this dataframe into an ML ready format and save to disk """
|
710
724
|
df = self.create_ml_dataframe(df)
|
711
725
|
dir_output = (
|
712
|
-
self.dir_analysis / self.country / self.crop / str(self.forecast_season)
|
726
|
+
self.dir_analysis / self.country / self.crop / self.model_name / str(self.forecast_season)
|
713
727
|
)
|
714
728
|
os.makedirs(dir_output, exist_ok=True)
|
715
729
|
df.to_csv(
|
@@ -718,7 +732,6 @@ class Geocif:
|
|
718
732
|
)
|
719
733
|
|
720
734
|
# cat_features should be converted to category type
|
721
|
-
|
722
735
|
df[self.cat_features] = df[self.cat_features].astype("category")
|
723
736
|
|
724
737
|
""" Heatmap of correlation of various features with yield at each time step"""
|
@@ -739,26 +752,29 @@ class Geocif:
|
|
739
752
|
how="outer",
|
740
753
|
)
|
741
754
|
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
dict_kwargs["combined_dict"] = self.combined_dict
|
755
|
+
dict_kwargs = {}
|
756
|
+
dict_kwargs["all_stages"] = self.all_stages
|
757
|
+
dict_kwargs["target_col"] = self.target
|
758
|
+
dict_kwargs["country"] = self.country
|
759
|
+
dict_kwargs["crop"] = self.crop
|
760
|
+
dict_kwargs["dir_output"] = (
|
761
|
+
self.dir_analysis
|
762
|
+
/ self.country
|
763
|
+
/ self.crop
|
764
|
+
/ self.model_name
|
765
|
+
/ str(self.forecast_season)
|
766
|
+
)
|
767
|
+
dict_kwargs["forecast_season"] = self.forecast_season
|
768
|
+
dict_kwargs["method"] = self.method
|
769
|
+
dict_kwargs["national_correlation"] = self.national_correlation
|
770
|
+
dict_kwargs["groupby"] = self.correlation_plot_groupby
|
771
|
+
dict_kwargs["dg_country"] = self.dg_country
|
772
|
+
dict_kwargs["combined_dict"] = self.combined_dict
|
761
773
|
|
774
|
+
if self.spatial_autocorrelation:
|
775
|
+
sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
|
776
|
+
|
777
|
+
if self.correlation_plots:
|
762
778
|
self.logger.info(f"Correlation plot for {self.country} {self.crop}")
|
763
779
|
(
|
764
780
|
dict_selected_features,
|
@@ -28,7 +28,9 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
|
|
28
28
|
|
29
29
|
# Only select columns that have been observed till the current stage
|
30
30
|
for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
|
31
|
-
current_feature_set = [
|
31
|
+
current_feature_set = [
|
32
|
+
col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
|
33
|
+
]
|
32
34
|
|
33
35
|
# Get the most correlated feature for each region
|
34
36
|
top_feature_by_region, counter = embedding.get_top_correlated_features(
|
@@ -41,7 +43,9 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
|
|
41
43
|
# Loop through top_feature_by_region and find the average score for _feature
|
42
44
|
# Calculate the average score for 'DTR_36'
|
43
45
|
_feature_scores = [
|
44
|
-
value[1][0]
|
46
|
+
value[1][0]
|
47
|
+
for key, value in top_feature_by_region.items()
|
48
|
+
if _feature in value[0]
|
45
49
|
]
|
46
50
|
average_score = sum(_feature_scores) / len(_feature_scores)
|
47
51
|
_feature = utils.remove_last_part(_feature)
|
@@ -137,7 +141,9 @@ def plot_feature_corr_by_time(df, **kwargs):
|
|
137
141
|
# Add colorbar label
|
138
142
|
# cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
|
139
143
|
cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
|
140
|
-
ax_heatmap.set_xticklabels(
|
144
|
+
ax_heatmap.set_xticklabels(
|
145
|
+
ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5
|
146
|
+
)
|
141
147
|
ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=5)
|
142
148
|
ax_heatmap.set_xlabel("")
|
143
149
|
ax_heatmap.set_ylabel(" ")
|
@@ -190,7 +196,9 @@ def _all_correlated_feature_by_time(df, **kwargs):
|
|
190
196
|
pbar.set_description(f"Calculating correlations")
|
191
197
|
pbar.update()
|
192
198
|
|
193
|
-
stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
199
|
+
stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
200
|
+
"Stage Name"
|
201
|
+
]
|
194
202
|
# starting_stage = stage_name.split("-")[0]
|
195
203
|
current_feature_set = [col for col in df.columns if stage_name in col]
|
196
204
|
|
@@ -210,7 +218,9 @@ def _all_correlated_feature_by_time(df, **kwargs):
|
|
210
218
|
|
211
219
|
all_stage_names = []
|
212
220
|
for stage in stages_features:
|
213
|
-
_tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
221
|
+
_tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
222
|
+
"Stage Name"
|
223
|
+
]
|
214
224
|
all_stage_names.append(_tmp)
|
215
225
|
|
216
226
|
df_results = df_results.reindex(all_stage_names)
|
@@ -254,7 +264,12 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
254
264
|
df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
|
255
265
|
dict_selected_features[region_id] = df_tmp.columns
|
256
266
|
|
257
|
-
df_tmp2 =
|
267
|
+
df_tmp2 = (
|
268
|
+
df_tmp.median(axis=0)
|
269
|
+
.abs()
|
270
|
+
.sort_values(ascending=False)
|
271
|
+
.reset_index()
|
272
|
+
)
|
258
273
|
df_tmp2.columns = ["Metric", "Value"]
|
259
274
|
# Add another column based on Type of Metric
|
260
275
|
for idx, row in df_tmp2.iterrows():
|
@@ -278,8 +293,8 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
278
293
|
dict_selected_features[region_id] = df_corr.columns
|
279
294
|
dict_best_cei[region_id] = {}
|
280
295
|
|
281
|
-
#dict_selected_features[region_id] = dict_selected_features[0]
|
282
|
-
#dict_best_cei[region_id] = dict_best_cei[0]
|
296
|
+
# dict_selected_features[region_id] = dict_selected_features[0]
|
297
|
+
# dict_best_cei[region_id] = dict_best_cei[0]
|
283
298
|
# Combine all unique values from the existing dictionary elements
|
284
299
|
# combined_metrics = set()
|
285
300
|
# for key in dict_selected_features:
|
@@ -310,7 +325,9 @@ def feature_correlation_by_time(**kwargs):
|
|
310
325
|
|
311
326
|
# Only select columns that have been observed till the current stage
|
312
327
|
for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
|
313
|
-
current_feature_set = [
|
328
|
+
current_feature_set = [
|
329
|
+
col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
|
330
|
+
]
|
314
331
|
|
315
332
|
# Get the most correlated feature for each region
|
316
333
|
top_feature_by_region, counter = embedding.compute_feature_correlations(
|
@@ -324,7 +341,9 @@ def feature_correlation_by_time(**kwargs):
|
|
324
341
|
# Loop through top_feature_by_region and find the average score for _feature
|
325
342
|
# Calculate the average score for 'DTR_36'
|
326
343
|
_feature_scores = [
|
327
|
-
value[1][0]
|
344
|
+
value[1][0]
|
345
|
+
for key, value in top_feature_by_region.items()
|
346
|
+
if _feature in value[0]
|
328
347
|
]
|
329
348
|
average_score = sum(_feature_scores) / len(_feature_scores)
|
330
349
|
_feature = utils.remove_last_part(_feature)
|
@@ -79,7 +79,9 @@ def get_top_correlated_features(inputs, targets):
|
|
79
79
|
feature_correlations = _compute_correlations(X, y)
|
80
80
|
|
81
81
|
# Exclude any nan values
|
82
|
-
feature_correlations = {
|
82
|
+
feature_correlations = {
|
83
|
+
k: v for k, v in feature_correlations.items() if not np.isnan(v)
|
84
|
+
}
|
83
85
|
|
84
86
|
if not feature_correlations:
|
85
87
|
continue
|
@@ -113,7 +115,9 @@ def get_all_features_correlation(inputs, targets, method):
|
|
113
115
|
feature_correlations = _compute_correlations(X, y)
|
114
116
|
|
115
117
|
# Exclude any nan values
|
116
|
-
feature_correlations = {
|
118
|
+
feature_correlations = {
|
119
|
+
k: v for k, v in feature_correlations.items() if not np.isnan(v)
|
120
|
+
}
|
117
121
|
|
118
122
|
if not feature_correlations:
|
119
123
|
continue
|
@@ -21,11 +21,15 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
|
|
21
21
|
# Initialize the new column with NaNs
|
22
22
|
df[f"Last Year {target_col}"] = np.nan
|
23
23
|
|
24
|
-
for region, group in tqdm(
|
24
|
+
for region, group in tqdm(
|
25
|
+
df.groupby("Region"), desc="Last year yields", leave=False
|
26
|
+
):
|
25
27
|
unique_years = group["Harvest Year"].unique()
|
26
28
|
|
27
29
|
for harvest_year in unique_years:
|
28
|
-
mask = (group["Harvest Year"] == harvest_year - 1) & (
|
30
|
+
mask = (group["Harvest Year"] == harvest_year - 1) & (
|
31
|
+
group["Region"] == region
|
32
|
+
)
|
29
33
|
last_year_yield = group.loc[mask, target_col].values
|
30
34
|
if last_year_yield:
|
31
35
|
df.loc[
|
@@ -89,7 +93,9 @@ def compute_median_yield(
|
|
89
93
|
closest_years = compute_closest_years(
|
90
94
|
all_seasons_with_yield, harvest_year, number_median_years
|
91
95
|
)
|
92
|
-
mask = (group["Harvest Year"].isin(closest_years)) & (
|
96
|
+
mask = (group["Harvest Year"].isin(closest_years)) & (
|
97
|
+
group["Region"] == region
|
98
|
+
)
|
93
99
|
median_yield = group.loc[mask, target_col].median()
|
94
100
|
df.loc[
|
95
101
|
(df["Region"] == region) & (df["Harvest Year"] == harvest_year),
|
@@ -99,7 +105,9 @@ def compute_median_yield(
|
|
99
105
|
return df
|
100
106
|
|
101
107
|
|
102
|
-
def compute_lag_yield(
|
108
|
+
def compute_lag_yield(
|
109
|
+
df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"
|
110
|
+
):
|
103
111
|
# For the number of years specified in self.number_lag_years, add the yield of that number of years
|
104
112
|
# ago to the dataframe
|
105
113
|
# For example, if number_lag_years is 3, then the yield of each year upto 3 years ago will be added
|
@@ -125,7 +133,9 @@ def compute_lag_yield(df, all_seasons_with_yield, number_lag_years, target_col="
|
|
125
133
|
col = f"t -{idx + 1} {target_col}"
|
126
134
|
|
127
135
|
mask_group_year = group["Harvest Year"] == year
|
128
|
-
mask_region = (df["Region"] == region) & (
|
136
|
+
mask_region = (df["Region"] == region) & (
|
137
|
+
df["Harvest Year"] == harvest_year
|
138
|
+
)
|
129
139
|
yield_value = group.loc[mask_group_year, target_col].values
|
130
140
|
|
131
141
|
if yield_value.size > 0:
|
@@ -181,11 +191,15 @@ def compute_analogous_yield(
|
|
181
191
|
all_years = df["Harvest Year"].unique()
|
182
192
|
|
183
193
|
for harvest_year in tqdm(all_years, desc="Computing analogous yields", leave=False):
|
184
|
-
lag_years = compute_closest_years(
|
194
|
+
lag_years = compute_closest_years(
|
195
|
+
all_seasons_with_yield, harvest_year, number_lag_years
|
196
|
+
)
|
185
197
|
|
186
198
|
for region in df["Region"].unique():
|
187
199
|
# Filter current year and region dataset
|
188
|
-
df_current = df[
|
200
|
+
df_current = df[
|
201
|
+
(df["Harvest Year"] == harvest_year) & (df["Region"] == region)
|
202
|
+
]
|
189
203
|
# Filter dataset for lag years and the same region
|
190
204
|
df_lag = df[(df["Harvest Year"].isin(lag_years)) & (df["Region"] == region)]
|
191
205
|
|
@@ -242,6 +256,7 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
|
|
242
256
|
|
243
257
|
# Suppress warnings in this function
|
244
258
|
import warnings
|
259
|
+
|
245
260
|
warnings.filterwarnings("ignore")
|
246
261
|
|
247
262
|
from kneed import KneeLocator
|
@@ -291,7 +306,9 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
|
|
291
306
|
inertia.append(kmeans.inertia_)
|
292
307
|
|
293
308
|
# Use KneeLocator to find the elbow point automatically
|
294
|
-
knee_locator = KneeLocator(
|
309
|
+
knee_locator = KneeLocator(
|
310
|
+
range_of_clusters, inertia, curve="convex", direction="decreasing"
|
311
|
+
)
|
295
312
|
|
296
313
|
# # Plot the Elbow Method for visual confirmation
|
297
314
|
# plt.figure(figsize=(10, 6))
|
@@ -306,7 +323,9 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
|
|
306
323
|
# Use the detected number of clusters
|
307
324
|
optimal_clusters = knee_locator.knee
|
308
325
|
if optimal_clusters:
|
309
|
-
optimal_clusters =
|
326
|
+
optimal_clusters = (
|
327
|
+
optimal_clusters + 1 if optimal_clusters > 1 else optimal_clusters
|
328
|
+
)
|
310
329
|
|
311
330
|
# Apply K-Means clustering with the detected optimal number of clusters
|
312
331
|
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
|
@@ -321,6 +340,43 @@ def detect_clusters(df, target_col="Yield (tn per ha)"):
|
|
321
340
|
)
|
322
341
|
else:
|
323
342
|
# If no optimal_clusters is found, then assign all regions to a single cluster
|
324
|
-
clusters_assigned = pd.DataFrame(
|
343
|
+
clusters_assigned = pd.DataFrame(
|
344
|
+
{"Region": df_yield_pivot.index, "Region_ID": 1}
|
345
|
+
)
|
325
346
|
|
326
347
|
return clusters_assigned
|
348
|
+
|
349
|
+
|
350
|
+
# breakpoint()
|
351
|
+
|
352
|
+
# from libpysal.weights import Queen, Rook
|
353
|
+
# from pysal.lib import weights
|
354
|
+
# from scipy.linalg import eigh
|
355
|
+
#
|
356
|
+
# breakpoint()
|
357
|
+
# df = pd.DataFrame()
|
358
|
+
#
|
359
|
+
# # Create a spatial weights matrix (e.g., Queen contiguity)
|
360
|
+
# w = weights.Queen.from_dataframe(dg)
|
361
|
+
#
|
362
|
+
# # Transform weights to row-standardized form
|
363
|
+
# w.transform = 'r'
|
364
|
+
#
|
365
|
+
# # Convert the weights matrix to a dense format for eigen decomposition
|
366
|
+
# W_dense = w.full()[0]
|
367
|
+
#
|
368
|
+
# # Compute eigenvalues and eigenvectors
|
369
|
+
# eigenvalues, eigenvectors = eigh(W_dense)
|
370
|
+
#
|
371
|
+
# # Sort eigenvalues and corresponding eigenvectors
|
372
|
+
# sorted_indices = np.argsort(eigenvalues)[::-1]
|
373
|
+
# eigenvalues = eigenvalues[sorted_indices]
|
374
|
+
# eigenvectors = eigenvectors[:, sorted_indices]
|
375
|
+
#
|
376
|
+
# # Select a subset of eigenvectors (e.g., first 10)
|
377
|
+
# selected_eigenvectors = eigenvectors[:, :2]
|
378
|
+
#
|
379
|
+
# breakpoint()
|
380
|
+
# # Add eigenvectors to the GeoDataFrame
|
381
|
+
# for i in range(selected_eigenvectors.shape[1]):
|
382
|
+
# df[f'EV_{i + 1}'] = selected_eigenvectors[:, i]
|
@@ -77,16 +77,22 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
77
77
|
|
78
78
|
# Step 5: Summarize the SHAP values for feature importance
|
79
79
|
shap_importances = np.mean(np.abs(shap_values), axis=0)
|
80
|
-
shap_importance_df = pd.DataFrame(
|
81
|
-
|
82
|
-
|
83
|
-
}).sort_values(by='importance', ascending=False)
|
80
|
+
shap_importance_df = pd.DataFrame(
|
81
|
+
{"feature": X.columns, "importance": shap_importances}
|
82
|
+
).sort_values(by="importance", ascending=False)
|
84
83
|
|
85
84
|
def evaluate_model_with_n_features(N, X_train, y_train):
|
86
|
-
top_features = shap_importance_df[
|
85
|
+
top_features = shap_importance_df["feature"].head(N).values
|
87
86
|
X_train_selected = X_train[top_features]
|
88
87
|
selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
|
89
|
-
scores = cross_val_score(
|
88
|
+
scores = cross_val_score(
|
89
|
+
selector,
|
90
|
+
X_train_selected,
|
91
|
+
y_train,
|
92
|
+
cv=5,
|
93
|
+
scoring="neg_mean_squared_error",
|
94
|
+
n_jobs=-1,
|
95
|
+
)
|
90
96
|
|
91
97
|
return np.mean(scores)
|
92
98
|
|
@@ -100,7 +106,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
100
106
|
optimal_N = nrange[np.argmax(cv_scores)]
|
101
107
|
|
102
108
|
# Use optimal N to select features
|
103
|
-
selected_features =
|
109
|
+
selected_features = (
|
110
|
+
shap_importance_df["feature"].head(optimal_N).values.tolist()
|
111
|
+
)
|
104
112
|
elif method == "feature_engine":
|
105
113
|
from feature_engine.selection import SmartCorrelatedSelection
|
106
114
|
|
@@ -202,7 +210,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
202
210
|
}
|
203
211
|
model = CatBoostRegressor(**hyperparams)
|
204
212
|
|
205
|
-
selector = BorutaShap(
|
213
|
+
selector = BorutaShap(
|
214
|
+
model=model, importance_measure="shap", classification=False
|
215
|
+
)
|
206
216
|
selector.fit(
|
207
217
|
X=X,
|
208
218
|
y=y,
|
@@ -237,7 +247,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
237
247
|
elif method == "RFE":
|
238
248
|
from sklearn.feature_selection import RFE
|
239
249
|
|
240
|
-
selector = RFE(
|
250
|
+
selector = RFE(
|
251
|
+
forest, n_features_to_select=min_features_to_select, step=1, verbose=1
|
252
|
+
)
|
241
253
|
selector = selector.fit(X, y)
|
242
254
|
selected_features_mask = selector.support_
|
243
255
|
selected_features = X.columns[selected_features_mask].tolist()
|
@@ -94,19 +94,23 @@ if __name__ == "__main__":
|
|
94
94
|
|
95
95
|
if not os.path.isfile(BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv"):
|
96
96
|
# In rows where admin_2 != "none", replace admin_1 with admin_2
|
97
|
-
df_fewsnet_sub.loc[
|
98
|
-
"admin_2"
|
99
|
-
]
|
97
|
+
df_fewsnet_sub.loc[
|
98
|
+
df_fewsnet_sub["admin_2"] != "none", "admin_1"
|
99
|
+
] = df_fewsnet_sub["admin_2"]
|
100
100
|
|
101
101
|
df_output = find_outlier(df_fewsnet_sub)
|
102
102
|
|
103
|
-
df_output.to_csv(
|
103
|
+
df_output.to_csv(
|
104
|
+
BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv", index=False
|
105
|
+
)
|
104
106
|
else:
|
105
|
-
df_output = pd.read_csv(
|
107
|
+
df_output = pd.read_csv(
|
108
|
+
BASE_DIR / crop / f"adm_crop_production_z_{crop}.csv"
|
109
|
+
)
|
106
110
|
|
107
|
-
df_fewsnet_sub.loc[
|
108
|
-
"admin_2"
|
109
|
-
]
|
111
|
+
df_fewsnet_sub.loc[
|
112
|
+
df_fewsnet_sub["admin_2"] != "none", "admin_1"
|
113
|
+
] = df_fewsnet_sub["admin_2"]
|
110
114
|
|
111
115
|
# Create a column called Z-Score Category based on the value of the z-score
|
112
116
|
# The categories are:
|
@@ -142,7 +146,9 @@ if __name__ == "__main__":
|
|
142
146
|
df_fewsnet_sub["harvest_year"] = df_fewsnet_sub["harvest_year"].astype(int)
|
143
147
|
|
144
148
|
df_yield = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "yield")]
|
145
|
-
df_production = df_fewsnet_sub[
|
149
|
+
df_production = df_fewsnet_sub[
|
150
|
+
mask & (df_fewsnet_sub["indicator"] == "production")
|
151
|
+
]
|
146
152
|
df_area = df_fewsnet_sub[mask & (df_fewsnet_sub["indicator"] == "area")]
|
147
153
|
|
148
154
|
df_yield["harvest_year"] = df_yield["harvest_year"].astype(int)
|
@@ -158,9 +164,13 @@ if __name__ == "__main__":
|
|
158
164
|
# Add 3 subplots, first for area
|
159
165
|
plt.figure(figsize=(10, 10))
|
160
166
|
plt.subplot(3, 1, 1)
|
161
|
-
plt.plot(
|
167
|
+
plt.plot(
|
168
|
+
df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"]
|
169
|
+
)
|
162
170
|
# Add a circle for each year where yield is available
|
163
|
-
plt.scatter(
|
171
|
+
plt.scatter(
|
172
|
+
df_yield[mask]["harvest_year"].astype(int), df_yield[mask]["value"]
|
173
|
+
)
|
164
174
|
# Draw a horizontal line at the average df_yield[mask]["value"]
|
165
175
|
plt.axhline(df_yield[mask]["value"].mean(), color="red", linestyle="--")
|
166
176
|
# Place a tick on x-axis at every year and make labels vertical
|
@@ -195,13 +205,17 @@ if __name__ == "__main__":
|
|
195
205
|
df_production[mask]["value"],
|
196
206
|
)
|
197
207
|
# Place a tick on x-axis at every year
|
198
|
-
plt.xticks(
|
208
|
+
plt.xticks(
|
209
|
+
df_production[mask]["harvest_year"].astype(int)[::2], rotation=90
|
210
|
+
)
|
199
211
|
plt.xlabel("Year")
|
200
212
|
plt.ylabel("Production")
|
201
213
|
|
202
214
|
plt.subplot(3, 1, 3)
|
203
215
|
plt.plot(df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"])
|
204
|
-
plt.scatter(
|
216
|
+
plt.scatter(
|
217
|
+
df_area[mask]["harvest_year"].astype(int), df_area[mask]["value"]
|
218
|
+
)
|
205
219
|
# Place a tick on x-axis at every year
|
206
220
|
plt.xticks(df_area[mask]["harvest_year"].astype(int)[::2], rotation=90)
|
207
221
|
plt.xlabel("Year")
|
@@ -210,7 +224,9 @@ if __name__ == "__main__":
|
|
210
224
|
try:
|
211
225
|
os.makedirs(BASE_DIR / crop, exist_ok=True)
|
212
226
|
plt.savefig(
|
213
|
-
BASE_DIR
|
227
|
+
BASE_DIR
|
228
|
+
/ crop
|
229
|
+
/ f"{fnid}_{country}_{admin_1}_{crop}_{season_name}.png"
|
214
230
|
)
|
215
231
|
except:
|
216
232
|
breakpoint()
|
@@ -0,0 +1,224 @@
|
|
1
|
+
import warnings
|
2
|
+
|
3
|
+
from tqdm import tqdm
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
import pandas as pd
|
6
|
+
from pysal.lib import weights
|
7
|
+
|
8
|
+
warnings.filterwarnings("ignore")
|
9
|
+
|
10
|
+
|
11
|
+
def validate_inputs(df_results, required_columns):
|
12
|
+
"""
|
13
|
+
|
14
|
+
Args:
|
15
|
+
df_results:
|
16
|
+
required_columns:
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
|
20
|
+
"""
|
21
|
+
if not all(column in df_results.columns for column in required_columns):
|
22
|
+
raise ValueError(
|
23
|
+
f"df_results must contain the following columns: {required_columns}"
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
def preprocess_data(df_results, dg_country):
|
28
|
+
"""
|
29
|
+
|
30
|
+
Args:
|
31
|
+
df_results:
|
32
|
+
dg_country:
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
|
36
|
+
"""
|
37
|
+
df = df_results.drop_duplicates()
|
38
|
+
df = df.dropna(subset=["Yield (tn per ha)"])
|
39
|
+
|
40
|
+
dg_country = dg_country.drop_duplicates(subset="Country Region")
|
41
|
+
dg_country = dg_country.dropna(subset=["Country Region", "Region_ID", "geometry"])
|
42
|
+
|
43
|
+
df["Country Region"] = (df["Country"] + " " + df["Region"]).str.lower()
|
44
|
+
dg_country["Country Region"] = dg_country["Country Region"].str.lower()
|
45
|
+
dg_country = dg_country[dg_country["Country Region"].isin(df["Country Region"])]
|
46
|
+
|
47
|
+
dg_country.reset_index(drop=True, inplace=True)
|
48
|
+
|
49
|
+
merged_df = dg_country.merge(df, on="Country Region", how="inner")
|
50
|
+
|
51
|
+
return merged_df
|
52
|
+
|
53
|
+
|
54
|
+
def create_base_weights(merged_df):
|
55
|
+
"""
|
56
|
+
|
57
|
+
Args:
|
58
|
+
merged_df:
|
59
|
+
|
60
|
+
Returns:
|
61
|
+
|
62
|
+
"""
|
63
|
+
dg = merged_df[["Country Region", "geometry"]].drop_duplicates()
|
64
|
+
|
65
|
+
try:
|
66
|
+
w_base = weights.Queen.from_dataframe(dg)
|
67
|
+
except Exception as e:
|
68
|
+
raise RuntimeError(f"Failed to create spatial weights: {e}")
|
69
|
+
|
70
|
+
no_neighbors = [
|
71
|
+
index for index, neighbors in w_base.neighbors.items() if len(neighbors) == 0
|
72
|
+
]
|
73
|
+
if no_neighbors:
|
74
|
+
dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
|
75
|
+
w_base = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
|
76
|
+
|
77
|
+
return w_base, dg
|
78
|
+
|
79
|
+
|
80
|
+
def create_weights_for_year(dg_country, regions_with_data):
|
81
|
+
"""
|
82
|
+
|
83
|
+
Args:
|
84
|
+
dg_country:
|
85
|
+
regions_with_data:
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
|
89
|
+
"""
|
90
|
+
dg = dg_country[dg_country["Country Region"].isin(regions_with_data)]
|
91
|
+
dg = dg.reset_index(drop=True)
|
92
|
+
|
93
|
+
wt = weights.Queen.from_dataframe(dg)
|
94
|
+
|
95
|
+
no_neighbors = [
|
96
|
+
index for index, neighbors in wt.neighbors.items() if len(neighbors) == 0
|
97
|
+
]
|
98
|
+
if no_neighbors:
|
99
|
+
dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
|
100
|
+
wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
|
101
|
+
|
102
|
+
return wt, dg
|
103
|
+
|
104
|
+
|
105
|
+
def compute_morans_i(merged_df):
|
106
|
+
"""
|
107
|
+
|
108
|
+
Args:
|
109
|
+
merged_df:
|
110
|
+
dg_country:
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
|
114
|
+
"""
|
115
|
+
from pysal.explore import esda
|
116
|
+
|
117
|
+
# Drop any regions with missing data
|
118
|
+
merged_df = merged_df.dropna(subset=["Yield (tn per ha)"])
|
119
|
+
|
120
|
+
years = merged_df["Harvest Year"].unique()
|
121
|
+
results = {"Harvest Year": [], "Moran's I": [], "p-value": [], "Significant": []}
|
122
|
+
|
123
|
+
for year in tqdm(years, desc="Compute Moran's I"):
|
124
|
+
year_data = merged_df[merged_df["Harvest Year"] == year]
|
125
|
+
regions_with_data = year_data["Country Region"].unique()
|
126
|
+
year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
|
127
|
+
|
128
|
+
y = year_data[["Country Region", "Region", "Yield (tn per ha)"]].drop_duplicates()
|
129
|
+
dg_country = year_data[["Country Region", "geometry"]].drop_duplicates()
|
130
|
+
|
131
|
+
if len(y) > 1:
|
132
|
+
w, x = create_weights_for_year(dg_country, regions_with_data)
|
133
|
+
y = y[y["Country Region"].isin(x["Country Region"])]
|
134
|
+
|
135
|
+
try:
|
136
|
+
mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
|
137
|
+
except:
|
138
|
+
breakpoint()
|
139
|
+
results["Harvest Year"].append(year)
|
140
|
+
try:
|
141
|
+
results["Moran's I"].append(mi.I)
|
142
|
+
except:
|
143
|
+
breakpoint()
|
144
|
+
results["p-value"].append(mi.p_sim)
|
145
|
+
results["Significant"].append(mi.p_sim < 0.1)
|
146
|
+
else:
|
147
|
+
results["Harvest Year"].append(year)
|
148
|
+
results["Moran's I"].append(None)
|
149
|
+
results["p-value"].append(None)
|
150
|
+
results["Significant"].append(False)
|
151
|
+
|
152
|
+
return pd.DataFrame(results)
|
153
|
+
|
154
|
+
|
155
|
+
def plot_morans_i_time_series(results_df, country, crop, dir_output):
|
156
|
+
"""
|
157
|
+
|
158
|
+
Args:
|
159
|
+
results_df:
|
160
|
+
country:
|
161
|
+
crop:
|
162
|
+
dir_output:
|
163
|
+
|
164
|
+
Returns:
|
165
|
+
|
166
|
+
"""
|
167
|
+
plt.figure(figsize=(10, 6))
|
168
|
+
|
169
|
+
significant = results_df[results_df["Significant"]]
|
170
|
+
plt.scatter(
|
171
|
+
significant["Harvest Year"],
|
172
|
+
significant["Moran's I"],
|
173
|
+
color="red",
|
174
|
+
label="Significant (p < 0.1)",
|
175
|
+
)
|
176
|
+
|
177
|
+
not_significant = results_df[~results_df["Significant"]]
|
178
|
+
plt.plot(
|
179
|
+
not_significant["Harvest Year"],
|
180
|
+
not_significant["Moran's I"],
|
181
|
+
marker="o",
|
182
|
+
linestyle="-",
|
183
|
+
color="blue",
|
184
|
+
label="Non-Significant",
|
185
|
+
)
|
186
|
+
|
187
|
+
plt.ylabel("Moran's I")
|
188
|
+
plt.legend()
|
189
|
+
plt.grid(True)
|
190
|
+
plt.savefig(dir_output / f"{country}_{crop}.png")
|
191
|
+
plt.close()
|
192
|
+
|
193
|
+
|
194
|
+
def compute_spatial_autocorrelation(df_results, **kwargs):
|
195
|
+
"""
|
196
|
+
|
197
|
+
Args:
|
198
|
+
df_results:
|
199
|
+
**kwargs:
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
|
203
|
+
"""
|
204
|
+
country = kwargs.get("country")
|
205
|
+
crop = kwargs.get("crop")
|
206
|
+
dg_country = kwargs.get("dg_country")
|
207
|
+
dir_output = kwargs.get("dir_output")
|
208
|
+
|
209
|
+
required_columns = [
|
210
|
+
"Country",
|
211
|
+
"Crop",
|
212
|
+
"Region",
|
213
|
+
"Harvest Year",
|
214
|
+
"Yield (tn per ha)",
|
215
|
+
]
|
216
|
+
validate_inputs(df_results, required_columns)
|
217
|
+
|
218
|
+
merged_df = preprocess_data(df_results, dg_country)
|
219
|
+
if merged_df.empty:
|
220
|
+
raise ValueError("No valid data available after preprocessing")
|
221
|
+
|
222
|
+
results_df = compute_morans_i(merged_df)
|
223
|
+
|
224
|
+
plot_morans_i_time_series(results_df, country, crop, dir_output)
|
@@ -17,7 +17,9 @@ def add_stage_information(df, method):
|
|
17
17
|
df["Stage"] = df["Stage"].astype(str)
|
18
18
|
|
19
19
|
df["Stage_ID"] = df["Stage"]
|
20
|
-
df["Stage Range"] = df["Stage"].apply(
|
20
|
+
df["Stage Range"] = df["Stage"].apply(
|
21
|
+
lambda x: "_".join([x.split("_")[0], x.split("_")[-1]])
|
22
|
+
)
|
21
23
|
|
22
24
|
# Create a column with starting stage and ending stage
|
23
25
|
# Stage looks like this: 13_12_11
|
@@ -34,14 +36,18 @@ def add_stage_information(df, method):
|
|
34
36
|
dict = utils.dict_growth_stages_biweekly
|
35
37
|
elif "monthly" in method:
|
36
38
|
dict = utils.dict_growth_stages_monthly
|
37
|
-
df["Stage Names"] =
|
39
|
+
df["Stage Names"] = (
|
40
|
+
df["Starting Stage"].map(dict) + " - " + df["Ending Stage"].map(dict)
|
41
|
+
)
|
38
42
|
|
39
43
|
# Group by Region, Harvest Year
|
40
44
|
# For each group, add a column called Percentage Season
|
41
45
|
# that is the percentage of the season that has passed based on the number of rows
|
42
46
|
# in the group
|
43
47
|
grouped = df.groupby(["Region", "Harvest Year"])
|
44
|
-
df["Percentage Season"] =
|
48
|
+
df["Percentage Season"] = (
|
49
|
+
grouped.cumcount() * 100.0 / grouped["CEI"].transform("size")
|
50
|
+
)
|
45
51
|
|
46
52
|
return df
|
47
53
|
|
@@ -186,7 +192,9 @@ def get_stage_information_dict(stage_str, method):
|
|
186
192
|
end_stage = parts[-1]
|
187
193
|
|
188
194
|
# Exclude cei from the stage_str string
|
189
|
-
stage_info["Stage_ID"] =
|
195
|
+
stage_info["Stage_ID"] = (
|
196
|
+
"_".join(parts[1:]) if parts[1].isdigit() else "_".join(parts[2:])
|
197
|
+
)
|
190
198
|
|
191
199
|
stage_info["CEI"] = cei
|
192
200
|
stage_info["Stage Range"] = "_".join([start_stage, end_stage])
|
@@ -29,33 +29,54 @@ def get_yld_prd(df, name_crop, cntr, region, calendar_year, region_column="ADM1_
|
|
29
29
|
# df_tmp = df.loc[mask_adm1]
|
30
30
|
|
31
31
|
df_tmp = df.copy()
|
32
|
-
if name_crop ==
|
33
|
-
if cntr ==
|
34
|
-
df_tmp = df.loc[df.Season ==
|
35
|
-
elif cntr ==
|
36
|
-
df_tmp = df.loc[df.Season ==
|
37
|
-
elif cntr ==
|
38
|
-
df_tmp = df.loc[df.Season ==
|
39
|
-
elif cntr ==
|
40
|
-
df_tmp = df.loc[df.Season ==
|
41
|
-
elif name_crop ==
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
32
|
+
if name_crop == "rice":
|
33
|
+
if cntr == "Viet nam":
|
34
|
+
df_tmp = df.loc[df.Season == "Spring Paddy"]
|
35
|
+
elif cntr == "Thailand":
|
36
|
+
df_tmp = df.loc[df.Season == "Major Season"]
|
37
|
+
elif cntr == "China":
|
38
|
+
df_tmp = df.loc[df.Season == "Single-cropping and Middle-season Rice"]
|
39
|
+
elif cntr == "India":
|
40
|
+
df_tmp = df.loc[df.Season == "Kharif"]
|
41
|
+
elif name_crop == "maize" and cntr in [
|
42
|
+
"Austria",
|
43
|
+
"Belgium",
|
44
|
+
"Bulgaria",
|
45
|
+
"Croatia",
|
46
|
+
"Czech Republic",
|
47
|
+
"Denmark",
|
48
|
+
"Germany",
|
49
|
+
"Greece",
|
50
|
+
"Hungary",
|
51
|
+
"Italy",
|
52
|
+
"Lithuania",
|
53
|
+
"Luxembourg",
|
54
|
+
"Netherlands",
|
55
|
+
"Poland",
|
56
|
+
"Portugal",
|
57
|
+
"Romania",
|
58
|
+
"Slovakia",
|
59
|
+
"Slovenia",
|
60
|
+
"Spain",
|
61
|
+
"Sweden",
|
62
|
+
"United Kingdom",
|
63
|
+
]:
|
64
|
+
df_tmp = df.loc[df.Season == "Grain Maize and Corn-cob-mix"]
|
65
|
+
elif name_crop == "maize" and cntr in ["France"]:
|
66
|
+
df_tmp = df.loc[df.Season == "Green Maize"]
|
48
67
|
|
49
68
|
if not df_tmp.empty:
|
50
|
-
if cntr !=
|
51
|
-
mask_tmp_country = (
|
69
|
+
if cntr != "Vietnam":
|
70
|
+
mask_tmp_country = (
|
71
|
+
df_tmp["ADM0_NAME"].str.lower() == cntr.replace("_", " ").lower()
|
72
|
+
)
|
52
73
|
else:
|
53
|
-
mask_tmp_country =
|
74
|
+
mask_tmp_country = df_tmp["ADM0_NAME"].str.lower() == "viet nam"
|
54
75
|
if region:
|
55
|
-
mask_tmp_adm1 =
|
76
|
+
mask_tmp_adm1 = df_tmp[region_column].str.lower() == region.lower()
|
56
77
|
else:
|
57
78
|
# ADM1_NAME column should be NaN to get country level stats
|
58
|
-
mask_tmp_adm1 =
|
79
|
+
mask_tmp_adm1 = df_tmp[region_column].isnull()
|
59
80
|
|
60
81
|
val = df_tmp.loc[mask_tmp_country & mask_tmp_adm1][calendar_year]
|
61
82
|
|
@@ -145,7 +166,16 @@ def add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone):
|
|
145
166
|
return df
|
146
167
|
|
147
168
|
|
148
|
-
def add_statistics(
|
169
|
+
def add_statistics(
|
170
|
+
dir_stats,
|
171
|
+
df,
|
172
|
+
country,
|
173
|
+
crop,
|
174
|
+
admin_zone,
|
175
|
+
stats,
|
176
|
+
method,
|
177
|
+
target_col="Yield (tn per ha)",
|
178
|
+
):
|
149
179
|
"""
|
150
180
|
|
151
181
|
Args:
|
@@ -166,7 +196,9 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
|
|
166
196
|
|
167
197
|
# HACK
|
168
198
|
if country == "Afghanistan":
|
169
|
-
df_fewsnet.loc[:, "product"] =
|
199
|
+
df_fewsnet.loc[:, "product"] = (
|
200
|
+
df_fewsnet["season_name"] + " " + df_fewsnet["product"]
|
201
|
+
)
|
170
202
|
# Check if country and crop exist in the fewsnet database
|
171
203
|
mask = (df_fewsnet["country"] == country) & (df_fewsnet["product"] == crop)
|
172
204
|
|
@@ -183,12 +215,27 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
|
|
183
215
|
mask_region = df_fewsnet[admin_zone] == region
|
184
216
|
mask_yield = (
|
185
217
|
df_fewsnet["crop_production_system"].isin(
|
186
|
-
[
|
218
|
+
[
|
219
|
+
"none",
|
220
|
+
"Small-scale (PS)",
|
221
|
+
"Commercial (PS)",
|
222
|
+
"All (PS)",
|
223
|
+
"irrigated",
|
224
|
+
"rainfed",
|
225
|
+
]
|
187
226
|
)
|
188
227
|
& (df_fewsnet["harvest_year"] == harvest_year)
|
189
228
|
& (df_fewsnet["product"] == crop)
|
190
229
|
& df_fewsnet["season_name"].isin(
|
191
|
-
[
|
230
|
+
[
|
231
|
+
"Main",
|
232
|
+
"Meher",
|
233
|
+
"Main harvest",
|
234
|
+
"Annual",
|
235
|
+
"Summer",
|
236
|
+
"Spring",
|
237
|
+
"Winter",
|
238
|
+
]
|
192
239
|
)
|
193
240
|
& (df_fewsnet["indicator"].isin(["yield", "area", "production"]))
|
194
241
|
)
|
@@ -84,7 +84,9 @@ def optuna_objective(model, df, feature_names, target_col, cat_features=[]):
|
|
84
84
|
y = df[target_col]
|
85
85
|
|
86
86
|
# Divide the data into training and validation sets
|
87
|
-
train_X, val_X, train_y, val_y = train_test_split(
|
87
|
+
train_X, val_X, train_y, val_y = train_test_split(
|
88
|
+
X, y, test_size=0.2, random_state=0
|
89
|
+
)
|
88
90
|
|
89
91
|
model.fit(
|
90
92
|
train_X,
|
@@ -134,7 +136,9 @@ def optimized_model(
|
|
134
136
|
params = {
|
135
137
|
"depth": trial.suggest_int("depth", 1, 7),
|
136
138
|
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
|
137
|
-
"iterations": trial.suggest_int(
|
139
|
+
"iterations": trial.suggest_int(
|
140
|
+
"iterations", low=1000, high=5000, step=500
|
141
|
+
),
|
138
142
|
"subsample": trial.suggest_float("subsample", 1.0, 1.0),
|
139
143
|
"random_strength": trial.suggest_float("random_strength", 0.3, 1.0),
|
140
144
|
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
|
@@ -177,7 +181,9 @@ def optimized_model(
|
|
177
181
|
optuna.logging.set_verbosity(optuna.logging.WARNING) # Disable verbose
|
178
182
|
sampler = optuna.samplers.TPESampler(seed=seed)
|
179
183
|
study = optuna.create_study(sampler=sampler, direction="minimize")
|
180
|
-
study.optimize(
|
184
|
+
study.optimize(
|
185
|
+
_optuna_objective, n_trials=n_trials, n_jobs=int(mp.cpu_count() * 0.4)
|
186
|
+
)
|
181
187
|
if study.best_trial is None:
|
182
188
|
raise ValueError("Optimization failed to complete any trials.")
|
183
189
|
hyperparams = study.best_trial.params
|
@@ -74,7 +74,9 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
74
74
|
model = detrended_data.trend_model[0]
|
75
75
|
|
76
76
|
if model_type == "mean":
|
77
|
-
trend_component = model.predict(
|
77
|
+
trend_component = model.predict(
|
78
|
+
np.ones(len(future_time_points)), has_constant="add"
|
79
|
+
)
|
78
80
|
elif model_type == "linear":
|
79
81
|
X_linear = add_constant(future_time_points, has_constant="add")
|
80
82
|
trend_component = model.predict(X_linear)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|