geocif 0.1.31__tar.gz → 0.1.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.31/geocif.egg-info → geocif-0.1.32}/PKG-INFO +1 -1
- {geocif-0.1.31 → geocif-0.1.32}/geocif/analysis.py +448 -213
- {geocif-0.1.31 → geocif-0.1.32}/geocif/geocif.py +47 -17
- {geocif-0.1.31 → geocif-0.1.32}/geocif/indices_runner.py +10 -8
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/spatial_autocorrelation.py +10 -6
- {geocif-0.1.31 → geocif-0.1.32}/geocif/viz/plot.py +3 -5
- {geocif-0.1.31 → geocif-0.1.32/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.31 → geocif-0.1.32}/setup.py +1 -1
- {geocif-0.1.31 → geocif-0.1.32}/LICENSE +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/MANIFEST.in +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/README.md +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/constants.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/features.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/geo.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/backup/models.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/cei/indices.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/logger.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/correlations.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/output.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/stages.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/stats.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/trainers.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/trend.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/ml/xai.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/playground/automl.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/playground/misc.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/utils.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif.egg-info/SOURCES.txt +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/requirements.txt +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/setup.cfg +0 -0
- {geocif-0.1.31 → geocif-0.1.32}/tests/test_geocif.py +0 -0
@@ -156,48 +156,78 @@ class Geoanalysis:
|
|
156
156
|
|
157
157
|
def analyze(self):
|
158
158
|
self.logger.info(f"Analyze {self.country} {self.crop}")
|
159
|
-
# Remove rows with missing values in Observed Yield (tn per ha)
|
160
|
-
df = self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
|
161
159
|
|
160
|
+
df = self._clean_data()
|
162
161
|
if df.empty:
|
163
162
|
return pd.DataFrame(), pd.DataFrame()
|
164
163
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
164
|
+
df_metrics = self._compute_metrics(df)
|
165
|
+
df_metrics = self._process_metrics(df_metrics)
|
166
|
+
self._plot_metrics(df_metrics)
|
167
|
+
|
168
|
+
df_regional_metrics_by_year = self._compute_regional_metrics(
|
169
|
+
df, by="Harvest Year"
|
170
|
+
)
|
171
|
+
df_regional_metrics_by_year = self._select_top_years(
|
172
|
+
df_regional_metrics_by_year
|
173
|
+
)
|
174
|
+
df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
|
175
|
+
|
176
|
+
self._store_results(
|
177
|
+
df_metrics, df_regional_metrics, df_regional_metrics_by_year
|
178
|
+
)
|
179
|
+
|
180
|
+
df_national_yield = self._compute_national_yield(df)
|
181
|
+
self._plot_national_yield(df_national_yield)
|
182
|
+
|
183
|
+
return df_metrics, df_regional_metrics, df_national_yield
|
170
184
|
|
171
|
-
|
185
|
+
def _clean_data(self):
|
186
|
+
# Remove rows with missing values in Observed Yield (tn per ha)
|
187
|
+
return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
|
188
|
+
|
189
|
+
def _compute_metrics(self, df):
|
190
|
+
# For each Harvest Year, Stages combination, compute metrics
|
191
|
+
df_metrics = (
|
192
|
+
df.groupby(
|
193
|
+
["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"]
|
194
|
+
)
|
195
|
+
.apply(self.annual_metrics)
|
196
|
+
.reset_index()
|
197
|
+
)
|
198
|
+
|
199
|
+
return df_metrics.pivot_table(
|
200
|
+
index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
|
201
|
+
columns="level_5",
|
202
|
+
values=0,
|
203
|
+
).reset_index()
|
204
|
+
|
205
|
+
def _process_metrics(self, df_metrics):
|
172
206
|
# Assign each unique Stage Name a unique integer identifier
|
173
207
|
df_metrics["Stage_ID"] = pd.Categorical(df_metrics["Stage Name"]).codes
|
174
208
|
|
175
209
|
# Order by Harvest Year and Number Stages (ascending)
|
176
|
-
df_metrics = df_metrics.sort_values(
|
177
|
-
by=["Harvest Year", "Stage_ID"], ascending=[True, True]
|
178
|
-
)
|
210
|
+
df_metrics = df_metrics.sort_values(by=["Harvest Year", "Stage_ID"])
|
179
211
|
|
180
212
|
# Add columns with the name of the country and crop
|
181
213
|
df_metrics["Country"] = self.country
|
182
214
|
df_metrics["Crop"] = self.crop
|
183
215
|
|
184
216
|
# Add stage information for plotting
|
185
|
-
|
217
|
+
return self.add_stage_information(df_metrics)
|
186
218
|
|
187
|
-
|
188
|
-
|
189
|
-
# breakpoint()
|
190
|
-
# df_metrics.to_csv(r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\dekad\ml\analysis\February-28-2024\ethiopia_maize\ab1.csv')
|
191
|
-
for metric in [
|
219
|
+
def _plot_metrics(self, df_metrics):
|
220
|
+
metrics = [
|
192
221
|
"Root Mean Square Error",
|
193
|
-
# "Nash-Sutcliff Efficiency",
|
194
222
|
"$r^2$",
|
195
223
|
"Mean Absolute Error",
|
196
224
|
"Mean Absolute\nPercentage Error",
|
197
225
|
"Percentage Bias",
|
198
|
-
]
|
226
|
+
]
|
227
|
+
for metric in metrics:
|
199
228
|
self.plot_metric(df_metrics, metric)
|
200
229
|
|
230
|
+
def _compute_regional_metrics(self, df, by=None):
|
201
231
|
cols = [
|
202
232
|
"Country",
|
203
233
|
"Region",
|
@@ -208,64 +238,193 @@ class Geoanalysis:
|
|
208
238
|
"Stage Range",
|
209
239
|
]
|
210
240
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
.apply(self.regional_metrics)
|
215
|
-
.reset_index()
|
216
|
-
)
|
241
|
+
if by:
|
242
|
+
return df.groupby(cols + [by]).apply(self.regional_metrics).reset_index()
|
243
|
+
else:
|
244
|
+
return df.groupby(cols).apply(self.regional_metrics).reset_index()
|
217
245
|
|
218
|
-
|
219
|
-
|
220
|
-
df_regional_metrics = (
|
246
|
+
def _select_top_years(self, df_regional_metrics):
|
247
|
+
return (
|
221
248
|
df_regional_metrics.groupby(["Country", "Region"])
|
222
249
|
.apply(lambda x: self.select_top_N_years(x, 10))
|
223
250
|
.reset_index(drop=True)
|
224
251
|
)
|
225
252
|
|
226
|
-
|
227
|
-
df_regional_metrics = (
|
228
|
-
df_regional_metrics.groupby(cols)["Mean Absolute Percentage Error"]
|
229
|
-
.mean()
|
230
|
-
.reset_index()
|
231
|
-
)
|
232
|
-
|
233
|
-
# Create an index based on following columns
|
253
|
+
def _average_mape(self, df_regional_metrics):
|
234
254
|
cols = [
|
235
255
|
"Country",
|
236
|
-
"
|
256
|
+
"Region",
|
257
|
+
"% of total Area (ha)",
|
237
258
|
"Model",
|
238
|
-
"
|
259
|
+
"Crop",
|
239
260
|
"Stage Name",
|
261
|
+
"Stage Range",
|
240
262
|
]
|
263
|
+
return (
|
264
|
+
df_regional_metrics.groupby(cols)["Mean Absolute Percentage Error"]
|
265
|
+
.mean()
|
266
|
+
.reset_index()
|
267
|
+
)
|
268
|
+
|
269
|
+
def _store_results(
|
270
|
+
self, df_metrics, df_regional_metrics, df_regional_metrics_by_year
|
271
|
+
):
|
272
|
+
# Create an index based on specific columns
|
241
273
|
df_metrics.index = df_metrics.apply(
|
242
|
-
lambda row: "_".join(
|
274
|
+
lambda row: "_".join(
|
275
|
+
[
|
276
|
+
str(row[col])
|
277
|
+
for col in [
|
278
|
+
"Country",
|
279
|
+
"Crop",
|
280
|
+
"Model",
|
281
|
+
"Harvest Year",
|
282
|
+
"Stage Name",
|
283
|
+
]
|
284
|
+
]
|
285
|
+
),
|
286
|
+
axis=1,
|
243
287
|
)
|
244
288
|
df_metrics.index.set_names(["Index"], inplace=True)
|
245
289
|
|
246
|
-
cols = [
|
247
|
-
"Country",
|
248
|
-
"Region",
|
249
|
-
"Model",
|
250
|
-
"Crop",
|
251
|
-
"Stage Name",
|
252
|
-
]
|
253
290
|
df_regional_metrics.index = df_regional_metrics.apply(
|
254
|
-
lambda row: "_".join(
|
291
|
+
lambda row: "_".join(
|
292
|
+
[
|
293
|
+
str(row[col])
|
294
|
+
for col in ["Country", "Region", "Model", "Crop", "Stage Name"]
|
295
|
+
]
|
296
|
+
),
|
297
|
+
axis=1,
|
255
298
|
)
|
256
299
|
df_regional_metrics.index.set_names(["Index"], inplace=True)
|
257
300
|
|
301
|
+
df_regional_metrics_by_year.index = df_regional_metrics_by_year.apply(
|
302
|
+
lambda row: "_".join(
|
303
|
+
[
|
304
|
+
str(row[col])
|
305
|
+
for col in [
|
306
|
+
"Country",
|
307
|
+
"Region",
|
308
|
+
"Model",
|
309
|
+
"Crop",
|
310
|
+
"Stage Name",
|
311
|
+
"Harvest Year",
|
312
|
+
]
|
313
|
+
]
|
314
|
+
),
|
315
|
+
axis=1,
|
316
|
+
)
|
317
|
+
df_regional_metrics_by_year.index.set_names(["Index"], inplace=True)
|
318
|
+
|
258
319
|
# Format with 3 places after the decimal point
|
259
320
|
df_metrics = df_metrics.round(3)
|
260
321
|
df_regional_metrics = df_regional_metrics.round(3)
|
322
|
+
df_regional_metrics_by_year = df_regional_metrics_by_year.round(3)
|
261
323
|
|
262
324
|
# Store results in database
|
263
|
-
|
264
|
-
|
265
|
-
|
325
|
+
with sqlite3.connect(self.db_path) as con:
|
326
|
+
utils.to_db(self.db_path, "country_metrics", df_metrics)
|
327
|
+
utils.to_db(self.db_path, "regional_metrics", df_regional_metrics)
|
328
|
+
utils.to_db(
|
329
|
+
self.db_path, "regional_metrics_by_year", df_regional_metrics_by_year
|
330
|
+
)
|
266
331
|
|
267
|
-
|
268
|
-
|
332
|
+
con.commit()
|
333
|
+
|
334
|
+
def _compute_national_yield(self, df):
|
335
|
+
# Compute observed and predicted national yield by multiplying Yield (tn per ha) by Area (ha)
|
336
|
+
observed = "Observed Yield (tn per ha)"
|
337
|
+
predicted = "Predicted Yield (tn per ha)"
|
338
|
+
area_ha = "Area (ha)"
|
339
|
+
|
340
|
+
df.loc[:, observed] = df[observed] * df[area_ha]
|
341
|
+
df.loc[:, predicted] = df[predicted] * df[area_ha]
|
342
|
+
|
343
|
+
# Group by Country and Harvest Year, then sum the National Yield and Area
|
344
|
+
df_national_yield = (
|
345
|
+
df.groupby(["Country", "Harvest Year"])
|
346
|
+
.agg({observed: "sum", predicted: "sum", area_ha: "sum"})
|
347
|
+
.reset_index()
|
348
|
+
)
|
349
|
+
|
350
|
+
# Compute observed and predicted yield per ha for each Harvest Year
|
351
|
+
df_national_yield[observed] = (
|
352
|
+
df_national_yield[observed] / df_national_yield[area_ha]
|
353
|
+
)
|
354
|
+
df_national_yield[predicted] = (
|
355
|
+
df_national_yield[predicted] / df_national_yield[area_ha]
|
356
|
+
)
|
357
|
+
|
358
|
+
return df_national_yield
|
359
|
+
|
360
|
+
def _plot_national_yield(self, df_national_yield, use_different_colors=True):
|
361
|
+
from sklearn.metrics import (
|
362
|
+
mean_squared_error,
|
363
|
+
r2_score,
|
364
|
+
mean_absolute_percentage_error,
|
365
|
+
)
|
366
|
+
|
367
|
+
x = df_national_yield["Harvest Year"]
|
368
|
+
y_observed = df_national_yield["Observed Yield (tn per ha)"]
|
369
|
+
y_predicted = df_national_yield["Predicted Yield (tn per ha)"]
|
370
|
+
|
371
|
+
with plt.style.context("science"):
|
372
|
+
plt.figure(figsize=(10, 6))
|
373
|
+
|
374
|
+
import palettable as pal
|
375
|
+
|
376
|
+
colors = pal.tableau.Tableau_20.mpl_colors
|
377
|
+
colors = colors[: len(x)]
|
378
|
+
|
379
|
+
# Add dashed gray grid lines with alpha=0.5
|
380
|
+
plt.grid(True, linestyle="--", alpha=0.5)
|
381
|
+
|
382
|
+
for i in range(len(x)):
|
383
|
+
plt.scatter(y_observed[i], y_predicted[i], color=colors[i], label=x[i])
|
384
|
+
|
385
|
+
# X and Y-axis range from 0 to the maximum observed/predicted yield * 1.1
|
386
|
+
max_yield = max(y_observed.max(), y_predicted.max()) * 1.25
|
387
|
+
plt.xlim(0, max_yield)
|
388
|
+
plt.ylim(0, max_yield)
|
389
|
+
|
390
|
+
# Add a line diagonally representing 1:1
|
391
|
+
plt.plot([0, max_yield], [0, max_yield], color="gray", linestyle="--")
|
392
|
+
|
393
|
+
# Calculate metrics
|
394
|
+
rmse = np.sqrt(mean_squared_error(y_observed, y_predicted))
|
395
|
+
mape = mean_absolute_percentage_error(y_observed, y_predicted)
|
396
|
+
r2 = r2_score(y_observed, y_predicted)
|
397
|
+
|
398
|
+
# Annotate metrics
|
399
|
+
textstr = "\n".join(
|
400
|
+
(
|
401
|
+
f"RMSE: {rmse:.2f} tn/ha",
|
402
|
+
f"MAPE: {mape:.2%}",
|
403
|
+
f"R²: {r2:.2f}",
|
404
|
+
)
|
405
|
+
)
|
406
|
+
plt.gca().annotate(
|
407
|
+
textstr,
|
408
|
+
xy=(0.05, 0.95),
|
409
|
+
xycoords="axes fraction",
|
410
|
+
fontsize=12,
|
411
|
+
verticalalignment="top",
|
412
|
+
)
|
413
|
+
|
414
|
+
plt.xlabel("Observed Yield (tn/ha)")
|
415
|
+
plt.ylabel("Predicted Yield (tn/ha)")
|
416
|
+
# Place legend outside the plot to the right without a border
|
417
|
+
plt.legend(
|
418
|
+
title="Year",
|
419
|
+
bbox_to_anchor=(1.05, 1),
|
420
|
+
loc="upper left",
|
421
|
+
edgecolor="none",
|
422
|
+
)
|
423
|
+
plt.tight_layout()
|
424
|
+
|
425
|
+
fname = f"scatter_{self.country}_{self.crop}.png"
|
426
|
+
plt.savefig(self.dir_analysis / fname, dpi=250)
|
427
|
+
plt.close()
|
269
428
|
|
270
429
|
def get_historic_production(self):
|
271
430
|
# Read in historic production data
|
@@ -342,148 +501,6 @@ class Geoanalysis:
|
|
342
501
|
|
343
502
|
return self.df_analysis
|
344
503
|
|
345
|
-
def map_regional(self):
|
346
|
-
con = sqlite3.connect(self.db_path)
|
347
|
-
|
348
|
-
# Read from database, where country and crop match
|
349
|
-
query = "SELECT * FROM country_metrics"
|
350
|
-
df_country = pd.read_sql_query(query, con)
|
351
|
-
query = "SELECT * FROM regional_metrics"
|
352
|
-
df_regional = pd.read_sql_query(query, con)
|
353
|
-
|
354
|
-
# Plot a histogram of the MAPE, different color for each country
|
355
|
-
# Plotting the histograms with KDE for each country
|
356
|
-
df_regional["Country"] = (
|
357
|
-
df_regional["Country"].str.replace("_", " ").str.title()
|
358
|
-
)
|
359
|
-
df_regional["Model"] = df_regional["Model"].str.title()
|
360
|
-
|
361
|
-
# Plotting the histogram with a smaller bin size for greater detail
|
362
|
-
# Plotting the KDE for each country, ensuring each step works
|
363
|
-
models = df_regional["Model"].unique()
|
364
|
-
for model in models:
|
365
|
-
df_model = df_regional[df_regional["Model"] == model]
|
366
|
-
|
367
|
-
# HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
|
368
|
-
# or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
|
369
|
-
df_tmp = df_model[
|
370
|
-
(df_model["% of total Area (ha)"] < 1)
|
371
|
-
& (df_model["Mean Absolute Percentage Error"] > 50)
|
372
|
-
& (df_model["Country"].isin(["Angola", "United Republic Of Tanzania"]))
|
373
|
-
]
|
374
|
-
|
375
|
-
# Remove df_tmp from df_model
|
376
|
-
df_model = df_model.drop(df_tmp.index)
|
377
|
-
# Plot the histogram of MAPE
|
378
|
-
# Create bins for '% of total Area (ha)' and 'MAPE'
|
379
|
-
bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 5 + 1)
|
380
|
-
|
381
|
-
df_model["Area Bins"] = pd.cut(
|
382
|
-
df_model["% of total Area (ha)"],
|
383
|
-
bins=bin_edges,
|
384
|
-
precision=0,
|
385
|
-
)
|
386
|
-
df_model["MAPE Bins"] = pd.cut(
|
387
|
-
df_model["Mean Absolute Percentage Error"],
|
388
|
-
bins=5, # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
|
389
|
-
right=False,
|
390
|
-
precision=1,
|
391
|
-
)
|
392
|
-
|
393
|
-
# Count occurrences of MAPE values for each area bin
|
394
|
-
area_mape_counts = (
|
395
|
-
df_model.groupby(["Area Bins", "MAPE Bins"])
|
396
|
-
.size()
|
397
|
-
.unstack(fill_value=0)
|
398
|
-
)
|
399
|
-
|
400
|
-
# Create the heatmap
|
401
|
-
plt.figure(figsize=(10, 8))
|
402
|
-
ax = sns.heatmap(
|
403
|
-
area_mape_counts,
|
404
|
-
annot=True,
|
405
|
-
square=True,
|
406
|
-
cmap=pal.scientific.sequential.Bamako_20_r.mpl_colormap,
|
407
|
-
fmt="d",
|
408
|
-
)
|
409
|
-
# Do not color or annotate cells with 0
|
410
|
-
for text in ax.texts:
|
411
|
-
if text.get_text() == "0":
|
412
|
-
text.set_text("")
|
413
|
-
text.set_color("white")
|
414
|
-
|
415
|
-
# plt.title("Heatmap of MAPE Bins vs % Total Area Bins")
|
416
|
-
plt.ylabel("% of Total Area (ha) Bins")
|
417
|
-
plt.xlabel("MAPE Bins")
|
418
|
-
|
419
|
-
# Adjust y-axis labels to horizontal
|
420
|
-
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
|
421
|
-
|
422
|
-
# Invert y-axis to have the highest bin at the top
|
423
|
-
ax.invert_yaxis()
|
424
|
-
plt.savefig(self.dir_analysis / f"heatmap_{model}.png", dpi=250)
|
425
|
-
plt.close()
|
426
|
-
|
427
|
-
# Plot the KDE of MAPE
|
428
|
-
plt.figure(figsize=(12, 8))
|
429
|
-
for label, group_data in df_model.groupby("Country"):
|
430
|
-
sns.kdeplot(
|
431
|
-
group_data["Mean Absolute Percentage Error"],
|
432
|
-
label=label,
|
433
|
-
clip=(0, None),
|
434
|
-
# bins=len(group_data),
|
435
|
-
# kde=True,
|
436
|
-
)
|
437
|
-
|
438
|
-
# Add minor ticks on the x-axis
|
439
|
-
plt.minorticks_on()
|
440
|
-
|
441
|
-
# Setting the title and labels
|
442
|
-
plt.title(
|
443
|
-
f"Kernel Density Estimation of Mean Absolute Percentage Error by Country - {model}"
|
444
|
-
)
|
445
|
-
plt.xlabel("Mean Absolute Percentage Error (%)")
|
446
|
-
plt.ylabel("Density")
|
447
|
-
plt.legend(title="Country", title_fontsize="13")
|
448
|
-
plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
|
449
|
-
plt.close()
|
450
|
-
|
451
|
-
# Map MAPE at regional level
|
452
|
-
df_model["Country Region"] = (
|
453
|
-
df_model["Country"].str.lower().str.replace("_", " ")
|
454
|
-
+ " "
|
455
|
-
+ df_model["Region"].str.lower()
|
456
|
-
)
|
457
|
-
|
458
|
-
fname = f"mape_{self.crop}_{model}.png"
|
459
|
-
col = "Mean Absolute Percentage Error"
|
460
|
-
countries = df_model["Country"].unique().tolist()
|
461
|
-
# make it title case and replace _ with space
|
462
|
-
countries = [country.title().replace("_", " ") for country in countries]
|
463
|
-
countries = ["Malawi"]
|
464
|
-
df_model = df_model[df_model["Country"].isin(countries)]
|
465
|
-
self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
|
466
|
-
plot.plot_df_shpfile(
|
467
|
-
self.dg, # dataframe containing adm1 name and polygon
|
468
|
-
df_model, # dataframe containing information that will be mapped
|
469
|
-
merge_col="Country Region", # Column on which to merge
|
470
|
-
name_country=countries,
|
471
|
-
name_col=col, # Which column to plot
|
472
|
-
dir_out=self.dir_analysis, # Output directory
|
473
|
-
fname=fname, # Output file name
|
474
|
-
label=f"MAPE (%)",
|
475
|
-
vmin=df_model[col].min(),
|
476
|
-
vmax=df_model[col].max(),
|
477
|
-
cmap=pal.scientific.sequential.Bamako_20_r,
|
478
|
-
series="sequential",
|
479
|
-
show_bg=False,
|
480
|
-
annotate_regions=False,
|
481
|
-
loc_legend="lower left",
|
482
|
-
)
|
483
|
-
|
484
|
-
con.commit()
|
485
|
-
con.close()
|
486
|
-
|
487
504
|
def map(self, df_plot):
|
488
505
|
# df_plot = self.df_analysis.copy()
|
489
506
|
models = df_plot["Model"].unique()
|
@@ -491,10 +508,6 @@ class Geoanalysis:
|
|
491
508
|
for model in models:
|
492
509
|
df_model = df_plot[df_plot["Model"] == model]
|
493
510
|
|
494
|
-
countries = ["malawi"]
|
495
|
-
df_model = df_model[df_model["Country"].isin(countries)]
|
496
|
-
self.dg = self.dg[self.dg["ADM0_NAME"].isin(["Malawi", "malawi"])]
|
497
|
-
|
498
511
|
countries = df_model["Country"].unique().tolist()
|
499
512
|
if len(countries) > 1:
|
500
513
|
self.dir_plot = self.dir_analysis
|
@@ -509,7 +522,6 @@ class Geoanalysis:
|
|
509
522
|
+ df_model["Region"].str.lower().str.replace("_", " ")
|
510
523
|
)
|
511
524
|
|
512
|
-
|
513
525
|
# Change Harvest year to type int
|
514
526
|
df_model["Harvest Year"] = df_model["Harvest Year"].astype(int)
|
515
527
|
annotate_region_column = (
|
@@ -525,8 +537,8 @@ class Geoanalysis:
|
|
525
537
|
df_time_period = df_harvest_year[
|
526
538
|
df_harvest_year["Stage Name"] == time_period
|
527
539
|
]
|
528
|
-
|
529
|
-
|
540
|
+
#
|
541
|
+
# """ % of total area """
|
530
542
|
if idx == 0:
|
531
543
|
fname = f"{self.country}_{self.crop}_perc_area.png"
|
532
544
|
col = "% of total Area (ha)"
|
@@ -548,15 +560,16 @@ class Geoanalysis:
|
|
548
560
|
annotate_region_column=annotate_region_column,
|
549
561
|
loc_legend="lower left",
|
550
562
|
)
|
551
|
-
|
552
|
-
|
563
|
+
#
|
564
|
+
# # """ Unique regions """
|
553
565
|
fname = f"{self.country}_{self.crop}_region_ID.png"
|
554
566
|
col = "Region_ID"
|
555
567
|
df_model[col] = df_model[col].astype(int) + 1
|
556
568
|
if len(df_model["Region_ID"].unique() > 1):
|
557
569
|
# Create a dictionary with each region assigned a unique integer identifier and name
|
558
570
|
dict_region = {
|
559
|
-
int(key): key
|
571
|
+
int(key): key
|
572
|
+
for key in df_time_period["Region_ID"].unique()
|
560
573
|
}
|
561
574
|
plot.plot_df_shpfile(
|
562
575
|
self.dg, # dataframe containing adm1 name and polygon
|
@@ -579,7 +592,7 @@ class Geoanalysis:
|
|
579
592
|
annotate_region_column=annotate_region_column,
|
580
593
|
loc_legend="lower left",
|
581
594
|
)
|
582
|
-
|
595
|
+
# breakpoint()
|
583
596
|
|
584
597
|
# """ Anomaly """
|
585
598
|
# fname = (
|
@@ -690,10 +703,10 @@ class Geoanalysis:
|
|
690
703
|
|
691
704
|
def execute(self):
|
692
705
|
self.query()
|
693
|
-
|
706
|
+
df = self.preprocess()
|
694
707
|
self.analyze()
|
695
708
|
|
696
|
-
return
|
709
|
+
return df
|
697
710
|
|
698
711
|
def get_config_data(self):
|
699
712
|
try:
|
@@ -737,7 +750,6 @@ class Geoanalysis:
|
|
737
750
|
|
738
751
|
"""
|
739
752
|
self.dict_config = {}
|
740
|
-
self.get_config_data()
|
741
753
|
|
742
754
|
self.observed = "Observed Yield (tn per ha)"
|
743
755
|
self.predicted = "Predicted Yield (tn per ha)"
|
@@ -798,12 +810,232 @@ class Geoanalysis:
|
|
798
810
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
|
799
811
|
)
|
800
812
|
# Make it lower case
|
801
|
-
self.dg["Country Region"] =
|
813
|
+
self.dg["Country Region"] = (
|
814
|
+
self.dg["Country Region"].str.lower().replace("_", " ")
|
815
|
+
)
|
816
|
+
|
817
|
+
|
818
|
+
@dataclass
|
819
|
+
class RegionalMapper(Geoanalysis):
|
820
|
+
path_config_files: List[Path] = field(default_factory=list)
|
821
|
+
logger: log = None
|
822
|
+
parser: ConfigParser = field(default_factory=ConfigParser)
|
823
|
+
|
824
|
+
def __post_init__(self):
|
825
|
+
# Call the parent class constructor
|
826
|
+
super().__post_init__()
|
827
|
+
self.get_config_data()
|
828
|
+
self.setup()
|
829
|
+
|
830
|
+
def map_regional(self):
|
831
|
+
"""Main function to read data and generate plots."""
|
832
|
+
self.read_data()
|
833
|
+
|
834
|
+
self.clean_data()
|
835
|
+
self.plot_heatmap()
|
836
|
+
self.plot_kde()
|
837
|
+
self.plot_mape_map()
|
838
|
+
self.plot_mape_by_year()
|
839
|
+
|
840
|
+
def read_data(self):
|
841
|
+
"""Read data from the database."""
|
842
|
+
con = sqlite3.connect(self.db_path)
|
843
|
+
|
844
|
+
query = "SELECT * FROM regional_metrics"
|
845
|
+
self.df_regional = pd.read_sql_query(query, con)
|
846
|
+
query = "SELECT * FROM regional_metrics_by_year"
|
847
|
+
self.df_regional_by_year = pd.read_sql_query(query, con)
|
848
|
+
|
849
|
+
con.close()
|
850
|
+
|
851
|
+
def clean_data(self):
|
852
|
+
"""Clean and format the data."""
|
853
|
+
self.df_regional["Country"] = (
|
854
|
+
self.df_regional["Country"].str.replace("_", " ").str.title()
|
855
|
+
)
|
856
|
+
self.df_regional["Model"] = self.df_regional["Model"].str.title()
|
857
|
+
|
858
|
+
def plot_heatmap(self):
|
859
|
+
"""Generate heatmaps of MAPE bins vs. % total area bins."""
|
860
|
+
models = self.df_regional["Model"].unique()
|
861
|
+
for model in models:
|
862
|
+
df_model = self.df_regional[self.df_regional["Model"] == model]
|
863
|
+
|
864
|
+
# HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
|
865
|
+
# or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
|
866
|
+
df_tmp = df_model[
|
867
|
+
(df_model["% of total Area (ha)"] < 0.5)
|
868
|
+
& (df_model["Mean Absolute Percentage Error"] > 100)
|
869
|
+
]
|
870
|
+
|
871
|
+
df_model = df_model.drop(df_tmp.index)
|
872
|
+
bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 6)
|
873
|
+
df_model["Area Bins"] = pd.cut(
|
874
|
+
df_model["% of total Area (ha)"], bins=bin_edges, precision=0
|
875
|
+
)
|
876
|
+
df_model["MAPE Bins"] = pd.cut(
|
877
|
+
df_model["Mean Absolute Percentage Error"],
|
878
|
+
bins=5,
|
879
|
+
right=False,
|
880
|
+
precision=1,
|
881
|
+
)
|
882
|
+
area_mape_counts = (
|
883
|
+
df_model.groupby(["Area Bins", "MAPE Bins"])
|
884
|
+
.size()
|
885
|
+
.unstack(fill_value=0)
|
886
|
+
)
|
887
|
+
self._plot_heatmap(area_mape_counts, model)
|
888
|
+
|
889
|
+
def _plot_heatmap(self, area_mape_counts, model):
|
890
|
+
"""
|
891
|
+
Plot heatmap helper function
|
892
|
+
Args:
|
893
|
+
area_mape_counts:
|
894
|
+
model:
|
895
|
+
|
896
|
+
Returns:
|
897
|
+
|
898
|
+
"""
|
899
|
+
plt.figure(figsize=(10, 8))
|
900
|
+
|
901
|
+
ax = sns.heatmap(
|
902
|
+
area_mape_counts,
|
903
|
+
annot=True,
|
904
|
+
square=True,
|
905
|
+
cmap=pal.scientific.sequential.Bamako_20_r.mpl_colormap,
|
906
|
+
fmt="d",
|
907
|
+
)
|
908
|
+
for text in ax.texts:
|
909
|
+
if text.get_text() == "0":
|
910
|
+
text.set_text("")
|
911
|
+
text.set_color("white")
|
912
|
+
plt.ylabel("% of Total Area (ha) Bins")
|
913
|
+
plt.xlabel("MAPE Bins")
|
914
|
+
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
|
915
|
+
ax.invert_yaxis()
|
916
|
+
|
917
|
+
plt.tight_layout()
|
918
|
+
plt.savefig(self.dir_analysis / f"heatmap_{model}.png", dpi=250)
|
919
|
+
plt.close()
|
920
|
+
|
921
|
+
def plot_kde(self):
|
922
|
+
"""Generate KDE plots of MAPE for each country."""
|
923
|
+
models = self.df_regional["Model"].unique()
|
924
|
+
|
925
|
+
for model in models:
|
926
|
+
df_model = self.df_regional[self.df_regional["Model"] == model]
|
927
|
+
|
928
|
+
# HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
|
929
|
+
# or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
|
930
|
+
df_tmp = df_model[
|
931
|
+
(df_model["% of total Area (ha)"] < 0.5)
|
932
|
+
& (df_model["Mean Absolute Percentage Error"] > 100)
|
933
|
+
]
|
934
|
+
|
935
|
+
df_model = df_model.drop(df_tmp.index)
|
936
|
+
|
937
|
+
with plt.style.context("science"):
|
938
|
+
plt.figure(figsize=(12, 8))
|
939
|
+
for label, group_data in df_model.groupby("Country"):
|
940
|
+
sns.histplot(
|
941
|
+
group_data["Mean Absolute Percentage Error"],
|
942
|
+
label=label,
|
943
|
+
# clip=(0, None),
|
944
|
+
)
|
945
|
+
|
946
|
+
# Plot a dashed gray line at x=20
|
947
|
+
plt.axvline(x=20, color="gray", linestyle="--")
|
948
|
+
|
949
|
+
plt.minorticks_on()
|
950
|
+
plt.xlabel("Mean Absolute Percentage Error (%)")
|
951
|
+
plt.ylabel("Frequency")
|
952
|
+
plt.legend(title="Country", title_fontsize="13")
|
953
|
+
|
954
|
+
plt.tight_layout()
|
955
|
+
plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
|
956
|
+
plt.close()
|
957
|
+
|
958
|
+
def plot_mape_map(self):
|
959
|
+
"""Plot the map of MAPE."""
|
960
|
+
self.df_regional["Country Region"] = (
|
961
|
+
self.df_regional["Country"].str.lower().str.replace("_", " ")
|
962
|
+
+ " "
|
963
|
+
+ self.df_regional["Region"].str.lower()
|
964
|
+
)
|
965
|
+
models = self.df_regional["Model"].unique()
|
966
|
+
|
967
|
+
for model in models:
|
968
|
+
df_model = self.df_regional[self.df_regional["Model"] == model]
|
969
|
+
|
970
|
+
# HACK: Drop rows where '% of total Area (ha)' is less than 1% and Mean Absolute Percentage Error is > 50%
|
971
|
+
# or where the Mean Absolute Percentage Error is greater than 50% if the '% of total Area (ha)' is greater than 1%
|
972
|
+
df_tmp = df_model[
|
973
|
+
(df_model["% of total Area (ha)"] < 0.5)
|
974
|
+
& (df_model["Mean Absolute Percentage Error"] > 100)
|
975
|
+
]
|
976
|
+
|
977
|
+
df_model = df_model.drop(df_tmp.index)
|
978
|
+
|
979
|
+
fname = f"mape_{self.crop}_{df_model['Model'].iloc[0]}.png"
|
980
|
+
col = "Mean Absolute Percentage Error"
|
981
|
+
countries = df_model["Country"].unique().tolist()
|
982
|
+
countries = [country.title().replace("_", " ") for country in countries]
|
983
|
+
df = df_model[df_model["Country"].isin(countries)]
|
984
|
+
self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
|
985
|
+
|
986
|
+
plot.plot_df_shpfile(
|
987
|
+
self.dg,
|
988
|
+
df,
|
989
|
+
merge_col="Country Region",
|
990
|
+
name_country=countries,
|
991
|
+
name_col=col,
|
992
|
+
dir_out=self.dir_analysis,
|
993
|
+
fname=fname,
|
994
|
+
label="MAPE (%)",
|
995
|
+
vmin=df[col].min(),
|
996
|
+
vmax=df[col].max(),
|
997
|
+
cmap=pal.scientific.sequential.Bamako_20_r,
|
998
|
+
series="sequential",
|
999
|
+
show_bg=False,
|
1000
|
+
annotate_regions=False,
|
1001
|
+
loc_legend="lower left",
|
1002
|
+
)
|
1003
|
+
|
1004
|
+
def plot_mape_by_year(self):
|
1005
|
+
"""Compute MAPE by year and plot using a bar chart."""
|
1006
|
+
# Compute the Mean Absolute Percentage Error (MAPE) by year
|
1007
|
+
mape_by_year = (
|
1008
|
+
self.df_regional_by_year.groupby("Harvest Year")[
|
1009
|
+
"Mean Absolute Percentage Error"
|
1010
|
+
]
|
1011
|
+
.mean()
|
1012
|
+
.reset_index()
|
1013
|
+
)
|
1014
|
+
|
1015
|
+
# Plot MAPE by year
|
1016
|
+
with plt.style.context("science"):
|
1017
|
+
plt.figure(figsize=(10, 6))
|
1018
|
+
sns.barplot(
|
1019
|
+
x="Harvest Year", y="Mean Absolute Percentage Error", data=mape_by_year
|
1020
|
+
)
|
1021
|
+
# Draw a dashed gray line at y=20
|
1022
|
+
plt.axhline(y=20, color="gray", linestyle="--")
|
1023
|
+
|
1024
|
+
plt.title("Mean Absolute Percentage Error by Year")
|
1025
|
+
plt.xlabel("Year")
|
1026
|
+
plt.ylabel("Mean Absolute Percentage Error (%)")
|
1027
|
+
plt.xticks(rotation=0)
|
1028
|
+
|
1029
|
+
plt.tight_layout()
|
1030
|
+
plt.savefig(self.dir_analysis / "mape_by_year.png", dpi=250)
|
1031
|
+
plt.close()
|
802
1032
|
|
803
1033
|
|
804
1034
|
def run(path_config_files=[Path("../config/geocif.txt")]):
|
805
1035
|
logger, parser = log.setup_logger_parser(path_config_files)
|
1036
|
+
|
806
1037
|
obj = Geoanalysis(path_config_files, logger, parser)
|
1038
|
+
obj.get_config_data()
|
807
1039
|
obj.setup()
|
808
1040
|
|
809
1041
|
""" Loop over each country, crop, model combination in dict_config """
|
@@ -826,11 +1058,14 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
826
1058
|
df_tmp = obj.execute()
|
827
1059
|
frames.append(df_tmp)
|
828
1060
|
|
829
|
-
|
1061
|
+
df = pd.concat(frames)
|
1062
|
+
|
1063
|
+
""" Map regional error metrics """
|
1064
|
+
mapper = RegionalMapper(path_config_files, logger, parser)
|
1065
|
+
mapper.map_regional()
|
830
1066
|
|
831
|
-
|
832
|
-
|
833
|
-
obj.map(dk)
|
1067
|
+
""" For each country, plot yields, conditions, anomalies, etc. """
|
1068
|
+
obj.map(df)
|
834
1069
|
|
835
1070
|
|
836
1071
|
if __name__ == "__main__":
|
@@ -108,6 +108,7 @@ class Geocif:
|
|
108
108
|
Config file: ML
|
109
109
|
====================================================================
|
110
110
|
"""
|
111
|
+
self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
|
111
112
|
self.model_type = self.parser.get("ML", "model_type")
|
112
113
|
self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
|
113
114
|
self.analogous_year_yield_as_feature = self.parser.getboolean(
|
@@ -149,7 +150,7 @@ class Geocif:
|
|
149
150
|
"""
|
150
151
|
# If ML model is run for individual region or cluster, then Region_ID is the same for each region
|
151
152
|
# or cluster and therefore redundant for the ML model
|
152
|
-
#if self.cluster_strategy in ["individual", "auto_detect"]:
|
153
|
+
# if self.cluster_strategy in ["individual", "auto_detect"]:
|
153
154
|
# self.cat_features.remove("Region_ID")
|
154
155
|
|
155
156
|
self.fixed_columns: list = [
|
@@ -223,7 +224,11 @@ class Geocif:
|
|
223
224
|
""" Update model to include conformal estimates """
|
224
225
|
X_train = df_region[self.selected_features + self.cat_features]
|
225
226
|
dir_output = (
|
226
|
-
self.dir_analysis
|
227
|
+
self.dir_analysis
|
228
|
+
/ self.country
|
229
|
+
/ self.crop
|
230
|
+
/ self.model_name
|
231
|
+
/ str(self.forecast_season)
|
227
232
|
)
|
228
233
|
|
229
234
|
region_id = df_region["Region_ID"].unique()[0]
|
@@ -275,7 +280,12 @@ class Geocif:
|
|
275
280
|
clusters_train = df_region["Region"]
|
276
281
|
clusters_train.reset_index(drop=True, inplace=True)
|
277
282
|
|
278
|
-
self.model.fit(
|
283
|
+
self.model.fit(
|
284
|
+
X_train,
|
285
|
+
Z_train,
|
286
|
+
clusters_train.astype("object"),
|
287
|
+
y_train.values,
|
288
|
+
)
|
279
289
|
elif self.model_name == "linear":
|
280
290
|
self.model.fit(X_train_scaled, y_train)
|
281
291
|
elif self.model_name == "gam":
|
@@ -327,7 +337,9 @@ class Geocif:
|
|
327
337
|
clusters_test = df_region["Region"]
|
328
338
|
clusters_test.reset_index(drop=True, inplace=True)
|
329
339
|
|
330
|
-
y_pred = self.model.predict(
|
340
|
+
y_pred = self.model.predict(
|
341
|
+
X_test, Z_test, clusters_test.astype("object")
|
342
|
+
)
|
331
343
|
best_hyperparameters = self.model.fe_model.get_params().copy()
|
332
344
|
else:
|
333
345
|
y_pred = self.model.predict(X_test)
|
@@ -609,6 +621,15 @@ class Geocif:
|
|
609
621
|
model = self.model
|
610
622
|
output.store(self.db_path, experiment_id, df, model, self.model_name)
|
611
623
|
|
624
|
+
def get_cei_column_names(self, df):
|
625
|
+
all_cei_columns = [
|
626
|
+
col
|
627
|
+
for col in df.columns
|
628
|
+
if col not in self.fixed_columns + [self.target] + self.statistics_columns
|
629
|
+
]
|
630
|
+
|
631
|
+
return all_cei_columns
|
632
|
+
|
612
633
|
def create_ml_dataframe(self, df):
|
613
634
|
"""
|
614
635
|
Create ML ready dataframe
|
@@ -650,23 +671,22 @@ class Geocif:
|
|
650
671
|
# Flatten the multi-index columns
|
651
672
|
df.columns = [f"{i}_{j}" if j != "" else f"{i}" for i, j in df.columns]
|
652
673
|
|
674
|
+
# Get all the columns apart from the fixed columns, target column and stats columns
|
675
|
+
all_cei_columns = self.get_cei_column_names(df)
|
676
|
+
parts = all_cei_columns[-1].split("_")
|
677
|
+
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
678
|
+
|
653
679
|
# HACK: Get feature name with GD4 in it to extract first and last stage id and name
|
654
|
-
|
655
|
-
# Select the longest string in
|
656
|
-
|
657
|
-
self.stage_info = stages.get_stage_information_dict(
|
680
|
+
cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
|
681
|
+
# Select the longest string in cei_column
|
682
|
+
cei_col = max(cei_column, key=len)
|
683
|
+
self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
|
658
684
|
|
659
685
|
# Change column name
|
660
686
|
# e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
|
661
687
|
df = stages.update_feature_names(df, self.method)
|
662
688
|
|
663
|
-
|
664
|
-
all_cei_columns = [
|
665
|
-
col
|
666
|
-
for col in df.columns
|
667
|
-
if col not in self.fixed_columns + [self.target] + self.statistics_columns
|
668
|
-
]
|
669
|
-
|
689
|
+
all_cei_columns = self.get_cei_column_names(df)
|
670
690
|
# Fill in any missing values with 0
|
671
691
|
df.loc[:, all_cei_columns].fillna(0, inplace=True)
|
672
692
|
|
@@ -720,10 +740,20 @@ class Geocif:
|
|
720
740
|
mask = self.df_results["Stage_ID"].isin(_stages)
|
721
741
|
df = self.df_results[mask]
|
722
742
|
|
743
|
+
""" Select which CEI categories to use for ML """
|
744
|
+
if "all" in self.use_ceis:
|
745
|
+
pass
|
746
|
+
else:
|
747
|
+
df = df[df["Type"].isin(self.use_ceis)]
|
748
|
+
|
723
749
|
""" Convert this dataframe into an ML ready format and save to disk """
|
724
750
|
df = self.create_ml_dataframe(df)
|
725
751
|
dir_output = (
|
726
|
-
self.dir_analysis
|
752
|
+
self.dir_analysis
|
753
|
+
/ self.country
|
754
|
+
/ self.crop
|
755
|
+
/ self.model_name
|
756
|
+
/ str(self.forecast_season)
|
727
757
|
)
|
728
758
|
os.makedirs(dir_output, exist_ok=True)
|
729
759
|
df.to_csv(
|
@@ -772,7 +802,7 @@ class Geocif:
|
|
772
802
|
dict_kwargs["combined_dict"] = self.combined_dict
|
773
803
|
|
774
804
|
if self.spatial_autocorrelation:
|
775
|
-
|
805
|
+
sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
|
776
806
|
|
777
807
|
if self.correlation_plots:
|
778
808
|
self.logger.info(f"Correlation plot for {self.country} {self.crop}")
|
@@ -162,14 +162,16 @@ class cei_runner(base.BaseGeo):
|
|
162
162
|
# Only keep those entries in combinations where the third elemt is
|
163
163
|
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
164
164
|
# This is done to test the code for these countries
|
165
|
-
combinations = [
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
165
|
+
combinations = [
|
166
|
+
i
|
167
|
+
for i in combinations
|
168
|
+
if "angola_maize" in i[3] or "lesotho_maize" in i[3] or
|
169
|
+
# "namibia" in i[2] or
|
170
|
+
# "united_republic_of_tanzania" in i[2] or
|
171
|
+
"zambia_maize" in i[3] or "zimbabwe_maize" in i[3] or
|
172
|
+
# "south_africa" in i[2] or
|
173
|
+
"mozambique_maize" in i[3]
|
174
|
+
]
|
173
175
|
# "malawi" in i[2]]
|
174
176
|
|
175
177
|
if self.do_parallel:
|
@@ -97,8 +97,10 @@ def create_weights_for_year(dg_country, regions_with_data):
|
|
97
97
|
]
|
98
98
|
if no_neighbors:
|
99
99
|
dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
|
100
|
-
|
101
|
-
|
100
|
+
try:
|
101
|
+
wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
|
102
|
+
except:
|
103
|
+
breakpoint()
|
102
104
|
return wt, dg
|
103
105
|
|
104
106
|
|
@@ -125,13 +127,15 @@ def compute_morans_i(merged_df):
|
|
125
127
|
regions_with_data = year_data["Country Region"].unique()
|
126
128
|
year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
|
127
129
|
|
128
|
-
y = year_data[
|
130
|
+
y = year_data[
|
131
|
+
["Country Region", "Region", "Yield (tn per ha)"]
|
132
|
+
].drop_duplicates()
|
129
133
|
dg_country = year_data[["Country Region", "geometry"]].drop_duplicates()
|
130
134
|
|
131
|
-
|
132
|
-
|
133
|
-
y = y[y["Country Region"].isin(x["Country Region"])]
|
135
|
+
w, x = create_weights_for_year(dg_country, regions_with_data)
|
136
|
+
y = y[y["Country Region"].isin(x["Country Region"])]
|
134
137
|
|
138
|
+
if len(y) > 1:
|
135
139
|
try:
|
136
140
|
mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
|
137
141
|
except:
|
@@ -332,7 +332,7 @@ def plot_df_shpfile(
|
|
332
332
|
cb.ax.set_title(
|
333
333
|
label, fontsize=8, fontweight="semibold", fontfamily="Arial"
|
334
334
|
)
|
335
|
-
cb.ax.set_xticklabels(ticks, fontsize=
|
335
|
+
cb.ax.set_xticklabels(ticks, fontsize=5, fontfamily="Arial")
|
336
336
|
|
337
337
|
# Use BoundaryNorm to create discrete levels
|
338
338
|
# sm = plt.cm.ScalarMappable(cmap=cmap.mpl_colormap, norm=norm)
|
@@ -394,9 +394,9 @@ def plot_df_shpfile(
|
|
394
394
|
_name_country, buffer=1.0
|
395
395
|
) # left, right, bottom, top
|
396
396
|
# Hack: Add space to the top for adding title
|
397
|
-
extent[3] = extent[3] +
|
397
|
+
extent[3] = extent[3] + 2
|
398
398
|
# Add some space to the bottom for adding legend and colorbar
|
399
|
-
extent[2] = extent[2] -
|
399
|
+
extent[2] = extent[2] - 3
|
400
400
|
ax.set_extent(extent)
|
401
401
|
elif name_country == "world":
|
402
402
|
ax.add_feature(
|
@@ -419,14 +419,12 @@ def plot_df_shpfile(
|
|
419
419
|
# ax.tick_params(bottom=False, labelbottom=False, left=False, labelleft=False)
|
420
420
|
# ax.axis("off")
|
421
421
|
|
422
|
-
plt.tight_layout()
|
423
422
|
# cbar.ax.tick_params(labelsize=8)
|
424
423
|
# if series == "sequential":
|
425
424
|
# cbar.ax.tick_params(size=2, width=0.5, which="both")
|
426
425
|
# cbar.outline.set_visible(False)
|
427
426
|
# plt.tight_layout()
|
428
427
|
try:
|
429
|
-
print(fname)
|
430
428
|
plt.savefig(dir_out / fname, dpi=350, bbox_inches="tight")
|
431
429
|
plt.close(fig)
|
432
430
|
except:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|