geocif 0.1.32__tar.gz → 0.1.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.32/geocif.egg-info → geocif-0.1.34}/PKG-INFO +1 -1
- {geocif-0.1.32 → geocif-0.1.34}/geocif/analysis.py +5 -5
- {geocif-0.1.32 → geocif-0.1.34}/geocif/cei/indices.py +11 -3
- {geocif-0.1.32 → geocif-0.1.34}/geocif/geocif.py +44 -3
- {geocif-0.1.32 → geocif-0.1.34}/geocif/indices_runner.py +5 -4
- geocif-0.1.34/geocif/indices_runner_v2.py +208 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/correlations.py +3 -0
- geocif-0.1.34/geocif/ml/correlations_backup.py +412 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/spatial_autocorrelation.py +6 -7
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/stages.py +6 -3
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/trainers.py +34 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/playground/misc.py +72 -2
- {geocif-0.1.32 → geocif-0.1.34/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.32 → geocif-0.1.34}/setup.py +1 -1
- {geocif-0.1.32 → geocif-0.1.34}/LICENSE +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/MANIFEST.in +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/README.md +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/constants.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/features.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/geo.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/models.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/logger.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/output.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/stats.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/trend.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/xai.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/playground/automl.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/utils.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif/viz/plot.py +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/requirements.txt +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/setup.cfg +0 -0
- {geocif-0.1.32 → geocif-0.1.34}/tests/test_geocif.py +0 -0
@@ -162,8 +162,8 @@ class Geoanalysis:
|
|
162
162
|
return pd.DataFrame(), pd.DataFrame()
|
163
163
|
|
164
164
|
df_metrics = self._compute_metrics(df)
|
165
|
-
df_metrics = self._process_metrics(df_metrics)
|
166
|
-
self._plot_metrics(df_metrics)
|
165
|
+
#df_metrics = self._process_metrics(df_metrics)
|
166
|
+
#self._plot_metrics(df_metrics)
|
167
167
|
|
168
168
|
df_regional_metrics_by_year = self._compute_regional_metrics(
|
169
169
|
df, by="Harvest Year"
|
@@ -172,9 +172,9 @@ class Geoanalysis:
|
|
172
172
|
df_regional_metrics_by_year
|
173
173
|
)
|
174
174
|
df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
|
175
|
-
|
175
|
+
breakpoint()
|
176
176
|
self._store_results(
|
177
|
-
|
177
|
+
None, df_regional_metrics, df_regional_metrics_by_year
|
178
178
|
)
|
179
179
|
|
180
180
|
df_national_yield = self._compute_national_yield(df)
|
@@ -195,7 +195,7 @@ class Geoanalysis:
|
|
195
195
|
.apply(self.annual_metrics)
|
196
196
|
.reset_index()
|
197
197
|
)
|
198
|
-
|
198
|
+
breakpoint()
|
199
199
|
return df_metrics.pivot_table(
|
200
200
|
index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
|
201
201
|
columns="level_5",
|
@@ -465,7 +465,7 @@ class CEIs:
|
|
465
465
|
|
466
466
|
extended_stages_list = []
|
467
467
|
if self.method in ["phenological_stages", "fraction_season", "full_season"]:
|
468
|
-
extended_stages_list =
|
468
|
+
extended_stages_list = stages
|
469
469
|
elif self.method in ["dekad_r", "biweekly_r", "monthly_r"]:
|
470
470
|
# reverse stages
|
471
471
|
stages = stages[::-1]
|
@@ -566,10 +566,10 @@ class CEIs:
|
|
566
566
|
|
567
567
|
"""
|
568
568
|
if self.method in ["phenological_stages", "fraction_season"]:
|
569
|
-
mask = df_harvest_year_region[col].isin(stages)
|
569
|
+
mask = df_harvest_year_region[col].isin([stages])
|
570
570
|
df_time_period = df_harvest_year_region[mask]
|
571
571
|
|
572
|
-
mask = df_all_years[col].isin(stages)
|
572
|
+
mask = df_all_years[col].isin([stages])
|
573
573
|
df_base_period = df_all_years[mask]
|
574
574
|
elif self.method in [
|
575
575
|
"dekad",
|
@@ -605,6 +605,10 @@ class CEIs:
|
|
605
605
|
Returns:
|
606
606
|
|
607
607
|
"""
|
608
|
+
# If stage is not a list then convert it to a list
|
609
|
+
if not isinstance(stage, list):
|
610
|
+
stage = [stage]
|
611
|
+
|
608
612
|
columns = [
|
609
613
|
"Description",
|
610
614
|
"CEI",
|
@@ -721,6 +725,10 @@ class CEIs:
|
|
721
725
|
:param index_details:
|
722
726
|
:return:
|
723
727
|
"""
|
728
|
+
# If stage is not a list then convert it to a list
|
729
|
+
if not isinstance(stage, list):
|
730
|
+
stage = [stage]
|
731
|
+
|
724
732
|
df = df[df["bounds"] == 1]
|
725
733
|
# Exclude lat, lon, time, bounds and time_bounds columns
|
726
734
|
df = df.drop(columns=["lat", "lon", "time", "bounds", "time_bounds"])
|
@@ -222,6 +222,10 @@ class Geocif:
|
|
222
222
|
self.logger.info(f"Selected features: {self.selected_features}")
|
223
223
|
|
224
224
|
""" Update model to include conformal estimates """
|
225
|
+
if "lat" not in self.selected_features:
|
226
|
+
self.selected_features.append("lat")
|
227
|
+
if "lon" not in self.selected_features:
|
228
|
+
self.selected_features.append("lon")
|
225
229
|
X_train = df_region[self.selected_features + self.cat_features]
|
226
230
|
dir_output = (
|
227
231
|
self.dir_analysis
|
@@ -275,6 +279,12 @@ class Geocif:
|
|
275
279
|
verbose=False,
|
276
280
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
277
281
|
)
|
282
|
+
elif self.model_name == "geospaNN":
|
283
|
+
self.model.fit(
|
284
|
+
X_train,
|
285
|
+
y_train,
|
286
|
+
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
287
|
+
)
|
278
288
|
elif self.model_name == "merf":
|
279
289
|
Z_train = np.ones((len(X_train), 1))
|
280
290
|
clusters_train = df_region["Region"]
|
@@ -341,6 +351,25 @@ class Geocif:
|
|
341
351
|
X_test, Z_test, clusters_test.astype("object")
|
342
352
|
)
|
343
353
|
best_hyperparameters = self.model.fe_model.get_params().copy()
|
354
|
+
elif self.model_name == "geospaNN":
|
355
|
+
import torch
|
356
|
+
import geospaNN
|
357
|
+
|
358
|
+
# Remove any categorical features
|
359
|
+
X_test = X_test.drop(columns=self.cat_features)
|
360
|
+
X = torch.from_numpy(X_test.to_numpy()).float()
|
361
|
+
coord = torch.from_numpy(self.df_test[['lon', 'lat']].to_numpy()).float()
|
362
|
+
|
363
|
+
p = X.shape[1]
|
364
|
+
n = X.shape[0]
|
365
|
+
nn = 5
|
366
|
+
|
367
|
+
data = geospaNN.make_graph(X, Y, coord, nn)
|
368
|
+
|
369
|
+
# remove categorical features from df_train
|
370
|
+
data_train = df_region[self.selected_features + self.cat_features + [self.target]]
|
371
|
+
w_train = data_train.y - self.estimate(data_train.x)
|
372
|
+
|
344
373
|
else:
|
345
374
|
y_pred = self.model.predict(X_test)
|
346
375
|
best_hyperparameters = self.model.get_params().copy()
|
@@ -458,9 +487,10 @@ class Geocif:
|
|
458
487
|
"Harvest Year",
|
459
488
|
"Stage Name",
|
460
489
|
]
|
461
|
-
|
462
|
-
lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
463
|
-
|
490
|
+
try:
|
491
|
+
df.index = df.apply(lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1)
|
492
|
+
except Exception as e:
|
493
|
+
breakpoint()
|
464
494
|
|
465
495
|
# name the index level
|
466
496
|
df.index.set_names(["Index"], inplace=True)
|
@@ -527,6 +557,10 @@ class Geocif:
|
|
527
557
|
if self.use_outlook_as_feature:
|
528
558
|
self.feature_names.append("FCST")
|
529
559
|
|
560
|
+
# Add lat and lon to feature names
|
561
|
+
self.feature_names.append("lat")
|
562
|
+
self.feature_names.append("lon")
|
563
|
+
|
530
564
|
self.selected_features = []
|
531
565
|
|
532
566
|
def loop_ml(self, stages, dict_selected_features, dict_best_cei):
|
@@ -782,6 +816,13 @@ class Geocif:
|
|
782
816
|
how="outer",
|
783
817
|
)
|
784
818
|
|
819
|
+
# Add a lat and lon column to self.dg_country
|
820
|
+
self.dg_country["lat"] = self.dg_country.centroid.y
|
821
|
+
self.dg_country["lon"] = self.dg_country.centroid.x
|
822
|
+
|
823
|
+
# Add lat and lon columns to df by merging on Country Region column
|
824
|
+
df = df.merge(self.dg_country[["Country Region", "lat", "lon"]].drop_duplicates(), on="Country Region", how="left")
|
825
|
+
|
785
826
|
dict_kwargs = {}
|
786
827
|
dict_kwargs["all_stages"] = self.all_stages
|
787
828
|
dict_kwargs["target_col"] = self.target
|
@@ -165,11 +165,12 @@ class cei_runner(base.BaseGeo):
|
|
165
165
|
combinations = [
|
166
166
|
i
|
167
167
|
for i in combinations
|
168
|
-
if "angola_maize" in i[3] or
|
169
|
-
|
170
|
-
#
|
168
|
+
if "angola_maize" in i[3] or
|
169
|
+
"lesotho_maize" in i[3] or
|
170
|
+
# "namibia_" in i[2] or
|
171
|
+
"united_republic_of_tanzania_maize" in i[3] or
|
171
172
|
"zambia_maize" in i[3] or "zimbabwe_maize" in i[3] or
|
172
|
-
|
173
|
+
"south_africa_maize" in i[3] or
|
173
174
|
"mozambique_maize" in i[3]
|
174
175
|
]
|
175
176
|
# "malawi" in i[2]]
|
@@ -0,0 +1,208 @@
|
|
1
|
+
import itertools
|
2
|
+
import warnings
|
3
|
+
from multiprocessing import Pool, cpu_count
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import arrow as ar
|
7
|
+
import pandas as pd
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
warnings.filterwarnings("ignore")
|
11
|
+
|
12
|
+
from .cei import indices
|
13
|
+
from geoprepare import base
|
14
|
+
|
15
|
+
|
16
|
+
def remove_duplicates(lst):
|
17
|
+
"""
|
18
|
+
|
19
|
+
:param lst:
|
20
|
+
:return:
|
21
|
+
"""
|
22
|
+
return list(set([i for i in lst]))
|
23
|
+
|
24
|
+
|
25
|
+
def get_admin_zone(country, dg_shp):
|
26
|
+
admin_zone = "admin_1"
|
27
|
+
country = country.title().replace(" ", "_")
|
28
|
+
|
29
|
+
# Read in shapefile
|
30
|
+
dg_country = dg_shp[dg_shp["ADMIN0"] == country]
|
31
|
+
|
32
|
+
# Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
|
33
|
+
if dg_country.empty:
|
34
|
+
admin_zone = "admin_1"
|
35
|
+
elif not dg_country["ADMIN2"].isna().all():
|
36
|
+
admin_zone = "admin_2"
|
37
|
+
|
38
|
+
return admin_zone
|
39
|
+
|
40
|
+
|
41
|
+
class cei_runner(base.BaseGeo):
|
42
|
+
def __init__(self, path_config_file):
|
43
|
+
super().__init__(path_config_file)
|
44
|
+
|
45
|
+
# Parse configuration files
|
46
|
+
self.parse_config()
|
47
|
+
|
48
|
+
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
49
|
+
self.base_dir = Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
50
|
+
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
51
|
+
|
52
|
+
def collect_files(self):
|
53
|
+
"""
|
54
|
+
1. Collect all the files which contain EO information
|
55
|
+
2. Exclude files from the `processed` directory if it is already in
|
56
|
+
processed_include_fall directory
|
57
|
+
3. Create a dataframe that contains the following columns:
|
58
|
+
- directory: name of directory where file is located
|
59
|
+
- path: full path to file
|
60
|
+
- filename: name of file
|
61
|
+
:return: Return the dataframe created above
|
62
|
+
"""
|
63
|
+
import geopandas as gp
|
64
|
+
|
65
|
+
dg_shp = gp.read_file(
|
66
|
+
self.dir_input
|
67
|
+
/ "Global_Datasets"
|
68
|
+
/ "Regions"
|
69
|
+
/ "Shps"
|
70
|
+
/ "adm_shapefile.shp",
|
71
|
+
engine="pyogrio",
|
72
|
+
)
|
73
|
+
|
74
|
+
# Collect all the files which contain EO information
|
75
|
+
df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
|
76
|
+
for filepath in self.base_dir.rglob("*.csv"):
|
77
|
+
country = filepath.parents[0].name
|
78
|
+
|
79
|
+
admin_zone = get_admin_zone(country, dg_shp)
|
80
|
+
|
81
|
+
# If country is not in cc.COUNTRIES then skip
|
82
|
+
# HACK: Skip korea for now, as it is giving errors
|
83
|
+
if country == "republic_of_korea":
|
84
|
+
continue
|
85
|
+
|
86
|
+
# Get name of directory one level up
|
87
|
+
process_type = filepath.parents[1].name
|
88
|
+
|
89
|
+
# Get name of file
|
90
|
+
filename = filepath.name
|
91
|
+
|
92
|
+
# Add to dataframe
|
93
|
+
df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
|
94
|
+
|
95
|
+
# Exclude those rows where directory is processed and file is already in
|
96
|
+
# processed_include_fall directory
|
97
|
+
no_fall = df_files["directory"] == "processed"
|
98
|
+
include_fall = df_files[df_files["directory"] == "processed_include_fall"][
|
99
|
+
"filename"
|
100
|
+
]
|
101
|
+
|
102
|
+
df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
|
103
|
+
|
104
|
+
return df_files
|
105
|
+
|
106
|
+
def process_combinations(self, df, method):
|
107
|
+
"""
|
108
|
+
Create a list of tuples of the following:
|
109
|
+
- directory: name of directory where file is located
|
110
|
+
- path: full path to file
|
111
|
+
- filename: name of file
|
112
|
+
- method: whether to compute indices for phenological stages or not
|
113
|
+
This tuple will be used as input to the `process` function
|
114
|
+
:param df:
|
115
|
+
:param method:
|
116
|
+
:return:
|
117
|
+
"""
|
118
|
+
combinations = []
|
119
|
+
|
120
|
+
for index, row in tqdm(df.iterrows()):
|
121
|
+
combinations.extend(
|
122
|
+
list(
|
123
|
+
itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
|
124
|
+
)
|
125
|
+
)
|
126
|
+
|
127
|
+
combinations = remove_duplicates(combinations)
|
128
|
+
|
129
|
+
return combinations
|
130
|
+
|
131
|
+
def main(self, method):
|
132
|
+
"""
|
133
|
+
|
134
|
+
:param method:
|
135
|
+
:return:
|
136
|
+
"""
|
137
|
+
# Create a dataframe of the files to be analyzed
|
138
|
+
df_files = self.collect_files()
|
139
|
+
|
140
|
+
combinations = self.process_combinations(df_files, method)
|
141
|
+
|
142
|
+
# Add an element to the tuple to indicate the season
|
143
|
+
# Last element is redo flag which is True if the analysis is to be redone
|
144
|
+
# and False otherwise. Analysis is always redone for the current year
|
145
|
+
# and last year whether file exists or not
|
146
|
+
combinations = [
|
147
|
+
(
|
148
|
+
self.parser,
|
149
|
+
status,
|
150
|
+
path,
|
151
|
+
filename,
|
152
|
+
admin_zone,
|
153
|
+
category,
|
154
|
+
year,
|
155
|
+
"ndvi",
|
156
|
+
False, # redo
|
157
|
+
)
|
158
|
+
for year in range(2001, ar.utcnow().year + 1)
|
159
|
+
for status, path, filename, admin_zone, category in combinations
|
160
|
+
]
|
161
|
+
|
162
|
+
# Only keep those entries in combinations where the third elemt is
|
163
|
+
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
164
|
+
# This is done to test the code for these countries
|
165
|
+
#combinations = [
|
166
|
+
# i
|
167
|
+
# for i in combinations
|
168
|
+
# if "ethiopia_maize_s1" in i[3]
|
169
|
+
#]
|
170
|
+
# "malawi" in i[2]]
|
171
|
+
|
172
|
+
if True:
|
173
|
+
num_cpu = int(cpu_count() * 0.8)
|
174
|
+
with Pool(num_cpu) as p:
|
175
|
+
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
176
|
+
pass
|
177
|
+
else:
|
178
|
+
# Use the code below if you want to test without parallelization or
|
179
|
+
# if you want to debug by using pdb
|
180
|
+
pbar = tqdm(combinations)
|
181
|
+
for i, val in enumerate(pbar):
|
182
|
+
pbar.set_description(
|
183
|
+
f"Main loop {combinations[i][2]} {combinations[i][5]}"
|
184
|
+
)
|
185
|
+
indices.process(val)
|
186
|
+
|
187
|
+
|
188
|
+
def run(path_config_files=[]):
|
189
|
+
"""
|
190
|
+
|
191
|
+
Args:
|
192
|
+
path_config_files:
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
|
196
|
+
"""
|
197
|
+
""" Check dictionary keys to have no spaces"""
|
198
|
+
indices.validate_index_definitions()
|
199
|
+
|
200
|
+
for method in [
|
201
|
+
"phenological_stages", # "dekad_r" # "dekad_r"
|
202
|
+
]: # , "full_season", "phenological_stages", "fraction_season"]:
|
203
|
+
obj = cei_runner(path_config_files)
|
204
|
+
obj.main(method)
|
205
|
+
|
206
|
+
|
207
|
+
if __name__ == "__main__":
|
208
|
+
run()
|
@@ -260,6 +260,9 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
260
260
|
):
|
261
261
|
df_corr = _all_correlated_feature_by_time(group, **kwargs)
|
262
262
|
|
263
|
+
# Remove columns with more than 50% NaN values
|
264
|
+
df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
|
265
|
+
|
263
266
|
if not df_corr.empty:
|
264
267
|
df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
|
265
268
|
dict_selected_features[region_id] = df_tmp.columns
|
@@ -0,0 +1,412 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import palettable as pal
|
5
|
+
import pandas as pd
|
6
|
+
import seaborn as sns
|
7
|
+
from tqdm import tqdm
|
8
|
+
|
9
|
+
from geocif import utils
|
10
|
+
from geocif.ml import embedding
|
11
|
+
from geocif.ml import stages
|
12
|
+
|
13
|
+
|
14
|
+
def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
|
15
|
+
"""
|
16
|
+
|
17
|
+
Args:
|
18
|
+
df_train:
|
19
|
+
simulation_stages:
|
20
|
+
target_col:
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
|
24
|
+
"""
|
25
|
+
frames = []
|
26
|
+
|
27
|
+
stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
|
28
|
+
|
29
|
+
# Only select columns that have been observed till the current stage
|
30
|
+
for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
|
31
|
+
current_feature_set = [
|
32
|
+
col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
|
33
|
+
]
|
34
|
+
|
35
|
+
# Get the most correlated feature for each region
|
36
|
+
top_feature_by_region, counter = embedding.get_top_correlated_features(
|
37
|
+
df_train[current_feature_set + ["Region"]],
|
38
|
+
df_train[target_col],
|
39
|
+
)
|
40
|
+
|
41
|
+
# Create a dataframe with the most common top feature and number of occurrences over timestep
|
42
|
+
_feature = counter.most_common(1)[0][0]
|
43
|
+
# Loop through top_feature_by_region and find the average score for _feature
|
44
|
+
# Calculate the average score for 'DTR_36'
|
45
|
+
_feature_scores = [
|
46
|
+
value[1][0]
|
47
|
+
for key, value in top_feature_by_region.items()
|
48
|
+
if _feature in value[0]
|
49
|
+
]
|
50
|
+
average_score = sum(_feature_scores) / len(_feature_scores)
|
51
|
+
_feature = utils.remove_last_part(_feature)
|
52
|
+
|
53
|
+
df = pd.DataFrame(
|
54
|
+
{
|
55
|
+
"Stage": [stage[-1]],
|
56
|
+
"Date": [utils.dict_growth_stages[stage[-1]]],
|
57
|
+
"Feature with Highest Correlation": [counter.most_common(1)[0][0]],
|
58
|
+
"Feature Category": [_feature],
|
59
|
+
"Score": [average_score],
|
60
|
+
# "Type": [ci.dict_indices[_feature][0]],
|
61
|
+
"Number of Occurrences": [counter.most_common(1)[0][1]],
|
62
|
+
# "Current Feature Set": [current_feature_set],
|
63
|
+
}
|
64
|
+
)
|
65
|
+
frames.append(df)
|
66
|
+
|
67
|
+
df_most_corr_feature_by_time = pd.concat(frames)
|
68
|
+
|
69
|
+
|
70
|
+
def plot_feature_corr_by_time(df, **kwargs):
|
71
|
+
country = kwargs.get("country")
|
72
|
+
crop = kwargs.get("crop")
|
73
|
+
dir_output = kwargs.get("dir_output")
|
74
|
+
forecast_season = kwargs.get("forecast_season")
|
75
|
+
national_correlation = kwargs.get("national_correlation")
|
76
|
+
group_by = kwargs.get("groupby")
|
77
|
+
|
78
|
+
# Setup the figure and gridspec
|
79
|
+
fig = plt.figure(figsize=(10, 5))
|
80
|
+
gs = fig.add_gridspec(
|
81
|
+
3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
|
82
|
+
)
|
83
|
+
|
84
|
+
# Assign subplots
|
85
|
+
ax_heatmap = fig.add_subplot(gs[0:2, 0])
|
86
|
+
ax_map = fig.add_subplot(gs[0, 1])
|
87
|
+
cbar_ax = fig.add_subplot(gs[2, 0])
|
88
|
+
ax4 = fig.add_subplot(gs[2, 1])
|
89
|
+
|
90
|
+
# Transpose and reverse the columns of the dataframe
|
91
|
+
#breakpoint()
|
92
|
+
## Only select foll. columns:
|
93
|
+
|
94
|
+
df = df[
|
95
|
+
[
|
96
|
+
"TG",
|
97
|
+
"TG10p",
|
98
|
+
"DTR",
|
99
|
+
"vDTR",
|
100
|
+
"R99p",
|
101
|
+
"RX5day",
|
102
|
+
"MEAN_ESI4WK",
|
103
|
+
]
|
104
|
+
]
|
105
|
+
df_transpose = df.T
|
106
|
+
df = df_transpose[df_transpose.columns[::-1]]
|
107
|
+
|
108
|
+
# Split column names and only use value before space
|
109
|
+
df.columns = df.columns.str.split(" ").str[0]
|
110
|
+
# In row names, replace ESI4WK by ES
|
111
|
+
df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
|
112
|
+
df.index = df.index.str.replace("R99p", "MEAN_SM")
|
113
|
+
df.index = df.index.str.replace("RX5day", "AUC_SM")
|
114
|
+
# Remove the last row
|
115
|
+
# Select the first, third and fifth column
|
116
|
+
df = df[["Dec", "Feb", "Apr"]]
|
117
|
+
# Rename Dec to Planting - Early Vegetative
|
118
|
+
# Rename Feb to Early Vegetative - Senescence
|
119
|
+
# Rename Apr to Senescence - Harvest
|
120
|
+
df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
|
121
|
+
ax_heatmap = sns.heatmap(
|
122
|
+
df,
|
123
|
+
ax=ax_heatmap,
|
124
|
+
annot=True,
|
125
|
+
cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
|
126
|
+
fmt=".2f",
|
127
|
+
square=False,
|
128
|
+
linewidths=0.5,
|
129
|
+
linecolor="white",
|
130
|
+
cbar_ax=cbar_ax,
|
131
|
+
cbar_kws={"orientation": "horizontal"}, # , "shrink": 0.5},
|
132
|
+
annot_kws={"size": 6},
|
133
|
+
xticklabels=True,
|
134
|
+
yticklabels=True,
|
135
|
+
)
|
136
|
+
ax_heatmap.tick_params(left=False, bottom=False)
|
137
|
+
|
138
|
+
# Plot the map using GeoPandas
|
139
|
+
dg_country = kwargs.get("dg_country")
|
140
|
+
|
141
|
+
ax_map = dg_country.plot(
|
142
|
+
ax=ax_map,
|
143
|
+
color="white",
|
144
|
+
edgecolor="black",
|
145
|
+
linewidth=1.0,
|
146
|
+
facecolor=None,
|
147
|
+
legend=False,
|
148
|
+
)
|
149
|
+
|
150
|
+
if not national_correlation:
|
151
|
+
id = kwargs["region_id"]
|
152
|
+
dg_region = dg_country[dg_country[group_by] == id]
|
153
|
+
ax_map = dg_region.plot(
|
154
|
+
ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
|
155
|
+
)
|
156
|
+
# Set title with color blue
|
157
|
+
ax_map.set_title(f"Region: {id}", color="blue")
|
158
|
+
|
159
|
+
# No colorbar for the map
|
160
|
+
ax_map.axis("off")
|
161
|
+
# Remove borders
|
162
|
+
ax_map.spines["top"].set_visible(False)
|
163
|
+
ax_map.spines["right"].set_visible(False)
|
164
|
+
ax_map.spines["bottom"].set_visible(False)
|
165
|
+
ax_map.spines["left"].set_visible(False)
|
166
|
+
# ax4 should not be visible
|
167
|
+
ax4.axis("off")
|
168
|
+
|
169
|
+
# Add colorbar label
|
170
|
+
# cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
|
171
|
+
cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
|
172
|
+
ax_heatmap.set_xticklabels(
|
173
|
+
ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
|
174
|
+
)
|
175
|
+
ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
|
176
|
+
ax_heatmap.set_xlabel("")
|
177
|
+
ax_heatmap.set_ylabel(" ")
|
178
|
+
# Reduce font size of ticks of colorbar
|
179
|
+
cbar_ax.tick_params(axis="both", which="major", labelsize=6)
|
180
|
+
|
181
|
+
_country = country.title().replace("_", " ")
|
182
|
+
_crop = crop.title().replace("_", " ")
|
183
|
+
if not national_correlation:
|
184
|
+
fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
|
185
|
+
else:
|
186
|
+
fname = f"{country}_{crop}_corr_feature_by_time.png"
|
187
|
+
ax_heatmap.set_title(f"{_country}\n{_crop}")
|
188
|
+
|
189
|
+
# plt.tight_layout()
|
190
|
+
os.makedirs(dir_output, exist_ok=True)
|
191
|
+
plt.savefig(dir_output / fname, dpi=250)
|
192
|
+
plt.close()
|
193
|
+
|
194
|
+
|
195
|
+
def _all_correlated_feature_by_time(df, **kwargs):
|
196
|
+
"""
|
197
|
+
|
198
|
+
Args:
|
199
|
+
df:
|
200
|
+
**kwargs:
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
|
204
|
+
"""
|
205
|
+
frames = []
|
206
|
+
all_stages = kwargs.get("all_stages")
|
207
|
+
target_col = kwargs.get("target_col")
|
208
|
+
method = kwargs.get("method")
|
209
|
+
|
210
|
+
longest_stage = max(all_stages, key=len)
|
211
|
+
|
212
|
+
# Split the original string into a list of its parts
|
213
|
+
longest_stage = longest_stage.split("_")
|
214
|
+
|
215
|
+
# Generate the list of strings as described by the user, removing one element from the start each time
|
216
|
+
stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
|
217
|
+
|
218
|
+
# Drop columns with no yield information
|
219
|
+
df = df.dropna(subset=[target_col])
|
220
|
+
|
221
|
+
# Only select columns that have been observed till the current stage
|
222
|
+
pbar = tqdm(stages_features, total=len(stages_features), leave=False)
|
223
|
+
for stage in pbar:
|
224
|
+
pbar.set_description(f"Calculating correlations")
|
225
|
+
pbar.update()
|
226
|
+
|
227
|
+
stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
228
|
+
"Stage Name"
|
229
|
+
]
|
230
|
+
# starting_stage = stage_name.split("-")[0]
|
231
|
+
current_feature_set = [col for col in df.columns if stage_name in col]
|
232
|
+
|
233
|
+
# Get the most correlated feature for each region
|
234
|
+
df_tmp = embedding.get_all_features_correlation(
|
235
|
+
df[current_feature_set + ["Region"]], df[target_col], method
|
236
|
+
)
|
237
|
+
|
238
|
+
frames.append(df_tmp)
|
239
|
+
|
240
|
+
df_results = pd.concat(frames)
|
241
|
+
if not df_results.empty:
|
242
|
+
# Exclude Region column
|
243
|
+
df_results = df_results.drop(columns="Region")
|
244
|
+
# Groupby Dekad and compute mean of all columns apart from Region
|
245
|
+
df_results = df_results.groupby(method).mean()
|
246
|
+
|
247
|
+
all_stage_names = []
|
248
|
+
for stage in stages_features:
|
249
|
+
_tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
250
|
+
"Stage Name"
|
251
|
+
]
|
252
|
+
all_stage_names.append(_tmp)
|
253
|
+
|
254
|
+
df_results = df_results.reindex(all_stage_names)
|
255
|
+
|
256
|
+
# Drop rows with all NaN values
|
257
|
+
df_results = df_results.dropna(how="all")
|
258
|
+
|
259
|
+
# Split the index based on - and only keep the first element
|
260
|
+
df_results.index = df_results.index.str.split("-").str[0]
|
261
|
+
|
262
|
+
return df_results
|
263
|
+
else:
|
264
|
+
return pd.DataFrame()
|
265
|
+
|
266
|
+
|
267
|
+
def all_correlated_feature_by_time(df, **kwargs):
|
268
|
+
"""
|
269
|
+
|
270
|
+
Args:
|
271
|
+
df:
|
272
|
+
**kwargs:
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
|
276
|
+
"""
|
277
|
+
THRESHOLD = 0.1
|
278
|
+
national_correlation = kwargs.get("national_correlation")
|
279
|
+
group_by = kwargs.get("groupby")
|
280
|
+
combined_dict = kwargs.get("combined_dict")
|
281
|
+
|
282
|
+
dict_selected_features = {}
|
283
|
+
dict_best_cei = {}
|
284
|
+
|
285
|
+
if not national_correlation:
|
286
|
+
groups = df.groupby(group_by)
|
287
|
+
for region_id, group in tqdm(
|
288
|
+
groups, desc=f"Compute all correlated feature by {group_by}", leave=False
|
289
|
+
):
|
290
|
+
df_corr = _all_correlated_feature_by_time(group, **kwargs)
|
291
|
+
|
292
|
+
# Remove columns with more than 50% NaN values
|
293
|
+
df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
|
294
|
+
|
295
|
+
if not df_corr.empty:
|
296
|
+
df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
|
297
|
+
# Add the columns to dict_selected_features along with the absolute mean value
|
298
|
+
absolute_medians = df_tmp.abs().median()
|
299
|
+
|
300
|
+
# Create a DataFrame to display the column names and their absolute median values
|
301
|
+
absolute_median_df = absolute_medians.reset_index()
|
302
|
+
absolute_median_df.columns = ['CEI', 'Median']
|
303
|
+
|
304
|
+
# Add the CEI and Median value to dict_selected_features
|
305
|
+
dict_selected_features[region_id] = absolute_median_df
|
306
|
+
|
307
|
+
df_tmp2 = (
|
308
|
+
df_tmp.median(axis=0)
|
309
|
+
.abs()
|
310
|
+
.sort_values(ascending=False)
|
311
|
+
.reset_index()
|
312
|
+
)
|
313
|
+
df_tmp2.columns = ["Metric", "Value"]
|
314
|
+
# Add another column based on Type of Metric
|
315
|
+
for idx, row in df_tmp2.iterrows():
|
316
|
+
df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
|
317
|
+
|
318
|
+
# Compute median of each CEI and sort the dataframe based on the absolute value of the median
|
319
|
+
dict_best_cei[region_id] = (
|
320
|
+
df_tmp2.groupby("Type")
|
321
|
+
.max()
|
322
|
+
.reset_index()
|
323
|
+
.sort_values("Value", ascending=False)["Metric"]
|
324
|
+
.values
|
325
|
+
)
|
326
|
+
|
327
|
+
kwargs["region_id"] = region_id
|
328
|
+
plot_feature_corr_by_time(df_tmp, **kwargs)
|
329
|
+
# For each element in dict_best_cei, add the type of the cei
|
330
|
+
else:
|
331
|
+
# HACK
|
332
|
+
df_corr = _all_correlated_feature_by_time(df, **kwargs)
|
333
|
+
|
334
|
+
df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
|
335
|
+
# Add the columns to dict_selected_features along with the absolute mean value
|
336
|
+
absolute_medians = df_tmp.abs().median()
|
337
|
+
|
338
|
+
# Create a DataFrame to display the column names and their absolute median values
|
339
|
+
absolute_median_df = absolute_medians.reset_index()
|
340
|
+
absolute_median_df.columns = ['CEI', 'Median']
|
341
|
+
|
342
|
+
# Add the CEI and Median value to dict_selected_features
|
343
|
+
dict_selected_features[region_id] = absolute_median_df
|
344
|
+
dict_best_cei[region_id] = {}
|
345
|
+
else:
|
346
|
+
df_corr = _all_correlated_feature_by_time(df, **kwargs)
|
347
|
+
df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
|
348
|
+
# Add the columns to dict_selected_features along with the absolute mean value
|
349
|
+
absolute_medians = df_tmp.abs().median()
|
350
|
+
|
351
|
+
# Create a DataFrame to display the column names and their absolute median values
|
352
|
+
absolute_median_df = absolute_medians.reset_index()
|
353
|
+
absolute_median_df.columns = ['CEI', 'Median']
|
354
|
+
|
355
|
+
# Add the CEI and Median value to dict_selected_features
|
356
|
+
dict_selected_features[0] = absolute_median_df
|
357
|
+
|
358
|
+
plot_feature_corr_by_time(df_corr, **kwargs)
|
359
|
+
|
360
|
+
return dict_selected_features, dict_best_cei
|
361
|
+
|
362
|
+
|
363
|
+
def feature_correlation_by_time(**kwargs):
|
364
|
+
raise NotImplementedError()
|
365
|
+
|
366
|
+
frames = []
|
367
|
+
simulation_stages = kwargs.get("simulation_stages")
|
368
|
+
df_train = kwargs.get("df_train")
|
369
|
+
target_col = kwargs.get("target_col")
|
370
|
+
|
371
|
+
stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
|
372
|
+
|
373
|
+
# Only select columns that have been observed till the current stage
|
374
|
+
for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
|
375
|
+
current_feature_set = [
|
376
|
+
col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
|
377
|
+
]
|
378
|
+
|
379
|
+
# Get the most correlated feature for each region
|
380
|
+
top_feature_by_region, counter = embedding.compute_feature_correlations(
|
381
|
+
df_train[current_feature_set + ["Region"]],
|
382
|
+
df_train[target_col],
|
383
|
+
"all",
|
384
|
+
)
|
385
|
+
|
386
|
+
# Create a dataframe with the most common top feature and number of occurrences over timestep
|
387
|
+
_feature = counter.most_common(1)[0][0]
|
388
|
+
# Loop through top_feature_by_region and find the average score for _feature
|
389
|
+
# Calculate the average score for 'DTR_36'
|
390
|
+
_feature_scores = [
|
391
|
+
value[1][0]
|
392
|
+
for key, value in top_feature_by_region.items()
|
393
|
+
if _feature in value[0]
|
394
|
+
]
|
395
|
+
average_score = sum(_feature_scores) / len(_feature_scores)
|
396
|
+
_feature = utils.remove_last_part(_feature)
|
397
|
+
|
398
|
+
df = pd.DataFrame(
|
399
|
+
{
|
400
|
+
"Stage": [stage[-1]],
|
401
|
+
"Date": [utils.dict_growth_stages[stage[-1]]],
|
402
|
+
"Feature with Highest Correlation": [counter.most_common(1)[0][0]],
|
403
|
+
"Feature Category": [_feature],
|
404
|
+
"Score": [average_score],
|
405
|
+
# "Type": [ci.dict_indices[_feature][0]],
|
406
|
+
"Number of Occurrences": [counter.most_common(1)[0][1]],
|
407
|
+
# "Current Feature Set": [current_feature_set],
|
408
|
+
}
|
409
|
+
)
|
410
|
+
frames.append(df)
|
411
|
+
|
412
|
+
df_corr_feature_by_time = pd.concat(frames)
|
@@ -77,7 +77,7 @@ def create_base_weights(merged_df):
|
|
77
77
|
return w_base, dg
|
78
78
|
|
79
79
|
|
80
|
-
def create_weights_for_year(dg_country, regions_with_data):
|
80
|
+
def create_weights_for_year(dg_country, regions_with_data, year):
|
81
81
|
"""
|
82
82
|
|
83
83
|
Args:
|
@@ -97,10 +97,8 @@ def create_weights_for_year(dg_country, regions_with_data):
|
|
97
97
|
]
|
98
98
|
if no_neighbors:
|
99
99
|
dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
|
100
|
-
|
101
|
-
|
102
|
-
except:
|
103
|
-
breakpoint()
|
100
|
+
wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
|
101
|
+
|
104
102
|
return wt, dg
|
105
103
|
|
106
104
|
|
@@ -125,6 +123,8 @@ def compute_morans_i(merged_df):
|
|
125
123
|
for year in tqdm(years, desc="Compute Moran's I"):
|
126
124
|
year_data = merged_df[merged_df["Harvest Year"] == year]
|
127
125
|
regions_with_data = year_data["Country Region"].unique()
|
126
|
+
if len(regions_with_data) < 3:
|
127
|
+
continue
|
128
128
|
year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
|
129
129
|
|
130
130
|
y = year_data[
|
@@ -132,9 +132,8 @@ def compute_morans_i(merged_df):
|
|
132
132
|
].drop_duplicates()
|
133
133
|
dg_country = year_data[["Country Region", "geometry"]].drop_duplicates()
|
134
134
|
|
135
|
-
w, x = create_weights_for_year(dg_country, regions_with_data)
|
135
|
+
w, x = create_weights_for_year(dg_country, regions_with_data, year)
|
136
136
|
y = y[y["Country Region"].isin(x["Country Region"])]
|
137
|
-
|
138
137
|
if len(y) > 1:
|
139
138
|
try:
|
140
139
|
mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
|
@@ -144,10 +144,13 @@ def select_stages_for_ml(stages_features, method="latest", n=100):
|
|
144
144
|
|
145
145
|
selected_stages = []
|
146
146
|
if method == "latest":
|
147
|
+
# Find the longest array in the list of arrays
|
148
|
+
selected_stages = [max(stages_features, key=len)]
|
149
|
+
|
147
150
|
# Only select those arrays in the list of arrays that are starting with latest_stage
|
148
|
-
for stage in stages_features:
|
149
|
-
|
150
|
-
|
151
|
+
# for stage in stages_features:
|
152
|
+
# if stage[0] == latest_stage[0]:
|
153
|
+
# selected_stages.append(stage)
|
151
154
|
elif method == "fraction":
|
152
155
|
# Filter arrays with exactly 2 elements
|
153
156
|
two_element_arrays = []
|
@@ -289,6 +289,40 @@ def auto_train(
|
|
289
289
|
model = LinearGAM(n_splines=25, spline_order=3).gridsearch(
|
290
290
|
X_train.values, y_train.values, lam=np.logspace(-3, 3, 11)
|
291
291
|
)
|
292
|
+
elif model_name == "geospaNN":
|
293
|
+
import torch
|
294
|
+
import geospaNN
|
295
|
+
|
296
|
+
# Remove any categorical features
|
297
|
+
X_train = X_train.drop(columns=cat_features)
|
298
|
+
X = torch.from_numpy(X_train.to_numpy()).float()
|
299
|
+
Y = torch.from_numpy(y_train.to_numpy().reshape(-1)).float()
|
300
|
+
|
301
|
+
coord = torch.from_numpy(df_train[['lon', 'lat']].to_numpy()).float()
|
302
|
+
|
303
|
+
p = X.shape[1]
|
304
|
+
n = X.shape[0]
|
305
|
+
nn = 5
|
306
|
+
|
307
|
+
data = geospaNN.make_graph(X, Y, coord, nn)
|
308
|
+
|
309
|
+
mlp = torch.nn.Sequential(
|
310
|
+
torch.nn.Linear(p, 50),
|
311
|
+
torch.nn.ReLU(),
|
312
|
+
torch.nn.Linear(50, 20),
|
313
|
+
torch.nn.ReLU(),
|
314
|
+
torch.nn.Linear(20, 10),
|
315
|
+
torch.nn.ReLU(),
|
316
|
+
torch.nn.Linear(10, 1),
|
317
|
+
)
|
318
|
+
|
319
|
+
# Split data
|
320
|
+
data_train, data_val, data_test = geospaNN.split_data(X, Y, coord, neighbor_size=nn, test_proportion=0.1)
|
321
|
+
theta0 = geospaNN.theta_update(torch.tensor([1, 1.5, 0.01]), mlp(data_train.x).squeeze() - data_train.y, data_train.pos, neighbor_size=5)
|
322
|
+
model = geospaNN.nngls(p=p, neighbor_size=nn, coord_dimensions=2, mlp=mlp, theta=torch.tensor(theta0))
|
323
|
+
nngls_model = geospaNN.nngls_train(model, lr=0.01, min_delta=0.001)
|
324
|
+
# Log training process
|
325
|
+
training_log = nngls_model.train(data_train, data_val, data_test, Update_init=10, Update_step=10)
|
292
326
|
elif model_name == "xgboost":
|
293
327
|
raise NotImplementedError
|
294
328
|
else:
|
@@ -1,6 +1,76 @@
|
|
1
|
-
import
|
1
|
+
import geopandas as gpd
|
2
|
+
import pygmt
|
2
3
|
import matplotlib.pyplot as plt
|
3
|
-
|
4
|
+
from matplotlib.lines import Line2D
|
5
|
+
import matplotlib.patches as mpatches
|
6
|
+
import os
|
7
|
+
filtered_shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\filtered_shapefile5.shp"
|
8
|
+
|
9
|
+
if not os.path.isfile(filtered_shapefile_path):
|
10
|
+
|
11
|
+
# Load the shapefile using GeoPandas
|
12
|
+
shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\adm_shapefile.shp"
|
13
|
+
gdf = gpd.read_file(shapefile_path, engine="pyogrio")
|
14
|
+
|
15
|
+
# Only keep one row per ADMIN0
|
16
|
+
gdf = gdf.drop_duplicates(subset="ADMIN0")
|
17
|
+
|
18
|
+
sh2_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\Level_1.shp"
|
19
|
+
gdf2 = gpd.read_file(sh2_path, engine="pyogrio")
|
20
|
+
|
21
|
+
# Subset gdf2 to USA, Pakistan and Afghanistan
|
22
|
+
gdf2 = gdf2[gdf2["ADM0_NAME"].isin(["United States of America"])]
|
23
|
+
|
24
|
+
# Exclude Alska and Hawaii from the USA
|
25
|
+
gdf2 = gdf2[~gdf2["ADM1_NAME"].isin(["Alaska", "Hawaii"])]
|
26
|
+
|
27
|
+
# Now combine all the states into one polygon
|
28
|
+
gdf2 = gdf2.dissolve(by="ADM0_NAME")
|
29
|
+
gdf2 = gdf2.reset_index()
|
30
|
+
|
31
|
+
# Rename ADM0_NAME to ADMIN0 for consistency
|
32
|
+
gdf2.rename(columns={"ADM0_NAME": "ADMIN0"}, inplace=True)
|
33
|
+
|
34
|
+
# Only keep ADMIN0 and geometry columns in gdf and gdf2
|
35
|
+
gdf = gdf[["ADMIN0", "geometry"]]
|
36
|
+
gdf2 = gdf2[["ADMIN0", "geometry"]]
|
37
|
+
|
38
|
+
# Merge gdf and gdf2
|
39
|
+
import pandas as pd
|
40
|
+
gdf = pd.concat([gdf, gdf2], ignore_index=True)
|
41
|
+
|
42
|
+
# Save the filtered shapefile as a temporary file
|
43
|
+
|
44
|
+
gdf.to_file(filtered_shapefile_path)
|
45
|
+
else:
|
46
|
+
gdf = gpd.read_file(filtered_shapefile_path, engine="pyogrio")
|
47
|
+
|
48
|
+
# Create the global map with highlighted countries
|
49
|
+
fig = pygmt.Figure()
|
50
|
+
|
51
|
+
# Define the region of interest and projection
|
52
|
+
# fig.basemap(region="g", projection="R12c/20", frame=True)
|
53
|
+
fig.basemap(region=[-135, 60, -35, 53], projection="Q12c", frame=True)
|
54
|
+
|
55
|
+
# Use the coast function to draw land and water
|
56
|
+
fig.coast(land="lightgray", water="lightcyan")
|
57
|
+
|
58
|
+
# Highlight the countries using the filtered shapefile
|
59
|
+
fig.plot(data=filtered_shapefile_path, pen="0.35p,black")
|
60
|
+
|
61
|
+
# Add hatches to Pakistan and Afghanistan
|
62
|
+
gdf_filled = gdf[gdf["ADMIN0"].isin(["Pakistan", "Afghanistan"])]
|
63
|
+
for _, row in gdf_filled.iterrows():
|
64
|
+
fill_gdf = gpd.GeoDataFrame([row], columns=gdf.columns)
|
65
|
+
with pygmt.helpers.GMTTempFile() as tmpfile:
|
66
|
+
fill_gdf.to_file(tmpfile.name, driver="GeoJSON")
|
67
|
+
fig.plot(data=tmpfile.name, pen="0.35p,black", fill="black@50+h")
|
68
|
+
|
69
|
+
# Save the figure
|
70
|
+
fig.savefig("global_choropleth_highlighted_v1.png", dpi=1000)
|
71
|
+
|
72
|
+
# Show the figure
|
73
|
+
fig.show()
|
4
74
|
|
5
75
|
import matplotlib.pyplot as plt
|
6
76
|
import cartopy.crs as ccrs
|
@@ -8,6 +8,7 @@ geocif/__init__.py
|
|
8
8
|
geocif/analysis.py
|
9
9
|
geocif/geocif.py
|
10
10
|
geocif/indices_runner.py
|
11
|
+
geocif/indices_runner_v2.py
|
11
12
|
geocif/logger.py
|
12
13
|
geocif/utils.py
|
13
14
|
geocif.egg-info/PKG-INFO
|
@@ -31,6 +32,7 @@ geocif/cei/definitions.py
|
|
31
32
|
geocif/cei/indices.py
|
32
33
|
geocif/ml/__init__.py
|
33
34
|
geocif/ml/correlations.py
|
35
|
+
geocif/ml/correlations_backup.py
|
34
36
|
geocif/ml/embedding.py
|
35
37
|
geocif/ml/feature_engineering.py
|
36
38
|
geocif/ml/feature_selection.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|