geocif 0.1.48__tar.gz → 0.1.49__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.48/geocif.egg-info → geocif-0.1.49}/PKG-INFO +1 -1
- {geocif-0.1.48 → geocif-0.1.49}/geocif/geocif.py +63 -19
- geocif-0.1.49/geocif/ml/aa.py +28 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/feature_selection.py +10 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/output.py +13 -13
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/stats.py +7 -1
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/trainers.py +27 -22
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/trend.py +32 -11
- geocif-0.1.49/geocif/viz/misc.py +55 -0
- {geocif-0.1.48 → geocif-0.1.49/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.48 → geocif-0.1.49}/setup.py +1 -1
- {geocif-0.1.48 → geocif-0.1.49}/LICENSE +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/MANIFEST.in +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/README.md +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/analysis.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/constants.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/features.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/geo.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/models.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/cei/indices.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/experiments.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/indices_runner.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/indices_runner_v2.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/logger.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/correlations.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/misc.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/stages.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/xai.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/playground/automl.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/playground/misc.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/utils.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif/viz/plot.py +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/requirements.txt +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/setup.cfg +0 -0
- {geocif-0.1.48 → geocif-0.1.49}/tests/test_geocif.py +0 -0
@@ -198,9 +198,6 @@ class Geocif:
|
|
198
198
|
|
199
199
|
self.db_path = self.dir_db / self.db_forecasts
|
200
200
|
|
201
|
-
# Store config file in database
|
202
|
-
output.config_to_db(self.db_path, self.parser, self.today)
|
203
|
-
|
204
201
|
# self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
|
205
202
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
206
203
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
@@ -221,6 +218,9 @@ class Geocif:
|
|
221
218
|
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
222
219
|
)
|
223
220
|
|
221
|
+
# Drop rows where target_col is NaN
|
222
|
+
df_region = df_region.dropna(subset=[target_col])
|
223
|
+
|
224
224
|
X_train = df_region[self.feature_names]
|
225
225
|
# Drop any columns with NaNs
|
226
226
|
X_train = X_train.dropna(axis=1, how="any")
|
@@ -280,7 +280,7 @@ class Geocif:
|
|
280
280
|
X_train_scaled,
|
281
281
|
y_train,
|
282
282
|
feature_names=self.selected_features,
|
283
|
-
target_col=
|
283
|
+
target_col=target_col,
|
284
284
|
optimize=self.optimize,
|
285
285
|
fraction_loocv=self.fraction_loocv,
|
286
286
|
cat_features=self.cat_features,
|
@@ -302,8 +302,13 @@ class Geocif:
|
|
302
302
|
verbose=False,
|
303
303
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
304
304
|
)
|
305
|
-
elif self.model_name
|
305
|
+
elif self.model_name in ["oblique"]:
|
306
306
|
self.model.fit(X_train, y_train)
|
307
|
+
elif self.model_name == "ydf":
|
308
|
+
# Combine X_train and y_train
|
309
|
+
df_train = pd.concat([X_train, y_train], axis=1)
|
310
|
+
|
311
|
+
self.model = self.model.train(df_train)
|
307
312
|
elif self.model_name == "geospaNN":
|
308
313
|
self.model.fit(
|
309
314
|
X_train,
|
@@ -335,9 +340,16 @@ class Geocif:
|
|
335
340
|
]:
|
336
341
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
337
342
|
|
343
|
+
if self.model_name == "cumulative_1":
|
344
|
+
num_columns = 1
|
345
|
+
elif self.model_name == "cumulative_2":
|
346
|
+
num_columns = 2
|
347
|
+
elif self.model_name == "cumulative_3":
|
348
|
+
num_columns = 3
|
349
|
+
|
338
350
|
# Standardize the numeric features
|
339
351
|
scaler = StandardScaler()
|
340
|
-
X_numeric = X_train.iloc[:, :
|
352
|
+
X_numeric = X_train.iloc[:, :num_columns]
|
341
353
|
X_scaled_numeric = pd.DataFrame(
|
342
354
|
scaler.fit_transform(X_numeric),
|
343
355
|
columns=X_numeric.columns,
|
@@ -409,9 +421,16 @@ class Geocif:
|
|
409
421
|
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
410
422
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
411
423
|
|
424
|
+
if self.model_name == "cumulative_1":
|
425
|
+
num_columns = 1
|
426
|
+
elif self.model_name == "cumulative_2":
|
427
|
+
num_columns = 2
|
428
|
+
elif self.model_name == "cumulative_3":
|
429
|
+
num_columns = 3
|
430
|
+
|
412
431
|
# Standardize the numeric features
|
413
432
|
scaler = StandardScaler()
|
414
|
-
X_numeric = X_test.iloc[:, :
|
433
|
+
X_numeric = X_test.iloc[:, :num_columns]
|
415
434
|
try:
|
416
435
|
X_scaled_numeric = pd.DataFrame(
|
417
436
|
scaler.fit_transform(X_numeric),
|
@@ -455,7 +474,9 @@ class Geocif:
|
|
455
474
|
self.selected_features + self.cat_features + [self.target]
|
456
475
|
]
|
457
476
|
w_train = data_train.y - self.estimate(data_train.x)
|
458
|
-
|
477
|
+
elif self.model_name == "ydf":
|
478
|
+
y_pred = self.model.evaluate(X_test)
|
479
|
+
best_hyperparameters = {}
|
459
480
|
else:
|
460
481
|
y_pred = self.model.predict(X_test)
|
461
482
|
best_hyperparameters = self.model.get_params().copy()
|
@@ -468,8 +489,8 @@ class Geocif:
|
|
468
489
|
|
469
490
|
obj_trend = trend.DetrendedData(
|
470
491
|
df_tmp[f"Detrended {self.target}"],
|
471
|
-
df_tmp["
|
472
|
-
df_tmp["
|
492
|
+
df_tmp["Detrended Model"],
|
493
|
+
df_tmp["Detrended Model Type"],
|
473
494
|
)
|
474
495
|
|
475
496
|
# Retrend the predicted yield
|
@@ -477,6 +498,8 @@ class Geocif:
|
|
477
498
|
obj_trend, df_region.iloc[idx][["Harvest Year"]]
|
478
499
|
)[0]
|
479
500
|
|
501
|
+
df_region.loc[idx, "Detrended Model Type"] = obj_trend.model_type.unique()[0]
|
502
|
+
|
480
503
|
# Create a dataframe with forecast results
|
481
504
|
shp = len(X_test)
|
482
505
|
experiment_id = f"{self.country}_{self.crop}"
|
@@ -530,7 +553,6 @@ class Geocif:
|
|
530
553
|
|
531
554
|
if self.check_yield_trend:
|
532
555
|
df.loc[:, "Detrended Model Type"] = df_region["Detrended Model Type"].values
|
533
|
-
df.loc[:, "Detrended Model"] = df_region["Detrended Model"].values
|
534
556
|
|
535
557
|
if self.last_year_yield_as_feature:
|
536
558
|
# Add last year yield to dataframe
|
@@ -729,7 +751,7 @@ class Geocif:
|
|
729
751
|
+ ["Region_ID"]
|
730
752
|
)
|
731
753
|
if self.check_yield_trend:
|
732
|
-
common_columns += ["Detrended Model Type", "Detrended Model"]
|
754
|
+
common_columns += [f"Detrended {self.target}", "Detrended Model Type", "Detrended Model"]
|
733
755
|
|
734
756
|
if self.last_year_yield_as_feature:
|
735
757
|
common_columns += [f"Last Year {self.target}"]
|
@@ -738,11 +760,15 @@ class Geocif:
|
|
738
760
|
# Filter dataframe based on region and self.feature_names
|
739
761
|
df_region_train = self.df_train[mask_train]
|
740
762
|
df_region_train = df_region_train[self.fixed_columns + common_columns]
|
763
|
+
df_region_train.reset_index(drop=True, inplace=True)
|
741
764
|
self.train(df_region_train, scaler)
|
742
765
|
|
743
766
|
""" Predict """
|
767
|
+
if self.check_yield_trend:
|
768
|
+
common_columns = common_columns[:-3]
|
744
769
|
df_region_test = self.df_test[mask_test]
|
745
770
|
df_region_test = df_region_test[self.fixed_columns + common_columns]
|
771
|
+
df_region_test.reset_index(drop=True, inplace=True)
|
746
772
|
experiment_id, df = self.predict(df_region_test, scaler)
|
747
773
|
# df.reset_index(inplace=True)
|
748
774
|
|
@@ -897,12 +923,12 @@ class Geocif:
|
|
897
923
|
|
898
924
|
if self.lag_yield_as_feature:
|
899
925
|
df = fe.compute_lag_yield(
|
900
|
-
df, self.all_seasons_with_yield, self.number_lag_years
|
926
|
+
df, self.all_seasons_with_yield, self.number_lag_years, self.target
|
901
927
|
)
|
902
928
|
|
903
929
|
if self.analogous_year_yield_as_feature:
|
904
930
|
df = fe.compute_analogous_yield(
|
905
|
-
df, self.all_seasons_with_yield, self.number_median_years
|
931
|
+
df, self.all_seasons_with_yield, self.number_median_years, self.target
|
906
932
|
)
|
907
933
|
|
908
934
|
# Create Region_ID column based on Region column category code
|
@@ -912,7 +938,7 @@ class Geocif:
|
|
912
938
|
elif self.cluster_strategy == "individual":
|
913
939
|
df["Region_ID"] = df["Region"].cat.codes
|
914
940
|
elif self.cluster_strategy == "auto_detect":
|
915
|
-
clusters_assigned = fe.detect_clusters(df)
|
941
|
+
clusters_assigned = fe.detect_clusters(df, self.target)
|
916
942
|
# Merge the cluster labels with the original DataFrame
|
917
943
|
df = df.merge(clusters_assigned, on="Region")
|
918
944
|
|
@@ -1036,8 +1062,8 @@ class Geocif:
|
|
1036
1062
|
|
1037
1063
|
""" Groupby Region column and compute detrended yield """
|
1038
1064
|
self.df_train[f"Detrended {self.target}"] = np.NaN
|
1039
|
-
self.df_train["
|
1040
|
-
self.df_train["
|
1065
|
+
self.df_train["Detrended Model"] = np.NaN
|
1066
|
+
self.df_train["Detrended Model Type"] = np.NaN
|
1041
1067
|
if self.check_yield_trend:
|
1042
1068
|
group_by = ["Region"]
|
1043
1069
|
groups = self.df_train.groupby(group_by)
|
@@ -1050,10 +1076,10 @@ class Geocif:
|
|
1050
1076
|
group.index, f"Detrended {self.target}"
|
1051
1077
|
] = detrended_data.detrended_series
|
1052
1078
|
self.df_train.loc[
|
1053
|
-
group.index, "
|
1079
|
+
group.index, "Detrended Model"
|
1054
1080
|
] = detrended_data.trend_model
|
1055
1081
|
self.df_train.loc[
|
1056
|
-
group.index, "
|
1082
|
+
group.index, "Detrended Model Type"
|
1057
1083
|
] = detrended_data.model_type
|
1058
1084
|
|
1059
1085
|
# 6. Exclude years without yields from df_train
|
@@ -1122,6 +1148,15 @@ class Geocif:
|
|
1122
1148
|
self.cluster_strategy = "single"
|
1123
1149
|
self.select_cei_by = "Index"
|
1124
1150
|
self.use_cumulative_features = True
|
1151
|
+
elif self.model_name in ["oblique", "ydf"]:
|
1152
|
+
self.do_xai = False
|
1153
|
+
self.estimate_ci = False
|
1154
|
+
# Remove Region from cat_features as it is object type
|
1155
|
+
self.cat_features = [col for col in self.cat_features if col != "Region"]
|
1156
|
+
# if self.model_name == "ydf":
|
1157
|
+
# # HACK, for ydf model, target_col is Yield
|
1158
|
+
# self.df_results.rename(columns={self.target: "Yield"}, inplace=True)
|
1159
|
+
# self.target = "Yield"
|
1125
1160
|
else:
|
1126
1161
|
self.do_xai = self.parser.getboolean("ML", "do_xai")
|
1127
1162
|
self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
|
@@ -1188,6 +1223,9 @@ class Geocif:
|
|
1188
1223
|
self.dg["Country Region"] = (
|
1189
1224
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
|
1190
1225
|
)
|
1226
|
+
elif self.country == "illinois":
|
1227
|
+
self.dg["ADM0_NAME"] = "illinois"
|
1228
|
+
self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["NAME"]
|
1191
1229
|
else:
|
1192
1230
|
self.dg["Country Region"] = (
|
1193
1231
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
|
@@ -1240,6 +1278,9 @@ class Geocif:
|
|
1240
1278
|
# TODO ignore file with _2000 in its name
|
1241
1279
|
all_files = [f for f in all_files if "_2000" not in f.name]
|
1242
1280
|
|
1281
|
+
# Assert that all_files is not empty
|
1282
|
+
assert all_files, f"No files found in {_dir_country} with {file_name}"
|
1283
|
+
|
1243
1284
|
self.df_results = pd.concat(
|
1244
1285
|
(pd.read_csv(f) for f in all_files), ignore_index=True
|
1245
1286
|
)
|
@@ -1293,6 +1334,9 @@ def loop_execute(inputs):
|
|
1293
1334
|
obj = Geocif(logger=logger, parser=parser)
|
1294
1335
|
obj.read_data(country, crop, season)
|
1295
1336
|
|
1337
|
+
# Store config file in database
|
1338
|
+
output.config_to_db(obj.db_path, obj.parser, obj.today)
|
1339
|
+
|
1296
1340
|
# Setup metadata and run ML code
|
1297
1341
|
obj.setup(season, model)
|
1298
1342
|
if obj.simulation_stages:
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import ydf
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
# Load dataset with Pandas
|
5
|
+
ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/"
|
6
|
+
train_ds = pd.read_csv(ds_path + "adult_train.csv")
|
7
|
+
test_ds = pd.read_csv(ds_path + "adult_test.csv")
|
8
|
+
|
9
|
+
# Train a Gradient Boosted Trees model
|
10
|
+
model = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
|
11
|
+
|
12
|
+
# Look at a model (input features, training logs, structure, etc.)
|
13
|
+
model.describe()
|
14
|
+
|
15
|
+
# Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
|
16
|
+
model.evaluate(test_ds)
|
17
|
+
|
18
|
+
# Generate predictions
|
19
|
+
model.predict(test_ds)
|
20
|
+
|
21
|
+
# Analyse a model (e.g. partial dependence plot, variable importance)
|
22
|
+
model.analyze(test_ds)
|
23
|
+
|
24
|
+
# Benchmark the inference speed of a model
|
25
|
+
model.benchmark(test_ds)
|
26
|
+
|
27
|
+
# Save the model
|
28
|
+
model.save("/tmp/my_model")
|
@@ -131,6 +131,16 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
131
131
|
|
132
132
|
X_filtered = selector.fit_transform(X, y)
|
133
133
|
selected_features = X_filtered.columns.tolist()
|
134
|
+
elif method == "mrmr":
|
135
|
+
from mrmr import mrmr_regression
|
136
|
+
|
137
|
+
try:
|
138
|
+
selected_features = mrmr_regression(X=X, y=y, K=10)
|
139
|
+
except:
|
140
|
+
breakpoint()
|
141
|
+
# combine X and y into a dataframe
|
142
|
+
# df = pd.concat([X, y], axis=1)
|
143
|
+
|
134
144
|
elif method == "RFECV":
|
135
145
|
from sklearn.feature_selection import RFECV
|
136
146
|
from sklearn.model_selection import KFold
|
@@ -109,21 +109,21 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
109
109
|
except Exception as e:
|
110
110
|
print(f"Error: {e}")
|
111
111
|
|
112
|
-
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
113
|
-
# Output model pickle as a blob to database
|
114
|
-
df_model = pd.DataFrame(
|
115
|
-
{
|
116
|
-
"Experiment_ID": [experiment_id],
|
117
|
-
"Model": [model_name],
|
118
|
-
"Model_Blob": [pickle.dumps(model)],
|
119
|
-
}
|
120
|
-
)
|
121
|
-
# df_model.index = df_model.apply(
|
122
|
-
# lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
123
|
-
# )
|
124
|
-
|
125
112
|
# name the index level
|
126
113
|
try:
|
114
|
+
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
115
|
+
# Output model pickle as a blob to database
|
116
|
+
df_model = pd.DataFrame(
|
117
|
+
{
|
118
|
+
"Experiment_ID": [experiment_id],
|
119
|
+
"Model": [model_name],
|
120
|
+
"Model_Blob": [pickle.dumps(model)],
|
121
|
+
}
|
122
|
+
)
|
123
|
+
# df_model.index = df_model.apply(
|
124
|
+
# lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
125
|
+
# )
|
126
|
+
|
127
127
|
df_model.index.set_names(["Index"], inplace=True)
|
128
128
|
utils.to_db(db_path, "models", df_model)
|
129
129
|
except Exception as e:
|
@@ -191,7 +191,12 @@ def add_statistics(
|
|
191
191
|
|
192
192
|
"""
|
193
193
|
# First check if country and crop are in the admin_crop_production.csv file
|
194
|
-
|
194
|
+
if country == "Afghanistan":
|
195
|
+
fn = "afghanistan.csv"
|
196
|
+
elif country == "Illinois":
|
197
|
+
fn = "illinois.csv"
|
198
|
+
else:
|
199
|
+
fn = "adm_crop_production.csv"
|
195
200
|
df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
|
196
201
|
|
197
202
|
# HACK
|
@@ -206,6 +211,7 @@ def add_statistics(
|
|
206
211
|
df = add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone)
|
207
212
|
else:
|
208
213
|
group_by = ["Region", "Harvest Year"]
|
214
|
+
|
209
215
|
groups = df.groupby(group_by)
|
210
216
|
|
211
217
|
# Define processing for each group
|
@@ -264,8 +264,7 @@ def auto_train(
|
|
264
264
|
if model_name in ["catboost", "merf"]:
|
265
265
|
hyperparams = {
|
266
266
|
"depth": 6,
|
267
|
-
|
268
|
-
"iterations": 5000,
|
267
|
+
|
269
268
|
"subsample": 1.0,
|
270
269
|
"random_strength": 0.5,
|
271
270
|
"reg_lambda": 0.001,
|
@@ -283,18 +282,33 @@ def auto_train(
|
|
283
282
|
regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
|
284
283
|
model = MERF(regr, max_iterations=10)
|
285
284
|
elif model_name == "oblique":
|
286
|
-
|
287
|
-
|
285
|
+
from treeple import ExtraObliqueRandomForestRegressor
|
286
|
+
|
287
|
+
# https://docs.neurodata.io/treeple/dev/modules/supervised_tree.html#oblique-trees
|
288
|
+
n_features = X_train.shape[1]
|
288
289
|
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
290
|
+
model = ExtraObliqueRandomForestRegressor(
|
291
|
+
n_estimators=1500,
|
292
|
+
max_depth=20,
|
293
|
+
max_features=n_features**2,
|
294
|
+
feature_combinations=n_features,
|
293
295
|
n_jobs=-1,
|
294
|
-
verbose=2,
|
295
296
|
random_state=42,
|
296
297
|
)
|
297
|
-
|
298
|
+
elif model_name == "ydf":
|
299
|
+
import ydf
|
300
|
+
templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
|
301
|
+
|
302
|
+
model = ydf.GradientBoostedTreesLearner(
|
303
|
+
label=target_col,
|
304
|
+
task=ydf.Task.REGRESSION,
|
305
|
+
growing_strategy='BEST_FIRST_GLOBAL',
|
306
|
+
categorical_algorithm='RANDOM',
|
307
|
+
split_axis='SPARSE_OBLIQUE',
|
308
|
+
sparse_oblique_normalization='MIN_MAX',
|
309
|
+
sparse_oblique_num_projections_exponent=2.0)
|
310
|
+
|
311
|
+
hyperparams = templates["benchmark_rank1v1"]
|
298
312
|
elif model_name == "linear":
|
299
313
|
from sklearn.linear_model import LassoCV
|
300
314
|
|
@@ -308,24 +322,15 @@ def auto_train(
|
|
308
322
|
elif model_name == "cumulative_1":
|
309
323
|
from pygam import GAM, s, f, te
|
310
324
|
|
311
|
-
|
312
|
-
region_idx = X_train.columns.get_loc("Region")
|
313
|
-
|
314
|
-
model = GAM(s(0) + f(region_idx))
|
325
|
+
model = GAM(s(0) + f(1))
|
315
326
|
elif model_name == "cumulative_2":
|
316
327
|
from pygam import GAM, s, f, te
|
317
328
|
|
318
|
-
|
319
|
-
region_idx = X_train.columns.get_loc("Region")
|
320
|
-
|
321
|
-
model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
|
329
|
+
model = GAM(s(0) + s(1) + te(0, 1) + f(2))
|
322
330
|
elif model_name == "cumulative_3":
|
323
331
|
from pygam import GAM, s, f, te
|
324
332
|
|
325
|
-
|
326
|
-
region_idx = X_train.columns.get_loc("Region")
|
327
|
-
|
328
|
-
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
|
333
|
+
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3))
|
329
334
|
elif model_name == "geospaNN":
|
330
335
|
import torch
|
331
336
|
import geospaNN
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
import pandas as pd
|
2
3
|
from statsmodels.regression.linear_model import OLS
|
3
4
|
from statsmodels.tools.tools import add_constant
|
4
5
|
|
@@ -6,7 +7,7 @@ from statsmodels.tools.tools import add_constant
|
|
6
7
|
class DetrendedData:
|
7
8
|
"""
|
8
9
|
A class to store the detrended series, the model used for detrending,
|
9
|
-
and the type of model ('mean', 'linear', 'quadratic').
|
10
|
+
and the type of model ('mean', 'linear', 'quadratic', 'difference').
|
10
11
|
"""
|
11
12
|
|
12
13
|
def __init__(self, detrended_series, trend_model, model_type):
|
@@ -15,14 +16,16 @@ class DetrendedData:
|
|
15
16
|
self.model_type = model_type
|
16
17
|
|
17
18
|
|
18
|
-
def detrend_dataframe(df, column_name="y"):
|
19
|
+
def detrend_dataframe(df, column_name="y", model_type="best"):
|
19
20
|
"""
|
20
|
-
Removes the trend from the specified column of a DataFrame using the method
|
21
|
-
(mean, linear, quadratic) that results in the lowest AIC value.
|
21
|
+
Removes the trend from the specified column of a DataFrame using the specified method
|
22
|
+
(mean, linear, quadratic, difference) or the method that results in the lowest AIC value.
|
22
23
|
|
23
24
|
Parameters:
|
24
25
|
- df: pandas DataFrame containing the time series data.
|
25
26
|
- column_name: string name of the column to detrend.
|
27
|
+
- model_type: string specifying which model to use for detrending ('mean', 'linear',
|
28
|
+
'quadratic', 'difference', or 'best' for automatic selection based on AIC).
|
26
29
|
|
27
30
|
Returns:
|
28
31
|
- DetrendedData object containing the detrended series, the statistical model,
|
@@ -41,16 +44,32 @@ def detrend_dataframe(df, column_name="y"):
|
|
41
44
|
X_quad = add_constant(np.column_stack((df["t"], df["t"] ** 2)))
|
42
45
|
quad_model = OLS(df[column_name], X_quad).fit()
|
43
46
|
|
44
|
-
|
45
|
-
|
47
|
+
# Differencing method
|
48
|
+
diff_series = df[column_name].diff().dropna()
|
49
|
+
diff_model = OLS(diff_series, np.ones(len(diff_series))).fit()
|
50
|
+
|
51
|
+
models = {
|
52
|
+
"mean": mean_model,
|
53
|
+
"linear": linear_model,
|
54
|
+
"quadratic": quad_model,
|
55
|
+
"difference": diff_model
|
56
|
+
}
|
57
|
+
|
58
|
+
if model_type == "best":
|
59
|
+
best_model_type = min(models, key=lambda x: models[x].aic)
|
60
|
+
else:
|
61
|
+
best_model_type = model_type
|
62
|
+
|
46
63
|
best_model = models[best_model_type]
|
47
64
|
|
48
65
|
if best_model_type == "mean":
|
49
66
|
detrended = df[column_name] - mean_model.predict(np.ones(len(df)))
|
50
67
|
elif best_model_type == "linear":
|
51
68
|
detrended = df[column_name] - linear_model.predict(X_linear)
|
52
|
-
|
69
|
+
elif best_model_type == "quadratic":
|
53
70
|
detrended = df[column_name] - quad_model.predict(X_quad)
|
71
|
+
else: # difference
|
72
|
+
detrended = df[column_name].diff().dropna()
|
54
73
|
|
55
74
|
return DetrendedData(detrended, best_model, best_model_type)
|
56
75
|
|
@@ -67,11 +86,10 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
67
86
|
Returns:
|
68
87
|
- The retrended series as a pandas Series.
|
69
88
|
"""
|
70
|
-
# if future_time_points is not of type pandas dataframe then convert it to one
|
71
89
|
future_time_points = np.array(future_time_points)
|
72
90
|
|
73
|
-
model_type = detrended_data.model_type[0]
|
74
|
-
model = detrended_data.trend_model[0]
|
91
|
+
model_type = detrended_data.model_type.unique()[0]
|
92
|
+
model = detrended_data.trend_model.unique()[0]
|
75
93
|
|
76
94
|
if model_type == "mean":
|
77
95
|
trend_component = model.predict(
|
@@ -80,11 +98,14 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
80
98
|
elif model_type == "linear":
|
81
99
|
X_linear = add_constant(future_time_points, has_constant="add")
|
82
100
|
trend_component = model.predict(X_linear)
|
83
|
-
|
101
|
+
elif model_type == "quadratic":
|
84
102
|
X_quad = add_constant(
|
85
103
|
np.column_stack((future_time_points, future_time_points**2)),
|
86
104
|
has_constant="add",
|
87
105
|
)
|
88
106
|
trend_component = model.predict(X_quad)
|
107
|
+
else: # difference
|
108
|
+
trend_component = pd.Series(np.nan, index=future_time_points)
|
109
|
+
trend_component.iloc[0] = model.params[0] # Add mean of differenced series
|
89
110
|
|
90
111
|
return trend_component
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import hvplot.pandas
|
3
|
+
import panel as pn
|
4
|
+
|
5
|
+
# Load the CSV file
|
6
|
+
file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\regional_cei_slope.csv'
|
7
|
+
data = pd.read_csv(file_path)
|
8
|
+
|
9
|
+
# Extract unique values for dropdowns
|
10
|
+
countries = data['Country'].unique().tolist()
|
11
|
+
|
12
|
+
# Create dropdown widgets
|
13
|
+
country_dropdown = pn.widgets.Select(name='Country', options=countries)
|
14
|
+
region_dropdown = pn.widgets.Select(name='Region', options=[])
|
15
|
+
crop_dropdown = pn.widgets.Select(name='Crop', options=[])
|
16
|
+
season_dropdown = pn.widgets.Select(name='Season', options=data['Season'].unique().tolist())
|
17
|
+
|
18
|
+
|
19
|
+
# Function to update region and crop options based on selected country
|
20
|
+
@pn.depends(country_dropdown.param.value, watch=True)
|
21
|
+
def update_region_and_crop_options(country):
|
22
|
+
filtered_data = data[data['Country'] == country]
|
23
|
+
regions = filtered_data['Region'].unique().tolist()
|
24
|
+
crops = filtered_data['Crop'].unique().tolist()
|
25
|
+
|
26
|
+
region_dropdown.options = regions
|
27
|
+
crop_dropdown.options = crops
|
28
|
+
|
29
|
+
|
30
|
+
# Function to filter data based on dropdown selections
|
31
|
+
@pn.depends(country_dropdown.param.value, region_dropdown.param.value, crop_dropdown.param.value,
|
32
|
+
season_dropdown.param.value)
|
33
|
+
def update_plot(country, region, crop, season):
|
34
|
+
filtered_data = data[(data['Country'] == country) &
|
35
|
+
(data['Region'] == region) &
|
36
|
+
(data['Crop'] == crop) &
|
37
|
+
(data['Season'] == season)]
|
38
|
+
|
39
|
+
if not filtered_data.empty:
|
40
|
+
plot = filtered_data.hvplot.scatter(x='Slope', y='Intercept',
|
41
|
+
hover_cols=['Growth Stage', 'p-value', 'Index', 'Description'])
|
42
|
+
return plot
|
43
|
+
else:
|
44
|
+
return pn.pane.Markdown("No data available for the selected combination.")
|
45
|
+
|
46
|
+
|
47
|
+
# Create the dashboard
|
48
|
+
dashboard = pn.Column(
|
49
|
+
pn.Row(country_dropdown, region_dropdown, crop_dropdown, season_dropdown),
|
50
|
+
update_plot
|
51
|
+
)
|
52
|
+
|
53
|
+
# Save as html page
|
54
|
+
dashboard.save('dashboard.html', embed=True)
|
55
|
+
|
@@ -32,6 +32,7 @@ geocif/cei/__init__.py
|
|
32
32
|
geocif/cei/definitions.py
|
33
33
|
geocif/cei/indices.py
|
34
34
|
geocif/ml/__init__.py
|
35
|
+
geocif/ml/aa.py
|
35
36
|
geocif/ml/correlations.py
|
36
37
|
geocif/ml/embedding.py
|
37
38
|
geocif/ml/feature_engineering.py
|
@@ -50,5 +51,6 @@ geocif/playground/__init__.py
|
|
50
51
|
geocif/playground/automl.py
|
51
52
|
geocif/playground/misc.py
|
52
53
|
geocif/viz/__init__.py
|
54
|
+
geocif/viz/misc.py
|
53
55
|
geocif/viz/plot.py
|
54
56
|
tests/test_geocif.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|