geocif 0.1.68__tar.gz → 0.1.70__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.68/geocif.egg-info → geocif-0.1.70}/PKG-INFO +1 -1
- {geocif-0.1.68 → geocif-0.1.70}/geocif/cei/indices.py +1 -1
- {geocif-0.1.68 → geocif-0.1.70}/geocif/geocif.py +45 -22
- {geocif-0.1.68 → geocif-0.1.70}/geocif/geocif_runner.py +34 -34
- geocif-0.1.70/geocif/indices_runner_angola.py +212 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_south_africa.py +2 -2
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/correlations.py +10 -7
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/embedding.py +11 -8
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/feature_engineering.py +6 -5
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/trainers.py +1 -1
- geocif-0.1.70/geocif/playground/wolayita_maize_mask.py +156 -0
- geocif-0.1.70/geocif/viz/gt.py +69 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/viz/tmp.py +1 -1
- {geocif-0.1.68 → geocif-0.1.70/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.68 → geocif-0.1.70}/setup.py +1 -1
- geocif-0.1.68/geocif/indices_runner_angola.py +0 -212
- {geocif-0.1.68 → geocif-0.1.70}/LICENSE +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/MANIFEST.in +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/README.md +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/analysis.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/constants.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/features.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/geo.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/models.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/experiments.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/logger.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/output.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/stages.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/stats.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/trend.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/xai.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/mm.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/aa.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/area.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/automl.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/download_esi.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/enso.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/eval.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/gamtest.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/gee_access.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/misc.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/reg.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/sustain.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp2.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp3.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp4.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp5.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/risk/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/utils.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif/viz/plot.py +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/requirements.txt +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/setup.cfg +0 -0
- {geocif-0.1.68 → geocif-0.1.70}/tests/test_geocif.py +0 -0
@@ -675,7 +675,7 @@ class CEIs:
|
|
675
675
|
elif "STD" in iname:
|
676
676
|
val = np.nanstd(eo_vals)
|
677
677
|
elif "AUC" in iname:
|
678
|
-
val = np.trapz(eo_vals
|
678
|
+
val = np.trapz(eo_vals)
|
679
679
|
elif "H-INDEX" in iname:
|
680
680
|
# Multiply by 10 for h-index to work
|
681
681
|
val = utils.compute_h_index(eo_vals * 10)
|
@@ -243,11 +243,11 @@ class Geocif:
|
|
243
243
|
if any(cei in column for cei in self.use_ceis)
|
244
244
|
]
|
245
245
|
else:
|
246
|
-
|
246
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
247
247
|
selector, _, self.selected_features = fs.select_features(
|
248
248
|
X_train, y_train, method=self.feature_selection
|
249
249
|
)
|
250
|
-
|
250
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
251
251
|
|
252
252
|
""" Update model to include conformal estimates """
|
253
253
|
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
@@ -306,7 +306,7 @@ class Geocif:
|
|
306
306
|
X_train,
|
307
307
|
y_train,
|
308
308
|
cat_features=self.cat_features,
|
309
|
-
verbose=
|
309
|
+
verbose=True,
|
310
310
|
)
|
311
311
|
elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
|
312
312
|
X_train = X_train.drop(
|
@@ -598,15 +598,15 @@ class Geocif:
|
|
598
598
|
df_region[f"Median {self.target}"].values, 3
|
599
599
|
)
|
600
600
|
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
601
|
+
if f"Median {self.target} (2018-2022)" in df_region.columns:
|
602
|
+
df.loc[:, f"Median {self.target} (2018-2022)"] = np.around(
|
603
|
+
df_region[f"Median {self.target} (2018-2022)"].values, 3
|
604
|
+
)
|
605
|
+
|
606
|
+
if f"Median {self.target} (2013-2017)" in df_region.columns:
|
607
|
+
df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
|
608
|
+
df_region[f"Median {self.target} (2013-2017)"].values, 3
|
609
|
+
)
|
610
610
|
|
611
611
|
if self.estimate_ci:
|
612
612
|
if self.estimate_ci_for_all or self.forecast_season == self.today_year:
|
@@ -820,8 +820,8 @@ class Geocif:
|
|
820
820
|
+ self.statistics_columns
|
821
821
|
+ self.feature_names
|
822
822
|
+ [f"Median {self.target}"]
|
823
|
-
|
824
|
-
|
823
|
+
+ [f"Median {self.target} (2018-2022)"]
|
824
|
+
+ [f"Median {self.target} (2013-2017)"]
|
825
825
|
+ ["Region_ID"]
|
826
826
|
)
|
827
827
|
if self.check_yield_trend:
|
@@ -997,6 +997,25 @@ class Geocif:
|
|
997
997
|
if self.use_single_time_period_as_feature:
|
998
998
|
df = stages.select_single_time_period_features(df)
|
999
999
|
|
1000
|
+
# If forecasting for current season, then exclude the latest month data as it will be partial
|
1001
|
+
# and will confuse the model
|
1002
|
+
if self.forecast_season == self.today_year:
|
1003
|
+
current_month = ar.utcnow().month
|
1004
|
+
|
1005
|
+
# Identify columns where the second chunk equals the current month index
|
1006
|
+
cols_to_drop = []
|
1007
|
+
for col in df.columns:
|
1008
|
+
if "_" in col:
|
1009
|
+
mon = stages.get_stage_information_dict(col, self.method)['Starting Stage']
|
1010
|
+
|
1011
|
+
if mon == current_month:
|
1012
|
+
cols_to_drop.append(col)
|
1013
|
+
|
1014
|
+
# Drop those columns
|
1015
|
+
|
1016
|
+
df = df.drop(columns=cols_to_drop)
|
1017
|
+
|
1018
|
+
# Hack: If
|
1000
1019
|
# Change column name
|
1001
1020
|
# e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
|
1002
1021
|
df = stages.update_feature_names(df, self.method)
|
@@ -1011,13 +1030,13 @@ class Geocif:
|
|
1011
1030
|
df, self.all_seasons_with_yield, self.number_median_years, self.target
|
1012
1031
|
)
|
1013
1032
|
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1033
|
+
df = fe.compute_user_median_statistics(
|
1034
|
+
df, range(2018, 2023)
|
1035
|
+
)
|
1036
|
+
|
1037
|
+
df = fe.compute_user_median_statistics(
|
1038
|
+
df, range(2013, 2018)
|
1039
|
+
)
|
1021
1040
|
|
1022
1041
|
if self.median_area_as_feature:
|
1023
1042
|
df = fe.compute_median_statistics(
|
@@ -1343,7 +1362,11 @@ class Geocif:
|
|
1343
1362
|
# e.g. _stages = ['13_12_11', '13_12_11_10', '13_12_11_10_9']
|
1344
1363
|
# then self.simulation_stages = [array([13, 12, 11]), array([13, 12, 11, 10]), array([13, 12, 11, 10, 9])]
|
1345
1364
|
# Drop stages in self.all_stages that do not have _ in them
|
1346
|
-
self.all_stages = [element for element in self.all_stages if "_" in element]
|
1365
|
+
# self.all_stages = [element for element in self.all_stages if "_" in element]
|
1366
|
+
|
1367
|
+
if self.forecast_season == self.today_year:
|
1368
|
+
current_month = ar.utcnow().month
|
1369
|
+
self.all_stages = [elem for elem in self.all_stages if not elem.startswith(str(current_month))]
|
1347
1370
|
|
1348
1371
|
self.simulation_stages = [
|
1349
1372
|
np.array([int(stage) for stage in s.split("_")]) for s in self.all_stages
|
@@ -26,40 +26,40 @@ def loop_execute(inputs):
|
|
26
26
|
Returns:
|
27
27
|
|
28
28
|
"""
|
29
|
-
from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
|
30
|
-
from pycallgraph2.output import GraphvizOutput
|
31
|
-
|
32
|
-
graphviz = GraphvizOutput()
|
33
|
-
graphviz.output_file = "geocif_visualization.png"
|
34
|
-
plt.rcParams["figure.dpi"] = 600
|
35
|
-
config = Config(max_depth=5)
|
36
|
-
config.trace_filter = GlobbingFilter(
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
)
|
41
|
-
|
42
|
-
with PyCallGraph(output=graphviz, config=config):
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
29
|
+
# from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
|
30
|
+
# from pycallgraph2.output import GraphvizOutput
|
31
|
+
#
|
32
|
+
# graphviz = GraphvizOutput()
|
33
|
+
# graphviz.output_file = "geocif_visualization.png"
|
34
|
+
# plt.rcParams["figure.dpi"] = 600
|
35
|
+
# config = Config(max_depth=5)
|
36
|
+
# config.trace_filter = GlobbingFilter(
|
37
|
+
# exclude=[
|
38
|
+
# "pycallgraph.*",
|
39
|
+
# ]
|
40
|
+
# )
|
41
|
+
#
|
42
|
+
# with PyCallGraph(output=graphviz, config=config):
|
43
|
+
project_name, country, crop, season, model, logger, parser, index = inputs
|
44
|
+
|
45
|
+
logger.info("=====================================================")
|
46
|
+
logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
|
47
|
+
logger.info("=====================================================")
|
48
|
+
|
49
|
+
obj = geocif.Geocif(logger=logger,
|
50
|
+
parser=parser,
|
51
|
+
project_name=project_name)
|
52
|
+
obj.read_data(country, crop, season)
|
53
|
+
|
54
|
+
# Store config file in database, only execute this for
|
55
|
+
# the first iteration of the loop
|
56
|
+
if index == 0:
|
57
|
+
output.config_to_db(obj.db_path, obj.parser, obj.today)
|
58
|
+
|
59
|
+
# Setup metadata and run ML code
|
60
|
+
obj.setup(season, model)
|
61
|
+
if obj.simulation_stages:
|
62
|
+
obj.execute()
|
63
63
|
|
64
64
|
|
65
65
|
def gather_inputs(parser):
|
@@ -0,0 +1,212 @@
|
|
1
|
+
import itertools
|
2
|
+
import warnings
|
3
|
+
from multiprocessing import Pool, cpu_count
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import arrow as ar
|
7
|
+
import pandas as pd
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
warnings.filterwarnings("ignore")
|
11
|
+
|
12
|
+
from .cei import indices
|
13
|
+
from geoprepare import base
|
14
|
+
|
15
|
+
country = "angola"
|
16
|
+
|
17
|
+
def remove_duplicates(lst):
|
18
|
+
"""
|
19
|
+
|
20
|
+
:param lst:
|
21
|
+
:return:
|
22
|
+
"""
|
23
|
+
return list(set([i for i in lst]))
|
24
|
+
|
25
|
+
|
26
|
+
def get_admin_zone(country, dg_shp):
|
27
|
+
admin_zone = "admin_1"
|
28
|
+
country = country.title().replace(" ", "_")
|
29
|
+
|
30
|
+
# Read in shapefile
|
31
|
+
dg_country = dg_shp[dg_shp["ADMIN0"] == country]
|
32
|
+
|
33
|
+
# Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
|
34
|
+
if dg_country.empty:
|
35
|
+
admin_zone = "admin_1"
|
36
|
+
elif not dg_country["ADMIN2"].isna().all():
|
37
|
+
admin_zone = "admin_2"
|
38
|
+
|
39
|
+
return admin_zone
|
40
|
+
|
41
|
+
|
42
|
+
class cei_runner(base.BaseGeo):
|
43
|
+
def __init__(self, path_config_file):
|
44
|
+
super().__init__(path_config_file)
|
45
|
+
|
46
|
+
# Parse configuration files
|
47
|
+
self.parse_config()
|
48
|
+
|
49
|
+
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
50
|
+
import platform
|
51
|
+
if platform.system() == "Linux":
|
52
|
+
self.base_dir = Path(
|
53
|
+
rf"/gpfs/data1/cmongp1/GEOGLAM/Output/countries/{country}"
|
54
|
+
)
|
55
|
+
else:
|
56
|
+
self.base_dir = Path(
|
57
|
+
rf"D:\Users\ritvik\projects\GEOGLAM\Output\countries\{country}"
|
58
|
+
) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
59
|
+
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
60
|
+
|
61
|
+
def collect_files(self):
|
62
|
+
"""
|
63
|
+
1. Collect all the files which contain EO information
|
64
|
+
2. Exclude files from the `processed` directory if it is already in
|
65
|
+
processed_include_fall directory
|
66
|
+
3. Create a dataframe that contains the following columns:
|
67
|
+
- directory: name of directory where file is located
|
68
|
+
- path: full path to file
|
69
|
+
- filename: name of file
|
70
|
+
:return: Return the dataframe created above
|
71
|
+
"""
|
72
|
+
import geopandas as gp
|
73
|
+
|
74
|
+
dg_shp = gp.read_file(
|
75
|
+
self.dir_input
|
76
|
+
/ "Global_Datasets"
|
77
|
+
/ "Regions"
|
78
|
+
/ "Shps"
|
79
|
+
/ "adm_shapefile.shp",
|
80
|
+
engine="pyogrio",
|
81
|
+
)
|
82
|
+
|
83
|
+
# Collect all the files which contain EO information
|
84
|
+
df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
|
85
|
+
for filepath in self.base_dir.rglob("*.csv"):
|
86
|
+
country = filepath.parents[0].name
|
87
|
+
|
88
|
+
admin_zone = get_admin_zone(country, dg_shp)
|
89
|
+
|
90
|
+
# If country is not in cc.COUNTRIES then skip
|
91
|
+
# HACK: Skip korea for now, as it is giving errors
|
92
|
+
if country == "republic_of_korea":
|
93
|
+
continue
|
94
|
+
|
95
|
+
# Get name of directory one level up
|
96
|
+
process_type = filepath.parents[1].name
|
97
|
+
|
98
|
+
# Get name of file
|
99
|
+
filename = filepath.name
|
100
|
+
|
101
|
+
# Add to dataframe
|
102
|
+
df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
|
103
|
+
|
104
|
+
# Exclude those rows where directory is processed and file is already in
|
105
|
+
# processed_include_fall directory
|
106
|
+
no_fall = df_files["directory"] == "processed"
|
107
|
+
include_fall = df_files[df_files["directory"] == "processed_include_fall"][
|
108
|
+
"filename"
|
109
|
+
]
|
110
|
+
|
111
|
+
df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
|
112
|
+
|
113
|
+
return df_files
|
114
|
+
|
115
|
+
def process_combinations(self, df, method):
|
116
|
+
"""
|
117
|
+
Create a list of tuples of the following:
|
118
|
+
- directory: name of directory where file is located
|
119
|
+
- path: full path to file
|
120
|
+
- filename: name of file
|
121
|
+
- method: whether to compute indices for phenological stages or not
|
122
|
+
This tuple will be used as input to the `process` function
|
123
|
+
:param df:
|
124
|
+
:param method:
|
125
|
+
:return:
|
126
|
+
"""
|
127
|
+
combinations = []
|
128
|
+
|
129
|
+
for index, row in tqdm(df.iterrows()):
|
130
|
+
combinations.extend(
|
131
|
+
list(
|
132
|
+
itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
|
133
|
+
)
|
134
|
+
)
|
135
|
+
|
136
|
+
combinations = remove_duplicates(combinations)
|
137
|
+
|
138
|
+
return combinations
|
139
|
+
|
140
|
+
def main(self, method):
|
141
|
+
"""
|
142
|
+
|
143
|
+
:param method:
|
144
|
+
:return:
|
145
|
+
"""
|
146
|
+
# Create a dataframe of the files to be analyzed
|
147
|
+
df_files = self.collect_files()
|
148
|
+
|
149
|
+
combinations = self.process_combinations(df_files, method)
|
150
|
+
|
151
|
+
# Add an element to the tuple to indicate the season
|
152
|
+
# Last element is redo flag which is True if the analysis is to be redone
|
153
|
+
# and False otherwise. Analysis is always redone for the current year
|
154
|
+
# and last year whether file exists or not
|
155
|
+
combinations = [
|
156
|
+
(
|
157
|
+
self.parser,
|
158
|
+
status,
|
159
|
+
path,
|
160
|
+
filename,
|
161
|
+
admin_zone,
|
162
|
+
category,
|
163
|
+
year,
|
164
|
+
"ndvi",
|
165
|
+
False, # redo
|
166
|
+
)
|
167
|
+
for year in range(2001, ar.utcnow().year + 1)
|
168
|
+
for status, path, filename, admin_zone, category in combinations
|
169
|
+
]
|
170
|
+
|
171
|
+
# Only keep those entries in combinations where the third elemt is
|
172
|
+
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
173
|
+
# This is done to test the code for these countries
|
174
|
+
combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
|
175
|
+
|
176
|
+
if True:
|
177
|
+
num_cpu = int(cpu_count() * 0.9)
|
178
|
+
with Pool(num_cpu) as p:
|
179
|
+
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
180
|
+
pass
|
181
|
+
else:
|
182
|
+
# Use the code below if you want to test without parallelization or
|
183
|
+
# if you want to debug by using pdb
|
184
|
+
pbar = tqdm(combinations)
|
185
|
+
for i, val in enumerate(pbar):
|
186
|
+
pbar.set_description(
|
187
|
+
f"Main loop {combinations[i][2]} {combinations[i][5]}"
|
188
|
+
)
|
189
|
+
indices.process(val)
|
190
|
+
|
191
|
+
|
192
|
+
def run(path_config_files=[]):
|
193
|
+
"""
|
194
|
+
|
195
|
+
Args:
|
196
|
+
path_config_files:
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
|
200
|
+
"""
|
201
|
+
""" Check dictionary keys to have no spaces"""
|
202
|
+
indices.validate_index_definitions()
|
203
|
+
|
204
|
+
for method in [
|
205
|
+
"monthly_r", # "dekad_r" # "dekad_r"
|
206
|
+
]: # , "full_season", "phenological_stages", "fraction_season"]:
|
207
|
+
obj = cei_runner(path_config_files)
|
208
|
+
obj.main(method)
|
209
|
+
|
210
|
+
|
211
|
+
if __name__ == "__main__":
|
212
|
+
run()
|
@@ -162,7 +162,7 @@ class cei_runner(base.BaseGeo):
|
|
162
162
|
category,
|
163
163
|
year,
|
164
164
|
"ndvi",
|
165
|
-
|
165
|
+
True, # redo
|
166
166
|
)
|
167
167
|
for year in range(2001, ar.utcnow().year + 1)
|
168
168
|
for status, path, filename, admin_zone, category in combinations
|
@@ -174,7 +174,7 @@ class cei_runner(base.BaseGeo):
|
|
174
174
|
combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
|
175
175
|
|
176
176
|
if True:
|
177
|
-
num_cpu = int(cpu_count() * 0.
|
177
|
+
num_cpu = int(cpu_count() * 0.8)
|
178
178
|
with Pool(num_cpu) as p:
|
179
179
|
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
180
180
|
pass
|
@@ -295,13 +295,16 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
295
295
|
df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
|
296
296
|
|
297
297
|
# Compute median of each CEI and sort the dataframe based on the absolute value of the median
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
298
|
+
try:
|
299
|
+
dict_best_cei[region_id] = (
|
300
|
+
df_tmp2.groupby("Type")
|
301
|
+
.max()
|
302
|
+
.reset_index()
|
303
|
+
.sort_values("Value", ascending=False)["Metric"]
|
304
|
+
.values
|
305
|
+
)
|
306
|
+
except:
|
307
|
+
breakpoint()
|
305
308
|
|
306
309
|
kwargs["region_id"] = region_id
|
307
310
|
_region_names = ", ".join([str(x) for x in group['Region'].unique()])
|
@@ -25,29 +25,32 @@ def _compute_correlations(X, y):
|
|
25
25
|
feature_correlations = {}
|
26
26
|
|
27
27
|
for feature in X.columns:
|
28
|
-
# Ignore
|
28
|
+
# Ignore object or categorical type columns
|
29
29
|
if X[feature].dtypes.name in ["object", "category"]:
|
30
30
|
continue
|
31
31
|
|
32
32
|
f_series = X[feature]
|
33
33
|
|
34
|
-
# Ignore NaN values in either y
|
35
|
-
mask = ~
|
36
|
-
|
37
|
-
|
34
|
+
# Ignore NaN values in either y or f_series
|
35
|
+
mask = ~(np.isnan(y) | np.isnan(f_series))
|
36
|
+
y_filtered = y[mask]
|
37
|
+
f_series_filtered = f_series[mask]
|
38
38
|
|
39
|
-
|
39
|
+
# Handle cases where std is zero
|
40
|
+
if np.std(f_series_filtered) == 0 or np.std(y_filtered) == 0:
|
40
41
|
feature_correlations[feature] = np.nan
|
41
42
|
else:
|
42
43
|
try:
|
43
|
-
r = pearsonr(
|
44
|
+
r = pearsonr(y_filtered, f_series_filtered)[0]
|
44
45
|
feature_correlations[feature] = round(r, 3)
|
45
|
-
except:
|
46
|
+
except Exception as e:
|
47
|
+
print(f"Error computing correlation for {feature}: {e}")
|
46
48
|
feature_correlations[feature] = np.nan
|
47
49
|
|
48
50
|
return feature_correlations
|
49
51
|
|
50
52
|
|
53
|
+
|
51
54
|
def find_most_common_top_feature(top_feature_by_region):
|
52
55
|
"""
|
53
56
|
Find the most common top feature and number of occurences
|
@@ -39,10 +39,10 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
|
|
39
39
|
|
40
40
|
return df
|
41
41
|
|
42
|
-
|
43
42
|
def compute_closest_years(all_years, harvest_year, number_lag_years):
|
44
43
|
"""
|
45
|
-
Finds the years closest to a given harvest year,
|
44
|
+
Finds the historical years closest to a given harvest year,
|
45
|
+
excluding any future year (harvest_year itself and beyond).
|
46
46
|
|
47
47
|
Args:
|
48
48
|
all_years (array-like): List or array of all years to consider.
|
@@ -50,7 +50,8 @@ def compute_closest_years(all_years, harvest_year, number_lag_years):
|
|
50
50
|
number_lag_years (int): Number of closest years to return.
|
51
51
|
|
52
52
|
Returns:
|
53
|
-
list:
|
53
|
+
list: The historical years closest to the given harvest year.
|
54
|
+
Returns an empty list if no historical years exist.
|
54
55
|
"""
|
55
56
|
# Exclude the harvest year before computation to simplify logic
|
56
57
|
filtered_years = [year for year in all_years if year != harvest_year]
|
@@ -96,7 +97,7 @@ def compute_median_statistics(
|
|
96
97
|
mask = (group["Harvest Year"].isin(closest_years)) & (
|
97
98
|
group["Region"] == region
|
98
99
|
)
|
99
|
-
median_yield = group.loc[mask, target_col].
|
100
|
+
median_yield = group.loc[mask, target_col].mean()
|
100
101
|
df.loc[
|
101
102
|
(df["Region"] == region) & (df["Harvest Year"] == harvest_year),
|
102
103
|
f"Median {target_col}",
|
@@ -186,7 +187,7 @@ def compute_lag_yield(
|
|
186
187
|
else:
|
187
188
|
# Add median yield
|
188
189
|
mask_group_median = group["Harvest Year"].isin(closest_years)
|
189
|
-
median_yield = group.loc[mask_group_median, target_col].
|
190
|
+
median_yield = group.loc[mask_group_median, target_col].mean()
|
190
191
|
|
191
192
|
df.loc[mask_region, col] = median_yield
|
192
193
|
|