geocif 0.1.67__tar.gz → 0.1.69__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.67/geocif.egg-info → geocif-0.1.69}/PKG-INFO +1 -1
- {geocif-0.1.67 → geocif-0.1.69}/geocif/cei/definitions.py +8 -8
- {geocif-0.1.67 → geocif-0.1.69}/geocif/geocif.py +12 -9
- {geocif-0.1.67 → geocif-0.1.69}/geocif/geocif_runner.py +0 -1
- geocif-0.1.69/geocif/indices_runner_angola.py +212 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/correlations.py +10 -7
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/embedding.py +11 -8
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/feature_engineering.py +6 -5
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/feature_selection.py +15 -1
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/stats.py +1 -1
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/trainers.py +1 -1
- geocif-0.1.69/geocif/playground/wolayita_maize_mask.py +156 -0
- geocif-0.1.69/geocif/viz/gt.py +69 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/viz/tmp.py +20 -7
- {geocif-0.1.67 → geocif-0.1.69/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.67 → geocif-0.1.69}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.67 → geocif-0.1.69}/setup.py +1 -1
- geocif-0.1.67/geocif/indices_runner_angola.py +0 -212
- {geocif-0.1.67 → geocif-0.1.69}/LICENSE +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/MANIFEST.in +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/README.md +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/analysis.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/constants.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/features.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/geo.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/backup/models.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/cei/indices.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/experiments.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/logger.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/output.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/stages.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/trend.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/ml/xai.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/mm.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/aa.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/area.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/automl.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/download_esi.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/enso.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/eval.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/gamtest.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/gee_access.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/misc.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/reg.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/sustain.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/tmp.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/tmp2.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/tmp3.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/tmp4.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/playground/tmp5.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/risk/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/utils.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif/viz/plot.py +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/requirements.txt +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/setup.cfg +0 -0
- {geocif-0.1.67 → geocif-0.1.69}/tests/test_geocif.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
PHENOLOGICAL_STAGES = [1, 2, 3]
|
2
2
|
dict_indices = {
|
3
3
|
"GD4": ["Cold", "Growing degree days (sum of Tmean > 4 C)"],
|
4
|
-
|
5
|
-
|
4
|
+
"CFD": ["Cold", "Maximum number of consecutive frost days (Tmin < 0 C)"],
|
5
|
+
"FD": ["Cold", "Number of Frost Days (Tmin < 0C)"],
|
6
6
|
"HD17": ["Cold", "Heating degree days (sum of Tmean < 17 C)"],
|
7
|
-
|
8
|
-
|
7
|
+
"ID": ["Cold", "Number of sharp Ice Days (Tmax < 0C)"],
|
8
|
+
"CSDI": ["Cold", "Cold-spell duration index"],
|
9
9
|
"TG10p": ["Cold", "Percentage of days when Tmean < 10th percentile"],
|
10
10
|
"TN10p": ["Cold", "Percentage of days when Tmin < 10th percentile"],
|
11
11
|
"TXn": ["Cold", "Minimum daily maximum temperature"],
|
@@ -70,10 +70,10 @@ dict_indices = {
|
|
70
70
|
"Compound",
|
71
71
|
"Days with TG > 75th percentile of daily mean temperature and RR >75th percentile of daily precipitation sum",
|
72
72
|
],
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
73
|
+
"SD": ["Snow", "Mean of daily snow depth"],
|
74
|
+
"SD1": ["Snow", "Number of days with snow depth >= 1 cm"],
|
75
|
+
"SD5cm": ["Snow", "Number of days with snow depth >= 5 cm"],
|
76
|
+
"SD50cm": ["Snow", "Number of days with snow depth >= 50 cm"],
|
77
77
|
}
|
78
78
|
|
79
79
|
dict_ndvi = {
|
@@ -243,11 +243,11 @@ class Geocif:
|
|
243
243
|
if any(cei in column for cei in self.use_ceis)
|
244
244
|
]
|
245
245
|
else:
|
246
|
-
|
246
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
247
247
|
selector, _, self.selected_features = fs.select_features(
|
248
248
|
X_train, y_train, method=self.feature_selection
|
249
249
|
)
|
250
|
-
|
250
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
251
251
|
|
252
252
|
""" Update model to include conformal estimates """
|
253
253
|
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
@@ -306,7 +306,7 @@ class Geocif:
|
|
306
306
|
X_train,
|
307
307
|
y_train,
|
308
308
|
cat_features=self.cat_features,
|
309
|
-
verbose=
|
309
|
+
verbose=True,
|
310
310
|
)
|
311
311
|
elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
|
312
312
|
X_train = X_train.drop(
|
@@ -598,9 +598,9 @@ class Geocif:
|
|
598
598
|
df_region[f"Median {self.target}"].values, 3
|
599
599
|
)
|
600
600
|
|
601
|
-
if f"Median {self.target} (
|
602
|
-
df.loc[:, f"Median {self.target} (
|
603
|
-
df_region[f"Median {self.target} (
|
601
|
+
if f"Median {self.target} (2018-2022)" in df_region.columns:
|
602
|
+
df.loc[:, f"Median {self.target} (2018-2022)"] = np.around(
|
603
|
+
df_region[f"Median {self.target} (2018-2022)"].values, 3
|
604
604
|
)
|
605
605
|
|
606
606
|
if f"Median {self.target} (2013-2017)" in df_region.columns:
|
@@ -820,7 +820,7 @@ class Geocif:
|
|
820
820
|
+ self.statistics_columns
|
821
821
|
+ self.feature_names
|
822
822
|
+ [f"Median {self.target}"]
|
823
|
-
+ [f"Median {self.target} (
|
823
|
+
+ [f"Median {self.target} (2018-2022)"]
|
824
824
|
+ [f"Median {self.target} (2013-2017)"]
|
825
825
|
+ ["Region_ID"]
|
826
826
|
)
|
@@ -1012,11 +1012,11 @@ class Geocif:
|
|
1012
1012
|
)
|
1013
1013
|
|
1014
1014
|
df = fe.compute_user_median_statistics(
|
1015
|
-
df,
|
1015
|
+
df, range(2018, 2023)
|
1016
1016
|
)
|
1017
1017
|
|
1018
1018
|
df = fe.compute_user_median_statistics(
|
1019
|
-
df,
|
1019
|
+
df, range(2013, 2018)
|
1020
1020
|
)
|
1021
1021
|
|
1022
1022
|
if self.median_area_as_feature:
|
@@ -1393,6 +1393,9 @@ class Geocif:
|
|
1393
1393
|
self.dg["ADM0_NAME"].str.lower().str.replace(" ", "_") == self.country
|
1394
1394
|
]
|
1395
1395
|
|
1396
|
+
# Drop any duplicates based on Country Region column
|
1397
|
+
self.dg_country = self.dg_country.drop_duplicates(subset=["Country Region"])
|
1398
|
+
|
1396
1399
|
def read_data(self, country, crop, season):
|
1397
1400
|
"""
|
1398
1401
|
|
@@ -0,0 +1,212 @@
|
|
1
|
+
import itertools
|
2
|
+
import warnings
|
3
|
+
from multiprocessing import Pool, cpu_count
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import arrow as ar
|
7
|
+
import pandas as pd
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
warnings.filterwarnings("ignore")
|
11
|
+
|
12
|
+
from .cei import indices
|
13
|
+
from geoprepare import base
|
14
|
+
|
15
|
+
country = "angola"
|
16
|
+
|
17
|
+
def remove_duplicates(lst):
|
18
|
+
"""
|
19
|
+
|
20
|
+
:param lst:
|
21
|
+
:return:
|
22
|
+
"""
|
23
|
+
return list(set([i for i in lst]))
|
24
|
+
|
25
|
+
|
26
|
+
def get_admin_zone(country, dg_shp):
|
27
|
+
admin_zone = "admin_1"
|
28
|
+
country = country.title().replace(" ", "_")
|
29
|
+
|
30
|
+
# Read in shapefile
|
31
|
+
dg_country = dg_shp[dg_shp["ADMIN0"] == country]
|
32
|
+
|
33
|
+
# Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
|
34
|
+
if dg_country.empty:
|
35
|
+
admin_zone = "admin_1"
|
36
|
+
elif not dg_country["ADMIN2"].isna().all():
|
37
|
+
admin_zone = "admin_2"
|
38
|
+
|
39
|
+
return admin_zone
|
40
|
+
|
41
|
+
|
42
|
+
class cei_runner(base.BaseGeo):
|
43
|
+
def __init__(self, path_config_file):
|
44
|
+
super().__init__(path_config_file)
|
45
|
+
|
46
|
+
# Parse configuration files
|
47
|
+
self.parse_config()
|
48
|
+
|
49
|
+
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
50
|
+
import platform
|
51
|
+
if platform.system() == "Linux":
|
52
|
+
self.base_dir = Path(
|
53
|
+
rf"/gpfs/data1/cmongp1/GEOGLAM/Output/countries/{country}"
|
54
|
+
)
|
55
|
+
else:
|
56
|
+
self.base_dir = Path(
|
57
|
+
rf"D:\Users\ritvik\projects\GEOGLAM\Output\countries\{country}"
|
58
|
+
) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
59
|
+
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
60
|
+
|
61
|
+
def collect_files(self):
|
62
|
+
"""
|
63
|
+
1. Collect all the files which contain EO information
|
64
|
+
2. Exclude files from the `processed` directory if it is already in
|
65
|
+
processed_include_fall directory
|
66
|
+
3. Create a dataframe that contains the following columns:
|
67
|
+
- directory: name of directory where file is located
|
68
|
+
- path: full path to file
|
69
|
+
- filename: name of file
|
70
|
+
:return: Return the dataframe created above
|
71
|
+
"""
|
72
|
+
import geopandas as gp
|
73
|
+
|
74
|
+
dg_shp = gp.read_file(
|
75
|
+
self.dir_input
|
76
|
+
/ "Global_Datasets"
|
77
|
+
/ "Regions"
|
78
|
+
/ "Shps"
|
79
|
+
/ "adm_shapefile.shp",
|
80
|
+
engine="pyogrio",
|
81
|
+
)
|
82
|
+
|
83
|
+
# Collect all the files which contain EO information
|
84
|
+
df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
|
85
|
+
for filepath in self.base_dir.rglob("*.csv"):
|
86
|
+
country = filepath.parents[0].name
|
87
|
+
|
88
|
+
admin_zone = get_admin_zone(country, dg_shp)
|
89
|
+
|
90
|
+
# If country is not in cc.COUNTRIES then skip
|
91
|
+
# HACK: Skip korea for now, as it is giving errors
|
92
|
+
if country == "republic_of_korea":
|
93
|
+
continue
|
94
|
+
|
95
|
+
# Get name of directory one level up
|
96
|
+
process_type = filepath.parents[1].name
|
97
|
+
|
98
|
+
# Get name of file
|
99
|
+
filename = filepath.name
|
100
|
+
|
101
|
+
# Add to dataframe
|
102
|
+
df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
|
103
|
+
|
104
|
+
# Exclude those rows where directory is processed and file is already in
|
105
|
+
# processed_include_fall directory
|
106
|
+
no_fall = df_files["directory"] == "processed"
|
107
|
+
include_fall = df_files[df_files["directory"] == "processed_include_fall"][
|
108
|
+
"filename"
|
109
|
+
]
|
110
|
+
|
111
|
+
df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
|
112
|
+
|
113
|
+
return df_files
|
114
|
+
|
115
|
+
def process_combinations(self, df, method):
|
116
|
+
"""
|
117
|
+
Create a list of tuples of the following:
|
118
|
+
- directory: name of directory where file is located
|
119
|
+
- path: full path to file
|
120
|
+
- filename: name of file
|
121
|
+
- method: whether to compute indices for phenological stages or not
|
122
|
+
This tuple will be used as input to the `process` function
|
123
|
+
:param df:
|
124
|
+
:param method:
|
125
|
+
:return:
|
126
|
+
"""
|
127
|
+
combinations = []
|
128
|
+
|
129
|
+
for index, row in tqdm(df.iterrows()):
|
130
|
+
combinations.extend(
|
131
|
+
list(
|
132
|
+
itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
|
133
|
+
)
|
134
|
+
)
|
135
|
+
|
136
|
+
combinations = remove_duplicates(combinations)
|
137
|
+
|
138
|
+
return combinations
|
139
|
+
|
140
|
+
def main(self, method):
|
141
|
+
"""
|
142
|
+
|
143
|
+
:param method:
|
144
|
+
:return:
|
145
|
+
"""
|
146
|
+
# Create a dataframe of the files to be analyzed
|
147
|
+
df_files = self.collect_files()
|
148
|
+
|
149
|
+
combinations = self.process_combinations(df_files, method)
|
150
|
+
|
151
|
+
# Add an element to the tuple to indicate the season
|
152
|
+
# Last element is redo flag which is True if the analysis is to be redone
|
153
|
+
# and False otherwise. Analysis is always redone for the current year
|
154
|
+
# and last year whether file exists or not
|
155
|
+
combinations = [
|
156
|
+
(
|
157
|
+
self.parser,
|
158
|
+
status,
|
159
|
+
path,
|
160
|
+
filename,
|
161
|
+
admin_zone,
|
162
|
+
category,
|
163
|
+
year,
|
164
|
+
"ndvi",
|
165
|
+
False, # redo
|
166
|
+
)
|
167
|
+
for year in range(2001, ar.utcnow().year + 1)
|
168
|
+
for status, path, filename, admin_zone, category in combinations
|
169
|
+
]
|
170
|
+
|
171
|
+
# Only keep those entries in combinations where the third elemt is
|
172
|
+
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
173
|
+
# This is done to test the code for these countries
|
174
|
+
combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
|
175
|
+
|
176
|
+
if True:
|
177
|
+
num_cpu = int(cpu_count() * 0.9)
|
178
|
+
with Pool(num_cpu) as p:
|
179
|
+
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
180
|
+
pass
|
181
|
+
else:
|
182
|
+
# Use the code below if you want to test without parallelization or
|
183
|
+
# if you want to debug by using pdb
|
184
|
+
pbar = tqdm(combinations)
|
185
|
+
for i, val in enumerate(pbar):
|
186
|
+
pbar.set_description(
|
187
|
+
f"Main loop {combinations[i][2]} {combinations[i][5]}"
|
188
|
+
)
|
189
|
+
indices.process(val)
|
190
|
+
|
191
|
+
|
192
|
+
def run(path_config_files=[]):
|
193
|
+
"""
|
194
|
+
|
195
|
+
Args:
|
196
|
+
path_config_files:
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
|
200
|
+
"""
|
201
|
+
""" Check dictionary keys to have no spaces"""
|
202
|
+
indices.validate_index_definitions()
|
203
|
+
|
204
|
+
for method in [
|
205
|
+
"monthly_r", # "dekad_r" # "dekad_r"
|
206
|
+
]: # , "full_season", "phenological_stages", "fraction_season"]:
|
207
|
+
obj = cei_runner(path_config_files)
|
208
|
+
obj.main(method)
|
209
|
+
|
210
|
+
|
211
|
+
if __name__ == "__main__":
|
212
|
+
run()
|
@@ -295,13 +295,16 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
295
295
|
df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
|
296
296
|
|
297
297
|
# Compute median of each CEI and sort the dataframe based on the absolute value of the median
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
298
|
+
try:
|
299
|
+
dict_best_cei[region_id] = (
|
300
|
+
df_tmp2.groupby("Type")
|
301
|
+
.max()
|
302
|
+
.reset_index()
|
303
|
+
.sort_values("Value", ascending=False)["Metric"]
|
304
|
+
.values
|
305
|
+
)
|
306
|
+
except:
|
307
|
+
breakpoint()
|
305
308
|
|
306
309
|
kwargs["region_id"] = region_id
|
307
310
|
_region_names = ", ".join([str(x) for x in group['Region'].unique()])
|
@@ -25,29 +25,32 @@ def _compute_correlations(X, y):
|
|
25
25
|
feature_correlations = {}
|
26
26
|
|
27
27
|
for feature in X.columns:
|
28
|
-
# Ignore
|
28
|
+
# Ignore object or categorical type columns
|
29
29
|
if X[feature].dtypes.name in ["object", "category"]:
|
30
30
|
continue
|
31
31
|
|
32
32
|
f_series = X[feature]
|
33
33
|
|
34
|
-
# Ignore NaN values in either y
|
35
|
-
mask = ~
|
36
|
-
|
37
|
-
|
34
|
+
# Ignore NaN values in either y or f_series
|
35
|
+
mask = ~(np.isnan(y) | np.isnan(f_series))
|
36
|
+
y_filtered = y[mask]
|
37
|
+
f_series_filtered = f_series[mask]
|
38
38
|
|
39
|
-
|
39
|
+
# Handle cases where std is zero
|
40
|
+
if np.std(f_series_filtered) == 0 or np.std(y_filtered) == 0:
|
40
41
|
feature_correlations[feature] = np.nan
|
41
42
|
else:
|
42
43
|
try:
|
43
|
-
r = pearsonr(
|
44
|
+
r = pearsonr(y_filtered, f_series_filtered)[0]
|
44
45
|
feature_correlations[feature] = round(r, 3)
|
45
|
-
except:
|
46
|
+
except Exception as e:
|
47
|
+
print(f"Error computing correlation for {feature}: {e}")
|
46
48
|
feature_correlations[feature] = np.nan
|
47
49
|
|
48
50
|
return feature_correlations
|
49
51
|
|
50
52
|
|
53
|
+
|
51
54
|
def find_most_common_top_feature(top_feature_by_region):
|
52
55
|
"""
|
53
56
|
Find the most common top feature and number of occurences
|
@@ -39,10 +39,10 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
|
|
39
39
|
|
40
40
|
return df
|
41
41
|
|
42
|
-
|
43
42
|
def compute_closest_years(all_years, harvest_year, number_lag_years):
|
44
43
|
"""
|
45
|
-
Finds the years closest to a given harvest year,
|
44
|
+
Finds the historical years closest to a given harvest year,
|
45
|
+
excluding any future year (harvest_year itself and beyond).
|
46
46
|
|
47
47
|
Args:
|
48
48
|
all_years (array-like): List or array of all years to consider.
|
@@ -50,7 +50,8 @@ def compute_closest_years(all_years, harvest_year, number_lag_years):
|
|
50
50
|
number_lag_years (int): Number of closest years to return.
|
51
51
|
|
52
52
|
Returns:
|
53
|
-
list:
|
53
|
+
list: The historical years closest to the given harvest year.
|
54
|
+
Returns an empty list if no historical years exist.
|
54
55
|
"""
|
55
56
|
# Exclude the harvest year before computation to simplify logic
|
56
57
|
filtered_years = [year for year in all_years if year != harvest_year]
|
@@ -96,7 +97,7 @@ def compute_median_statistics(
|
|
96
97
|
mask = (group["Harvest Year"].isin(closest_years)) & (
|
97
98
|
group["Region"] == region
|
98
99
|
)
|
99
|
-
median_yield = group.loc[mask, target_col].
|
100
|
+
median_yield = group.loc[mask, target_col].mean()
|
100
101
|
df.loc[
|
101
102
|
(df["Region"] == region) & (df["Harvest Year"] == harvest_year),
|
102
103
|
f"Median {target_col}",
|
@@ -186,7 +187,7 @@ def compute_lag_yield(
|
|
186
187
|
else:
|
187
188
|
# Add median yield
|
188
189
|
mask_group_median = group["Harvest Year"].isin(closest_years)
|
189
|
-
median_yield = group.loc[mask_group_median, target_col].
|
190
|
+
median_yield = group.loc[mask_group_median, target_col].mean()
|
190
191
|
|
191
192
|
df.loc[mask_region, col] = median_yield
|
192
193
|
|
@@ -188,10 +188,24 @@ def select_features(X, y, method="RFE", min_features_to_select=3, threshold_nan=
|
|
188
188
|
|
189
189
|
# Get the selected feature names
|
190
190
|
selected_features = X.columns[selected_features].tolist()
|
191
|
+
elif method == "lasso":
|
192
|
+
from sklearn.linear_model import LassoLarsCV
|
193
|
+
from sklearn.feature_selection import SelectFromModel
|
194
|
+
|
195
|
+
# Fit Lasso model (L1 regularization) to perform feature selection
|
196
|
+
lasso = LassoLarsCV(cv=5)
|
197
|
+
lasso.fit(X, y)
|
198
|
+
|
199
|
+
# Use SelectFromModel to remove features with zero coefficients
|
200
|
+
selector = SelectFromModel(lasso, prefit=True)
|
201
|
+
|
202
|
+
# Get the selected features
|
203
|
+
selected_features = X.columns[selector.get_support()].tolist()
|
204
|
+
print(selected_features)
|
191
205
|
elif method == "BorutaPy":
|
192
206
|
from boruta import BorutaPy
|
193
207
|
|
194
|
-
selector = BorutaPy(forest, n_estimators="auto", random_state=42)
|
208
|
+
selector = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
|
195
209
|
selector.fit(X.values, y.values)
|
196
210
|
selected_features_mask = selector.support_
|
197
211
|
selected_features = X.columns[selected_features_mask].tolist()
|
@@ -203,7 +203,7 @@ def add_statistics(
|
|
203
203
|
fn = "illinois.csv"
|
204
204
|
elif country == "Ethiopia":
|
205
205
|
# HACK
|
206
|
-
fn = "
|
206
|
+
fn = "adm_crop_production.csv"
|
207
207
|
else:
|
208
208
|
fn = "adm_crop_production.csv"
|
209
209
|
df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
|
@@ -0,0 +1,156 @@
|
|
1
|
+
import rasterio
|
2
|
+
from rasterio.warp import calculate_default_transform, reproject, Resampling
|
3
|
+
import numpy as np
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
import math
|
6
|
+
|
7
|
+
# Input / Output paths
|
8
|
+
input_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize.tif"
|
9
|
+
output_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize_5km_percentage.tif"
|
10
|
+
|
11
|
+
import rasterio
|
12
|
+
from rasterio.warp import calculate_default_transform, reproject, Resampling
|
13
|
+
from math import ceil
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
input_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize.tif"
|
17
|
+
output_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize_5km_percentage.tif"
|
18
|
+
|
19
|
+
with rasterio.open(input_path) as src:
|
20
|
+
# 1) If needed, assign correct CRS
|
21
|
+
# Example: if you know it's actually EPSG:32637 but isn't set
|
22
|
+
# src_crs = rasterio.crs.CRS.from_epsg(32637)
|
23
|
+
# else if it's already correct, do:
|
24
|
+
src_crs = src.crs
|
25
|
+
|
26
|
+
# 2) Decide your pixel size.
|
27
|
+
# If src_crs is lat/lon (EPSG:4326), use ~0.045 deg for ~5 km.
|
28
|
+
# If src_crs is UTM in meters, use 5000 for 5 km.
|
29
|
+
pixel_size = 0.045 # or 5000 if in meters
|
30
|
+
|
31
|
+
transform, width, height = calculate_default_transform(
|
32
|
+
src_crs, # source crs
|
33
|
+
src_crs, # target crs (same if you just want coarser in place)
|
34
|
+
src.width,
|
35
|
+
src.height,
|
36
|
+
*src.bounds,
|
37
|
+
resolution=pixel_size
|
38
|
+
)
|
39
|
+
|
40
|
+
# Prepare output fraction array
|
41
|
+
fraction_array = np.full((height, width), -9999, dtype=np.float32)
|
42
|
+
|
43
|
+
# Reproject with average -> fraction
|
44
|
+
reproject(
|
45
|
+
source=rasterio.band(src, 1),
|
46
|
+
destination=fraction_array,
|
47
|
+
src_transform=src.transform,
|
48
|
+
src_crs=src_crs,
|
49
|
+
dst_transform=transform,
|
50
|
+
dst_crs=src_crs,
|
51
|
+
resampling=Resampling.average,
|
52
|
+
dst_nodata=-9999
|
53
|
+
)
|
54
|
+
|
55
|
+
# Now fraction_array should have values in [0..1], with -9999 for nodata.
|
56
|
+
valid_mask = (fraction_array != -9999)
|
57
|
+
|
58
|
+
if not np.any(valid_mask):
|
59
|
+
print("No valid cells at all (everything is nodata). This indicates a bounding box or CRS mismatch.")
|
60
|
+
else:
|
61
|
+
frac_min = fraction_array[valid_mask].min()
|
62
|
+
frac_max = fraction_array[valid_mask].max()
|
63
|
+
print("Fraction min:", frac_min)
|
64
|
+
print("Fraction max:", frac_max)
|
65
|
+
|
66
|
+
# If both min and max are 0.0, it means there's truly no coverage or it's extremely small.
|
67
|
+
# Otherwise you might see something like 0.0, 0.01, 0.5, etc.
|
68
|
+
|
69
|
+
# Then let's see if maybe they're all below 0.005:
|
70
|
+
below_005 = (fraction_array[valid_mask] < 0.005).all()
|
71
|
+
print("All fractions < 0.5%?", below_005)
|
72
|
+
|
73
|
+
breakpoint()
|
74
|
+
with rasterio.open(input_path) as src:
|
75
|
+
# If src.crs is None but you KNOW it's EPSG:4326, assign it:
|
76
|
+
# src_crs = rasterio.crs.CRS.from_epsg(4326)
|
77
|
+
# Otherwise, just use what's in the file:
|
78
|
+
src_crs = src.crs
|
79
|
+
|
80
|
+
# Let's assume the file is already lat/lon (EPSG:4326).
|
81
|
+
# We'll define ~0.045° as "5 km" at the equator.
|
82
|
+
new_res = 0.045
|
83
|
+
|
84
|
+
# Calculate a new transform and new shape
|
85
|
+
# for coarser resolution in the SAME EPSG:4326.
|
86
|
+
transform, width, height = calculate_default_transform(
|
87
|
+
src_crs, # src CRS
|
88
|
+
src_crs, # dst CRS (same if you want to stay in lat/lon)
|
89
|
+
src.width,
|
90
|
+
src.height,
|
91
|
+
*src.bounds,
|
92
|
+
resolution=new_res # sets pixel size to 0.045 degrees
|
93
|
+
)
|
94
|
+
|
95
|
+
# Read full data for histogram plotting
|
96
|
+
data_in = src.read(1, masked=True)
|
97
|
+
in_profile = src.profile.copy()
|
98
|
+
|
99
|
+
# Plot input histogram (0 or 1)
|
100
|
+
arr_in = data_in.compressed()
|
101
|
+
plt.figure()
|
102
|
+
plt.hist(arr_in, bins=[-0.5, 0.5, 1.5], edgecolor='black')
|
103
|
+
plt.title("Input (0/1)")
|
104
|
+
plt.xlabel("Value")
|
105
|
+
plt.ylabel("Frequency")
|
106
|
+
plt.show()
|
107
|
+
|
108
|
+
# Prepare output array, float32 with sentinel -9999
|
109
|
+
out_array = np.full((height, width), -9999, dtype=np.float32)
|
110
|
+
|
111
|
+
with rasterio.open(input_path) as src:
|
112
|
+
reproject(
|
113
|
+
source=rasterio.band(src, 1),
|
114
|
+
destination=out_array,
|
115
|
+
src_transform=src.transform,
|
116
|
+
src_crs=src.crs,
|
117
|
+
dst_transform=transform,
|
118
|
+
dst_crs=src_crs, # same
|
119
|
+
resampling=Resampling.average,
|
120
|
+
dst_nodata=-9999
|
121
|
+
)
|
122
|
+
|
123
|
+
# Now out_array has fraction in [0..1]. Convert to % (0..100).
|
124
|
+
breakpoint()
|
125
|
+
mask_valid = (out_array != -9999)
|
126
|
+
out_array[mask_valid] *= 100.0
|
127
|
+
out_array[mask_valid] = np.rint(out_array[mask_valid]) # round
|
128
|
+
out_array = out_array.astype(np.int32)
|
129
|
+
|
130
|
+
# Update profile
|
131
|
+
out_profile = in_profile.copy()
|
132
|
+
out_profile.update({
|
133
|
+
'driver': 'GTiff',
|
134
|
+
'width': width,
|
135
|
+
'height': height,
|
136
|
+
'transform': transform,
|
137
|
+
'crs': src_crs,
|
138
|
+
'dtype': 'int32',
|
139
|
+
'nodata': -9999
|
140
|
+
})
|
141
|
+
|
142
|
+
# Write out
|
143
|
+
with rasterio.open(output_path, 'w', **out_profile) as dst:
|
144
|
+
dst.write(out_array, 1)
|
145
|
+
|
146
|
+
print("Wrote:", output_path)
|
147
|
+
|
148
|
+
# Plot histogram of output (ignore -9999)
|
149
|
+
out_data = np.where(out_array == -9999, np.nan, out_array)
|
150
|
+
valid_data = out_data[~np.isnan(out_data)]
|
151
|
+
plt.figure()
|
152
|
+
plt.hist(valid_data, bins=50, edgecolor="black")
|
153
|
+
plt.title("5km Percentage (0-100)")
|
154
|
+
plt.xlabel("Percent cropped")
|
155
|
+
plt.ylabel("Frequency")
|
156
|
+
plt.show()
|
@@ -0,0 +1,69 @@
|
|
1
|
+
from great_tables import GT, md, system_fonts
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
# Your data as a pandas DataFrame
|
5
|
+
data = [
|
6
|
+
[2, "10<sup>th</sup>", "<=14; >14", 89.2, "2010 - 2021"],
|
7
|
+
[2, "25<sup>th</sup>", "<=18.7; >18.7", 82.2, "2010 - 2021"],
|
8
|
+
[2, "50<sup>th</sup>", "<=24.6; >24.6", 83.7, "2010 - 2021"],
|
9
|
+
[2, "75<sup>th</sup>", "<=31; >31", 88.3, "2010 - 2021"],
|
10
|
+
[2, "90<sup>th</sup>", "<=38.9; >38.9", 96.9, "2010 - 2021"],
|
11
|
+
[3, "33<sup>rd</sup>, 67<sup>th</sup>", "<=20.3; 20.3 - 29.6; >29.6", 60.5, "2010 - 2021"],
|
12
|
+
[4, "25<sup>th</sup>, 50<sup>th</sup>, 75<sup>th</sup>",
|
13
|
+
"<=18.7; 18.7-24.6; 24.6-31; >31", 64.4, "2010 - 2021"]
|
14
|
+
]
|
15
|
+
cols = ["Number of classes", "Percentile(s)", "Yield categories", "Accuracy (%)", "Years"]
|
16
|
+
|
17
|
+
df = pd.DataFrame(data, columns=cols)
|
18
|
+
|
19
|
+
# Create a Great Tables object
|
20
|
+
gt_tbl = GT(data=df)
|
21
|
+
|
22
|
+
# Example formatting, coloring, and styling
|
23
|
+
gt_tbl = (gt_tbl
|
24
|
+
# Format the "Accuracy (%)" column to show one decimal place
|
25
|
+
.fmt_number(
|
26
|
+
columns=["Accuracy (%)"],
|
27
|
+
decimals=1
|
28
|
+
)
|
29
|
+
# Color-scale the "Accuracy (%)" column (optional)
|
30
|
+
#.data_color(
|
31
|
+
# columns=["Accuracy (%)"],
|
32
|
+
# palette=["tomato", "gold", "palegreen"],
|
33
|
+
# domain=[50, 100] # Range from the lowest to highest accuracy
|
34
|
+
#)
|
35
|
+
# Set column widths
|
36
|
+
.cols_width({
|
37
|
+
"Number classes": "60px",
|
38
|
+
"Percentile(s)": "140px",
|
39
|
+
"Yield categories": "220px",
|
40
|
+
"Accuracy (%)": "100px",
|
41
|
+
"Years": "90px"
|
42
|
+
})
|
43
|
+
# Add a table header/title
|
44
|
+
.tab_header(
|
45
|
+
title=md("**Accuracy of Model for Different Yield Categories**")
|
46
|
+
)
|
47
|
+
# Add a source note (optional)
|
48
|
+
# .tab_source_note(
|
49
|
+
# md(
|
50
|
+
# "**Source**: Internal records<br>"
|
51
|
+
# "**Note**: Data from 2010-2021"
|
52
|
+
# )
|
53
|
+
# )
|
54
|
+
# Customize general table options
|
55
|
+
.tab_options(
|
56
|
+
heading_background_color='antiquewhite',
|
57
|
+
column_labels_background_color='antiquewhite',
|
58
|
+
source_notes_background_color='antiquewhite',
|
59
|
+
table_background_color='snow',
|
60
|
+
table_font_names=system_fonts("humanist"),
|
61
|
+
data_row_padding='2px'
|
62
|
+
)
|
63
|
+
# Align all columns center except "Yield categories", which might be longer text
|
64
|
+
.cols_align(align="center")
|
65
|
+
.cols_align(align="left", columns=["Yield categories"])
|
66
|
+
)
|
67
|
+
|
68
|
+
# Display the table
|
69
|
+
GT.save(gt_tbl, file="aa.png")
|
@@ -1,6 +1,4 @@
|
|
1
1
|
import geopandas as gpd
|
2
|
-
import pandas as pd
|
3
|
-
import matplotlib.pyplot as plt
|
4
2
|
import palettable as pal
|
5
3
|
import matplotlib.colors as mcolors
|
6
4
|
|
@@ -9,7 +7,7 @@ import glob
|
|
9
7
|
import os
|
10
8
|
|
11
9
|
# 1. Specify the directory containing your .dta files:
|
12
|
-
data_dir =
|
10
|
+
data_dir = r"C:\Users\ritvik\Downloads\maize_yield (2)\maize_yield"
|
13
11
|
|
14
12
|
# 2. Use glob to find all .dta files in that directory:
|
15
13
|
dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
|
@@ -20,6 +18,13 @@ dataframes = [pd.read_stata(f) for f in dta_files]
|
|
20
18
|
# 4. Concatenate them all into one DataFrame (row-wise):
|
21
19
|
merged_df = pd.concat(dataframes, ignore_index=True)
|
22
20
|
|
21
|
+
# Replace null values in PROD98CQ with those in PROD columns
|
22
|
+
merged_df['PROD98CQ'] = merged_df['PROD98CQ'].fillna(merged_df['PROD'])
|
23
|
+
merged_df['YEAR'] = merged_df['YEAR'].fillna(merged_df['year'])
|
24
|
+
|
25
|
+
# Drop rows where AREAH is 0
|
26
|
+
merged_df = merged_df[merged_df['AREAH'] != 0]
|
27
|
+
|
23
28
|
merged_df['ZONE'] = merged_df['ZONE'].astype(int)
|
24
29
|
merged_df['DIST'] = merged_df['DIST'].astype(int)
|
25
30
|
|
@@ -36,7 +41,7 @@ merged_df['W_CODE'] = '7' + merged_df['W_CODE']
|
|
36
41
|
merged_df['W_CODE'] = merged_df['W_CODE'].str.replace('.0', '')
|
37
42
|
merged_df['W_CODE'] = merged_df['W_CODE'].astype(int)
|
38
43
|
|
39
|
-
dg = gpd.read_file(r"
|
44
|
+
dg = gpd.read_file(r"wolayita_dissolved.shp")
|
40
45
|
dg = dg[['W_CODE', 'W_NAME']]
|
41
46
|
|
42
47
|
# Merge the two dataframes on W_CODE
|
@@ -48,8 +53,8 @@ merged_df = merged_df.dropna(subset=['PROD98CQ', 'AREAH'])
|
|
48
53
|
# Compte yield column
|
49
54
|
merged_df['yield'] = merged_df['PROD98CQ'] / merged_df['AREAH']
|
50
55
|
|
51
|
-
# create a new dataframe which computes average yield by W_NAME for each year
|
52
|
-
df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR'])['yield']
|
56
|
+
# create a new dataframe which computes average yield by W_NAME for each year, do a weighted average using FWEIGHT column
|
57
|
+
df_avg_yield = merged_df.groupby(['W_NAME', 'YEAR']).apply(lambda x: np.average(x['yield'], weights=x['FWEIGHT'])).reset_index(name='yield')
|
53
58
|
|
54
59
|
# Change W_NAME column to title case
|
55
60
|
df_avg_yield['W_NAME'] = df_avg_yield['W_NAME'].str.title()
|
@@ -64,7 +69,15 @@ df_avg_yield = df_avg_yield.pivot(index='W_NAME', columns='YEAR', values='yield'
|
|
64
69
|
df_avg_yield.index.name = None
|
65
70
|
df_avg_yield.columns.name = None
|
66
71
|
|
67
|
-
df_avg_yield.to_csv('
|
72
|
+
df_avg_yield.to_csv('wolayita_yields_v8.csv')
|
73
|
+
breakpoint()
|
74
|
+
# Compare wolayita_yields_v2.csv with wolayita_yields.csv
|
75
|
+
# 1. Load the two CSV files
|
76
|
+
df_v1 = pd.read_csv('wolayita_yields.csv')
|
77
|
+
df_v2 = pd.read_csv('wolayita_yields_v2.csv')
|
78
|
+
|
79
|
+
# 2. Check if the two DataFrames are equal
|
80
|
+
print(df_v1.equals(df_v2))
|
68
81
|
|
69
82
|
breakpoint()
|
70
83
|
# 5. (Optional) Inspect the merged DataFrame
|
@@ -72,9 +72,11 @@ geocif/playground/tmp2.py
|
|
72
72
|
geocif/playground/tmp3.py
|
73
73
|
geocif/playground/tmp4.py
|
74
74
|
geocif/playground/tmp5.py
|
75
|
+
geocif/playground/wolayita_maize_mask.py
|
75
76
|
geocif/risk/__init__.py
|
76
77
|
geocif/risk/impact_assessment.py
|
77
78
|
geocif/viz/__init__.py
|
79
|
+
geocif/viz/gt.py
|
78
80
|
geocif/viz/plot.py
|
79
81
|
geocif/viz/tmp.py
|
80
82
|
tests/test_geocif.py
|
@@ -1,212 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
import warnings
|
3
|
-
from multiprocessing import Pool, cpu_count
|
4
|
-
from pathlib import Path
|
5
|
-
|
6
|
-
import arrow as ar
|
7
|
-
import pandas as pd
|
8
|
-
from tqdm import tqdm
|
9
|
-
|
10
|
-
warnings.filterwarnings("ignore")
|
11
|
-
|
12
|
-
from .cei import indices
|
13
|
-
from geoprepare import base
|
14
|
-
|
15
|
-
country = "ethiopia"
|
16
|
-
|
17
|
-
def remove_duplicates(lst):
|
18
|
-
"""
|
19
|
-
|
20
|
-
:param lst:
|
21
|
-
:return:
|
22
|
-
"""
|
23
|
-
return list(set([i for i in lst]))
|
24
|
-
|
25
|
-
|
26
|
-
def get_admin_zone(country, dg_shp):
|
27
|
-
admin_zone = "admin_1"
|
28
|
-
country = country.title().replace(" ", "_")
|
29
|
-
|
30
|
-
# Read in shapefile
|
31
|
-
dg_country = dg_shp[dg_shp["ADMIN0"] == country]
|
32
|
-
|
33
|
-
# Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
|
34
|
-
if dg_country.empty:
|
35
|
-
admin_zone = "admin_1"
|
36
|
-
elif not dg_country["ADMIN2"].isna().all():
|
37
|
-
admin_zone = "admin_2"
|
38
|
-
|
39
|
-
return admin_zone
|
40
|
-
|
41
|
-
|
42
|
-
class cei_runner(base.BaseGeo):
|
43
|
-
def __init__(self, path_config_file):
|
44
|
-
super().__init__(path_config_file)
|
45
|
-
|
46
|
-
# Parse configuration files
|
47
|
-
self.parse_config()
|
48
|
-
|
49
|
-
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
50
|
-
import platform
|
51
|
-
if platform.system() == "Linux":
|
52
|
-
self.base_dir = Path(
|
53
|
-
rf"/gpfs/data1/cmongp1/GEOGLAM/Output/countries/{country}"
|
54
|
-
)
|
55
|
-
else:
|
56
|
-
self.base_dir = Path(
|
57
|
-
rf"D:\Users\ritvik\projects\GEOGLAM\Output\countries\{country}"
|
58
|
-
) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
59
|
-
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
60
|
-
|
61
|
-
def collect_files(self):
|
62
|
-
"""
|
63
|
-
1. Collect all the files which contain EO information
|
64
|
-
2. Exclude files from the `processed` directory if it is already in
|
65
|
-
processed_include_fall directory
|
66
|
-
3. Create a dataframe that contains the following columns:
|
67
|
-
- directory: name of directory where file is located
|
68
|
-
- path: full path to file
|
69
|
-
- filename: name of file
|
70
|
-
:return: Return the dataframe created above
|
71
|
-
"""
|
72
|
-
import geopandas as gp
|
73
|
-
|
74
|
-
dg_shp = gp.read_file(
|
75
|
-
self.dir_input
|
76
|
-
/ "Global_Datasets"
|
77
|
-
/ "Regions"
|
78
|
-
/ "Shps"
|
79
|
-
/ "adm_shapefile.shp",
|
80
|
-
engine="pyogrio",
|
81
|
-
)
|
82
|
-
|
83
|
-
# Collect all the files which contain EO information
|
84
|
-
df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
|
85
|
-
for filepath in self.base_dir.rglob("*.csv"):
|
86
|
-
country = filepath.parents[0].name
|
87
|
-
|
88
|
-
admin_zone = get_admin_zone(country, dg_shp)
|
89
|
-
|
90
|
-
# If country is not in cc.COUNTRIES then skip
|
91
|
-
# HACK: Skip korea for now, as it is giving errors
|
92
|
-
if country == "republic_of_korea":
|
93
|
-
continue
|
94
|
-
|
95
|
-
# Get name of directory one level up
|
96
|
-
process_type = filepath.parents[1].name
|
97
|
-
|
98
|
-
# Get name of file
|
99
|
-
filename = filepath.name
|
100
|
-
|
101
|
-
# Add to dataframe
|
102
|
-
df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
|
103
|
-
|
104
|
-
# Exclude those rows where directory is processed and file is already in
|
105
|
-
# processed_include_fall directory
|
106
|
-
no_fall = df_files["directory"] == "processed"
|
107
|
-
include_fall = df_files[df_files["directory"] == "processed_include_fall"][
|
108
|
-
"filename"
|
109
|
-
]
|
110
|
-
|
111
|
-
df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
|
112
|
-
|
113
|
-
return df_files
|
114
|
-
|
115
|
-
def process_combinations(self, df, method):
|
116
|
-
"""
|
117
|
-
Create a list of tuples of the following:
|
118
|
-
- directory: name of directory where file is located
|
119
|
-
- path: full path to file
|
120
|
-
- filename: name of file
|
121
|
-
- method: whether to compute indices for phenological stages or not
|
122
|
-
This tuple will be used as input to the `process` function
|
123
|
-
:param df:
|
124
|
-
:param method:
|
125
|
-
:return:
|
126
|
-
"""
|
127
|
-
combinations = []
|
128
|
-
|
129
|
-
for index, row in tqdm(df.iterrows()):
|
130
|
-
combinations.extend(
|
131
|
-
list(
|
132
|
-
itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
|
133
|
-
)
|
134
|
-
)
|
135
|
-
|
136
|
-
combinations = remove_duplicates(combinations)
|
137
|
-
|
138
|
-
return combinations
|
139
|
-
|
140
|
-
def main(self, method):
|
141
|
-
"""
|
142
|
-
|
143
|
-
:param method:
|
144
|
-
:return:
|
145
|
-
"""
|
146
|
-
# Create a dataframe of the files to be analyzed
|
147
|
-
df_files = self.collect_files()
|
148
|
-
|
149
|
-
combinations = self.process_combinations(df_files, method)
|
150
|
-
|
151
|
-
# Add an element to the tuple to indicate the season
|
152
|
-
# Last element is redo flag which is True if the analysis is to be redone
|
153
|
-
# and False otherwise. Analysis is always redone for the current year
|
154
|
-
# and last year whether file exists or not
|
155
|
-
combinations = [
|
156
|
-
(
|
157
|
-
self.parser,
|
158
|
-
status,
|
159
|
-
path,
|
160
|
-
filename,
|
161
|
-
admin_zone,
|
162
|
-
category,
|
163
|
-
year,
|
164
|
-
"ndvi",
|
165
|
-
False, # redo
|
166
|
-
)
|
167
|
-
for year in range(2001, ar.utcnow().year + 1)
|
168
|
-
for status, path, filename, admin_zone, category in combinations
|
169
|
-
]
|
170
|
-
|
171
|
-
# Only keep those entries in combinations where the third elemt is
|
172
|
-
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
173
|
-
# This is done to test the code for these countries
|
174
|
-
combinations = [i for i in combinations if f"{country}_winter_wheat_s1" in i[3]]
|
175
|
-
|
176
|
-
if True:
|
177
|
-
num_cpu = int(cpu_count() * 0.9)
|
178
|
-
with Pool(num_cpu) as p:
|
179
|
-
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
180
|
-
pass
|
181
|
-
else:
|
182
|
-
# Use the code below if you want to test without parallelization or
|
183
|
-
# if you want to debug by using pdb
|
184
|
-
pbar = tqdm(combinations)
|
185
|
-
for i, val in enumerate(pbar):
|
186
|
-
pbar.set_description(
|
187
|
-
f"Main loop {combinations[i][2]} {combinations[i][5]}"
|
188
|
-
)
|
189
|
-
indices.process(val)
|
190
|
-
|
191
|
-
|
192
|
-
def run(path_config_files=[]):
|
193
|
-
"""
|
194
|
-
|
195
|
-
Args:
|
196
|
-
path_config_files:
|
197
|
-
|
198
|
-
Returns:
|
199
|
-
|
200
|
-
"""
|
201
|
-
""" Check dictionary keys to have no spaces"""
|
202
|
-
indices.validate_index_definitions()
|
203
|
-
|
204
|
-
for method in [
|
205
|
-
"monthly_r", # "dekad_r" # "dekad_r"
|
206
|
-
]: # , "full_season", "phenological_stages", "fraction_season"]:
|
207
|
-
obj = cei_runner(path_config_files)
|
208
|
-
obj.main(method)
|
209
|
-
|
210
|
-
|
211
|
-
if __name__ == "__main__":
|
212
|
-
run()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|