glam-processing 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,654 @@
1
+ import os
2
+ import gzip
3
+ import shutil
4
+ import logging
5
+
6
+ from datetime import datetime, timedelta
7
+ import requests
8
+ import subprocess
9
+ from multiprocessing import Pool
10
+
11
+ from bs4 import BeautifulSoup
12
+
13
+ from tqdm import tqdm
14
+
15
+ import rasterio
16
+ from rasterio.io import MemoryFile
17
+ from rasterio.crs import CRS
18
+ from rasterio.merge import merge
19
+ from rasterio.rio.overview import get_maximum_overview_level
20
+ from rio_cogeo.cogeo import cog_translate
21
+ from rio_cogeo.cogeo import cog_validate
22
+ from rio_cogeo.profiles import cog_profiles
23
+
24
+ import rioxarray
25
+
26
+ import earthaccess
27
+ from earthaccess import Auth, DataCollections, DataGranules
28
+
29
+
30
+ from .earthdata import (
31
+ create_ndvi_geotiff,
32
+ create_ndwi_geotiff,
33
+ create_sds_geotiff,
34
+ SUPPORTED_DATASETS as EARTHDATA_DATASETS,
35
+ )
36
+
37
+ from . import exceptions
38
+
39
+
40
+ logging.basicConfig(
41
+ format="%(asctime)s - %(message)s",
42
+ datefmt="%d-%b-%y %H:%M:%S",
43
+ )
44
+ log = logging.getLogger(__name__)
45
+
46
+ # add more CLMS dataset id's as needed
47
+ CLMS_DATASETS = ["swi_12.5km_v3_10daily"]
48
+
49
+ UCSB_DATASETS = ["CHIRPS-2.0"]
50
+
51
+ SERVIR_DATASETS = ["esi/4WK", "esi/12WK"]
52
+
53
+ SUPPORTED_DATASETS = (
54
+ EARTHDATA_DATASETS + CLMS_DATASETS + UCSB_DATASETS + SERVIR_DATASETS
55
+ )
56
+
57
+ SUPPORTED_INDICIES = ["NDVI", "NDWI"]
58
+
59
+
60
+ class GlamDownloader:
61
+ def __init__(self, dataset):
62
+ self.dataset = dataset
63
+
64
+ @property
65
+ def dataset(self):
66
+ return self._dataset
67
+
68
+ @dataset.setter
69
+ def dataset(self, value):
70
+ if value not in SUPPORTED_DATASETS:
71
+ raise exceptions.UnsupportedError(
72
+ f"Dataset '{value}' not recognized or not supported."
73
+ )
74
+ self._dataset = value
75
+
76
+ @staticmethod
77
+ def supported_datasets():
78
+ return SUPPORTED_DATASETS
79
+
80
+ @staticmethod
81
+ def supported_indicies():
82
+ return SUPPORTED_INDICIES
83
+
84
+ def _cloud_optimize(self, dataset, out_file, nodata=False):
85
+ raster = rasterio.open(dataset)
86
+ meta = raster.meta.copy()
87
+
88
+ if nodata:
89
+ meta.update({"nodata": nodata})
90
+
91
+ out_meta = meta
92
+ cog_options = cog_profiles.get("deflate")
93
+ out_meta.update(cog_options)
94
+ out_meta.update({"BIGTIFF": "IF_SAFER"})
95
+ cog_translate(
96
+ raster,
97
+ out_file,
98
+ out_meta,
99
+ allow_intermediate_compression=True,
100
+ quiet=False,
101
+ )
102
+
103
+ return True
104
+
105
+ def _create_mosaic_cog_from_vrt(self, vrt_path):
106
+ temp_path = vrt_path.replace("vrt", "temp.tif")
107
+ out_path = vrt_path.replace("vrt", "tif")
108
+ log.info(temp_path)
109
+ log.info(out_path)
110
+
111
+ log.info("Creating global mosaic tiff.")
112
+ mosaic_command = [
113
+ "gdal_translate",
114
+ "-of",
115
+ "GTiff",
116
+ "-co",
117
+ "COMPRESS=DEFLATE",
118
+ "-co",
119
+ "BIGTIFF=IF_SAFER",
120
+ vrt_path,
121
+ temp_path,
122
+ ]
123
+ subprocess.call(mosaic_command)
124
+ os.remove(vrt_path)
125
+
126
+ log.info("Creating COG.")
127
+
128
+ optimized = self._cloud_optimize(temp_path, out_path)
129
+ if optimized:
130
+ os.remove(temp_path)
131
+
132
+ return out_path
133
+
134
+ def _create_mosaic_cog_from_tifs(self, date_string, files, out_dir):
135
+ date = datetime.strptime(date_string, "%Y-%m-%d")
136
+ year = date.year
137
+ doy = date.strftime("%j")
138
+
139
+ # get index or sds name
140
+ sample_file = files[0]
141
+ variable = sample_file.split(".")[-2]
142
+
143
+ file_name = f"{self.dataset}.{variable}.{year}.{doy}.tif"
144
+ out_path = os.path.join(out_dir, file_name)
145
+ vrt_path = out_path.replace("tif", "vrt")
146
+
147
+ log.info("Creating mosaic VRT.")
148
+
149
+ vrt_command = ["gdalbuildvrt", vrt_path]
150
+ vrt_command += files
151
+ subprocess.call(vrt_command)
152
+
153
+ out = self._create_mosaic_cog_from_vrt(vrt_path)
154
+
155
+ return out
156
+
157
+
158
+ class EarthDataDownloader(GlamDownloader):
159
+ def __init__(self, dataset):
160
+ super().__init__(dataset)
161
+ self.auth = Auth()
162
+
163
+ @property
164
+ def auth(self):
165
+ return self._auth
166
+
167
+ @auth.setter
168
+ def auth(self, value):
169
+ if not value.authenticated:
170
+ try:
171
+ value.login()
172
+ except:
173
+ value.login(strategy="interactive", persist=True)
174
+ self._auth = value
175
+
176
+ @property
177
+ def authenticated(self):
178
+ return self.auth.authenticated
179
+
180
+ @property
181
+ def collection(self):
182
+ return DataCollections().short_name(self.dataset).cloud_hosted(True).get(1)[0]
183
+
184
+ def info(self):
185
+ return self.collection.summary()
186
+
187
+ def query_granules(self, start_date, end_date):
188
+ log.info("Querying available granules")
189
+ concept_id = self.collection.concept_id()
190
+ query = DataGranules().concept_id(concept_id).temporal(start_date, end_date)
191
+ granules = query.get_all()
192
+
193
+ return granules
194
+
195
+ def query_composites(self, start_date, end_date):
196
+ granules = self.query_granules(start_date, end_date)
197
+ composites = []
198
+ for granule in tqdm(granules, desc="Getting available composite dates"):
199
+ composite_obj = {}
200
+ composite_obj["id"] = (
201
+ granule["meta"]["native-id"].split(".")[0]
202
+ + "."
203
+ + granule["meta"]["native-id"].split(".")[1]
204
+ )
205
+ composite_obj["start_date"] = granule["umm"]["TemporalExtent"][
206
+ "RangeDateTime"
207
+ ]["BeginningDateTime"][:10]
208
+ composite_obj["end_date"] = granule["umm"]["TemporalExtent"][
209
+ "RangeDateTime"
210
+ ]["EndingDateTime"][:10]
211
+ if composite_obj not in composites:
212
+ composites.append(composite_obj)
213
+
214
+ return composites
215
+
216
+ def download_granules(self, start_date, end_date, out_dir):
217
+ local_path = os.path.abspath(out_dir)
218
+ granules = self.query_granules(start_date, end_date)
219
+ granule_count = len(granules)
220
+
221
+ download_complete = False
222
+ while not download_complete:
223
+ files = earthaccess.download(granules, local_path=local_path)
224
+ try:
225
+ for file in files:
226
+ assert os.path.isfile(file)
227
+ if len(files) == granule_count:
228
+ download_complete = True
229
+ except TypeError:
230
+ download_complete = False
231
+ log.info(
232
+ f"{len(files)} of {granule_count} files downloaded. Retrying..."
233
+ )
234
+
235
+ log.info(f"Successfilly downloaded {len(files)} of {granule_count} files.")
236
+ return files
237
+
238
+ def download_vi_granules(self, start_date, end_date, out_dir, vi="NDVI"):
239
+ out = os.path.abspath(out_dir)
240
+
241
+ vi_functions = {
242
+ "NDVI": create_ndvi_geotiff,
243
+ "NDWI": create_ndwi_geotiff,
244
+ }
245
+
246
+ if vi not in SUPPORTED_INDICIES:
247
+ raise exceptions.UnsupportedError(
248
+ f"Vegetation index '{vi}' not recognized or not supported."
249
+ )
250
+
251
+ granule_files = self.download_granules(start_date, end_date, out)
252
+
253
+ vi_files = []
254
+ for file in tqdm(granule_files, desc=f"Creating {vi} files"):
255
+ vi_files.append(vi_functions[vi](file, out))
256
+
257
+ # Remove granule files after tiffs are created.
258
+ for file in granule_files:
259
+ os.remove(file)
260
+
261
+ return vi_files
262
+
263
+ def download_sds_granules(self, sds_name, start_date, end_date, out_dir):
264
+ out = os.path.abspath(out_dir)
265
+
266
+ granule_files = self.download_granules(start_date, end_date, out)
267
+
268
+ sds_files = []
269
+ for file in tqdm(granule_files, desc=f"Creating {sds_name} files"):
270
+ sds_files.append(create_sds_geotiff(file, self.dataset, sds_name, out))
271
+
272
+ # Remove granule files after tiffs are created.
273
+ for file in granule_files:
274
+ os.remove(file)
275
+
276
+ return sds_files
277
+
278
+ def download_vi_composites(self, start_date, end_date, out_dir, vi="NDVI"):
279
+ out = os.path.abspath(out_dir)
280
+
281
+ composites = self.query_composites(start_date, end_date)
282
+
283
+ output = []
284
+ for composite in tqdm(composites, desc=f"Creating {vi} composites"):
285
+ vi_files = self.download_vi_granules(
286
+ composite["start_date"], composite["end_date"], out, vi=vi
287
+ )
288
+
289
+ vi_mosaic = self._create_mosaic_cog_from_tifs(
290
+ composite["start_date"], vi_files, out
291
+ )
292
+ # Remove tiffs after mosaic creation.
293
+ for file in vi_files:
294
+ os.remove(file)
295
+
296
+ output.append(vi_mosaic)
297
+
298
+ return output
299
+
300
+ def download_sds_composites(self, sds_name, start_date, end_date, out_dir):
301
+ out = os.path.abspath(out_dir)
302
+
303
+ composites = self.query_composites(start_date, end_date)
304
+
305
+ output = []
306
+ for composite in tqdm(composites, desc=f"Creating {sds_name} composites"):
307
+ sds_files = self.download_sds_granules(
308
+ sds_name, composite["start_date"], composite["end_date"], out
309
+ )
310
+
311
+ sds_mosaic = self._create_mosaic_cog_from_tifs(
312
+ composite["start_date"], sds_files, out
313
+ )
314
+ # Remove tiffs after mosaic creation.
315
+ for file in sds_files:
316
+ os.remove(file)
317
+
318
+ output.append(sds_mosaic)
319
+
320
+ return output
321
+
322
+
323
+ class CLMSDownloader(GlamDownloader):
324
+ def __init__(self, dataset):
325
+ super().__init__(dataset)
326
+
327
+ self.manifest = f"https://globalland.vito.be/download/manifest/{self.dataset}_netcdf/manifest_clms_global_{self.dataset}_netcdf_latest.txt"
328
+
329
+ def query_composites(self, start_date, end_date):
330
+
331
+ r = requests.get(self.manifest)
332
+ download_list = r.text.split("\n")
333
+
334
+ composites = []
335
+ for url in tqdm(download_list, desc=f"Querying available {self.dataset} files"):
336
+ if url.endswith(".nc"):
337
+ datestring = url.split("/")[-2]
338
+
339
+ year = datestring[:4]
340
+ month = datestring[4:6]
341
+ day = datestring[6:8]
342
+ date = datetime(int(year), int(month), int(day))
343
+ start = datetime.strptime(start_date, "%Y-%m-%d")
344
+ end = datetime.strptime(end_date, "%Y-%m-%d")
345
+
346
+ if start <= date <= end:
347
+ composites.append({"date": date.strftime("%Y-%m-%d"), "url": url})
348
+
349
+ return composites
350
+
351
+ def download_composites(self, start_date, end_date, out_dir):
352
+
353
+ composites = self.query_composites(start_date, end_date)
354
+
355
+ completed = []
356
+
357
+ for composite in tqdm(
358
+ composites, desc=f"Downloading {self.dataset} composites"
359
+ ):
360
+ date = composite.get("date")
361
+ url = composite.get("url")
362
+
363
+ r = requests.get(url)
364
+
365
+ out = os.path.join(out_dir, f"{self.dataset}.{date}.tif")
366
+
367
+ # Temporary NetCDF file; later to be converted to tiff
368
+ file_nc = out.replace("tif", "nc")
369
+
370
+ # write output .nc file
371
+ with open(file_nc, "wb") as fd: # write data in chunks
372
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
373
+ fd.write(chunk)
374
+
375
+ # checksum
376
+ # size of downloaded file (bytes)
377
+ observed_size = int(os.stat(file_nc).st_size)
378
+ # size anticipated from header (bytes)
379
+ expected_size = int(r.headers["Content-Length"])
380
+
381
+ # if checksum is failed, log and return empty
382
+ if int(observed_size) != int(expected_size):
383
+ w = f"\nExpected file size:\t{expected_size} bytes\nObserved file size:\t{observed_size} bytes"
384
+ log.warning(w)
385
+ os.remove(file_nc)
386
+ return ()
387
+
388
+ # Use rioxarray to remove time dimension and create intermediate geotiff
389
+ xds = rioxarray.open_rasterio(os.path.abspath(file_nc), decode_times=False)
390
+ new_ds = xds.squeeze()
391
+
392
+ # Select SWI layer for T-Value of 10
393
+ temp = os.path.join(out_dir, f"{self.dataset}.{date}.temp.tif")
394
+ new_ds["SWI_010"].rio.to_raster(temp)
395
+
396
+ optimized = self._cloud_optimize(temp, out, nodata=False)
397
+
398
+ if optimized:
399
+ os.remove(file_nc)
400
+ os.remove(temp)
401
+ completed.append(out)
402
+
403
+ return completed
404
+
405
+
406
+ class UCSBDownloader(GlamDownloader):
407
+ def __init__(self, dataset):
408
+ super().__init__(dataset)
409
+ self.index = f"https://data.chc.ucsb.edu/products/{dataset}/global_dekad/tifs/"
410
+ self.prelim_index = (
411
+ f"https://data.chc.ucsb.edu/products/{dataset}/prelim/global_dekad/tifs/"
412
+ )
413
+
414
+ def query_prelim_composites(self, start_date, end_date):
415
+ r = requests.get(self.prelim_index)
416
+ index_links = BeautifulSoup(r.text, "html.parser").find_all("a")
417
+
418
+ file_names = []
419
+
420
+ for link in tqdm(
421
+ index_links, desc=f"Querying available preliminary {self.dataset} files"
422
+ ):
423
+ if link.get("href").endswith(".tif"):
424
+ file_parts = link.get("href").split(".")
425
+
426
+ day = file_parts[-2]
427
+ if int(day) == 2:
428
+ day = 11
429
+ elif int(day) == 3:
430
+ day = 21
431
+
432
+ month = file_parts[-3]
433
+ year = file_parts[-4]
434
+
435
+ composite_start = datetime(int(year), int(month), int(day))
436
+ composite_end = composite_start + timedelta(days=9)
437
+ date_range_start = datetime.strptime(start_date, "%Y-%m-%d")
438
+ date_range_end = datetime.strptime(end_date, "%Y-%m-%d")
439
+
440
+ if (
441
+ composite_start >= date_range_start
442
+ and composite_start <= date_range_end
443
+ ) or (
444
+ composite_end >= date_range_start
445
+ and composite_end <= date_range_end
446
+ ):
447
+ file_name = link.get("href")
448
+ file_names.append(file_name)
449
+
450
+ return file_names
451
+
452
+ def query_composites(self, start_date, end_date):
453
+ r = requests.get(self.index)
454
+ index_links = BeautifulSoup(r.text, "html.parser").find_all("a")
455
+
456
+ file_names = []
457
+
458
+ for link in tqdm(index_links, desc=f"Querying available {self.dataset} files"):
459
+ if link.get("href").endswith(".tif.gz"):
460
+ file_parts = link.get("href").split(".")
461
+
462
+ day = file_parts[-3]
463
+ if int(day) == 2:
464
+ day = 11
465
+ elif int(day) == 3:
466
+ day = 21
467
+
468
+ month = file_parts[-4]
469
+ year = file_parts[-5]
470
+
471
+ composite_start = datetime(int(year), int(month), int(day))
472
+ composite_end = composite_start + timedelta(days=9)
473
+ date_range_start = datetime.strptime(start_date, "%Y-%m-%d")
474
+ date_range_end = datetime.strptime(end_date, "%Y-%m-%d")
475
+
476
+ if (
477
+ composite_start >= date_range_start
478
+ and composite_start <= date_range_end
479
+ ) or (
480
+ composite_end >= date_range_start
481
+ and composite_end <= date_range_end
482
+ ):
483
+ file_name = link.get("href")
484
+ file_names.append(file_name)
485
+
486
+ return file_names
487
+
488
+ def download_composites(self, start_date, end_date, out_dir, prelim=True):
489
+
490
+ composites = self.query_composites(start_date, end_date)
491
+
492
+ completed = []
493
+
494
+ for composite in tqdm(
495
+ composites, desc=f"Downloading {self.dataset} composites"
496
+ ):
497
+ url = self.index + composite
498
+ r = requests.get(url)
499
+
500
+ zipped_out = os.path.join(out_dir, composite)
501
+ unzipped_out = zipped_out.strip(".gz")
502
+
503
+ with open(zipped_out, "wb") as fd: # write data in chunks
504
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
505
+ fd.write(chunk)
506
+
507
+ # CHECKSUM
508
+ # size of downloaded file in bytes
509
+ observed_size = int(os.stat(zipped_out).st_size)
510
+ # size of promised file in bytes, extracted from server-delivered headers
511
+ expected_size = int(r.headers["Content-Length"])
512
+
513
+ # checksum failure; return empty tuple
514
+ if observed_size != expected_size: # checksum failure
515
+ w = f"WARNING:\nExpected file size:\t{expected_size} bytes\nObserved file size:\t{observed_size} bytes"
516
+ log.warning(w)
517
+ return () # no files for you today, but we'll try again tomorrow!
518
+
519
+ # use gzip to unzip file to final location
520
+ # tf = file_unzipped.replace(".tif", ".UNMASKED.tif")
521
+ with gzip.open(zipped_out) as fz:
522
+ with open(unzipped_out, "w+b") as fu:
523
+ shutil.copyfileobj(fz, fu)
524
+ os.remove(zipped_out) # delete zipped version
525
+
526
+ optimized = self._cloud_optimize(unzipped_out, unzipped_out, -9999)
527
+
528
+ if optimized:
529
+ completed.append(unzipped_out)
530
+
531
+ if prelim:
532
+ prelim_composites = self.query_prelim_composites(start_date, end_date)
533
+ for prelim_composite in tqdm(
534
+ prelim_composites, desc=f"Downloading {self.dataset} prelim composites"
535
+ ):
536
+ if f"{prelim_composite}.gz" not in composites:
537
+ filename, ext = os.path.splitext(prelim_composite)
538
+
539
+ out = os.path.join(out_dir, f"{filename}.prelim{ext}")
540
+ url = self.prelim_index + prelim_composite
541
+ r = requests.get(url)
542
+
543
+ with open(out, "wb") as fd: # write data in chunks
544
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
545
+ fd.write(chunk)
546
+
547
+ # CHECKSUM
548
+ # size of downloaded file in bytes
549
+ observed_size = int(os.stat(out).st_size)
550
+ # size of promised file in bytes, extracted from server-delivered headers
551
+ expected_size = int(r.headers["Content-Length"])
552
+
553
+ # checksum failure; return empty tuple
554
+ if observed_size != expected_size: # checksum failure
555
+ w = f"WARNING:\nExpected file size:\t{expected_size} bytes\nObserved file size:\t{observed_size} bytes"
556
+ log.warning(w)
557
+ return () # no files for you today, but we'll try again tomorrow!
558
+
559
+ optimized = self._cloud_optimize(out, out, -9999)
560
+ if optimized:
561
+ completed.append(out)
562
+
563
+ return completed
564
+
565
+
566
+ class SERVIRDownloader(GlamDownloader):
567
+ def __init__(self, dataset):
568
+ super().__init__(dataset)
569
+
570
+ self.index = f"https://gis1.servirglobal.net/data/{dataset}/"
571
+
572
+ def query_composites(self, start_date, end_date):
573
+ y1 = int(start_date.split("-")[0])
574
+ y2 = int(end_date.split("-")[0])
575
+
576
+ file_names = []
577
+
578
+ for year in tqdm(
579
+ range(y1, y2 + 1), desc=f"Querying available {self.dataset} files"
580
+ ):
581
+ dataset_url = self.index + str(year)
582
+ r = requests.get(dataset_url)
583
+
584
+ soup = BeautifulSoup(r.text, "html.parser")
585
+ links = soup.find_all("a")
586
+
587
+ for link in links:
588
+ if link.text.endswith(".tif"):
589
+ file_name, ext = os.path.splitext(link.text)
590
+ datestring = file_name.split("_")[-1]
591
+ date = datetime.strptime(datestring, "%Y%j")
592
+ if date >= datetime.strptime(
593
+ start_date, "%Y-%m-%d"
594
+ ) and date <= datetime.strptime(end_date, "%Y-%m-%d"):
595
+
596
+ file_names.append(str(year) + "/" + link.text)
597
+
598
+ return file_names
599
+
600
+ def download_composites(self, start_date, end_date, out_dir):
601
+ composites = self.query_composites(start_date, end_date)
602
+ completed = []
603
+
604
+ for composite in tqdm(
605
+ composites, desc=f"Downloading {self.dataset} composites"
606
+ ):
607
+ out = os.path.join(out_dir, composite.split("/")[-1])
608
+ url = self.index + composite
609
+ r = requests.get(url)
610
+
611
+ with open(out, "wb") as fd: # write data in chunks
612
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
613
+ fd.write(chunk)
614
+
615
+ # CHECKSUM
616
+ # size of downloaded file in bytes
617
+ observed_size = int(os.stat(out).st_size)
618
+ # size of promised file in bytes, extracted from server-delivered headers
619
+ expected_size = int(r.headers["Content-Length"])
620
+
621
+ # checksum failure; return empty tuple
622
+ if observed_size != expected_size: # checksum failure
623
+ w = f"WARNING:\nExpected file size:\t{expected_size} bytes\nObserved file size:\t{observed_size} bytes"
624
+ log.warning(w)
625
+ return () # no files for you today, but we'll try again tomorrow!
626
+
627
+ optimized = self._cloud_optimize(out, out, -9999)
628
+ if optimized:
629
+ completed.append(out)
630
+
631
+ return completed
632
+
633
+
634
+ class Downloader:
635
+ def __init__(self, dataset):
636
+ # add more short names as needed
637
+ self.short_names = {"chirps": "CHIRPS-2.0", "swi": "swi_12.5km_v3_10daily"}
638
+ dataset = self.short_names.get(dataset, dataset)
639
+ self.dataset = dataset
640
+
641
+ if dataset in EARTHDATA_DATASETS:
642
+ self.instance = EarthDataDownloader(dataset)
643
+ elif dataset in UCSB_DATASETS:
644
+ self.instance = UCSBDownloader(dataset)
645
+ elif dataset in CLMS_DATASETS:
646
+ self.instance = CLMSDownloader(dataset)
647
+ elif dataset in SERVIR_DATASETS:
648
+ self.instance = SERVIRDownloader(dataset)
649
+ else:
650
+ raise ValueError(f"Dataset {dataset} not supported")
651
+
652
+ def __getattr__(self, name):
653
+ # assume it is implemented by self.instance
654
+ return self.instance.__getattribute__(name)