oafuncs 0.0.90__py2.py3-none-any.whl → 0.0.92__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oafuncs/data_store/OAFuncs.png +0 -0
- oafuncs/oa_data.py +9 -82
- oafuncs/oa_down/__init__.py +1 -0
- oafuncs/oa_down/hycom_3hourly.py +322 -174
- oafuncs/oa_down/idm.py +50 -0
- oafuncs/oa_down/literature.py +55 -30
- oafuncs/oa_file.py +58 -14
- oafuncs/oa_help.py +7 -1
- oafuncs/oa_nc.py +20 -18
- oafuncs/oa_tool/__init__.py +6 -6
- oafuncs/oa_tool/parallel.py +90 -0
- {oafuncs-0.0.90.dist-info → oafuncs-0.0.92.dist-info}/METADATA +12 -2
- oafuncs-0.0.92.dist-info/RECORD +28 -0
- {oafuncs-0.0.90.dist-info → oafuncs-0.0.92.dist-info}/WHEEL +1 -1
- oafuncs-0.0.90.dist-info/RECORD +0 -26
- {oafuncs-0.0.90.dist-info → oafuncs-0.0.92.dist-info}/LICENSE.txt +0 -0
- {oafuncs-0.0.90.dist-info → oafuncs-0.0.92.dist-info}/top_level.txt +0 -0
oafuncs/oa_down/hycom_3hourly.py
CHANGED
@@ -26,19 +26,24 @@ from threading import Lock
|
|
26
26
|
import matplotlib.pyplot as plt
|
27
27
|
import numpy as np
|
28
28
|
import pandas as pd
|
29
|
+
import xarray as xr
|
29
30
|
import requests
|
30
31
|
from rich import print
|
31
32
|
from rich.progress import Progress
|
33
|
+
import netCDF4 as nc
|
32
34
|
|
33
35
|
from oafuncs.oa_down.user_agent import get_ua
|
34
36
|
from oafuncs.oa_file import file_size, mean_size
|
37
|
+
from oafuncs.oa_nc import check as check_nc
|
38
|
+
from oafuncs.oa_nc import modify as modify_nc
|
39
|
+
from oafuncs.oa_down.idm import downloader as idm_downloader
|
35
40
|
|
36
41
|
warnings.filterwarnings("ignore", category=RuntimeWarning, message="Engine '.*' loading failed:.*")
|
37
42
|
|
38
43
|
__all__ = ["draw_time_range", "download", "how_to_use", "get_time_list"]
|
39
44
|
|
40
45
|
|
41
|
-
def
|
46
|
+
def _get_initial_data():
|
42
47
|
global variable_info, data_info, var_group, single_var_group
|
43
48
|
# ----------------------------------------------
|
44
49
|
# variable
|
@@ -305,14 +310,14 @@ def get_time_list(time_s, time_e, delta, interval_type="hour"):
|
|
305
310
|
return dt_list
|
306
311
|
|
307
312
|
|
308
|
-
def
|
313
|
+
def _transform_time(time_str):
|
309
314
|
# old_time = '2023080203'
|
310
315
|
# time_new = '2023-08-02T03%3A00%3A00Z'
|
311
316
|
time_new = f"{time_str[:4]}-{time_str[4:6]}-{time_str[6:8]}T{time_str[8:10]}%3A00%3A00Z"
|
312
317
|
return time_new
|
313
318
|
|
314
319
|
|
315
|
-
def
|
320
|
+
def _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_str_end=None, mode="single_depth", depth=None, level_num=None):
|
316
321
|
query_dict = {
|
317
322
|
"var": variable_info[var]["var_name"],
|
318
323
|
"north": lat_max,
|
@@ -331,11 +336,11 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
|
|
331
336
|
}
|
332
337
|
|
333
338
|
if time_str_end is not None:
|
334
|
-
query_dict["time_start"] =
|
335
|
-
query_dict["time_end"] =
|
339
|
+
query_dict["time_start"] = _transform_time(time_str_ymdh)
|
340
|
+
query_dict["time_end"] = _transform_time(time_str_end)
|
336
341
|
query_dict["timeStride"] = 1
|
337
342
|
else:
|
338
|
-
query_dict["time"] =
|
343
|
+
query_dict["time"] = _transform_time(time_str_ymdh)
|
339
344
|
|
340
345
|
def get_nearest_level_index(depth):
|
341
346
|
level_depth = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 125.0, 150.0, 200.0, 250.0, 300.0, 350.0, 400.0, 500.0, 600.0, 700.0, 800.0, 900.0, 1000.0, 1250.0, 1500.0, 2000.0, 2500.0, 3000.0, 4000.0, 5000]
|
@@ -360,7 +365,7 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
|
|
360
365
|
return query_dict
|
361
366
|
|
362
367
|
|
363
|
-
def
|
368
|
+
def _check_time_in_dataset_and_version(time_input, time_end=None):
|
364
369
|
# 判断是处理单个时间点还是时间范围
|
365
370
|
is_single_time = time_end is None
|
366
371
|
|
@@ -417,8 +422,8 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
|
|
417
422
|
if is_single_time:
|
418
423
|
return True
|
419
424
|
else:
|
420
|
-
base_url_s =
|
421
|
-
base_url_e =
|
425
|
+
base_url_s = _get_base_url(d_list[0], v_list[0], "u", str(time_start))
|
426
|
+
base_url_e = _get_base_url(d_list[0], v_list[0], "u", str(time_end))
|
422
427
|
if base_url_s == base_url_e:
|
423
428
|
return True
|
424
429
|
else:
|
@@ -429,7 +434,7 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
|
|
429
434
|
return False
|
430
435
|
|
431
436
|
|
432
|
-
def
|
437
|
+
def _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time_input, time_end=None):
|
433
438
|
# 根据时间长度补全时间格式
|
434
439
|
if len(str(time_input)) == 8:
|
435
440
|
time_input = str(time_input) + "00"
|
@@ -468,7 +473,7 @@ def ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time
|
|
468
473
|
return False
|
469
474
|
|
470
475
|
|
471
|
-
def
|
476
|
+
def _direct_choose_dataset_and_version(time_input, time_end=None):
|
472
477
|
# 假设 data_info 是一个字典,包含了数据集和版本的信息
|
473
478
|
# 示例结构:data_info['hourly']['dataset'][dataset_name]['version'][version_name]['time_range']
|
474
479
|
|
@@ -507,7 +512,7 @@ def direct_choose_dataset_and_version(time_input, time_end=None):
|
|
507
512
|
return dataset_name_out, version_name_out
|
508
513
|
|
509
514
|
|
510
|
-
def
|
515
|
+
def _get_base_url(dataset_name, version_name, var, ymdh_str):
|
511
516
|
year_str = int(ymdh_str[:4])
|
512
517
|
url_dict = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["url"]
|
513
518
|
classification_method = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["classification"]
|
@@ -548,160 +553,272 @@ def get_base_url(dataset_name, version_name, var, ymdh_str):
|
|
548
553
|
return base_url
|
549
554
|
|
550
555
|
|
551
|
-
def
|
552
|
-
base_url =
|
556
|
+
def _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
|
557
|
+
base_url = _get_base_url(dataset_name, version_name, var, ymdh_str)
|
553
558
|
if isinstance(query_dict["var"], str):
|
554
559
|
query_dict["var"] = [query_dict["var"]]
|
555
560
|
target_url = base_url + "&".join(f"var={var}" for var in query_dict["var"]) + "&" + "&".join(f"{key}={value}" for key, value in query_dict.items() if key != "var")
|
556
561
|
return target_url
|
557
562
|
|
558
563
|
|
559
|
-
def
|
564
|
+
def _clear_existing_file(file_full_path):
|
560
565
|
if os.path.exists(file_full_path):
|
561
566
|
os.remove(file_full_path)
|
562
567
|
print(f"{file_full_path} has been removed")
|
563
568
|
|
564
569
|
|
565
|
-
def
|
570
|
+
def _check_existing_file(file_full_path, avg_size):
|
566
571
|
if os.path.exists(file_full_path):
|
567
572
|
print(f"[bold #FFA54F]{file_full_path} exists")
|
568
573
|
fsize = file_size(file_full_path)
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
#
|
574
|
+
delta_size_ratio = (fsize - avg_size) / avg_size
|
575
|
+
if abs(delta_size_ratio) > 0.025:
|
576
|
+
if check_nc(file_full_path):
|
577
|
+
# print(f"File size is abnormal but can be opened normally, file size: {fsize:.2f} KB")
|
578
|
+
if not _check_ftime(file_full_path,if_print=True):
|
579
|
+
return False
|
580
|
+
else:
|
581
|
+
return True
|
582
|
+
else:
|
583
|
+
print(f"File size is abnormal and cannot be opened, {file_full_path}: {fsize:.2f} KB")
|
584
|
+
return False
|
585
|
+
else:
|
586
|
+
if not _check_ftime(file_full_path,if_print=True):
|
573
587
|
return False
|
574
588
|
else:
|
575
589
|
return True
|
576
|
-
if fsize < 5:
|
577
|
-
print(f"[bold #FFA54F]{file_full_path} ({fsize:.2f} KB) may be incomplete")
|
578
|
-
# clear_existing_file(file_full_path)
|
579
|
-
return False
|
580
|
-
else:
|
581
|
-
return True
|
582
590
|
else:
|
583
|
-
# print(f'{file_full_path} does not exist')
|
584
591
|
return False
|
585
592
|
|
586
593
|
|
587
|
-
def
|
588
|
-
# Check if the file exists
|
589
|
-
fname = Path(store_path) / file_name
|
590
|
-
file_name_split = file_name.split("_")
|
591
|
-
file_name_split = file_name_split[:-1]
|
592
|
-
# same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
|
593
|
-
same_file = "_".join(file_name_split) + "*nc"
|
594
|
-
|
594
|
+
def _get_mean_size30(store_path, same_file):
|
595
595
|
if same_file not in fsize_dict.keys():
|
596
|
-
|
597
|
-
|
596
|
+
# print(f'Same file name: {same_file}')
|
597
|
+
fsize_dict[same_file] = {"size": 0, "count": 0}
|
598
598
|
|
599
599
|
if fsize_dict[same_file]["count"] < 30 or fsize_dict[same_file]["size"] == 0:
|
600
600
|
# 更新30次文件最小值,后续认为可以代表所有文件,不再更新占用时间
|
601
601
|
fsize_mean = mean_size(store_path, same_file, max_num=30)
|
602
|
-
set_min_size = fsize_mean * 0.
|
602
|
+
set_min_size = fsize_mean * 0.95
|
603
603
|
fsize_dict[same_file]["size"] = set_min_size
|
604
604
|
fsize_dict[same_file]["count"] += 1
|
605
605
|
else:
|
606
606
|
set_min_size = fsize_dict[same_file]["size"]
|
607
|
-
|
608
|
-
if check_existing_file(fname, set_min_size):
|
609
|
-
count_dict["skip"] += 1
|
610
|
-
return
|
611
|
-
clear_existing_file(fname)
|
607
|
+
return set_min_size
|
612
608
|
|
613
|
-
# -----------------------------------------------
|
614
|
-
print(f"[bold #f0f6d0]Requesting {file_name}...")
|
615
|
-
# 创建会话
|
616
|
-
s = requests.Session()
|
617
|
-
download_success = False
|
618
|
-
request_times = 0
|
619
609
|
|
620
|
-
|
621
|
-
|
622
|
-
|
610
|
+
def _get_mean_size_move(same_file, current_file):
|
611
|
+
# 获取锁
|
612
|
+
with fsize_dict_lock: # 全局锁,确保同一时间只能有一个线程访问
|
613
|
+
# 初始化字典中的值,如果文件不在字典中
|
614
|
+
if same_file not in fsize_dict.keys():
|
615
|
+
fsize_dict[same_file] = {"size_list": [], "mean_size": 1.0}
|
623
616
|
|
624
|
-
#
|
625
|
-
|
626
|
-
|
617
|
+
tolerance_ratio = 0.025 # 容忍的阈值比例
|
618
|
+
current_file_size = file_size(current_file)
|
619
|
+
|
620
|
+
# 如果列表不为空,则计算平均值,否则保持为1
|
621
|
+
if fsize_dict[same_file]["size_list"]:
|
622
|
+
fsize_dict[same_file]["mean_size"] = sum(fsize_dict[same_file]["size_list"]) / len(fsize_dict[same_file]["size_list"])
|
623
|
+
fsize_dict[same_file]["mean_size"] = max(fsize_dict[same_file]["mean_size"], 1.0)
|
624
|
+
else:
|
625
|
+
fsize_dict[same_file]["mean_size"] = 1.0
|
626
|
+
|
627
|
+
size_difference_ratio = (current_file_size - fsize_dict[same_file]["mean_size"]) / fsize_dict[same_file]["mean_size"]
|
628
|
+
|
629
|
+
if abs(size_difference_ratio) > tolerance_ratio:
|
630
|
+
if check_nc(current_file):
|
631
|
+
# print(f"File size is abnormal but can be opened normally, file size: {current_file_size:.2f} KB")
|
632
|
+
# 文件可以正常打开,但大小异常,保留当前文件大小
|
633
|
+
fsize_dict[same_file]["size_list"] = [current_file_size]
|
634
|
+
fsize_dict[same_file]["mean_size"] = current_file_size
|
635
|
+
else:
|
636
|
+
_clear_existing_file(current_file)
|
637
|
+
print(f"File size is abnormal, may need to be downloaded again, file size: {current_file_size:.2f} KB")
|
638
|
+
else:
|
639
|
+
# 添加当前文件大小到列表中,并更新计数
|
640
|
+
fsize_dict[same_file]["size_list"].append(current_file_size)
|
627
641
|
|
628
|
-
|
629
|
-
|
642
|
+
# 返回调整后的平均值,这里根据您的需求,返回的是添加新值之前的平均值
|
643
|
+
return fsize_dict[same_file]["mean_size"]
|
630
644
|
|
631
|
-
# 计算每个字符串中的时间数量
|
632
|
-
num_times_str = len(times_in_str)
|
633
645
|
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
646
|
+
def _check_ftime(nc_file, tname="time", if_print=False):
|
647
|
+
if not os.path.exists(nc_file):
|
648
|
+
return False
|
649
|
+
nc_file = str(nc_file)
|
650
|
+
try:
|
651
|
+
ds = xr.open_dataset(nc_file)
|
652
|
+
real_time = ds[tname].values[0]
|
653
|
+
ds.close()
|
654
|
+
real_time = str(real_time)[:13]
|
655
|
+
real_time = real_time.replace("-", "").replace("T", "")
|
656
|
+
# -----------------------------------------------------
|
657
|
+
f_time = re.findall(r"\d{10}", nc_file)[0]
|
658
|
+
if real_time == f_time:
|
659
|
+
return True
|
638
660
|
else:
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
#
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
661
|
+
if if_print:
|
662
|
+
print(f"[bold #daff5c]File time error, file/real time: [bold blue]{f_time}/{real_time}")
|
663
|
+
return False
|
664
|
+
except Exception as e:
|
665
|
+
if if_print:
|
666
|
+
print(f"[bold #daff5c]File time check failed, {nc_file}: {e}")
|
667
|
+
return False
|
668
|
+
|
669
|
+
|
670
|
+
def _correct_time(nc_file):
|
671
|
+
# 打开NC文件
|
672
|
+
dataset = nc.Dataset(nc_file)
|
673
|
+
|
674
|
+
# 读取时间单位
|
675
|
+
time_units = dataset.variables["time"].units
|
676
|
+
|
677
|
+
# 关闭文件
|
678
|
+
dataset.close()
|
679
|
+
|
680
|
+
# 解析时间单位字符串以获取时间原点
|
681
|
+
origin_str = time_units.split("since")[1].strip()
|
682
|
+
origin_datetime = datetime.datetime.strptime(origin_str, "%Y-%m-%d %H:%M:%S")
|
683
|
+
|
684
|
+
# 从文件名中提取日期字符串
|
685
|
+
given_date_str = re.findall(r"\d{10}", str(nc_file))[0]
|
686
|
+
|
687
|
+
# 将提取的日期字符串转换为datetime对象
|
688
|
+
given_datetime = datetime.datetime.strptime(given_date_str, "%Y%m%d%H")
|
689
|
+
|
690
|
+
# 计算给定日期与时间原点之间的差值(以小时为单位)
|
691
|
+
time_difference = (given_datetime - origin_datetime).total_seconds()
|
692
|
+
if "hours" in time_units:
|
693
|
+
time_difference /= 3600
|
694
|
+
elif "days" in time_units:
|
695
|
+
time_difference /= 3600 * 24
|
696
|
+
|
697
|
+
# 修改NC文件中的时间变量
|
698
|
+
modify_nc(nc_file, "time", None, time_difference)
|
699
|
+
|
700
|
+
|
701
|
+
|
702
|
+
def _download_file(target_url, store_path, file_name, check=False):
|
703
|
+
# Check if the file exists
|
704
|
+
fname = Path(store_path) / file_name
|
705
|
+
file_name_split = file_name.split("_")
|
706
|
+
file_name_split = file_name_split[:-1]
|
707
|
+
# same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
|
708
|
+
same_file = "_".join(file_name_split) + "*nc"
|
709
|
+
|
710
|
+
if check:
|
711
|
+
if same_file not in fsize_dict.keys(): # 对第一个文件单独进行检查,因为没有大小可以对比
|
712
|
+
check_nc(fname,if_delete=True)
|
713
|
+
|
714
|
+
# set_min_size = _get_mean_size30(store_path, same_file) # 原方案,只30次取平均值;若遇变化,无法判断
|
715
|
+
get_mean_size = _get_mean_size_move(same_file, fname)
|
716
|
+
|
717
|
+
if _check_existing_file(fname, get_mean_size):
|
718
|
+
count_dict["skip"] += 1
|
719
|
+
return
|
720
|
+
_clear_existing_file(fname)
|
721
|
+
|
722
|
+
if not use_idm:
|
723
|
+
# -----------------------------------------------
|
724
|
+
print(f"[bold #f0f6d0]Requesting {file_name} ...")
|
725
|
+
# 创建会话
|
726
|
+
s = requests.Session()
|
727
|
+
download_success = False
|
728
|
+
request_times = 0
|
729
|
+
|
730
|
+
def calculate_wait_time(time_str, target_url):
|
731
|
+
# 定义正则表达式,匹配YYYYMMDDHH格式的时间
|
732
|
+
time_pattern = r"\d{10}"
|
733
|
+
|
734
|
+
# 定义两个字符串
|
735
|
+
# str1 = 'HYCOM_water_u_2018010100-2018010112.nc'
|
736
|
+
# str2 = 'HYCOM_water_u_2018010100.nc'
|
737
|
+
|
738
|
+
# 使用正则表达式查找时间
|
739
|
+
times_in_str = re.findall(time_pattern, time_str)
|
740
|
+
|
741
|
+
# 计算每个字符串中的时间数量
|
742
|
+
num_times_str = len(times_in_str)
|
743
|
+
|
744
|
+
if num_times_str > 1:
|
745
|
+
delta_t = datetime.datetime.strptime(times_in_str[1], "%Y%m%d%H") - datetime.datetime.strptime(times_in_str[0], "%Y%m%d%H")
|
746
|
+
delta_t = delta_t.total_seconds() / 3600
|
747
|
+
delta_t = delta_t / 3 + 1
|
748
|
+
else:
|
749
|
+
delta_t = 1
|
750
|
+
# 单个要素最多等待5分钟,不宜太短,太短可能请求失败;也不宜太长,太长可能会浪费时间
|
751
|
+
num_var = int(target_url.count("var="))
|
752
|
+
if num_var <= 0:
|
753
|
+
num_var = 1
|
754
|
+
return int(delta_t * 5 * 60 * num_var)
|
755
|
+
|
756
|
+
max_timeout = calculate_wait_time(file_name, target_url)
|
757
|
+
print(f"[bold #912dbc]Max timeout: {max_timeout} seconds")
|
758
|
+
|
759
|
+
# print(f'Download_start_time: {datetime.datetime.now()}')
|
760
|
+
download_time_s = datetime.datetime.now()
|
761
|
+
order_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
|
762
|
+
while not download_success:
|
763
|
+
if request_times >= 10:
|
764
|
+
# print(f'下载失败,已重试 {request_times} 次\n可先跳过,后续再试')
|
765
|
+
print(f"[bold #ffe5c0]Download failed after {request_times} times\nYou can skip it and try again later")
|
766
|
+
count_dict["fail"] += 1
|
767
|
+
break
|
768
|
+
if request_times > 0:
|
769
|
+
# print(f'\r正在重试第 {request_times} 次', end="")
|
770
|
+
print(f"[bold #ffe5c0]Retrying the {order_list[request_times-1]} time...")
|
771
|
+
# 尝试下载文件
|
772
|
+
try:
|
773
|
+
headers = {"User-Agent": get_ua()}
|
774
|
+
""" response = s.get(target_url, headers=headers, timeout=random.randint(5, max_timeout))
|
775
|
+
response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
|
776
|
+
|
777
|
+
# 保存文件
|
778
|
+
with open(filename, 'wb') as f:
|
779
|
+
f.write(response.content) """
|
780
|
+
|
781
|
+
response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
|
782
|
+
response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
|
783
|
+
# 保存文件
|
784
|
+
with open(fname, "wb") as f:
|
785
|
+
print(f"[bold #96cbd7]Downloading {file_name} ...")
|
786
|
+
for chunk in response.iter_content(chunk_size=1024):
|
787
|
+
if chunk:
|
788
|
+
f.write(chunk)
|
789
|
+
|
790
|
+
f.close()
|
791
|
+
|
792
|
+
if not _check_ftime(fname):
|
793
|
+
_correct_time(fname)
|
794
|
+
|
795
|
+
# print(f'\r文件 {fname} 下载成功', end="")
|
796
|
+
if os.path.exists(fname):
|
797
|
+
download_success = True
|
798
|
+
download_time_e = datetime.datetime.now()
|
799
|
+
download_delta = download_time_e - download_time_s
|
800
|
+
print(f"[#3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
|
801
|
+
count_dict["success"] += 1
|
802
|
+
# print(f'Download_end_time: {datetime.datetime.now()}')
|
803
|
+
|
804
|
+
except requests.exceptions.HTTPError as errh:
|
805
|
+
print(f"Http Error: {errh}")
|
806
|
+
except requests.exceptions.ConnectionError as errc:
|
807
|
+
print(f"Error Connecting: {errc}")
|
808
|
+
except requests.exceptions.Timeout as errt:
|
809
|
+
print(f"Timeout Error: {errt}")
|
810
|
+
except requests.exceptions.RequestException as err:
|
811
|
+
print(f"OOps: Something Else: {err}")
|
812
|
+
|
813
|
+
time.sleep(3)
|
814
|
+
request_times += 1
|
815
|
+
else:
|
816
|
+
idm_downloader(target_url, store_path, file_name, given_idm_engine)
|
817
|
+
idm_download_list.append(fname)
|
818
|
+
print(f"[bold #3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been submit to IDM for downloading")
|
819
|
+
|
820
|
+
|
821
|
+
def _check_hour_is_valid(ymdh_str):
|
705
822
|
# hour should be 00, 03, 06, 09, 12, 15, 18, 21
|
706
823
|
hh = int(str(ymdh_str[-2:]))
|
707
824
|
if hh in [0, 3, 6, 9, 12, 15, 18, 21]:
|
@@ -710,9 +827,9 @@ def check_hour_is_valid(ymdh_str):
|
|
710
827
|
return False
|
711
828
|
|
712
829
|
|
713
|
-
def
|
830
|
+
def _check_dataset_version(dataset_name, version_name, download_time, download_time_end=None):
|
714
831
|
if dataset_name is not None and version_name is not None:
|
715
|
-
just_ensure =
|
832
|
+
just_ensure = _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, download_time, download_time_end)
|
716
833
|
if just_ensure:
|
717
834
|
return dataset_name, version_name
|
718
835
|
else:
|
@@ -725,7 +842,7 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
|
|
725
842
|
download_time_str = download_time_str + "00"
|
726
843
|
|
727
844
|
# 检查小时是否有效(如果需要的话)
|
728
|
-
if download_time_end is None and not
|
845
|
+
if download_time_end is None and not _check_hour_is_valid(download_time_str):
|
729
846
|
print("Please ensure the hour is 00, 03, 06, 09, 12, 15, 18, 21")
|
730
847
|
raise ValueError("The hour is invalid")
|
731
848
|
|
@@ -733,18 +850,18 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
|
|
733
850
|
if download_time_end is not None:
|
734
851
|
if len(str(download_time_end)) == 8:
|
735
852
|
download_time_end = str(download_time_end) + "21"
|
736
|
-
have_data =
|
853
|
+
have_data = _check_time_in_dataset_and_version(download_time_str, download_time_end)
|
737
854
|
if have_data:
|
738
|
-
return
|
855
|
+
return _direct_choose_dataset_and_version(download_time_str, download_time_end)
|
739
856
|
else:
|
740
|
-
have_data =
|
857
|
+
have_data = _check_time_in_dataset_and_version(download_time_str)
|
741
858
|
if have_data:
|
742
|
-
return
|
859
|
+
return _direct_choose_dataset_and_version(download_time_str)
|
743
860
|
|
744
861
|
return None, None
|
745
862
|
|
746
863
|
|
747
|
-
def
|
864
|
+
def _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
|
748
865
|
# year_str = str(download_time)[:4]
|
749
866
|
ymdh_str = str(download_time)
|
750
867
|
if depth is not None and level_num is not None:
|
@@ -760,19 +877,19 @@ def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max
|
|
760
877
|
else:
|
761
878
|
# print("Full depth or full level data will be downloaded...")
|
762
879
|
which_mode = "full"
|
763
|
-
query_dict =
|
764
|
-
submit_url =
|
880
|
+
query_dict = _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
|
881
|
+
submit_url = _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
|
765
882
|
return submit_url
|
766
883
|
|
767
884
|
|
768
|
-
def
|
885
|
+
def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
|
769
886
|
print("[bold #ecdbfe]-" * 160)
|
770
887
|
download_time = str(download_time)
|
771
888
|
if download_time_end is not None:
|
772
889
|
download_time_end = str(download_time_end)
|
773
|
-
dataset_name, version_name =
|
890
|
+
dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time, download_time_end)
|
774
891
|
else:
|
775
|
-
dataset_name, version_name =
|
892
|
+
dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time)
|
776
893
|
if dataset_name is None and version_name is None:
|
777
894
|
count_dict["no_data"] += 1
|
778
895
|
if download_time_end is not None:
|
@@ -787,11 +904,11 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
787
904
|
if isinstance(var, list):
|
788
905
|
if len(var) == 1:
|
789
906
|
var = var[0]
|
790
|
-
submit_url =
|
907
|
+
submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
|
791
908
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
|
792
909
|
if download_time_end is not None:
|
793
910
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
|
794
|
-
|
911
|
+
_download_file(submit_url, store_path, file_name, check)
|
795
912
|
else:
|
796
913
|
if download_time < "2024081012":
|
797
914
|
varlist = [_ for _ in var]
|
@@ -804,7 +921,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
804
921
|
continue
|
805
922
|
|
806
923
|
var = current_group[0]
|
807
|
-
submit_url =
|
924
|
+
submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
|
808
925
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
|
809
926
|
old_str = f'var={variable_info[var]["var_name"]}'
|
810
927
|
new_str = f'var={variable_info[var]["var_name"]}'
|
@@ -816,17 +933,17 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
816
933
|
file_name = f"HYCOM_{key}_{download_time}.nc"
|
817
934
|
if download_time_end is not None:
|
818
935
|
file_name = f"HYCOM_{key}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
|
819
|
-
|
936
|
+
_download_file(submit_url, store_path, file_name, check)
|
820
937
|
else:
|
821
938
|
for v in var:
|
822
|
-
submit_url =
|
939
|
+
submit_url = _get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
|
823
940
|
file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}.nc"
|
824
941
|
if download_time_end is not None:
|
825
942
|
file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}-{download_time_end}.nc"
|
826
|
-
|
943
|
+
_download_file(submit_url, store_path, file_name, check)
|
827
944
|
|
828
945
|
|
829
|
-
def
|
946
|
+
def _convert_full_name_to_short_name(full_name):
|
830
947
|
for var, info in variable_info.items():
|
831
948
|
if full_name == info["var_name"] or full_name == info["standard_name"] or full_name == var:
|
832
949
|
return var
|
@@ -836,7 +953,7 @@ def convert_full_name_to_short_name(full_name):
|
|
836
953
|
return False
|
837
954
|
|
838
955
|
|
839
|
-
def
|
956
|
+
def _download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
|
840
957
|
"""
|
841
958
|
# 并行下载任务
|
842
959
|
# 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
|
@@ -847,10 +964,10 @@ def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_ma
|
|
847
964
|
因此,即使多个任务同时执行,也不会出现数据交互错乱的问题。
|
848
965
|
"""
|
849
966
|
|
850
|
-
|
967
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
|
851
968
|
|
852
969
|
|
853
|
-
def
|
970
|
+
def _done_callback(future, progress, task, total, counter_lock):
|
854
971
|
"""
|
855
972
|
# 并行下载任务的回调函数
|
856
973
|
# 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
|
@@ -866,7 +983,7 @@ def done_callback(future, progress, task, total, counter_lock):
|
|
866
983
|
progress.update(task, advance=1, description=f"[cyan]Downloading... {parallel_counter}/{total}")
|
867
984
|
|
868
985
|
|
869
|
-
def
|
986
|
+
def _download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
|
870
987
|
"""
|
871
988
|
Description:
|
872
989
|
Download the data of single time or a series of time
|
@@ -895,7 +1012,7 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
895
1012
|
parallel_counter = 0
|
896
1013
|
counter_lock = Lock() # 创建一个锁,线程安全的计数器
|
897
1014
|
if ymdh_time_s == ymdh_time_e:
|
898
|
-
|
1015
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
|
899
1016
|
elif int(ymdh_time_s) < int(ymdh_time_e):
|
900
1017
|
print("Downloading a series of files...")
|
901
1018
|
time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3, "hour")
|
@@ -905,16 +1022,16 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
905
1022
|
if num_workers is None or num_workers <= 1:
|
906
1023
|
# 串行方式
|
907
1024
|
for i, time_str in enumerate(time_list):
|
908
|
-
|
1025
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
|
909
1026
|
progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")
|
910
1027
|
else:
|
911
1028
|
# 并行方式
|
912
1029
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
913
|
-
futures = [executor.submit(
|
1030
|
+
futures = [executor.submit(_download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
|
914
1031
|
""" for i, future in enumerate(futures):
|
915
1032
|
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")) """
|
916
1033
|
for feature in as_completed(futures):
|
917
|
-
|
1034
|
+
_done_callback(feature, progress, task, len(time_list), counter_lock)
|
918
1035
|
else:
|
919
1036
|
new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
|
920
1037
|
total_num = len(new_time_list)
|
@@ -923,21 +1040,21 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
923
1040
|
for i, time_str in enumerate(new_time_list):
|
924
1041
|
time_str_end_index = int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))
|
925
1042
|
time_str_end = time_list[time_str_end_index]
|
926
|
-
|
1043
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
|
927
1044
|
progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")
|
928
1045
|
else:
|
929
1046
|
# 并行方式
|
930
1047
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
931
|
-
futures = [executor.submit(
|
1048
|
+
futures = [executor.submit(_download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
|
932
1049
|
""" for i, future in enumerate(futures):
|
933
1050
|
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")) """
|
934
1051
|
for feature in as_completed(futures):
|
935
|
-
|
1052
|
+
_done_callback(feature, progress, task, len(time_list), counter_lock)
|
936
1053
|
else:
|
937
1054
|
print("Please ensure the time_s is no more than time_e")
|
938
1055
|
|
939
1056
|
|
940
|
-
def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
|
1057
|
+
def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1, idm_engine=None):
|
941
1058
|
"""
|
942
1059
|
Description:
|
943
1060
|
Download the data of single time or a series of time
|
@@ -958,11 +1075,12 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
958
1075
|
num_workers: int, the number of workers, default is None, if not set, the number of workers will be 1; suggest not to set the number of workers too large
|
959
1076
|
check: bool, whether to check the existing file, default is False, if set to True, the existing file will be checked and not downloaded again; else, the existing file will be covered
|
960
1077
|
ftimes: int, the number of time in one file, default is 1, if set to 1, the data of single time will be downloaded; the maximum is 8, if set to 8, the data of 8 times will be downloaded in one file
|
1078
|
+
idm_engine: str, the IDM engine, default is None, if set, the IDM will be used to download the data; example: "D:\\Programs\\Internet Download Manager\\IDMan.exe"
|
961
1079
|
|
962
1080
|
Returns:
|
963
1081
|
None
|
964
1082
|
"""
|
965
|
-
|
1083
|
+
_get_initial_data()
|
966
1084
|
|
967
1085
|
# 打印信息并处理数据集和版本名称
|
968
1086
|
if dataset_name is None and version_name is None:
|
@@ -980,11 +1098,11 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
980
1098
|
|
981
1099
|
if isinstance(var, list):
|
982
1100
|
if len(var) == 1:
|
983
|
-
var =
|
1101
|
+
var = _convert_full_name_to_short_name(var[0])
|
984
1102
|
else:
|
985
|
-
var = [
|
1103
|
+
var = [_convert_full_name_to_short_name(v) for v in var]
|
986
1104
|
elif isinstance(var, str):
|
987
|
-
var =
|
1105
|
+
var = _convert_full_name_to_short_name(var)
|
988
1106
|
else:
|
989
1107
|
raise ValueError("The var is invalid")
|
990
1108
|
if var is False:
|
@@ -1005,8 +1123,8 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
1005
1123
|
os.makedirs(str(store_path), exist_ok=True)
|
1006
1124
|
|
1007
1125
|
if num_workers is not None:
|
1008
|
-
num_workers = max(min(num_workers, 10), 1)
|
1009
|
-
|
1126
|
+
num_workers = max(min(num_workers, 10), 1) # 暂时不限制最大值,再检查的时候可以多开一些线程
|
1127
|
+
# num_workers = int(max(num_workers, 1))
|
1010
1128
|
time_s = str(time_s)
|
1011
1129
|
if len(time_s) == 8:
|
1012
1130
|
time_s += "00"
|
@@ -1025,8 +1143,37 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
1025
1143
|
|
1026
1144
|
global fsize_dict
|
1027
1145
|
fsize_dict = {}
|
1028
|
-
|
1029
|
-
|
1146
|
+
|
1147
|
+
global fsize_dict_lock
|
1148
|
+
fsize_dict_lock = Lock()
|
1149
|
+
|
1150
|
+
global use_idm, given_idm_engine, idm_download_list
|
1151
|
+
if idm_engine is not None:
|
1152
|
+
use_idm = True
|
1153
|
+
given_idm_engine = idm_engine
|
1154
|
+
idm_download_list = []
|
1155
|
+
else:
|
1156
|
+
use_idm = False
|
1157
|
+
|
1158
|
+
|
1159
|
+
_download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
|
1160
|
+
|
1161
|
+
if idm_download_list:
|
1162
|
+
for f in idm_download_list:
|
1163
|
+
wait_success = 0
|
1164
|
+
success = False
|
1165
|
+
while not success:
|
1166
|
+
if check_nc(f):
|
1167
|
+
_correct_time(f)
|
1168
|
+
success = True
|
1169
|
+
count_dict["success"] += 1
|
1170
|
+
else:
|
1171
|
+
wait_success += 1
|
1172
|
+
time.sleep(3)
|
1173
|
+
if wait_success >= 20:
|
1174
|
+
success = True
|
1175
|
+
# print(f'{f} download failed')
|
1176
|
+
count_dict["fail"] += 1
|
1030
1177
|
|
1031
1178
|
count_dict["total"] = count_dict["success"] + count_dict["fail"] + count_dict["skip"] + count_dict["no_data"]
|
1032
1179
|
|
@@ -1094,9 +1241,9 @@ def how_to_use():
|
|
1094
1241
|
|
1095
1242
|
|
1096
1243
|
if __name__ == "__main__":
|
1097
|
-
time_s, time_e = "
|
1244
|
+
time_s, time_e = "2018010800", "2024083121"
|
1098
1245
|
merge_name = f"{time_s}_{time_e}" # 合并后的文件名
|
1099
|
-
root_path = r"G:\Data\HYCOM\
|
1246
|
+
root_path = r"G:\Data\HYCOM\3hourly"
|
1100
1247
|
location_dict = {"west": 105, "east": 130, "south": 15, "north": 45}
|
1101
1248
|
download_dict = {
|
1102
1249
|
"water_u": {"simple_name": "u", "download": 1},
|
@@ -1116,10 +1263,11 @@ if __name__ == "__main__":
|
|
1116
1263
|
# if you wanna download all depth or level, set both False
|
1117
1264
|
depth = None # or 0-5000 meters
|
1118
1265
|
level = None # or 1-40 levels
|
1119
|
-
num_workers =
|
1266
|
+
num_workers = 1
|
1120
1267
|
|
1121
1268
|
check = True
|
1122
1269
|
ftimes = 1
|
1270
|
+
idm_engine = r"D:\Programs\Internet Download Manager\IDMan.exe"
|
1123
1271
|
|
1124
1272
|
download_switch, single_var = True, False
|
1125
1273
|
combine_switch = False
|
@@ -1130,9 +1278,9 @@ if __name__ == "__main__":
|
|
1130
1278
|
if download_switch:
|
1131
1279
|
if single_var:
|
1132
1280
|
for var_name in var_list:
|
1133
|
-
download(var=var_name, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes)
|
1281
|
+
download(var=var_name, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes, idm_engine=idm_engine)
|
1134
1282
|
else:
|
1135
|
-
download(var=var_list, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes)
|
1283
|
+
download(var=var_list, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes, idm_engine=idm_engine)
|
1136
1284
|
|
1137
1285
|
""" if combine_switch or copy_switch:
|
1138
1286
|
time_list = get_time_list(time_s, time_e, 3, 'hour')
|