oafuncs 0.0.90__py2.py3-none-any.whl → 0.0.92__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,19 +26,24 @@ from threading import Lock
26
26
  import matplotlib.pyplot as plt
27
27
  import numpy as np
28
28
  import pandas as pd
29
+ import xarray as xr
29
30
  import requests
30
31
  from rich import print
31
32
  from rich.progress import Progress
33
+ import netCDF4 as nc
32
34
 
33
35
  from oafuncs.oa_down.user_agent import get_ua
34
36
  from oafuncs.oa_file import file_size, mean_size
37
+ from oafuncs.oa_nc import check as check_nc
38
+ from oafuncs.oa_nc import modify as modify_nc
39
+ from oafuncs.oa_down.idm import downloader as idm_downloader
35
40
 
36
41
  warnings.filterwarnings("ignore", category=RuntimeWarning, message="Engine '.*' loading failed:.*")
37
42
 
38
43
  __all__ = ["draw_time_range", "download", "how_to_use", "get_time_list"]
39
44
 
40
45
 
41
- def get_initial_data():
46
+ def _get_initial_data():
42
47
  global variable_info, data_info, var_group, single_var_group
43
48
  # ----------------------------------------------
44
49
  # variable
@@ -305,14 +310,14 @@ def get_time_list(time_s, time_e, delta, interval_type="hour"):
305
310
  return dt_list
306
311
 
307
312
 
308
- def transform_time(time_str):
313
+ def _transform_time(time_str):
309
314
  # old_time = '2023080203'
310
315
  # time_new = '2023-08-02T03%3A00%3A00Z'
311
316
  time_new = f"{time_str[:4]}-{time_str[4:6]}-{time_str[6:8]}T{time_str[8:10]}%3A00%3A00Z"
312
317
  return time_new
313
318
 
314
319
 
315
- def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_str_end=None, mode="single_depth", depth=None, level_num=None):
320
+ def _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_str_end=None, mode="single_depth", depth=None, level_num=None):
316
321
  query_dict = {
317
322
  "var": variable_info[var]["var_name"],
318
323
  "north": lat_max,
@@ -331,11 +336,11 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
331
336
  }
332
337
 
333
338
  if time_str_end is not None:
334
- query_dict["time_start"] = transform_time(time_str_ymdh)
335
- query_dict["time_end"] = transform_time(time_str_end)
339
+ query_dict["time_start"] = _transform_time(time_str_ymdh)
340
+ query_dict["time_end"] = _transform_time(time_str_end)
336
341
  query_dict["timeStride"] = 1
337
342
  else:
338
- query_dict["time"] = transform_time(time_str_ymdh)
343
+ query_dict["time"] = _transform_time(time_str_ymdh)
339
344
 
340
345
  def get_nearest_level_index(depth):
341
346
  level_depth = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 125.0, 150.0, 200.0, 250.0, 300.0, 350.0, 400.0, 500.0, 600.0, 700.0, 800.0, 900.0, 1000.0, 1250.0, 1500.0, 2000.0, 2500.0, 3000.0, 4000.0, 5000]
@@ -360,7 +365,7 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
360
365
  return query_dict
361
366
 
362
367
 
363
- def check_time_in_dataset_and_version(time_input, time_end=None):
368
+ def _check_time_in_dataset_and_version(time_input, time_end=None):
364
369
  # 判断是处理单个时间点还是时间范围
365
370
  is_single_time = time_end is None
366
371
 
@@ -417,8 +422,8 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
417
422
  if is_single_time:
418
423
  return True
419
424
  else:
420
- base_url_s = get_base_url(d_list[0], v_list[0], "u", str(time_start))
421
- base_url_e = get_base_url(d_list[0], v_list[0], "u", str(time_end))
425
+ base_url_s = _get_base_url(d_list[0], v_list[0], "u", str(time_start))
426
+ base_url_e = _get_base_url(d_list[0], v_list[0], "u", str(time_end))
422
427
  if base_url_s == base_url_e:
423
428
  return True
424
429
  else:
@@ -429,7 +434,7 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
429
434
  return False
430
435
 
431
436
 
432
- def ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time_input, time_end=None):
437
+ def _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time_input, time_end=None):
433
438
  # 根据时间长度补全时间格式
434
439
  if len(str(time_input)) == 8:
435
440
  time_input = str(time_input) + "00"
@@ -468,7 +473,7 @@ def ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time
468
473
  return False
469
474
 
470
475
 
471
- def direct_choose_dataset_and_version(time_input, time_end=None):
476
+ def _direct_choose_dataset_and_version(time_input, time_end=None):
472
477
  # 假设 data_info 是一个字典,包含了数据集和版本的信息
473
478
  # 示例结构:data_info['hourly']['dataset'][dataset_name]['version'][version_name]['time_range']
474
479
 
@@ -507,7 +512,7 @@ def direct_choose_dataset_and_version(time_input, time_end=None):
507
512
  return dataset_name_out, version_name_out
508
513
 
509
514
 
510
- def get_base_url(dataset_name, version_name, var, ymdh_str):
515
+ def _get_base_url(dataset_name, version_name, var, ymdh_str):
511
516
  year_str = int(ymdh_str[:4])
512
517
  url_dict = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["url"]
513
518
  classification_method = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["classification"]
@@ -548,160 +553,272 @@ def get_base_url(dataset_name, version_name, var, ymdh_str):
548
553
  return base_url
549
554
 
550
555
 
551
- def get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
552
- base_url = get_base_url(dataset_name, version_name, var, ymdh_str)
556
+ def _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
557
+ base_url = _get_base_url(dataset_name, version_name, var, ymdh_str)
553
558
  if isinstance(query_dict["var"], str):
554
559
  query_dict["var"] = [query_dict["var"]]
555
560
  target_url = base_url + "&".join(f"var={var}" for var in query_dict["var"]) + "&" + "&".join(f"{key}={value}" for key, value in query_dict.items() if key != "var")
556
561
  return target_url
557
562
 
558
563
 
559
- def clear_existing_file(file_full_path):
564
+ def _clear_existing_file(file_full_path):
560
565
  if os.path.exists(file_full_path):
561
566
  os.remove(file_full_path)
562
567
  print(f"{file_full_path} has been removed")
563
568
 
564
569
 
565
- def check_existing_file(file_full_path, min_size):
570
+ def _check_existing_file(file_full_path, avg_size):
566
571
  if os.path.exists(file_full_path):
567
572
  print(f"[bold #FFA54F]{file_full_path} exists")
568
573
  fsize = file_size(file_full_path)
569
- if min_size:
570
- if fsize < min_size:
571
- print(f"[bold #FFA54F]{file_full_path} ({fsize:.2f} KB) may be incomplete")
572
- # clear_existing_file(file_full_path)
574
+ delta_size_ratio = (fsize - avg_size) / avg_size
575
+ if abs(delta_size_ratio) > 0.025:
576
+ if check_nc(file_full_path):
577
+ # print(f"File size is abnormal but can be opened normally, file size: {fsize:.2f} KB")
578
+ if not _check_ftime(file_full_path,if_print=True):
579
+ return False
580
+ else:
581
+ return True
582
+ else:
583
+ print(f"File size is abnormal and cannot be opened, {file_full_path}: {fsize:.2f} KB")
584
+ return False
585
+ else:
586
+ if not _check_ftime(file_full_path,if_print=True):
573
587
  return False
574
588
  else:
575
589
  return True
576
- if fsize < 5:
577
- print(f"[bold #FFA54F]{file_full_path} ({fsize:.2f} KB) may be incomplete")
578
- # clear_existing_file(file_full_path)
579
- return False
580
- else:
581
- return True
582
590
  else:
583
- # print(f'{file_full_path} does not exist')
584
591
  return False
585
592
 
586
593
 
587
- def download_file(target_url, store_path, file_name, check=False):
588
- # Check if the file exists
589
- fname = Path(store_path) / file_name
590
- file_name_split = file_name.split("_")
591
- file_name_split = file_name_split[:-1]
592
- # same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
593
- same_file = "_".join(file_name_split) + "*nc"
594
-
594
+ def _get_mean_size30(store_path, same_file):
595
595
  if same_file not in fsize_dict.keys():
596
- # print(f'Same file name: {same_file}')
597
- fsize_dict[same_file] = {"size": 0, "count": 0}
596
+ # print(f'Same file name: {same_file}')
597
+ fsize_dict[same_file] = {"size": 0, "count": 0}
598
598
 
599
599
  if fsize_dict[same_file]["count"] < 30 or fsize_dict[same_file]["size"] == 0:
600
600
  # 更新30次文件最小值,后续认为可以代表所有文件,不再更新占用时间
601
601
  fsize_mean = mean_size(store_path, same_file, max_num=30)
602
- set_min_size = fsize_mean * 0.8
602
+ set_min_size = fsize_mean * 0.95
603
603
  fsize_dict[same_file]["size"] = set_min_size
604
604
  fsize_dict[same_file]["count"] += 1
605
605
  else:
606
606
  set_min_size = fsize_dict[same_file]["size"]
607
- if check:
608
- if check_existing_file(fname, set_min_size):
609
- count_dict["skip"] += 1
610
- return
611
- clear_existing_file(fname)
607
+ return set_min_size
612
608
 
613
- # -----------------------------------------------
614
- print(f"[bold #f0f6d0]Requesting {file_name}...")
615
- # 创建会话
616
- s = requests.Session()
617
- download_success = False
618
- request_times = 0
619
609
 
620
- def calculate_wait_time(time_str, target_url):
621
- # 定义正则表达式,匹配YYYYMMDDHH格式的时间
622
- time_pattern = r"\d{10}"
610
+ def _get_mean_size_move(same_file, current_file):
611
+ # 获取锁
612
+ with fsize_dict_lock: # 全局锁,确保同一时间只能有一个线程访问
613
+ # 初始化字典中的值,如果文件不在字典中
614
+ if same_file not in fsize_dict.keys():
615
+ fsize_dict[same_file] = {"size_list": [], "mean_size": 1.0}
623
616
 
624
- # 定义两个字符串
625
- # str1 = 'HYCOM_water_u_2018010100-2018010112.nc'
626
- # str2 = 'HYCOM_water_u_2018010100.nc'
617
+ tolerance_ratio = 0.025 # 容忍的阈值比例
618
+ current_file_size = file_size(current_file)
619
+
620
+ # 如果列表不为空,则计算平均值,否则保持为1
621
+ if fsize_dict[same_file]["size_list"]:
622
+ fsize_dict[same_file]["mean_size"] = sum(fsize_dict[same_file]["size_list"]) / len(fsize_dict[same_file]["size_list"])
623
+ fsize_dict[same_file]["mean_size"] = max(fsize_dict[same_file]["mean_size"], 1.0)
624
+ else:
625
+ fsize_dict[same_file]["mean_size"] = 1.0
626
+
627
+ size_difference_ratio = (current_file_size - fsize_dict[same_file]["mean_size"]) / fsize_dict[same_file]["mean_size"]
628
+
629
+ if abs(size_difference_ratio) > tolerance_ratio:
630
+ if check_nc(current_file):
631
+ # print(f"File size is abnormal but can be opened normally, file size: {current_file_size:.2f} KB")
632
+ # 文件可以正常打开,但大小异常,保留当前文件大小
633
+ fsize_dict[same_file]["size_list"] = [current_file_size]
634
+ fsize_dict[same_file]["mean_size"] = current_file_size
635
+ else:
636
+ _clear_existing_file(current_file)
637
+ print(f"File size is abnormal, may need to be downloaded again, file size: {current_file_size:.2f} KB")
638
+ else:
639
+ # 添加当前文件大小到列表中,并更新计数
640
+ fsize_dict[same_file]["size_list"].append(current_file_size)
627
641
 
628
- # 使用正则表达式查找时间
629
- times_in_str = re.findall(time_pattern, time_str)
642
+ # 返回调整后的平均值,这里根据您的需求,返回的是添加新值之前的平均值
643
+ return fsize_dict[same_file]["mean_size"]
630
644
 
631
- # 计算每个字符串中的时间数量
632
- num_times_str = len(times_in_str)
633
645
 
634
- if num_times_str > 1:
635
- delta_t = datetime.datetime.strptime(times_in_str[1], "%Y%m%d%H") - datetime.datetime.strptime(times_in_str[0], "%Y%m%d%H")
636
- delta_t = delta_t.total_seconds() / 3600
637
- delta_t = delta_t / 3 + 1
646
+ def _check_ftime(nc_file, tname="time", if_print=False):
647
+ if not os.path.exists(nc_file):
648
+ return False
649
+ nc_file = str(nc_file)
650
+ try:
651
+ ds = xr.open_dataset(nc_file)
652
+ real_time = ds[tname].values[0]
653
+ ds.close()
654
+ real_time = str(real_time)[:13]
655
+ real_time = real_time.replace("-", "").replace("T", "")
656
+ # -----------------------------------------------------
657
+ f_time = re.findall(r"\d{10}", nc_file)[0]
658
+ if real_time == f_time:
659
+ return True
638
660
  else:
639
- delta_t = 1
640
- # 单个要素最多等待5分钟,不宜太短,太短可能请求失败;也不宜太长,太长可能会浪费时间
641
- num_var = int(target_url.count("var="))
642
- if num_var <= 0:
643
- num_var = 1
644
- return int(delta_t * 5 * 60 * num_var)
645
-
646
- max_timeout = calculate_wait_time(file_name, target_url)
647
- print(f"[bold #912dbc]Max timeout: {max_timeout} seconds")
648
-
649
- # print(f'Download_start_time: {datetime.datetime.now()}')
650
- download_time_s = datetime.datetime.now()
651
- order_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
652
- while not download_success:
653
- if request_times >= 10:
654
- # print(f'下载失败,已重试 {request_times} 次\n可先跳过,后续再试')
655
- print(f"[bold #ffe5c0]Download failed after {request_times} times\nYou can skip it and try again later")
656
- count_dict["fail"] += 1
657
- break
658
- if request_times > 0:
659
- # print(f'\r正在重试第 {request_times} 次', end="")
660
- print(f"[bold #ffe5c0]Retrying the {order_list[request_times-1]} time...")
661
- # 尝试下载文件
662
- try:
663
- headers = {"User-Agent": get_ua()}
664
- """ response = s.get(target_url, headers=headers, timeout=random.randint(5, max_timeout))
665
- response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
666
-
667
- # 保存文件
668
- with open(filename, 'wb') as f:
669
- f.write(response.content) """
670
-
671
- response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
672
- response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
673
- # 保存文件
674
- with open(fname, "wb") as f:
675
- print(f"[bold #96cbd7]Downloading {file_name}...")
676
- for chunk in response.iter_content(chunk_size=1024):
677
- if chunk:
678
- f.write(chunk)
679
-
680
- f.close()
681
-
682
- # print(f'\r文件 {fname} 下载成功', end="")
683
- if os.path.exists(fname):
684
- download_success = True
685
- download_time_e = datetime.datetime.now()
686
- download_delta = download_time_e - download_time_s
687
- print(f"[#3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
688
- count_dict["success"] += 1
689
- # print(f'Download_end_time: {datetime.datetime.now()}')
690
-
691
- except requests.exceptions.HTTPError as errh:
692
- print(f"Http Error: {errh}")
693
- except requests.exceptions.ConnectionError as errc:
694
- print(f"Error Connecting: {errc}")
695
- except requests.exceptions.Timeout as errt:
696
- print(f"Timeout Error: {errt}")
697
- except requests.exceptions.RequestException as err:
698
- print(f"OOps: Something Else: {err}")
699
-
700
- time.sleep(3)
701
- request_times += 1
702
-
703
-
704
- def check_hour_is_valid(ymdh_str):
661
+ if if_print:
662
+ print(f"[bold #daff5c]File time error, file/real time: [bold blue]{f_time}/{real_time}")
663
+ return False
664
+ except Exception as e:
665
+ if if_print:
666
+ print(f"[bold #daff5c]File time check failed, {nc_file}: {e}")
667
+ return False
668
+
669
+
670
+ def _correct_time(nc_file):
671
+ # 打开NC文件
672
+ dataset = nc.Dataset(nc_file)
673
+
674
+ # 读取时间单位
675
+ time_units = dataset.variables["time"].units
676
+
677
+ # 关闭文件
678
+ dataset.close()
679
+
680
+ # 解析时间单位字符串以获取时间原点
681
+ origin_str = time_units.split("since")[1].strip()
682
+ origin_datetime = datetime.datetime.strptime(origin_str, "%Y-%m-%d %H:%M:%S")
683
+
684
+ # 从文件名中提取日期字符串
685
+ given_date_str = re.findall(r"\d{10}", str(nc_file))[0]
686
+
687
+ # 将提取的日期字符串转换为datetime对象
688
+ given_datetime = datetime.datetime.strptime(given_date_str, "%Y%m%d%H")
689
+
690
+ # 计算给定日期与时间原点之间的差值(以小时为单位)
691
+ time_difference = (given_datetime - origin_datetime).total_seconds()
692
+ if "hours" in time_units:
693
+ time_difference /= 3600
694
+ elif "days" in time_units:
695
+ time_difference /= 3600 * 24
696
+
697
+ # 修改NC文件中的时间变量
698
+ modify_nc(nc_file, "time", None, time_difference)
699
+
700
+
701
+
702
+ def _download_file(target_url, store_path, file_name, check=False):
703
+ # Check if the file exists
704
+ fname = Path(store_path) / file_name
705
+ file_name_split = file_name.split("_")
706
+ file_name_split = file_name_split[:-1]
707
+ # same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
708
+ same_file = "_".join(file_name_split) + "*nc"
709
+
710
+ if check:
711
+ if same_file not in fsize_dict.keys(): # 对第一个文件单独进行检查,因为没有大小可以对比
712
+ check_nc(fname,if_delete=True)
713
+
714
+ # set_min_size = _get_mean_size30(store_path, same_file) # 原方案,只30次取平均值;若遇变化,无法判断
715
+ get_mean_size = _get_mean_size_move(same_file, fname)
716
+
717
+ if _check_existing_file(fname, get_mean_size):
718
+ count_dict["skip"] += 1
719
+ return
720
+ _clear_existing_file(fname)
721
+
722
+ if not use_idm:
723
+ # -----------------------------------------------
724
+ print(f"[bold #f0f6d0]Requesting {file_name} ...")
725
+ # 创建会话
726
+ s = requests.Session()
727
+ download_success = False
728
+ request_times = 0
729
+
730
+ def calculate_wait_time(time_str, target_url):
731
+ # 定义正则表达式,匹配YYYYMMDDHH格式的时间
732
+ time_pattern = r"\d{10}"
733
+
734
+ # 定义两个字符串
735
+ # str1 = 'HYCOM_water_u_2018010100-2018010112.nc'
736
+ # str2 = 'HYCOM_water_u_2018010100.nc'
737
+
738
+ # 使用正则表达式查找时间
739
+ times_in_str = re.findall(time_pattern, time_str)
740
+
741
+ # 计算每个字符串中的时间数量
742
+ num_times_str = len(times_in_str)
743
+
744
+ if num_times_str > 1:
745
+ delta_t = datetime.datetime.strptime(times_in_str[1], "%Y%m%d%H") - datetime.datetime.strptime(times_in_str[0], "%Y%m%d%H")
746
+ delta_t = delta_t.total_seconds() / 3600
747
+ delta_t = delta_t / 3 + 1
748
+ else:
749
+ delta_t = 1
750
+ # 单个要素最多等待5分钟,不宜太短,太短可能请求失败;也不宜太长,太长可能会浪费时间
751
+ num_var = int(target_url.count("var="))
752
+ if num_var <= 0:
753
+ num_var = 1
754
+ return int(delta_t * 5 * 60 * num_var)
755
+
756
+ max_timeout = calculate_wait_time(file_name, target_url)
757
+ print(f"[bold #912dbc]Max timeout: {max_timeout} seconds")
758
+
759
+ # print(f'Download_start_time: {datetime.datetime.now()}')
760
+ download_time_s = datetime.datetime.now()
761
+ order_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
762
+ while not download_success:
763
+ if request_times >= 10:
764
+ # print(f'下载失败,已重试 {request_times} 次\n可先跳过,后续再试')
765
+ print(f"[bold #ffe5c0]Download failed after {request_times} times\nYou can skip it and try again later")
766
+ count_dict["fail"] += 1
767
+ break
768
+ if request_times > 0:
769
+ # print(f'\r正在重试第 {request_times} 次', end="")
770
+ print(f"[bold #ffe5c0]Retrying the {order_list[request_times-1]} time...")
771
+ # 尝试下载文件
772
+ try:
773
+ headers = {"User-Agent": get_ua()}
774
+ """ response = s.get(target_url, headers=headers, timeout=random.randint(5, max_timeout))
775
+ response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
776
+
777
+ # 保存文件
778
+ with open(filename, 'wb') as f:
779
+ f.write(response.content) """
780
+
781
+ response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
782
+ response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
783
+ # 保存文件
784
+ with open(fname, "wb") as f:
785
+ print(f"[bold #96cbd7]Downloading {file_name} ...")
786
+ for chunk in response.iter_content(chunk_size=1024):
787
+ if chunk:
788
+ f.write(chunk)
789
+
790
+ f.close()
791
+
792
+ if not _check_ftime(fname):
793
+ _correct_time(fname)
794
+
795
+ # print(f'\r文件 {fname} 下载成功', end="")
796
+ if os.path.exists(fname):
797
+ download_success = True
798
+ download_time_e = datetime.datetime.now()
799
+ download_delta = download_time_e - download_time_s
800
+ print(f"[#3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
801
+ count_dict["success"] += 1
802
+ # print(f'Download_end_time: {datetime.datetime.now()}')
803
+
804
+ except requests.exceptions.HTTPError as errh:
805
+ print(f"Http Error: {errh}")
806
+ except requests.exceptions.ConnectionError as errc:
807
+ print(f"Error Connecting: {errc}")
808
+ except requests.exceptions.Timeout as errt:
809
+ print(f"Timeout Error: {errt}")
810
+ except requests.exceptions.RequestException as err:
811
+ print(f"OOps: Something Else: {err}")
812
+
813
+ time.sleep(3)
814
+ request_times += 1
815
+ else:
816
+ idm_downloader(target_url, store_path, file_name, given_idm_engine)
817
+ idm_download_list.append(fname)
818
+ print(f"[bold #3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been submit to IDM for downloading")
819
+
820
+
821
+ def _check_hour_is_valid(ymdh_str):
705
822
  # hour should be 00, 03, 06, 09, 12, 15, 18, 21
706
823
  hh = int(str(ymdh_str[-2:]))
707
824
  if hh in [0, 3, 6, 9, 12, 15, 18, 21]:
@@ -710,9 +827,9 @@ def check_hour_is_valid(ymdh_str):
710
827
  return False
711
828
 
712
829
 
713
- def check_dataset_version(dataset_name, version_name, download_time, download_time_end=None):
830
+ def _check_dataset_version(dataset_name, version_name, download_time, download_time_end=None):
714
831
  if dataset_name is not None and version_name is not None:
715
- just_ensure = ensure_time_in_specific_dataset_and_version(dataset_name, version_name, download_time, download_time_end)
832
+ just_ensure = _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, download_time, download_time_end)
716
833
  if just_ensure:
717
834
  return dataset_name, version_name
718
835
  else:
@@ -725,7 +842,7 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
725
842
  download_time_str = download_time_str + "00"
726
843
 
727
844
  # 检查小时是否有效(如果需要的话)
728
- if download_time_end is None and not check_hour_is_valid(download_time_str):
845
+ if download_time_end is None and not _check_hour_is_valid(download_time_str):
729
846
  print("Please ensure the hour is 00, 03, 06, 09, 12, 15, 18, 21")
730
847
  raise ValueError("The hour is invalid")
731
848
 
@@ -733,18 +850,18 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
733
850
  if download_time_end is not None:
734
851
  if len(str(download_time_end)) == 8:
735
852
  download_time_end = str(download_time_end) + "21"
736
- have_data = check_time_in_dataset_and_version(download_time_str, download_time_end)
853
+ have_data = _check_time_in_dataset_and_version(download_time_str, download_time_end)
737
854
  if have_data:
738
- return direct_choose_dataset_and_version(download_time_str, download_time_end)
855
+ return _direct_choose_dataset_and_version(download_time_str, download_time_end)
739
856
  else:
740
- have_data = check_time_in_dataset_and_version(download_time_str)
857
+ have_data = _check_time_in_dataset_and_version(download_time_str)
741
858
  if have_data:
742
- return direct_choose_dataset_and_version(download_time_str)
859
+ return _direct_choose_dataset_and_version(download_time_str)
743
860
 
744
861
  return None, None
745
862
 
746
863
 
747
- def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
864
+ def _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
748
865
  # year_str = str(download_time)[:4]
749
866
  ymdh_str = str(download_time)
750
867
  if depth is not None and level_num is not None:
@@ -760,19 +877,19 @@ def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max
760
877
  else:
761
878
  # print("Full depth or full level data will be downloaded...")
762
879
  which_mode = "full"
763
- query_dict = get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
764
- submit_url = get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
880
+ query_dict = _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
881
+ submit_url = _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
765
882
  return submit_url
766
883
 
767
884
 
768
- def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
885
+ def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
769
886
  print("[bold #ecdbfe]-" * 160)
770
887
  download_time = str(download_time)
771
888
  if download_time_end is not None:
772
889
  download_time_end = str(download_time_end)
773
- dataset_name, version_name = check_dataset_version(dataset_name, version_name, download_time, download_time_end)
890
+ dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time, download_time_end)
774
891
  else:
775
- dataset_name, version_name = check_dataset_version(dataset_name, version_name, download_time)
892
+ dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time)
776
893
  if dataset_name is None and version_name is None:
777
894
  count_dict["no_data"] += 1
778
895
  if download_time_end is not None:
@@ -787,11 +904,11 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
787
904
  if isinstance(var, list):
788
905
  if len(var) == 1:
789
906
  var = var[0]
790
- submit_url = get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
907
+ submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
791
908
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
792
909
  if download_time_end is not None:
793
910
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
794
- download_file(submit_url, store_path, file_name, check)
911
+ _download_file(submit_url, store_path, file_name, check)
795
912
  else:
796
913
  if download_time < "2024081012":
797
914
  varlist = [_ for _ in var]
@@ -804,7 +921,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
804
921
  continue
805
922
 
806
923
  var = current_group[0]
807
- submit_url = get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
924
+ submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
808
925
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
809
926
  old_str = f'var={variable_info[var]["var_name"]}'
810
927
  new_str = f'var={variable_info[var]["var_name"]}'
@@ -816,17 +933,17 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
816
933
  file_name = f"HYCOM_{key}_{download_time}.nc"
817
934
  if download_time_end is not None:
818
935
  file_name = f"HYCOM_{key}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
819
- download_file(submit_url, store_path, file_name, check)
936
+ _download_file(submit_url, store_path, file_name, check)
820
937
  else:
821
938
  for v in var:
822
- submit_url = get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
939
+ submit_url = _get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
823
940
  file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}.nc"
824
941
  if download_time_end is not None:
825
942
  file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}-{download_time_end}.nc"
826
- download_file(submit_url, store_path, file_name, check)
943
+ _download_file(submit_url, store_path, file_name, check)
827
944
 
828
945
 
829
- def convert_full_name_to_short_name(full_name):
946
+ def _convert_full_name_to_short_name(full_name):
830
947
  for var, info in variable_info.items():
831
948
  if full_name == info["var_name"] or full_name == info["standard_name"] or full_name == var:
832
949
  return var
@@ -836,7 +953,7 @@ def convert_full_name_to_short_name(full_name):
836
953
  return False
837
954
 
838
955
 
839
- def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
956
+ def _download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
840
957
  """
841
958
  # 并行下载任务
842
959
  # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
@@ -847,10 +964,10 @@ def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_ma
847
964
  因此,即使多个任务同时执行,也不会出现数据交互错乱的问题。
848
965
  """
849
966
 
850
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
967
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
851
968
 
852
969
 
853
- def done_callback(future, progress, task, total, counter_lock):
970
+ def _done_callback(future, progress, task, total, counter_lock):
854
971
  """
855
972
  # 并行下载任务的回调函数
856
973
  # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
@@ -866,7 +983,7 @@ def done_callback(future, progress, task, total, counter_lock):
866
983
  progress.update(task, advance=1, description=f"[cyan]Downloading... {parallel_counter}/{total}")
867
984
 
868
985
 
869
- def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
986
+ def _download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
870
987
  """
871
988
  Description:
872
989
  Download the data of single time or a series of time
@@ -895,7 +1012,7 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
895
1012
  parallel_counter = 0
896
1013
  counter_lock = Lock() # 创建一个锁,线程安全的计数器
897
1014
  if ymdh_time_s == ymdh_time_e:
898
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
1015
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
899
1016
  elif int(ymdh_time_s) < int(ymdh_time_e):
900
1017
  print("Downloading a series of files...")
901
1018
  time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3, "hour")
@@ -905,16 +1022,16 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
905
1022
  if num_workers is None or num_workers <= 1:
906
1023
  # 串行方式
907
1024
  for i, time_str in enumerate(time_list):
908
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
1025
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
909
1026
  progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")
910
1027
  else:
911
1028
  # 并行方式
912
1029
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
913
- futures = [executor.submit(download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
1030
+ futures = [executor.submit(_download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
914
1031
  """ for i, future in enumerate(futures):
915
1032
  future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")) """
916
1033
  for feature in as_completed(futures):
917
- done_callback(feature, progress, task, len(time_list), counter_lock)
1034
+ _done_callback(feature, progress, task, len(time_list), counter_lock)
918
1035
  else:
919
1036
  new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
920
1037
  total_num = len(new_time_list)
@@ -923,21 +1040,21 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
923
1040
  for i, time_str in enumerate(new_time_list):
924
1041
  time_str_end_index = int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))
925
1042
  time_str_end = time_list[time_str_end_index]
926
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
1043
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
927
1044
  progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")
928
1045
  else:
929
1046
  # 并行方式
930
1047
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
931
- futures = [executor.submit(download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
1048
+ futures = [executor.submit(_download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
932
1049
  """ for i, future in enumerate(futures):
933
1050
  future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")) """
934
1051
  for feature in as_completed(futures):
935
- done_callback(feature, progress, task, len(time_list), counter_lock)
1052
+ _done_callback(feature, progress, task, len(time_list), counter_lock)
936
1053
  else:
937
1054
  print("Please ensure the time_s is no more than time_e")
938
1055
 
939
1056
 
940
- def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
1057
+ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1, idm_engine=None):
941
1058
  """
942
1059
  Description:
943
1060
  Download the data of single time or a series of time
@@ -958,11 +1075,12 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
958
1075
  num_workers: int, the number of workers, default is None, if not set, the number of workers will be 1; suggest not to set the number of workers too large
959
1076
  check: bool, whether to check the existing file, default is False, if set to True, the existing file will be checked and not downloaded again; else, the existing file will be covered
960
1077
  ftimes: int, the number of time in one file, default is 1, if set to 1, the data of single time will be downloaded; the maximum is 8, if set to 8, the data of 8 times will be downloaded in one file
1078
+ idm_engine: str, the IDM engine, default is None, if set, the IDM will be used to download the data; example: "D:\\Programs\\Internet Download Manager\\IDMan.exe"
961
1079
 
962
1080
  Returns:
963
1081
  None
964
1082
  """
965
- get_initial_data()
1083
+ _get_initial_data()
966
1084
 
967
1085
  # 打印信息并处理数据集和版本名称
968
1086
  if dataset_name is None and version_name is None:
@@ -980,11 +1098,11 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
980
1098
 
981
1099
  if isinstance(var, list):
982
1100
  if len(var) == 1:
983
- var = convert_full_name_to_short_name(var[0])
1101
+ var = _convert_full_name_to_short_name(var[0])
984
1102
  else:
985
- var = [convert_full_name_to_short_name(v) for v in var]
1103
+ var = [_convert_full_name_to_short_name(v) for v in var]
986
1104
  elif isinstance(var, str):
987
- var = convert_full_name_to_short_name(var)
1105
+ var = _convert_full_name_to_short_name(var)
988
1106
  else:
989
1107
  raise ValueError("The var is invalid")
990
1108
  if var is False:
@@ -1005,8 +1123,8 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
1005
1123
  os.makedirs(str(store_path), exist_ok=True)
1006
1124
 
1007
1125
  if num_workers is not None:
1008
- num_workers = max(min(num_workers, 10), 1)
1009
-
1126
+ num_workers = max(min(num_workers, 10), 1) # 暂时不限制最大值,再检查的时候可以多开一些线程
1127
+ # num_workers = int(max(num_workers, 1))
1010
1128
  time_s = str(time_s)
1011
1129
  if len(time_s) == 8:
1012
1130
  time_s += "00"
@@ -1025,8 +1143,37 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
1025
1143
 
1026
1144
  global fsize_dict
1027
1145
  fsize_dict = {}
1028
-
1029
- download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
1146
+
1147
+ global fsize_dict_lock
1148
+ fsize_dict_lock = Lock()
1149
+
1150
+ global use_idm, given_idm_engine, idm_download_list
1151
+ if idm_engine is not None:
1152
+ use_idm = True
1153
+ given_idm_engine = idm_engine
1154
+ idm_download_list = []
1155
+ else:
1156
+ use_idm = False
1157
+
1158
+
1159
+ _download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
1160
+
1161
+ if idm_download_list:
1162
+ for f in idm_download_list:
1163
+ wait_success = 0
1164
+ success = False
1165
+ while not success:
1166
+ if check_nc(f):
1167
+ _correct_time(f)
1168
+ success = True
1169
+ count_dict["success"] += 1
1170
+ else:
1171
+ wait_success += 1
1172
+ time.sleep(3)
1173
+ if wait_success >= 20:
1174
+ success = True
1175
+ # print(f'{f} download failed')
1176
+ count_dict["fail"] += 1
1030
1177
 
1031
1178
  count_dict["total"] = count_dict["success"] + count_dict["fail"] + count_dict["skip"] + count_dict["no_data"]
1032
1179
 
@@ -1094,9 +1241,9 @@ def how_to_use():
1094
1241
 
1095
1242
 
1096
1243
  if __name__ == "__main__":
1097
- time_s, time_e = "2024101012", "2024101018"
1244
+ time_s, time_e = "2018010800", "2024083121"
1098
1245
  merge_name = f"{time_s}_{time_e}" # 合并后的文件名
1099
- root_path = r"G:\Data\HYCOM\3hourly_test"
1246
+ root_path = r"G:\Data\HYCOM\3hourly"
1100
1247
  location_dict = {"west": 105, "east": 130, "south": 15, "north": 45}
1101
1248
  download_dict = {
1102
1249
  "water_u": {"simple_name": "u", "download": 1},
@@ -1116,10 +1263,11 @@ if __name__ == "__main__":
1116
1263
  # if you wanna download all depth or level, set both False
1117
1264
  depth = None # or 0-5000 meters
1118
1265
  level = None # or 1-40 levels
1119
- num_workers = 3
1266
+ num_workers = 1
1120
1267
 
1121
1268
  check = True
1122
1269
  ftimes = 1
1270
+ idm_engine = r"D:\Programs\Internet Download Manager\IDMan.exe"
1123
1271
 
1124
1272
  download_switch, single_var = True, False
1125
1273
  combine_switch = False
@@ -1130,9 +1278,9 @@ if __name__ == "__main__":
1130
1278
  if download_switch:
1131
1279
  if single_var:
1132
1280
  for var_name in var_list:
1133
- download(var=var_name, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes)
1281
+ download(var=var_name, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes, idm_engine=idm_engine)
1134
1282
  else:
1135
- download(var=var_list, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes)
1283
+ download(var=var_list, time_s=time_s, time_e=time_e, store_path=Path(root_path), lon_min=location_dict["west"], lon_max=location_dict["east"], lat_min=location_dict["south"], lat_max=location_dict["north"], num_workers=num_workers, check=check, depth=depth, level=level, ftimes=ftimes, idm_engine=idm_engine)
1136
1284
 
1137
1285
  """ if combine_switch or copy_switch:
1138
1286
  time_list = get_time_list(time_s, time_e, 3, 'hour')