oafuncs 0.0.97.16__py3-none-any.whl → 0.0.97.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,10 @@
2
2
  # coding=utf-8
3
3
  """
4
4
  Author: Liu Kun && 16031215@qq.com
5
- Date: 2025-01-29 19:05:09
5
+ Date: 2025-04-07 10:51:09
6
6
  LastEditors: Liu Kun && 16031215@qq.com
7
- LastEditTime: 2025-01-29 19:05:10
8
- FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\hycom_3hourly_20250129 copy.py
7
+ LastEditTime: 2025-04-07 10:51:09
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\hycom_3hourly copy.py
9
9
  Description:
10
10
  EditPlatform: vscode
11
11
  ComputerInfo: XPS 15 9510
@@ -34,7 +34,7 @@ from rich.progress import Progress
34
34
 
35
35
  from oafuncs.oa_down.idm import downloader as idm_downloader
36
36
  from oafuncs.oa_down.user_agent import get_ua
37
- from oafuncs.oa_file import file_size, mean_size
37
+ from oafuncs.oa_file import file_size
38
38
  from oafuncs.oa_nc import check as check_nc
39
39
  from oafuncs.oa_nc import modify as modify_nc
40
40
 
@@ -415,10 +415,12 @@ def _check_time_in_dataset_and_version(time_input, time_end=None):
415
415
  have_data = True
416
416
 
417
417
  # 输出结果
418
- print(f"[bold red]{time_input_str} is in the following dataset and version:")
418
+ if match_time is None:
419
+ print(f"[bold red]{time_input_str} is in the following dataset and version:")
419
420
  if have_data:
420
- for d, v, trange in zip(d_list, v_list, trange_list):
421
- print(f"[bold blue]{d} {v} {trange}")
421
+ if match_time is None:
422
+ for d, v, trange in zip(d_list, v_list, trange_list):
423
+ print(f"[bold blue]{d} {v} {trange}")
422
424
  if is_single_time:
423
425
  return True
424
426
  else:
@@ -501,12 +503,11 @@ def _direct_choose_dataset_and_version(time_input, time_end=None):
501
503
 
502
504
  # 检查时间是否在数据集版本的时间范围内
503
505
  if time_start >= time_s and time_end <= time_e:
504
- # print(f'[bold purple]dataset: {dataset_name}, version: {version_name} is chosen')
505
- # return dataset_name, version_name
506
506
  dataset_name_out, version_name_out = dataset_name, version_name
507
507
 
508
508
  if dataset_name_out is not None and version_name_out is not None:
509
- print(f"[bold purple]dataset: {dataset_name_out}, version: {version_name_out} is chosen")
509
+ if match_time is None:
510
+ print(f"[bold purple]dataset: {dataset_name_out}, version: {version_name_out} is chosen")
510
511
 
511
512
  # 如果没有找到匹配的数据集和版本,会返回 None
512
513
  return dataset_name_out, version_name_out
@@ -583,10 +584,9 @@ def _check_existing_file(file_full_path, avg_size):
583
584
  delta_size_ratio = (fsize - avg_size) / avg_size
584
585
  if abs(delta_size_ratio) > 0.025:
585
586
  if check_nc(file_full_path):
586
- # print(f"File size is abnormal but can be opened normally, file size: {fsize:.2f} KB")
587
587
  return True
588
588
  else:
589
- print(f"File size is abnormal and cannot be opened, {file_full_path}: {fsize:.2f} KB")
589
+ # print(f"File size is abnormal and cannot be opened, {file_full_path}: {fsize:.2f} KB")
590
590
  return False
591
591
  else:
592
592
  return True
@@ -594,33 +594,14 @@ def _check_existing_file(file_full_path, avg_size):
594
594
  return False
595
595
 
596
596
 
597
- def _get_mean_size30(store_path, same_file):
598
- if same_file not in fsize_dict.keys():
599
- # print(f'Same file name: {same_file}')
600
- fsize_dict[same_file] = {"size": 0, "count": 0}
601
-
602
- if fsize_dict[same_file]["count"] < 30 or fsize_dict[same_file]["size"] == 0:
603
- # 更新30次文件最小值,后续认为可以代表所有文件,不再更新占用时间
604
- fsize_mean = mean_size(store_path, same_file, max_num=30)
605
- set_min_size = fsize_mean * 0.95
606
- fsize_dict[same_file]["size"] = set_min_size
607
- fsize_dict[same_file]["count"] += 1
608
- else:
609
- set_min_size = fsize_dict[same_file]["size"]
610
- return set_min_size
611
-
612
-
613
597
  def _get_mean_size_move(same_file, current_file):
614
- # 获取锁
615
- with fsize_dict_lock: # 全局锁,确保同一时间只能有一个线程访问
616
- # 初始化字典中的值,如果文件不在字典中
598
+ with fsize_dict_lock:
617
599
  if same_file not in fsize_dict.keys():
618
600
  fsize_dict[same_file] = {"size_list": [], "mean_size": 1.0}
619
601
 
620
- tolerance_ratio = 0.025 # 容忍的阈值比例
602
+ tolerance_ratio = 0.025
621
603
  current_file_size = file_size(current_file)
622
604
 
623
- # 如果列表不为空,则计算平均值,否则保持为1
624
605
  if fsize_dict[same_file]["size_list"]:
625
606
  fsize_dict[same_file]["mean_size"] = sum(fsize_dict[same_file]["size_list"]) / len(fsize_dict[same_file]["size_list"])
626
607
  fsize_dict[same_file]["mean_size"] = max(fsize_dict[same_file]["mean_size"], 1.0)
@@ -630,19 +611,15 @@ def _get_mean_size_move(same_file, current_file):
630
611
  size_difference_ratio = (current_file_size - fsize_dict[same_file]["mean_size"]) / fsize_dict[same_file]["mean_size"]
631
612
 
632
613
  if abs(size_difference_ratio) > tolerance_ratio:
633
- if check_nc(current_file):
634
- # print(f"File size is abnormal but can be opened normally, file size: {current_file_size:.2f} KB")
635
- # 文件可以正常打开,但大小异常,保留当前文件大小
614
+ if check_nc(current_file,print_messages=False):
636
615
  fsize_dict[same_file]["size_list"] = [current_file_size]
637
616
  fsize_dict[same_file]["mean_size"] = current_file_size
638
617
  else:
639
618
  _clear_existing_file(current_file)
640
- print(f"File size is abnormal, may need to be downloaded again, file size: {current_file_size:.2f} KB")
619
+ # print(f"File size is abnormal, may need to be downloaded again, file size: {current_file_size:.2f} KB")
641
620
  else:
642
- # 添加当前文件大小到列表中,并更新计数
643
621
  fsize_dict[same_file]["size_list"].append(current_file_size)
644
622
 
645
- # 返回调整后的平均值,这里根据您的需求,返回的是添加新值之前的平均值
646
623
  return fsize_dict[same_file]["mean_size"]
647
624
 
648
625
 
@@ -656,7 +633,6 @@ def _check_ftime(nc_file, tname="time", if_print=False):
656
633
  ds.close()
657
634
  real_time = str(real_time)[:13]
658
635
  real_time = real_time.replace("-", "").replace("T", "")
659
- # -----------------------------------------------------
660
636
  f_time = re.findall(r"\d{10}", nc_file)[0]
661
637
  if real_time == f_time:
662
638
  return True
@@ -671,91 +647,66 @@ def _check_ftime(nc_file, tname="time", if_print=False):
671
647
 
672
648
 
673
649
  def _correct_time(nc_file):
674
- # 打开NC文件
675
650
  dataset = nc.Dataset(nc_file)
676
-
677
- # 读取时间单位
678
651
  time_units = dataset.variables["time"].units
679
-
680
- # 关闭文件
681
652
  dataset.close()
682
-
683
- # 解析时间单位字符串以获取时间原点
684
653
  origin_str = time_units.split("since")[1].strip()
685
654
  origin_datetime = datetime.datetime.strptime(origin_str, "%Y-%m-%d %H:%M:%S")
686
-
687
- # 从文件名中提取日期字符串
688
655
  given_date_str = re.findall(r"\d{10}", str(nc_file))[0]
689
-
690
- # 将提取的日期字符串转换为datetime对象
691
656
  given_datetime = datetime.datetime.strptime(given_date_str, "%Y%m%d%H")
692
-
693
- # 计算给定日期与时间原点之间的差值(以小时为单位)
694
657
  time_difference = (given_datetime - origin_datetime).total_seconds()
695
658
  if "hours" in time_units:
696
659
  time_difference /= 3600
697
660
  elif "days" in time_units:
698
661
  time_difference /= 3600 * 24
699
-
700
- # 修改NC文件中的时间变量
701
662
  modify_nc(nc_file, "time", None, time_difference)
702
663
 
703
664
 
704
- def _download_file(target_url, store_path, file_name, check=False):
705
- # Check if the file exists
665
+ def _download_file(target_url, store_path, file_name, cover=False):
706
666
  fname = Path(store_path) / file_name
707
667
  file_name_split = file_name.split("_")
708
668
  file_name_split = file_name_split[:-1]
709
- # same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
710
669
  same_file = "_".join(file_name_split) + "*nc"
711
670
 
712
671
  if match_time is not None:
713
- if check_nc(fname):
672
+ if check_nc(fname, print_messages=False):
714
673
  if not _check_ftime(fname, if_print=True):
715
674
  if match_time:
716
675
  _correct_time(fname)
717
676
  count_dict["skip"] += 1
718
677
  else:
719
678
  _clear_existing_file(fname)
720
- # print(f"[bold #ffe5c0]File time error, {fname}")
721
679
  count_dict["no_data"] += 1
722
680
  else:
723
681
  count_dict["skip"] += 1
724
682
  print(f"[bold green]{file_name} is correct")
725
683
  return
726
684
 
727
- if check:
728
- if same_file not in fsize_dict.keys(): # 对第一个文件单独进行检查,因为没有大小可以对比
729
- check_nc(fname, delete_switch=True)
685
+ if not cover and os.path.exists(fname):
686
+ print(f"[bold #FFA54F]{fname} exists, skipping ...")
687
+ count_dict["skip"] += 1
688
+ return
689
+
690
+ if same_file not in fsize_dict.keys():
691
+ check_nc(fname, delete_if_invalid=True, print_messages=False)
730
692
 
731
- # set_min_size = _get_mean_size30(store_path, same_file) # 原方案,只30次取平均值;若遇变化,无法判断
732
- get_mean_size = _get_mean_size_move(same_file, fname)
693
+ get_mean_size = _get_mean_size_move(same_file, fname)
694
+
695
+ if _check_existing_file(fname, get_mean_size):
696
+ count_dict["skip"] += 1
697
+ return
733
698
 
734
- if _check_existing_file(fname, get_mean_size):
735
- count_dict["skip"] += 1
736
- return
737
699
  _clear_existing_file(fname)
738
700
 
739
701
  if not use_idm:
740
- # -----------------------------------------------
741
702
  print(f"[bold #f0f6d0]Requesting {file_name} ...")
742
- # 创建会话
743
703
  s = requests.Session()
744
704
  download_success = False
745
705
  request_times = 0
746
706
 
747
707
  def calculate_wait_time(time_str, target_url):
748
- # 定义正则表达式,匹配YYYYMMDDHH格式的时间
749
708
  time_pattern = r"\d{10}"
750
-
751
- # 定义两个字符串
752
- # str1 = 'HYCOM_water_u_2018010100-2018010112.nc'
753
- # str2 = 'HYCOM_water_u_2018010100.nc'
754
-
755
- # 使用正则表达式查找时间
756
709
  times_in_str = re.findall(time_pattern, time_str)
757
-
758
- # 计算每个字符串中的时间数量
759
710
  num_times_str = len(times_in_str)
760
711
 
761
712
  if num_times_str > 1:
@@ -764,7 +715,6 @@ def _download_file(target_url, store_path, file_name, check=False):
764
715
  delta_t = delta_t / 3 + 1
765
716
  else:
766
717
  delta_t = 1
767
- # 单个要素最多等待5分钟,不宜太短,太短可能请求失败;也不宜太长,太长可能会浪费时间
768
718
  num_var = int(target_url.count("var="))
769
719
  if num_var <= 0:
770
720
  num_var = 1
@@ -773,31 +723,19 @@ def _download_file(target_url, store_path, file_name, check=False):
773
723
  max_timeout = calculate_wait_time(file_name, target_url)
774
724
  print(f"[bold #912dbc]Max timeout: {max_timeout} seconds")
775
725
 
776
- # print(f'Download_start_time: {datetime.datetime.now()}')
777
726
  download_time_s = datetime.datetime.now()
778
727
  order_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
779
728
  while not download_success:
780
729
  if request_times >= 10:
781
- # print(f'下载失败,已重试 {request_times} 次\n可先跳过,后续再试')
782
730
  print(f"[bold #ffe5c0]Download failed after {request_times} times\nYou can skip it and try again later")
783
731
  count_dict["fail"] += 1
784
732
  break
785
733
  if request_times > 0:
786
- # print(f'\r正在重试第 {request_times} 次', end="")
787
734
  print(f"[bold #ffe5c0]Retrying the {order_list[request_times - 1]} time...")
788
- # 尝试下载文件
789
735
  try:
790
736
  headers = {"User-Agent": get_ua()}
791
- """ response = s.get(target_url, headers=headers, timeout=random.randint(5, max_timeout))
792
- response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
793
-
794
- # 保存文件
795
- with open(filename, 'wb') as f:
796
- f.write(response.content) """
797
-
798
- response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
799
- response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
800
- # 保存文件
737
+ response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout))
738
+ response.raise_for_status()
801
739
  with open(fname, "wb") as f:
802
740
  print(f"[bold #96cbd7]Downloading {file_name} ...")
803
741
  for chunk in response.iter_content(chunk_size=1024):
@@ -806,14 +744,12 @@ def _download_file(target_url, store_path, file_name, check=False):
806
744
 
807
745
  f.close()
808
746
 
809
- # print(f'\r文件 {fname} 下载成功', end="")
810
747
  if os.path.exists(fname):
811
748
  download_success = True
812
749
  download_time_e = datetime.datetime.now()
813
750
  download_delta = download_time_e - download_time_s
814
751
  print(f"[#3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
815
752
  count_dict["success"] += 1
816
- # print(f'Download_end_time: {datetime.datetime.now()}')
817
753
 
818
754
  except requests.exceptions.HTTPError as errh:
819
755
  print(f"Http Error: {errh}")
@@ -833,7 +769,6 @@ def _download_file(target_url, store_path, file_name, check=False):
833
769
 
834
770
 
835
771
  def _check_hour_is_valid(ymdh_str):
836
- # hour should be 00, 03, 06, 09, 12, 15, 18, 21
837
772
  hh = int(str(ymdh_str[-2:]))
838
773
  if hh in [0, 3, 6, 9, 12, 15, 18, 21]:
839
774
  return True
@@ -849,18 +784,15 @@ def _check_dataset_version(dataset_name, version_name, download_time, download_t
849
784
  else:
850
785
  return None, None
851
786
 
852
- # 确保下载时间是一个字符串
853
787
  download_time_str = str(download_time)
854
788
 
855
789
  if len(download_time_str) == 8:
856
790
  download_time_str = download_time_str + "00"
857
791
 
858
- # 检查小时是否有效(如果需要的话)
859
792
  if download_time_end is None and not _check_hour_is_valid(download_time_str):
860
793
  print("Please ensure the hour is 00, 03, 06, 09, 12, 15, 18, 21")
861
794
  raise ValueError("The hour is invalid")
862
795
 
863
- # 根据是否检查整个天来设置时间范围
864
796
  if download_time_end is not None:
865
797
  if len(str(download_time_end)) == 8:
866
798
  download_time_end = str(download_time_end) + "21"
@@ -876,7 +808,6 @@ def _check_dataset_version(dataset_name, version_name, download_time, download_t
876
808
 
877
809
 
878
810
  def _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
879
- # year_str = str(download_time)[:4]
880
811
  ymdh_str = str(download_time)
881
812
  if depth is not None and level_num is not None:
882
813
  print("Please ensure the depth or level_num is None")
@@ -889,14 +820,13 @@ def _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_ma
889
820
  print(f"Data of single level ({level_num}) will be downloaded...")
890
821
  which_mode = "level"
891
822
  else:
892
- # print("Full depth or full level data will be downloaded...")
893
823
  which_mode = "full"
894
824
  query_dict = _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
895
825
  submit_url = _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
896
826
  return submit_url
897
827
 
898
828
 
899
- def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
829
+ def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, cover=False):
900
830
  print("[bold #ecdbfe]-" * mark_len)
901
831
  download_time = str(download_time)
902
832
  if download_time_end is not None:
@@ -921,8 +851,8 @@ def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_ma
921
851
  submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
922
852
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
923
853
  if download_time_end is not None:
924
- file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
925
- _download_file(submit_url, store_path, file_name, check)
854
+ file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}-{download_time_end}.nc"
855
+ _download_file(submit_url, store_path, file_name, cover)
926
856
  else:
927
857
  if download_time < "2024081012":
928
858
  varlist = [_ for _ in var]
@@ -943,18 +873,17 @@ def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_ma
943
873
  for v in current_group[1:]:
944
874
  new_str = f"{new_str}&var={variable_info[v]['var_name']}"
945
875
  submit_url = submit_url.replace(old_str, new_str)
946
- # file_name = f'HYCOM_{'-'.join([variable_info[v]["var_name"] for v in current_group])}_{download_time}.nc'
947
876
  file_name = f"HYCOM_{key}_{download_time}.nc"
948
877
  if download_time_end is not None:
949
- file_name = f"HYCOM_{key}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
950
- _download_file(submit_url, store_path, file_name, check)
878
+ file_name = f"HYCOM_{key}_{download_time}-{download_time_end}.nc"
879
+ _download_file(submit_url, store_path, file_name, cover)
951
880
  else:
952
881
  for v in var:
953
882
  submit_url = _get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
954
883
  file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}.nc"
955
884
  if download_time_end is not None:
956
885
  file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}-{download_time_end}.nc"
957
- _download_file(submit_url, store_path, file_name, check)
886
+ _download_file(submit_url, store_path, file_name, cover)
958
887
 
959
888
 
960
889
  def _convert_full_name_to_short_name(full_name):
@@ -967,214 +896,184 @@ def _convert_full_name_to_short_name(full_name):
967
896
  return False
968
897
 
969
898
 
970
- def _download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
971
- """
972
- # 并行下载任务
973
- # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
974
-
975
- 任务封装:将每个任务需要的数据和操作封装在一个函数中,这样每个任务都是独立的,不会相互干扰。
976
- 本情况下,download_task函数的作用是将每个下载任务封装起来,包括它所需的所有参数。
977
- 这样,每个任务都是独立的,有自己的参数和数据,不会与其他任务共享或修改任何数据。
978
- 因此,即使多个任务同时执行,也不会出现数据交互错乱的问题。
979
- """
980
-
981
- _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
899
+ def _download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, cover):
900
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, cover)
982
901
 
983
902
 
984
903
  def _done_callback(future, progress, task, total, counter_lock):
985
- """
986
- # 并行下载任务的回调函数
987
- # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
988
-
989
- 回调函数:当一个任务完成后,会调用这个函数,这样可以及时更新进度条,显示任务的完成情况。
990
- 本情况下,done_callback函数的作用是当一个任务完成后,更新进度条的进度,显示任务的完成情况。
991
- 这样,即使多个任务同时执行,也可以及时看到每个任务的完成情况,不会等到所有任务都完成才显示。
992
- """
993
-
994
904
  global parallel_counter
995
905
  with counter_lock:
996
906
  parallel_counter += 1
997
907
  progress.update(task, advance=1, description=f"[cyan]{bar_desc} {parallel_counter}/{total}")
998
908
 
999
909
 
1000
- def _download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1, interval_hour=3):
1001
- """
1002
- Description:
1003
- Download the data of single time or a series of time
1004
-
1005
- Parameters:
1006
- var: str, the variable name, such as 'u', 'v', 'temp', 'salt', 'ssh', 'u_b', 'v_b', 'temp_b', 'salt_b' or 'water_u', 'water_v', 'water_temp', 'salinity', 'surf_el', 'water_u_bottom', 'water_v_bottom', 'water_temp_bottom', 'salinity_bottom'
1007
- time_s: str, the start time, such as '2024110100' or '20241101', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21
1008
- time_e: str, the end time, such as '2024110221' or '20241102', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21
1009
- lon_min: float, the minimum longitude, default is 0
1010
- lon_max: float, the maximum longitude, default is 359.92
1011
- lat_min: float, the minimum latitude, default is -80
1012
- lat_max: float, the maximum latitude, default is 90
1013
- depth: float, the depth, default is None
1014
- level: int, the level number, default is None
1015
- store_path: str, the path to store the data, default is None
1016
- dataset_name: str, the dataset name, default is None, example: 'GLBv0.08', 'GLBu0.08', 'GLBy0.08'
1017
- version_name: str, the version name, default is None, example: '53.X', '56.3'
1018
- num_workers: int, the number of workers, default is None
1019
-
1020
- Returns:
1021
- None
1022
- """
910
+ def _download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, cover=False, interval_hour=3):
1023
911
  ymdh_time_s, ymdh_time_e = str(time_s), str(time_e)
1024
- if num_workers is not None and num_workers > 1: # 如果使用多线程下载,用于进度条显示
912
+ if num_workers is not None and num_workers > 1:
1025
913
  global parallel_counter
1026
914
  parallel_counter = 0
1027
- counter_lock = Lock() # 创建一个锁,线程安全的计数器
915
+ counter_lock = Lock()
1028
916
  if ymdh_time_s == ymdh_time_e:
1029
- _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
917
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, cover)
1030
918
  elif int(ymdh_time_s) < int(ymdh_time_e):
1031
- print("Downloading a series of files...")
919
+ if match_time is None:
920
+ print("*" * mark_len)
921
+ print("Downloading a series of files...")
1032
922
  time_list = _get_time_list(ymdh_time_s, ymdh_time_e, interval_hour, "hour")
1033
923
  with Progress() as progress:
1034
924
  task = progress.add_task(f"[cyan]{bar_desc}", total=len(time_list))
1035
- if ftimes == 1:
1036
- if num_workers is None or num_workers <= 1:
1037
- # 串行方式
1038
- for i, time_str in enumerate(time_list):
1039
- _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
1040
- progress.update(task, advance=1, description=f"[cyan]{bar_desc} {i + 1}/{len(time_list)}")
1041
- else:
1042
- # 并行方式
1043
- with ThreadPoolExecutor(max_workers=num_workers) as executor:
1044
- futures = [executor.submit(_download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
1045
- """ for i, future in enumerate(futures):
1046
- future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]{bar_desc} {i+1}/{len(time_list)}")) """
1047
- for feature in as_completed(futures):
1048
- _done_callback(feature, progress, task, len(time_list), counter_lock)
925
+ if num_workers is None or num_workers <= 1:
926
+ for i, time_str in enumerate(time_list):
927
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, cover)
928
+ progress.update(task, advance=1, description=f"[cyan]{bar_desc} {i + 1}/{len(time_list)}")
1049
929
  else:
1050
- # new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
1051
- new_time_list = _get_time_list(ymdh_time_s, ymdh_time_e, interval_hour * ftimes, "hour")
1052
- total_num = len(new_time_list)
1053
- if num_workers is None or num_workers <= 1:
1054
- # 串行方式
1055
- for i, time_str in enumerate(new_time_list):
1056
- time_str_end_index = int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))
1057
- time_str_end = time_list[time_str_end_index]
1058
- _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
1059
- progress.update(task, advance=1, description=f"[cyan]{bar_desc} {i + 1}/{total_num}")
1060
- else:
1061
- # 并行方式
1062
- with ThreadPoolExecutor(max_workers=num_workers) as executor:
1063
- futures = [executor.submit(_download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
1064
- """ for i, future in enumerate(futures):
1065
- future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]{bar_desc} {i+1}/{total_num}")) """
1066
- for feature in as_completed(futures):
1067
- _done_callback(feature, progress, task, len(time_list), counter_lock)
930
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
931
+ futures = [executor.submit(_download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, cover) for time_str in time_list]
932
+ for feature in as_completed(futures):
933
+ _done_callback(feature, progress, task, len(time_list), counter_lock)
1068
934
  else:
1069
935
  print("[bold red]Please ensure the time_s is no more than time_e")
1070
936
 
1071
937
 
1072
- def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1, idm_engine=None, fill_time=None, interval_hour=3):
938
+ def download(
939
+ variables,
940
+ start_time,
941
+ end_time=None,
942
+ lon_min=0,
943
+ lon_max=359.92,
944
+ lat_min=-80,
945
+ lat_max=90,
946
+ depth=None,
947
+ level=None,
948
+ output_dir=None,
949
+ dataset=None,
950
+ version=None,
951
+ workers=None,
952
+ overwrite=False,
953
+ idm_path=None,
954
+ validate_time=None,
955
+ interval_hours=3,
956
+ ):
1073
957
  """
1074
- Description:
1075
- Download the data of single time or a series of time
958
+ Download data for a single time or a series of times.
1076
959
 
1077
960
  Parameters:
1078
- var: str or list, the variable name, such as 'u', 'v', 'temp', 'salt', 'ssh', 'u_b', 'v_b', 'temp_b', 'salt_b' or 'water_u', 'water_v', 'water_temp', 'salinity', 'surf_el', 'water_u_bottom', 'water_v_bottom', 'water_temp_bottom', 'salinity_bottom'
1079
- time_s: str, the start time, such as '2024110100' or '20241101', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21
1080
- time_e: str, the end time, such as '2024110221' or '20241102', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21; default is None, if not set, the data of single time will be downloaded; or same as time_s, the data of single time will be downloaded
1081
- lon_min: float, the minimum longitude, default is 0
1082
- lon_max: float, the maximum longitude, default is 359.92
1083
- lat_min: float, the minimum latitude, default is -80
1084
- lat_max: float, the maximum latitude, default is 90
1085
- depth: float, the depth, default is None, if you wanna get the data of single depth, you can set the depth, suggest to set the depth in [0, 5000]
1086
- level: int, the level number, default is None, if you wanna get the data of single level, you can set the level, suggest to set the level in [1, 40]
1087
- store_path: str, the path to store the data, default is None, if not set, the data will be stored in the current working directory
1088
- dataset_name: str, the dataset name, default is None, example: 'GLBv0.08', 'GLBu0.08', 'GLBy0.08', if not set, the dataset will be chosen according to the download_time
1089
- version_name: str, the version name, default is None, example: '53.X', '56.3', if not set, the version will be chosen according to the download_time
1090
- num_workers: int, the number of workers, default is None, if not set, the number of workers will be 1; suggest not to set the number of workers too large
1091
- check: bool, whether to check the existing file, default is False, if set to True, the existing file will be checked and not downloaded again; else, the existing file will be covered
1092
- ftimes: int, the number of time in one file, default is 1, if set to 1, the data of single time will be downloaded; the maximum is 8, if set to 8, the data of 8 times will be downloaded in one file
1093
- idm_engine: str, the IDM engine, default is None, if set, the IDM will be used to download the data; example: "D:\\Programs\\Internet Download Manager\\IDMan.exe"
1094
- fill_time: bool or None, the mode to fill the time, default is None. None: only download the data; True: modify the real time of data to the time in the file name; False: check the time in the file name and the real time of data, if not match, delete the file
1095
- interval_hour: int, the interval time to download the data, default is 3, if set, the interval time will be used to download the data; example: 3, 6, ...
961
+ variables (str or list): Variable names to download. Examples include:
962
+ 'u', 'v', 'temp', 'salt', 'ssh', 'u_b', 'v_b', 'temp_b', 'salt_b'
963
+ or their full names like 'water_u', 'water_v', etc.
964
+ start_time (str): Start time in the format 'YYYYMMDDHH' or 'YYYYMMDD'.
965
+ If hour is included, it must be one of [00, 03, 06, 09, 12, 15, 18, 21].
966
+ end_time (str, optional): End time in the format 'YYYYMMDDHH' or 'YYYYMMDD'.
967
+ If not provided, only data for the start_time will be downloaded.
968
+ lon_min (float, optional): Minimum longitude. Default is 0.
969
+ lon_max (float, optional): Maximum longitude. Default is 359.92.
970
+ lat_min (float, optional): Minimum latitude. Default is -80.
971
+ lat_max (float, optional): Maximum latitude. Default is 90.
972
+ depth (float, optional): Depth in meters. If specified, data for a single depth
973
+ will be downloaded. Suggested range: [0, 5000].
974
+ level (int, optional): Vertical level number. If specified, data for a single
975
+ level will be downloaded. Suggested range: [1, 40].
976
+ output_dir (str, optional): Directory to save downloaded files. If not provided,
977
+ files will be saved in the current working directory.
978
+ dataset (str, optional): Dataset name. Examples: 'GLBv0.08', 'GLBu0.08', etc.
979
+ If not provided, the dataset will be chosen based on the time range.
980
+ version (str, optional): Dataset version. Examples: '53.X', '56.3', etc.
981
+ If not provided, the version will be chosen based on the time range.
982
+ workers (int, optional): Number of parallel workers. Default is 1. Maximum is 10.
983
+ overwrite (bool, optional): Whether to overwrite existing files. Default is False.
984
+ idm_path (str, optional): Path to the Internet Download Manager (IDM) executable.
985
+ If provided, IDM will be used for downloading.
986
+ validate_time (bool, optional): Time validation mode. Default is None.
987
+ - None: Only download data.
988
+ - True: Modify the real time in the data to match the file name.
989
+ - False: Check if the real time matches the file name. If not, delete the file.
990
+ interval_hours (int, optional): Time interval in hours for downloading data.
991
+ Default is 3. Examples: 3, 6, etc.
1096
992
 
1097
993
  Returns:
1098
994
  None
995
+
996
+ Example:
997
+ >>> download(
998
+ variables='u',
999
+ start_time='2024083100',
1000
+ end_time='2024090100',
1001
+ lon_min=0,
1002
+ lon_max=359.92,
1003
+ lat_min=-80,
1004
+ lat_max=90,
1005
+ depth=None,
1006
+ level=None,
1007
+ output_dir=None,
1008
+ dataset=None,
1009
+ version=None,
1010
+ workers=4,
1011
+ overwrite=False,
1012
+ idm_path=None,
1013
+ validate_time=None,
1014
+ interval_hours=3,
1015
+ )
1099
1016
  """
1100
1017
  from oafuncs.oa_tool import pbar
1101
- from oafuncs.oa_cmap import get as get_cmap
1102
-
1018
+
1103
1019
  _get_initial_data()
1104
1020
 
1105
- # 打印信息并处理数据集和版本名称
1106
- if dataset_name is None and version_name is None:
1107
- print("The dataset_name and version_name are None, so the dataset and version will be chosen according to the download_time.\nIf there is more than one dataset and version in the time range, the first one will be chosen.")
1108
- print("If you wanna choose the dataset and version by yourself, please set the dataset_name and version_name together.")
1109
- elif dataset_name is None and version_name is not None:
1110
- print("Please ensure the dataset_name is not None")
1111
- print("If you do not add the dataset_name, both the dataset and version will be chosen according to the download_time.")
1112
- elif dataset_name is not None and version_name is None:
1113
- print("Please ensure the version_name is not None")
1114
- print("If you do not add the version_name, both the dataset and version will be chosen according to the download_time.")
1021
+ if dataset is None and version is None:
1022
+ if validate_time is None:
1023
+ print("Dataset and version will be chosen based on the time range.")
1024
+ print("If multiple datasets or versions exist, the latest one will be used.")
1025
+ elif dataset is None:
1026
+ print("Please provide a dataset name if specifying a version.")
1027
+ elif version is None:
1028
+ print("Please provide a version if specifying a dataset name.")
1115
1029
  else:
1116
- print("The dataset_name and version_name are both set by yourself.")
1117
- print("Please ensure the dataset_name and version_name are correct.")
1030
+ print("Using the specified dataset and version.")
1118
1031
 
1119
- if isinstance(var, list):
1120
- if len(var) == 1:
1121
- var = _convert_full_name_to_short_name(var[0])
1032
+ if isinstance(variables, list):
1033
+ if len(variables) == 1:
1034
+ variables = _convert_full_name_to_short_name(variables[0])
1122
1035
  else:
1123
- var = [_convert_full_name_to_short_name(v) for v in var]
1124
- elif isinstance(var, str):
1125
- var = _convert_full_name_to_short_name(var)
1036
+ variables = [_convert_full_name_to_short_name(v) for v in variables]
1037
+ elif isinstance(variables, str):
1038
+ variables = _convert_full_name_to_short_name(variables)
1126
1039
  else:
1127
- raise ValueError("The var is invalid")
1128
- if var is False:
1129
- raise ValueError("The var is invalid")
1130
- if lon_min < 0 or lon_min > 359.92 or lon_max < 0 or lon_max > 359.92 or lat_min < -80 or lat_min > 90 or lat_max < -80 or lat_max > 90:
1131
- print("Please ensure the lon_min, lon_max, lat_min, lat_max are in the range")
1132
- print("The range of lon_min, lon_max is 0~359.92")
1133
- print("The range of lat_min, lat_max is -80~90")
1134
- raise ValueError("The lon or lat is invalid")
1135
-
1136
- if ftimes != 1:
1137
- print("Please ensure the ftimes is in [1, 8]")
1138
- ftimes = max(min(ftimes, 8), 1)
1139
-
1140
- if store_path is None:
1141
- store_path = str(Path.cwd())
1040
+ raise ValueError("Invalid variable(s) provided.")
1041
+ if variables is False:
1042
+ raise ValueError("Invalid variable(s) provided.")
1043
+ if not (0 <= lon_min <= 359.92 and 0 <= lon_max <= 359.92 and -80 <= lat_min <= 90 and -80 <= lat_max <= 90):
1044
+ raise ValueError("Longitude or latitude values are out of range.")
1045
+
1046
+ if output_dir is None:
1047
+ output_dir = str(Path.cwd())
1142
1048
  else:
1143
- os.makedirs(str(store_path), exist_ok=True)
1144
-
1145
- if num_workers is not None:
1146
- num_workers = max(min(num_workers, 10), 1) # 暂时不限制最大值,再检查的时候可以多开一些线程
1147
- # num_workers = int(max(num_workers, 1))
1148
- time_s = str(time_s)
1149
- if len(time_s) == 8:
1150
- time_s += "00"
1151
- if time_e is None:
1152
- time_e = time_s[:]
1049
+ os.makedirs(output_dir, exist_ok=True)
1050
+
1051
+ if workers is not None:
1052
+ workers = max(min(workers, 10), 1)
1053
+ start_time = str(start_time)
1054
+ if len(start_time) == 8:
1055
+ start_time += "00"
1056
+ if end_time is None:
1057
+ end_time = start_time[:]
1153
1058
  else:
1154
- time_e = str(time_e)
1155
- if len(time_e) == 8:
1156
- time_e += "21"
1059
+ end_time = str(end_time)
1060
+ if len(end_time) == 8:
1061
+ end_time += "21"
1157
1062
 
1158
1063
  global count_dict
1159
1064
  count_dict = {"success": 0, "fail": 0, "skip": 0, "no_data": 0, "total": 0, "no_data_list": []}
1160
1065
 
1161
- """ global current_platform
1162
- current_platform = platform.system() """
1163
-
1164
1066
  global fsize_dict
1165
1067
  fsize_dict = {}
1166
1068
 
1167
1069
  global fsize_dict_lock
1168
1070
  fsize_dict_lock = Lock()
1169
1071
 
1170
- if fill_time is not None:
1171
- num_workers = 1
1172
-
1173
1072
  global use_idm, given_idm_engine, idm_download_list, bar_desc
1174
- if idm_engine is not None:
1073
+ if idm_path is not None:
1175
1074
  use_idm = True
1176
- num_workers = 1
1177
- given_idm_engine = idm_engine
1075
+ workers = 1
1076
+ given_idm_engine = idm_path
1178
1077
  idm_download_list = []
1179
1078
  bar_desc = "Submitting to IDM ..."
1180
1079
  else:
@@ -1182,51 +1081,46 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
1182
1081
  bar_desc = "Downloading ..."
1183
1082
 
1184
1083
  global match_time
1185
- match_time = fill_time
1084
+ match_time = validate_time
1186
1085
 
1187
1086
  global mark_len
1188
1087
  mark_len = 100
1189
1088
 
1190
- _download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes, int(interval_hour))
1191
-
1192
- if idm_engine is not None:
1089
+ if validate_time is not None:
1090
+ workers = 1
1091
+ print('*' * mark_len)
1092
+ print("[bold red]Only checking the time of existing files.")
1093
+ bar_desc = "Checking time ..."
1094
+
1095
+ _download_hourly_func(
1096
+ variables,
1097
+ start_time,
1098
+ end_time,
1099
+ lon_min,
1100
+ lon_max,
1101
+ lat_min,
1102
+ lat_max,
1103
+ depth,
1104
+ level,
1105
+ output_dir,
1106
+ dataset,
1107
+ version,
1108
+ workers,
1109
+ overwrite,
1110
+ int(interval_hours),
1111
+ )
1112
+
1113
+ if idm_path is not None:
1193
1114
  print("[bold #ecdbfe]*" * mark_len)
1194
- str_info = "All files have been submitted to IDM for downloading"
1195
- str_info = str_info.center(mark_len, "*")
1196
- print(f"[bold #3dfc40]{str_info}")
1115
+ print(f"[bold #3dfc40]{'All files have been submitted to IDM for downloading'.center(mark_len, '*')}")
1197
1116
  print("[bold #ecdbfe]*" * mark_len)
1198
1117
  if idm_download_list:
1199
- """ file_download_time = 60 # 预设下载时间为1分钟
1200
- for f in pbar(idm_download_list,cmap='bwr',prefix='HYCOM: '):
1201
- file_download_start_time = time.time()
1202
- wait_success = 0
1203
- success = False
1204
- while not success:
1205
- if check_nc(f,print_switch=False):
1206
- count_dict["success"] += 1
1207
- success = True
1208
- # print(f"[bold #3dfc40]File [bold #dfff73]{f} [#3dfc40]has been downloaded successfully")
1209
- file_download_end_time = time.time()
1210
- file_download_time = file_download_end_time - file_download_start_time
1211
- file_download_time = int(file_download_time)
1212
- # print(f"[bold #3dfc40]Time: {file_download_time} seconds")
1213
- file_download_time = max(60, file_download_time) # 预设下载时间为1分钟起步
1214
- else:
1215
- wait_success += 1
1216
- # print(f"[bold #ffe5c0]Waiting {file_download_time} seconds to check the file {f}...")
1217
- time.sleep(file_download_time)
1218
- if wait_success >= 10:
1219
- success = True
1220
- # print(f'{f} download failed')
1221
- print(f"[bold #ffe5c0]Waiting for more than 10 times, skipping the file {f}...")
1222
- count_dict["fail"] += 1
1223
- # print("[bold #ecdbfe]-" * mark_len) """
1224
1118
  remain_list = idm_download_list.copy()
1225
- for f_count in pbar(range(len(idm_download_list)), cmap=get_cmap('diverging_1'), prefix="HYCOM: "):
1119
+ for _ in pbar(range(len(idm_download_list)), cmap="diverging_1", description="Downloading: "):
1226
1120
  success = False
1227
1121
  while not success:
1228
1122
  for f in remain_list:
1229
- if check_nc(f, print_switch=False):
1123
+ if check_nc(f, print_messages=False):
1230
1124
  count_dict["success"] += 1
1231
1125
  success = True
1232
1126
  remain_list.remove(f)
@@ -1237,12 +1131,9 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
1237
1131
  print(f"[bold #ff80ab]Total: {count_dict['total']}\nSuccess: {count_dict['success']}\nFail: {count_dict['fail']}\nSkip: {count_dict['skip']}\nNo data: {count_dict['no_data']}")
1238
1132
  print("[bold #ecdbfe]=" * mark_len)
1239
1133
  if count_dict["fail"] > 0:
1240
- print("[bold #be5528]Please try again to download the failed data later")
1134
+ print("[bold #be5528]Please try again to download the failed data later.")
1241
1135
  if count_dict["no_data"] > 0:
1242
- if count_dict["no_data"] == 1:
1243
- print(f"[bold #f90000]There is {count_dict['no_data']} data that does not exist in any dataset and version")
1244
- else:
1245
- print(f"[bold #f90000]These are {count_dict['no_data']} data that do not exist in any dataset and version")
1136
+ print(f"[bold #f90000]{count_dict['no_data']} data entries do not exist in any dataset or version.")
1246
1137
  for no_data in count_dict["no_data_list"]:
1247
1138
  print(f"[bold #d81b60]{no_data}")
1248
1139
  print("[bold #ecdbfe]=" * mark_len)
@@ -1265,24 +1156,22 @@ if __name__ == "__main__":
1265
1156
 
1266
1157
  single_var = False
1267
1158
 
1268
- # draw_time_range(pic_save_folder=r'I:\Delete')
1269
-
1270
1159
  options = {
1271
- "var": var_list,
1272
- "time_s": "2025010300",
1273
- "time_e": "2025010321",
1274
- "store_path": r"I:\Data\HYCOM\3hourly",
1160
+ "variables": var_list,
1161
+ "start_time": "2025010300",
1162
+ "end_time": "2025010309",
1163
+ "output_dir": r"I:\Data\HYCOM\3hourly_test",
1275
1164
  "lon_min": 105,
1276
1165
  "lon_max": 130,
1277
1166
  "lat_min": 15,
1278
1167
  "lat_max": 45,
1279
- "num_workers": 3,
1280
- "check": True,
1281
- "depth": None, # or 0-5000 meters
1282
- "level": None, # or 1-40 levels
1283
- "ftimes": 1,
1284
- # "idm_engine": r"D:\Programs\Internet Download Manager\IDMan.exe", # 查漏补缺不建议开启
1285
- "fill_time": None,
1168
+ "workers": 1,
1169
+ "overwrite": False,
1170
+ "depth": None,
1171
+ "level": None,
1172
+ "validate_time": True,
1173
+ "idm_path": r'D:\Programs\Internet Download Manager\IDMan.exe',
1174
+ "interval_hours": 3,
1286
1175
  }
1287
1176
 
1288
1177
  if single_var: