oafuncs 0.0.78__py2.py3-none-any.whl → 0.0.80__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. oafuncs/__init__.py +15 -12
  2. oafuncs/oa_down/User_Agent-list.txt +13 -2837
  3. oafuncs/oa_down/hycom_3hourly.py +184 -63
  4. oafuncs/oa_down/test.py +52 -28
  5. oafuncs/oa_down/test_ua.py +151 -0
  6. oafuncs/oa_draw.py +45 -1
  7. oafuncs/oa_s/__init__.py +23 -0
  8. oafuncs/oa_s/oa_cmap.py +163 -0
  9. oafuncs/oa_s/oa_data.py +187 -0
  10. oafuncs/oa_s/oa_draw.py +451 -0
  11. oafuncs/oa_s/oa_file.py +332 -0
  12. oafuncs/oa_s/oa_help.py +39 -0
  13. oafuncs/oa_s/oa_nc.py +410 -0
  14. oafuncs/oa_s/oa_python.py +107 -0
  15. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/__init__.py" +26 -0
  16. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_cmap.py" +163 -0
  17. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_data.py" +187 -0
  18. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/__init__.py" +20 -0
  19. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/hycom_3hourly.py" +1176 -0
  20. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/literature.py" +332 -0
  21. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/test_ua.py" +151 -0
  22. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_draw.py" +451 -0
  23. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_file.py" +332 -0
  24. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_help.py" +39 -0
  25. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_nc.py" +410 -0
  26. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_python.py" +107 -0
  27. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/__init__.py" +21 -0
  28. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/meteorological.py" +168 -0
  29. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/ocean.py" +158 -0
  30. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/scientific.py" +139 -0
  31. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_tool/__init__.py" +18 -0
  32. oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_tool/email.py" +114 -0
  33. {oafuncs-0.0.78.dist-info → oafuncs-0.0.80.dist-info}/METADATA +1 -1
  34. oafuncs-0.0.80.dist-info/RECORD +51 -0
  35. oafuncs-0.0.80.dist-info/top_level.txt +2 -0
  36. oafuncs-0.0.78.dist-info/RECORD +0 -24
  37. oafuncs-0.0.78.dist-info/top_level.txt +0 -1
  38. {oafuncs-0.0.78.dist-info → oafuncs-0.0.80.dist-info}/LICENSE.txt +0 -0
  39. {oafuncs-0.0.78.dist-info → oafuncs-0.0.80.dist-info}/WHEEL +0 -0
@@ -18,13 +18,15 @@ import os
18
18
  import random
19
19
  import time
20
20
  import warnings
21
- from concurrent.futures import ThreadPoolExecutor
21
+ from concurrent.futures import ThreadPoolExecutor, as_completed
22
22
  from pathlib import Path
23
+ from threading import Lock
23
24
 
24
25
  import matplotlib.pyplot as plt
25
26
  import numpy as np
26
27
  import pandas as pd
27
28
  import requests
29
+ from bs4 import BeautifulSoup
28
30
  from rich import print
29
31
  from rich.progress import Progress
30
32
 
@@ -53,17 +55,17 @@ data_info["hourly"]["dataset"]["GLBy0.08"]["version"] = {"93.0": {}}
53
55
  # 在网页上提交超过范围的时间,会返回该数据集实际时间范围,从而纠正下面的时间范围
54
56
  # 目前只纠正了GLBv0.08 93.0的时间范围,具体到小时了
55
57
  # 其他数据集的时刻暂时默认为00起,21止
56
- data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["time_range"] = {"time_start": "19940101", "time_end": "20151231"}
57
- data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["time_range"] = {"time_start": "20140701", "time_end": "20160430"}
58
- data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["time_range"] = {"time_start": "20160501", "time_end": "20170131"}
59
- data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["time_range"] = {"time_start": "20170201", "time_end": "20170531"}
60
- data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["time_range"] = {"time_start": "20170601", "time_end": "20170930"}
61
- data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["time_range"] = {"time_start": "20171001", "time_end": "20171231"}
58
+ data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["time_range"] = {"time_start": "1994010112", "time_end": "2015123109"}
59
+ data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["time_range"] = {"time_start": "2014070112", "time_end": "2016093009"}
60
+ data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["time_range"] = {"time_start": "2016050112", "time_end": "2017020109"}
61
+ data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["time_range"] = {"time_start": "2017020112", "time_end": "2017060109"}
62
+ data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["time_range"] = {"time_start": "2017060112", "time_end": "2017100109"}
63
+ data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["time_range"] = {"time_start": "2017100112", "time_end": "2018032009"}
62
64
  data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018010112", "time_end": "2020021909"}
63
65
  # GLBu0.08
64
- data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["time_range"] = {"time_start": "20180919", "time_end": "20181208"}
66
+ data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018091912", "time_end": "2018120909"}
65
67
  # GLBy0.08
66
- data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["time_range"] = {"time_start": "20181204", "time_end": "20300904"}
68
+ data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018120412", "time_end": "20300904"}
67
69
 
68
70
  # variable
69
71
  variable_info = {
@@ -139,10 +141,11 @@ data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["url"] = url_930_u
139
141
  uv3z_930_y = {}
140
142
  ts3z_930_y = {}
141
143
  ssh_930_y = {}
142
- for y_930_y in range(2018, 2025):
144
+ for y_930_y in range(2018, 2030):
143
145
  uv3z_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/uv3z/{y_930_y}?"
144
146
  ts3z_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/ts3z/{y_930_y}?"
145
147
  ssh_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/ssh/{y_930_y}?"
148
+ # GLBy0.08 93.0 data time range in each year: year-01-01 12:00 to year+1-01-01 09:00
146
149
  url_930_y = {
147
150
  "uv3z": uv3z_930_y,
148
151
  "ts3z": ts3z_930_y,
@@ -370,7 +373,16 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
370
373
  if have_data:
371
374
  for d, v, trange in zip(d_list, v_list, trange_list):
372
375
  print(f"[bold blue]{d} {v} {trange}")
373
- return True
376
+ if is_single_time:
377
+ return True
378
+ else:
379
+ base_url_s = get_base_url(d_list[0], v_list[0], "u", str(time_start))
380
+ base_url_e = get_base_url(d_list[0], v_list[0], "u", str(time_end))
381
+ if base_url_s == base_url_e:
382
+ return True
383
+ else:
384
+ print(f"[bold red]{time_start} to {time_end} is in different datasets or versions, so you can't download them together")
385
+ return False
374
386
  else:
375
387
  print(f"[bold red]{time_input_str} is not in any dataset and version")
376
388
  return False
@@ -454,7 +466,8 @@ def direct_choose_dataset_and_version(time_input, time_end=None):
454
466
  return dataset_name_out, version_name_out
455
467
 
456
468
 
457
- def get_base_url(dataset_name, version_name, var, year_str):
469
+ def get_base_url(dataset_name, version_name, var, ymdh_str):
470
+ year_str = int(ymdh_str[:4])
458
471
  url_dict = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["url"]
459
472
  classification_method = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["classification"]
460
473
  if classification_method == "year_different":
@@ -470,6 +483,12 @@ def get_base_url(dataset_name, version_name, var, year_str):
470
483
  if base_url is None:
471
484
  print("Please ensure the var is in [u,v,temp,salt,ssh,u_b,v_b,temp_b,salt_b]")
472
485
  elif classification_method == "var_year_different":
486
+ if dataset_name == "GLBy0.08" and version_name == "93.0":
487
+ mdh_str = ymdh_str[4:]
488
+ # GLBy0.08 93.0
489
+ # data time range in each year: year-01-01 12:00 to year+1-01-01 09:00
490
+ if mdh_str <= "010109":
491
+ year_str = int(ymdh_str[:4]) - 1
473
492
  base_url = None
474
493
  for key, value in var_group.items():
475
494
  if var in value:
@@ -480,8 +499,8 @@ def get_base_url(dataset_name, version_name, var, year_str):
480
499
  return base_url
481
500
 
482
501
 
483
- def get_submit_url(dataset_name, version_name, var, year_str, query_dict):
484
- base_url = get_base_url(dataset_name, version_name, var, year_str)
502
+ def get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
503
+ base_url = get_base_url(dataset_name, version_name, var, ymdh_str)
485
504
  if isinstance(query_dict["var"], str):
486
505
  query_dict["var"] = [query_dict["var"]]
487
506
  target_url = base_url + "&".join(f"var={var}" for var in query_dict["var"]) + "&" + "&".join(f"{key}={value}" for key, value in query_dict.items() if key != "var")
@@ -494,10 +513,37 @@ def clear_existing_file(file_full_path):
494
513
  print(f"{file_full_path} has been removed")
495
514
 
496
515
 
516
+ def _get_file_size(file_path, unit="KB"):
517
+ # 检查文件是否存在
518
+ if not os.path.exists(file_path):
519
+ return "文件不存在"
520
+
521
+ # 获取文件大小(字节)
522
+ file_size = os.path.getsize(file_path)
523
+
524
+ # 单位转换字典
525
+ unit_dict = {"PB": 1024**5, "TB": 1024**4, "GB": 1024**3, "MB": 1024**2, "KB": 1024}
526
+
527
+ # 检查传入的单位是否合法
528
+ if unit not in unit_dict:
529
+ return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
530
+
531
+ # 转换文件大小到指定单位
532
+ converted_size = file_size / unit_dict[unit]
533
+
534
+ return converted_size
535
+
536
+
497
537
  def check_existing_file(file_full_path):
498
538
  if os.path.exists(file_full_path):
499
539
  print(f"[bold #FFA54F]{file_full_path} exists")
500
- return True
540
+ fsize = _get_file_size(file_full_path)
541
+ if fsize < 5:
542
+ print(f"[bold #FFA54F]{file_full_path} may be incomplete\nFile size: {fsize:.2f} KB")
543
+ # clear_existing_file(file_full_path)
544
+ return False
545
+ else:
546
+ return True
501
547
  else:
502
548
  # print(f'{file_full_path} does not exist')
503
549
  return False
@@ -512,10 +558,13 @@ def get_ua():
512
558
  # 去掉换行符和空行
513
559
  ua_list = [line.strip() for line in ua_list if line.strip()]
514
560
 
561
+ # if current_platform == 'Linux':
562
+ # ua_list = [line for line in ua_list if 'Linux' in line]
563
+
515
564
  return random.choice(ua_list)
516
565
 
517
566
 
518
- def get_proxy():
567
+ def get_proxy_file():
519
568
  # 获取当前脚本的绝对路径
520
569
  script_dir = os.path.dirname(os.path.abspath(__file__))
521
570
  # 构建ip.txt的绝对路径
@@ -531,15 +580,63 @@ def get_proxy():
531
580
  return proxies
532
581
 
533
582
 
534
- def dlownload_file(target_url, store_path, file_name, check=False):
583
+ def scrape_and_categorize_proxies(choose_protocol="http"):
584
+ url = "https://topproxylinks.com/"
585
+ # 发送HTTP请求获取网页内容
586
+ response = requests.get(url)
587
+ # 使用BeautifulSoup解析网页
588
+ soup = BeautifulSoup(response.text, "html.parser")
589
+
590
+ # 初始化字典来存储不同协议的代理
591
+ proxies_dict = {"http": [], "socks4": [], "socks5": []}
592
+
593
+ # 查找表格中的所有行
594
+ tbody = soup.find("tbody")
595
+
596
+ if tbody:
597
+ for row in tbody.find_all("tr"):
598
+ # 提取协议、代理和国家的单元格
599
+ cells = row.find_all("td")
600
+ protocol = cells[0].text.strip().lower()
601
+ proxy = cells[1].text.strip()
602
+
603
+ # 根据协议分类存储代理
604
+ if protocol in proxies_dict:
605
+ proxies_dict[protocol].append(proxy)
606
+
607
+ if choose_protocol in proxies_dict:
608
+ proxies_list = proxies_dict[choose_protocol]
609
+ else:
610
+ proxies_list = proxies_dict["http"]
611
+
612
+ return proxies_list
613
+
614
+
615
+ def get_proxy():
616
+ ip_list = scrape_and_categorize_proxies(choose_protocol="http")
617
+ choose_ip = random.choice(ip_list)
618
+ proxies = {"http": f"http://{choose_ip}", "https": f"http://{choose_ip}"}
619
+ print(f"Using proxy: {proxies}")
620
+ return proxies
621
+
622
+
623
+ def download_file(target_url, store_path, file_name, check=False):
624
+ # Check if the file exists
625
+ fname = Path(store_path) / file_name
626
+ if check:
627
+ if check_existing_file(fname):
628
+ count_dict["skip"] += 1
629
+ return
630
+ clear_existing_file(fname)
631
+
632
+ # -----------------------------------------------
535
633
  print(f"[bold #f0f6d0]Requesting {file_name}...")
536
634
  # 创建会话
537
635
  s = requests.Session()
538
636
  download_success = False
539
637
  request_times = 0
540
- filename = Path(store_path) / file_name
541
638
 
542
- def calculate_wait_time(time_str):
639
+ def calculate_wait_time(time_str, target_url):
543
640
  import re
544
641
 
545
642
  # 定义正则表达式,匹配YYYYMMDDHH格式的时间
@@ -561,16 +658,15 @@ def dlownload_file(target_url, store_path, file_name, check=False):
561
658
  delta_t = delta_t / 3 + 1
562
659
  else:
563
660
  delta_t = 1
661
+ # 单个要素最多等待5分钟,不宜太短,太短可能请求失败;也不宜太长,太长可能会浪费时间
662
+ num_var = int(target_url.count("var="))
663
+ if num_var <= 0:
664
+ num_var = 1
665
+ return int(delta_t * 5 * 60 * num_var)
564
666
 
565
- return int(delta_t * 180)
566
-
567
- max_timeout = calculate_wait_time(file_name)
667
+ max_timeout = calculate_wait_time(file_name, target_url)
668
+ print(f"[bold #912dbc]Max timeout: {max_timeout} seconds")
568
669
 
569
- if check:
570
- if check_existing_file(filename):
571
- count_dict['skip'] += 1
572
- return
573
- clear_existing_file(filename)
574
670
  # print(f'Download_start_time: {datetime.datetime.now()}')
575
671
  download_time_s = datetime.datetime.now()
576
672
  order_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
@@ -593,11 +689,15 @@ def dlownload_file(target_url, store_path, file_name, check=False):
593
689
  with open(filename, 'wb') as f:
594
690
  f.write(response.content) """
595
691
 
596
- response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
692
+ if find_proxy:
693
+ proxies = get_proxy()
694
+ response = s.get(target_url, headers=headers, proxies=proxies, stream=True, timeout=random.randint(5, max_timeout))
695
+ else:
696
+ response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
597
697
  response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
598
698
 
599
699
  # 保存文件
600
- with open(filename, "wb") as f:
700
+ with open(fname, "wb") as f:
601
701
  print(f"[bold #96cbd7]Downloading {file_name}...")
602
702
  for chunk in response.iter_content(chunk_size=1024):
603
703
  if chunk:
@@ -605,12 +705,12 @@ def dlownload_file(target_url, store_path, file_name, check=False):
605
705
 
606
706
  f.close()
607
707
 
608
- # print(f'\r文件 {filename} 下载成功', end="")
609
- if os.path.exists(filename):
708
+ # print(f'\r文件 {fname} 下载成功', end="")
709
+ if os.path.exists(fname):
610
710
  download_success = True
611
711
  download_time_e = datetime.datetime.now()
612
712
  download_delta = download_time_e - download_time_s
613
- print(f"[#3dfc40]File [bold #dfff73]{filename} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
713
+ print(f"[#3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
614
714
  count_dict["success"] += 1
615
715
  # print(f'Download_end_time: {datetime.datetime.now()}')
616
716
 
@@ -671,7 +771,8 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
671
771
 
672
772
 
673
773
  def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
674
- year_str = str(download_time)[:4]
774
+ # year_str = str(download_time)[:4]
775
+ ymdh_str = str(download_time)
675
776
  if depth is not None and level_num is not None:
676
777
  print("Please ensure the depth or level_num is None")
677
778
  print("Progress will use the depth")
@@ -683,10 +784,10 @@ def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max
683
784
  print(f"Data of single level ({level_num}) will be downloaded...")
684
785
  which_mode = "level"
685
786
  else:
686
- print("Full depth or full level data will be downloaded...")
787
+ # print("Full depth or full level data will be downloaded...")
687
788
  which_mode = "full"
688
789
  query_dict = get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
689
- submit_url = get_submit_url(dataset_name, version_name, var, year_str, query_dict)
790
+ submit_url = get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
690
791
  return submit_url
691
792
 
692
793
 
@@ -716,7 +817,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
716
817
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
717
818
  if download_time_end is not None:
718
819
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}_{download_time_end}.nc"
719
- dlownload_file(submit_url, store_path, file_name, check)
820
+ download_file(submit_url, store_path, file_name, check)
720
821
  else:
721
822
  varlist = [_ for _ in var]
722
823
  for key, value in var_group.items():
@@ -740,7 +841,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
740
841
  file_name = f"HYCOM_{key}_{download_time}.nc"
741
842
  if download_time_end is not None:
742
843
  file_name = f"HYCOM_{key}_{download_time}_{download_time_end}.nc"
743
- dlownload_file(submit_url, store_path, file_name, check)
844
+ download_file(submit_url, store_path, file_name, check)
744
845
 
745
846
 
746
847
  def convert_full_name_to_short_name(full_name):
@@ -767,6 +868,22 @@ def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_ma
767
868
  prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
768
869
 
769
870
 
871
+ def done_callback(future, progress, task, total, counter_lock):
872
+ """
873
+ # 并行下载任务的回调函数
874
+ # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
875
+
876
+ 回调函数:当一个任务完成后,会调用这个函数,这样可以及时更新进度条,显示任务的完成情况。
877
+ 本情况下,done_callback函数的作用是当一个任务完成后,更新进度条的进度,显示任务的完成情况。
878
+ 这样,即使多个任务同时执行,也可以及时看到每个任务的完成情况,不会等到所有任务都完成才显示。
879
+ """
880
+
881
+ global parallel_counter
882
+ with counter_lock:
883
+ parallel_counter += 1
884
+ progress.update(task, advance=1, description=f"[cyan]Downloading... {parallel_counter}/{total}")
885
+
886
+
770
887
  def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
771
888
  """
772
889
  Description:
@@ -791,6 +908,10 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
791
908
  None
792
909
  """
793
910
  ymdh_time_s, ymdh_time_e = str(time_s), str(time_e)
911
+ if num_workers is not None and num_workers > 1: # 如果使用多线程下载,用于进度条显示
912
+ global parallel_counter
913
+ parallel_counter = 0
914
+ counter_lock = Lock() # 创建一个锁,线程安全的计数器
794
915
  if ymdh_time_s == ymdh_time_e:
795
916
  prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name)
796
917
  elif int(ymdh_time_s) < int(ymdh_time_e):
@@ -808,8 +929,10 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
808
929
  # 并行方式
809
930
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
810
931
  futures = [executor.submit(download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
811
- for i, future in enumerate(futures):
812
- future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}"))
932
+ """ for i, future in enumerate(futures):
933
+ future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")) """
934
+ for feature in as_completed(futures):
935
+ done_callback(feature, progress, task, len(time_list), counter_lock)
813
936
  else:
814
937
  new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
815
938
  total_num = len(new_time_list)
@@ -824,8 +947,10 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
824
947
  # 并行方式
825
948
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
826
949
  futures = [executor.submit(download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
827
- for i, future in enumerate(futures):
828
- future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}"))
950
+ """ for i, future in enumerate(futures):
951
+ future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")) """
952
+ for feature in as_completed(futures):
953
+ done_callback(feature, progress, task, len(time_list), counter_lock)
829
954
  else:
830
955
  print("Please ensure the time_s is no more than time_e")
831
956
 
@@ -907,31 +1032,30 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
907
1032
  time_e = str(time_e)
908
1033
  if len(time_e) == 8:
909
1034
  time_e += "21"
910
-
1035
+
911
1036
  global count_dict
912
- count_dict = {
913
- 'success': 0,
914
- 'fail': 0,
915
- 'skip': 0,
916
- 'no_data': 0,
917
- 'total': 0,
918
- 'no_data_list': []
919
- }
1037
+ count_dict = {"success": 0, "fail": 0, "skip": 0, "no_data": 0, "total": 0, "no_data_list": []}
1038
+
1039
+ """ global current_platform
1040
+ current_platform = platform.system() """
1041
+
1042
+ global find_proxy
1043
+ find_proxy = False
920
1044
 
921
1045
  download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
922
-
1046
+
923
1047
  count_dict["total"] = count_dict["success"] + count_dict["fail"] + count_dict["skip"] + count_dict["no_data"]
924
-
1048
+
925
1049
  print("[bold #ecdbfe]-" * 160)
926
1050
  print(f"[bold #ff80ab]Total: {count_dict['total']}\nSuccess: {count_dict['success']}\nFail: {count_dict['fail']}\nSkip: {count_dict['skip']}")
927
- if count_dict['fail'] > 0:
1051
+ if count_dict["fail"] > 0:
928
1052
  print("[bold #be5528]Please try again to download the failed data later")
929
- if count_dict['no_data'] > 0:
930
- if count_dict['no_data'] == 1:
1053
+ if count_dict["no_data"] > 0:
1054
+ if count_dict["no_data"] == 1:
931
1055
  print(f"[bold #f90000]There is {count_dict['no_data']} data that does not exist in any dataset and version")
932
1056
  else:
933
1057
  print(f"[bold #f90000]These are {count_dict['no_data']} data that do not exist in any dataset and version")
934
- for no_data in count_dict['no_data_list']:
1058
+ for no_data in count_dict["no_data_list"]:
935
1059
  print(f"[bold #d81b60]{no_data}")
936
1060
  print("[bold #ecdbfe]-" * 160)
937
1061
 
@@ -987,8 +1111,8 @@ def how_to_use():
987
1111
 
988
1112
  if __name__ == "__main__":
989
1113
  # help(hycom3h.download)
990
- time_s, time_e = "2018070100", "2019123121"
991
- merge_name = "2018_2024"
1114
+ time_s, time_e = "2023010100", "2023123121"
1115
+ merge_name = f"{time_s}_{time_e}" # 合并后的文件名
992
1116
  root_path = r"G:\Data\HYCOM\3hourly"
993
1117
  location_dict = {"west": 105, "east": 130, "south": 15, "north": 45}
994
1118
  download_dict = {
@@ -1003,16 +1127,13 @@ if __name__ == "__main__":
1003
1127
  "salinity_bottom": {"simple_name": "salt_b", "download": 0},
1004
1128
  }
1005
1129
 
1006
- var_list = []
1007
- for var_name in download_dict.keys():
1008
- if download_dict[var_name]["download"] == 1:
1009
- var_list.append(var_name)
1130
+ var_list = [var_name for var_name in download_dict.keys() if download_dict[var_name]["download"]]
1010
1131
 
1011
1132
  # set depth or level, only one can be True
1012
1133
  # if you wanna download all depth or level, set both False
1013
1134
  depth = None # or 0-5000 meters
1014
1135
  level = None # or 1-40 levels
1015
- num_workers = 1
1136
+ num_workers = 3
1016
1137
 
1017
1138
  check = True
1018
1139
  ftimes = 1
oafuncs/oa_down/test.py CHANGED
@@ -1,45 +1,45 @@
1
1
  #!/usr/bin/env python
2
2
  # coding=utf-8
3
- '''
3
+ """
4
4
  Author: Liu Kun && 16031215@qq.com
5
5
  Date: 2024-12-01 19:32:25
6
6
  LastEditors: Liu Kun && 16031215@qq.com
7
- LastEditTime: 2024-12-01 19:50:32
7
+ LastEditTime: 2024-12-10 11:16:36
8
8
  FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\test.py
9
- Description:
9
+ Description:
10
10
  EditPlatform: vscode
11
11
  ComputerInfo: XPS 15 9510
12
12
  SystemInfo: Windows 11
13
13
  Python Version: 3.12
14
- '''
14
+ """
15
15
 
16
16
  import os
17
17
  import random
18
+ import re
18
19
 
19
- txtfile = r'E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\User_Agent-list.txt'
20
20
 
21
- with open(txtfile, 'r') as f:
22
- lines = f.readlines()
23
- # 去掉换行符和空行
24
- lines = [line.strip() for line in lines if line.strip()]
25
- new_line = []
26
- for i in range(len(lines)):
27
- if '/' in lines[i]:
28
- new_line.append(lines[i])
29
- else:
30
- print(lines[i])
21
+ def is_valid_user_agent(user_agent):
22
+ # 简单的正则表达式来检查User Agent的格式
23
+ # 这个正则表达式检查User Agent是否包含常见的浏览器信息格式
24
+ pattern = re.compile(
25
+ r"^(?:(?:Mozilla|Opera|Chrome|Safari|Edg|OPR)/[\d.]+)"
26
+ r"(?:\s(?:\(.*?\)))?"
27
+ r"(?:\s(?:Gecko|AppleWebKit|KHTML, like Gecko|Version|Edge|OPR)/[\d.]+)?"
28
+ r"(?:\s.*?(?:rv:|Version/|Ubuntu|Macintosh|Windows|X11|Linux|CrOS|FreeBSD|OpenBSD|NetBSD|iPhone|iPad|iPod|Android|BlackBerry|BB10|Mobile|Symbian|Windows Phone|IEMobile|Opera Mini|Opera Mobi|UCBrowser|MQQBrowser|baiduboxapp|baidubrowser|Safari|Firefox|MSIE|Trident|Edge|EdgA|Chrome|CriOS|Vivaldi|Sleipnir|Midori|ELinks|Lynx|w3m|Arora|Epiphany|Konqueror|Dillo|Netscape|SeaMonkey|K-Meleon|Camino|Iceape|Galeon|GranParadiso|Iceweasel|Firefox|Fennec|Conkeror|PaleMoon|Uzbl|QupZilla|Otter|Waterfox|Basilisk|Cyberfox|PaleMoon|GNU IceCat|GNU IceWeasel|IceCat|IceWeasel|Seamonkey|Iceape|Firefox|Epiphany|Web|Safari|Android|Mobile|BlackBerry|BB10|Tablet|Silk|Kindle|FxiOS|Focus|SamsungBrowser|browser|AppleWebKit|Puffin|DuckDuckGo|YaBrowser|Yandex|Amigo|NokiaBrowser|OviBrowser|OneBrowser|Chrome|Firefox|Safari|OPR|Coast|Mercury|Silk|Skyfire|IEMobile|Bolt|Jasmine|NativeHost|Crosswalk|TizenBrowser|SailfishBrowser|SamsungBrowser|Silk-Accelerated|UCBrowser|Quark|XiaoMi|OnePlus|Vivo|Oppo|Realme|Meizu|Lenovo|Huawei|ZTE|Alcatel|Sony|Nokia|LG|HTC|Asus|Acer|Motorola|Samsung)/[\d.]+)?$"
29
+ )
31
30
 
32
- newtxtfile = r'E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\ua_list_new.txt'
33
- """ with open(newtxtfile, 'w') as f:
34
- for line in new_line:
35
- f.write(line + '\n') """
31
+ # 使用正则表达式匹配User Agent字符串
32
+ if pattern.match(user_agent):
33
+ return True
34
+ else:
35
+ return False
36
36
 
37
37
 
38
38
  def get_ua():
39
39
  current_dir = os.path.dirname(os.path.abspath(__file__))
40
- ua_file_txt = os.path.join(current_dir, 'User_Agent-list.txt')
40
+ ua_file_txt = os.path.join(current_dir, "User_Agent-list.txt")
41
41
 
42
- with open(ua_file_txt, 'r') as f:
42
+ with open(ua_file_txt, "r") as f:
43
43
  ua_list = f.readlines()
44
44
  # 去掉换行符和空行
45
45
  ua_list = [line.strip() for line in ua_list if line.strip()]
@@ -47,9 +47,6 @@ def get_ua():
47
47
  return random.choice(ua_list)
48
48
 
49
49
 
50
- print(get_ua())
51
-
52
-
53
50
  def get_ua_org():
54
51
  ua_list = [
55
52
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
@@ -115,13 +112,40 @@ def get_ua_org():
115
112
  "NOKIA5700/UCWEB7.0.2.37/28/999",
116
113
  "Openwave/UCWEB7.0.2.37/28/999",
117
114
  "Openwave/UCWEB7.0.2.37/28/999",
118
-
119
115
  ]
120
- with open(newtxtfile, 'w') as f:
116
+ with open(newtxtfile, "w") as f:
121
117
  for line in ua_list:
122
- f.write(line + '\n')
118
+ f.write(line + "\n")
123
119
  # print(f'Using User-Agent: {ua}')
124
120
  ua = random.choice(ua_list)
125
121
  return ua
126
122
 
127
- # get_ua_org()
123
+
124
+ # get_ua_org()
125
+
126
+ if __name__ == "__main__":
127
+ txtfile = r"E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\User_Agent-list.txt"
128
+
129
+ with open(txtfile, "r") as f:
130
+ lines = f.readlines()
131
+ # 去掉换行符和空行
132
+ lines = [line.strip() for line in lines if line.strip()]
133
+ """ new_line = []
134
+ for i in range(len(lines)):
135
+ if '/' in lines[i]:
136
+ new_line.append(lines[i])
137
+ else:
138
+ print(lines[i]) """
139
+
140
+ new_line = []
141
+ for line in lines:
142
+ if is_valid_user_agent(line):
143
+ # print(line)
144
+ new_line.append(line)
145
+ else:
146
+ print(f"Invalid User-Agent: {line}")
147
+
148
+ newtxtfile = r"E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\ua_list_new.txt"
149
+ with open(newtxtfile, "w") as f:
150
+ for line in new_line:
151
+ f.write(line + "\n")