oafuncs 0.0.78__py2.py3-none-any.whl → 0.0.80__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oafuncs/__init__.py +15 -12
- oafuncs/oa_down/User_Agent-list.txt +13 -2837
- oafuncs/oa_down/hycom_3hourly.py +184 -63
- oafuncs/oa_down/test.py +52 -28
- oafuncs/oa_down/test_ua.py +151 -0
- oafuncs/oa_draw.py +45 -1
- oafuncs/oa_s/__init__.py +23 -0
- oafuncs/oa_s/oa_cmap.py +163 -0
- oafuncs/oa_s/oa_data.py +187 -0
- oafuncs/oa_s/oa_draw.py +451 -0
- oafuncs/oa_s/oa_file.py +332 -0
- oafuncs/oa_s/oa_help.py +39 -0
- oafuncs/oa_s/oa_nc.py +410 -0
- oafuncs/oa_s/oa_python.py +107 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/__init__.py" +26 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_cmap.py" +163 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_data.py" +187 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/__init__.py" +20 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/hycom_3hourly.py" +1176 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/literature.py" +332 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_down/test_ua.py" +151 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_draw.py" +451 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_file.py" +332 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_help.py" +39 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_nc.py" +410 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_python.py" +107 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/__init__.py" +21 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/meteorological.py" +168 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/ocean.py" +158 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_sign/scientific.py" +139 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_tool/__init__.py" +18 -0
- oafuncs - /321/205/320/231/320/277/321/206/320/254/320/274/oa_tool/email.py" +114 -0
- {oafuncs-0.0.78.dist-info → oafuncs-0.0.80.dist-info}/METADATA +1 -1
- oafuncs-0.0.80.dist-info/RECORD +51 -0
- oafuncs-0.0.80.dist-info/top_level.txt +2 -0
- oafuncs-0.0.78.dist-info/RECORD +0 -24
- oafuncs-0.0.78.dist-info/top_level.txt +0 -1
- {oafuncs-0.0.78.dist-info → oafuncs-0.0.80.dist-info}/LICENSE.txt +0 -0
- {oafuncs-0.0.78.dist-info → oafuncs-0.0.80.dist-info}/WHEEL +0 -0
oafuncs/oa_down/hycom_3hourly.py
CHANGED
@@ -18,13 +18,15 @@ import os
|
|
18
18
|
import random
|
19
19
|
import time
|
20
20
|
import warnings
|
21
|
-
from concurrent.futures import ThreadPoolExecutor
|
21
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22
22
|
from pathlib import Path
|
23
|
+
from threading import Lock
|
23
24
|
|
24
25
|
import matplotlib.pyplot as plt
|
25
26
|
import numpy as np
|
26
27
|
import pandas as pd
|
27
28
|
import requests
|
29
|
+
from bs4 import BeautifulSoup
|
28
30
|
from rich import print
|
29
31
|
from rich.progress import Progress
|
30
32
|
|
@@ -53,17 +55,17 @@ data_info["hourly"]["dataset"]["GLBy0.08"]["version"] = {"93.0": {}}
|
|
53
55
|
# 在网页上提交超过范围的时间,会返回该数据集实际时间范围,从而纠正下面的时间范围
|
54
56
|
# 目前只纠正了GLBv0.08 93.0的时间范围,具体到小时了
|
55
57
|
# 其他数据集的时刻暂时默认为00起,21止
|
56
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["time_range"] = {"time_start": "
|
57
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["time_range"] = {"time_start": "
|
58
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["time_range"] = {"time_start": "
|
59
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["time_range"] = {"time_start": "
|
60
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["time_range"] = {"time_start": "
|
61
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["time_range"] = {"time_start": "
|
58
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["time_range"] = {"time_start": "1994010112", "time_end": "2015123109"}
|
59
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["time_range"] = {"time_start": "2014070112", "time_end": "2016093009"}
|
60
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["time_range"] = {"time_start": "2016050112", "time_end": "2017020109"}
|
61
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["time_range"] = {"time_start": "2017020112", "time_end": "2017060109"}
|
62
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["time_range"] = {"time_start": "2017060112", "time_end": "2017100109"}
|
63
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["time_range"] = {"time_start": "2017100112", "time_end": "2018032009"}
|
62
64
|
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018010112", "time_end": "2020021909"}
|
63
65
|
# GLBu0.08
|
64
|
-
data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["time_range"] = {"time_start": "
|
66
|
+
data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018091912", "time_end": "2018120909"}
|
65
67
|
# GLBy0.08
|
66
|
-
data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["time_range"] = {"time_start": "
|
68
|
+
data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018120412", "time_end": "20300904"}
|
67
69
|
|
68
70
|
# variable
|
69
71
|
variable_info = {
|
@@ -139,10 +141,11 @@ data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["url"] = url_930_u
|
|
139
141
|
uv3z_930_y = {}
|
140
142
|
ts3z_930_y = {}
|
141
143
|
ssh_930_y = {}
|
142
|
-
for y_930_y in range(2018,
|
144
|
+
for y_930_y in range(2018, 2030):
|
143
145
|
uv3z_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/uv3z/{y_930_y}?"
|
144
146
|
ts3z_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/ts3z/{y_930_y}?"
|
145
147
|
ssh_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/ssh/{y_930_y}?"
|
148
|
+
# GLBy0.08 93.0 data time range in each year: year-01-01 12:00 to year+1-01-01 09:00
|
146
149
|
url_930_y = {
|
147
150
|
"uv3z": uv3z_930_y,
|
148
151
|
"ts3z": ts3z_930_y,
|
@@ -370,7 +373,16 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
|
|
370
373
|
if have_data:
|
371
374
|
for d, v, trange in zip(d_list, v_list, trange_list):
|
372
375
|
print(f"[bold blue]{d} {v} {trange}")
|
373
|
-
|
376
|
+
if is_single_time:
|
377
|
+
return True
|
378
|
+
else:
|
379
|
+
base_url_s = get_base_url(d_list[0], v_list[0], "u", str(time_start))
|
380
|
+
base_url_e = get_base_url(d_list[0], v_list[0], "u", str(time_end))
|
381
|
+
if base_url_s == base_url_e:
|
382
|
+
return True
|
383
|
+
else:
|
384
|
+
print(f"[bold red]{time_start} to {time_end} is in different datasets or versions, so you can't download them together")
|
385
|
+
return False
|
374
386
|
else:
|
375
387
|
print(f"[bold red]{time_input_str} is not in any dataset and version")
|
376
388
|
return False
|
@@ -454,7 +466,8 @@ def direct_choose_dataset_and_version(time_input, time_end=None):
|
|
454
466
|
return dataset_name_out, version_name_out
|
455
467
|
|
456
468
|
|
457
|
-
def get_base_url(dataset_name, version_name, var,
|
469
|
+
def get_base_url(dataset_name, version_name, var, ymdh_str):
|
470
|
+
year_str = int(ymdh_str[:4])
|
458
471
|
url_dict = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["url"]
|
459
472
|
classification_method = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["classification"]
|
460
473
|
if classification_method == "year_different":
|
@@ -470,6 +483,12 @@ def get_base_url(dataset_name, version_name, var, year_str):
|
|
470
483
|
if base_url is None:
|
471
484
|
print("Please ensure the var is in [u,v,temp,salt,ssh,u_b,v_b,temp_b,salt_b]")
|
472
485
|
elif classification_method == "var_year_different":
|
486
|
+
if dataset_name == "GLBy0.08" and version_name == "93.0":
|
487
|
+
mdh_str = ymdh_str[4:]
|
488
|
+
# GLBy0.08 93.0
|
489
|
+
# data time range in each year: year-01-01 12:00 to year+1-01-01 09:00
|
490
|
+
if mdh_str <= "010109":
|
491
|
+
year_str = int(ymdh_str[:4]) - 1
|
473
492
|
base_url = None
|
474
493
|
for key, value in var_group.items():
|
475
494
|
if var in value:
|
@@ -480,8 +499,8 @@ def get_base_url(dataset_name, version_name, var, year_str):
|
|
480
499
|
return base_url
|
481
500
|
|
482
501
|
|
483
|
-
def get_submit_url(dataset_name, version_name, var,
|
484
|
-
base_url = get_base_url(dataset_name, version_name, var,
|
502
|
+
def get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
|
503
|
+
base_url = get_base_url(dataset_name, version_name, var, ymdh_str)
|
485
504
|
if isinstance(query_dict["var"], str):
|
486
505
|
query_dict["var"] = [query_dict["var"]]
|
487
506
|
target_url = base_url + "&".join(f"var={var}" for var in query_dict["var"]) + "&" + "&".join(f"{key}={value}" for key, value in query_dict.items() if key != "var")
|
@@ -494,10 +513,37 @@ def clear_existing_file(file_full_path):
|
|
494
513
|
print(f"{file_full_path} has been removed")
|
495
514
|
|
496
515
|
|
516
|
+
def _get_file_size(file_path, unit="KB"):
|
517
|
+
# 检查文件是否存在
|
518
|
+
if not os.path.exists(file_path):
|
519
|
+
return "文件不存在"
|
520
|
+
|
521
|
+
# 获取文件大小(字节)
|
522
|
+
file_size = os.path.getsize(file_path)
|
523
|
+
|
524
|
+
# 单位转换字典
|
525
|
+
unit_dict = {"PB": 1024**5, "TB": 1024**4, "GB": 1024**3, "MB": 1024**2, "KB": 1024}
|
526
|
+
|
527
|
+
# 检查传入的单位是否合法
|
528
|
+
if unit not in unit_dict:
|
529
|
+
return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
|
530
|
+
|
531
|
+
# 转换文件大小到指定单位
|
532
|
+
converted_size = file_size / unit_dict[unit]
|
533
|
+
|
534
|
+
return converted_size
|
535
|
+
|
536
|
+
|
497
537
|
def check_existing_file(file_full_path):
|
498
538
|
if os.path.exists(file_full_path):
|
499
539
|
print(f"[bold #FFA54F]{file_full_path} exists")
|
500
|
-
|
540
|
+
fsize = _get_file_size(file_full_path)
|
541
|
+
if fsize < 5:
|
542
|
+
print(f"[bold #FFA54F]{file_full_path} may be incomplete\nFile size: {fsize:.2f} KB")
|
543
|
+
# clear_existing_file(file_full_path)
|
544
|
+
return False
|
545
|
+
else:
|
546
|
+
return True
|
501
547
|
else:
|
502
548
|
# print(f'{file_full_path} does not exist')
|
503
549
|
return False
|
@@ -512,10 +558,13 @@ def get_ua():
|
|
512
558
|
# 去掉换行符和空行
|
513
559
|
ua_list = [line.strip() for line in ua_list if line.strip()]
|
514
560
|
|
561
|
+
# if current_platform == 'Linux':
|
562
|
+
# ua_list = [line for line in ua_list if 'Linux' in line]
|
563
|
+
|
515
564
|
return random.choice(ua_list)
|
516
565
|
|
517
566
|
|
518
|
-
def
|
567
|
+
def get_proxy_file():
|
519
568
|
# 获取当前脚本的绝对路径
|
520
569
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
521
570
|
# 构建ip.txt的绝对路径
|
@@ -531,15 +580,63 @@ def get_proxy():
|
|
531
580
|
return proxies
|
532
581
|
|
533
582
|
|
534
|
-
def
|
583
|
+
def scrape_and_categorize_proxies(choose_protocol="http"):
|
584
|
+
url = "https://topproxylinks.com/"
|
585
|
+
# 发送HTTP请求获取网页内容
|
586
|
+
response = requests.get(url)
|
587
|
+
# 使用BeautifulSoup解析网页
|
588
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
589
|
+
|
590
|
+
# 初始化字典来存储不同协议的代理
|
591
|
+
proxies_dict = {"http": [], "socks4": [], "socks5": []}
|
592
|
+
|
593
|
+
# 查找表格中的所有行
|
594
|
+
tbody = soup.find("tbody")
|
595
|
+
|
596
|
+
if tbody:
|
597
|
+
for row in tbody.find_all("tr"):
|
598
|
+
# 提取协议、代理和国家的单元格
|
599
|
+
cells = row.find_all("td")
|
600
|
+
protocol = cells[0].text.strip().lower()
|
601
|
+
proxy = cells[1].text.strip()
|
602
|
+
|
603
|
+
# 根据协议分类存储代理
|
604
|
+
if protocol in proxies_dict:
|
605
|
+
proxies_dict[protocol].append(proxy)
|
606
|
+
|
607
|
+
if choose_protocol in proxies_dict:
|
608
|
+
proxies_list = proxies_dict[choose_protocol]
|
609
|
+
else:
|
610
|
+
proxies_list = proxies_dict["http"]
|
611
|
+
|
612
|
+
return proxies_list
|
613
|
+
|
614
|
+
|
615
|
+
def get_proxy():
|
616
|
+
ip_list = scrape_and_categorize_proxies(choose_protocol="http")
|
617
|
+
choose_ip = random.choice(ip_list)
|
618
|
+
proxies = {"http": f"http://{choose_ip}", "https": f"http://{choose_ip}"}
|
619
|
+
print(f"Using proxy: {proxies}")
|
620
|
+
return proxies
|
621
|
+
|
622
|
+
|
623
|
+
def download_file(target_url, store_path, file_name, check=False):
|
624
|
+
# Check if the file exists
|
625
|
+
fname = Path(store_path) / file_name
|
626
|
+
if check:
|
627
|
+
if check_existing_file(fname):
|
628
|
+
count_dict["skip"] += 1
|
629
|
+
return
|
630
|
+
clear_existing_file(fname)
|
631
|
+
|
632
|
+
# -----------------------------------------------
|
535
633
|
print(f"[bold #f0f6d0]Requesting {file_name}...")
|
536
634
|
# 创建会话
|
537
635
|
s = requests.Session()
|
538
636
|
download_success = False
|
539
637
|
request_times = 0
|
540
|
-
filename = Path(store_path) / file_name
|
541
638
|
|
542
|
-
def calculate_wait_time(time_str):
|
639
|
+
def calculate_wait_time(time_str, target_url):
|
543
640
|
import re
|
544
641
|
|
545
642
|
# 定义正则表达式,匹配YYYYMMDDHH格式的时间
|
@@ -561,16 +658,15 @@ def dlownload_file(target_url, store_path, file_name, check=False):
|
|
561
658
|
delta_t = delta_t / 3 + 1
|
562
659
|
else:
|
563
660
|
delta_t = 1
|
661
|
+
# 单个要素最多等待5分钟,不宜太短,太短可能请求失败;也不宜太长,太长可能会浪费时间
|
662
|
+
num_var = int(target_url.count("var="))
|
663
|
+
if num_var <= 0:
|
664
|
+
num_var = 1
|
665
|
+
return int(delta_t * 5 * 60 * num_var)
|
564
666
|
|
565
|
-
|
566
|
-
|
567
|
-
max_timeout = calculate_wait_time(file_name)
|
667
|
+
max_timeout = calculate_wait_time(file_name, target_url)
|
668
|
+
print(f"[bold #912dbc]Max timeout: {max_timeout} seconds")
|
568
669
|
|
569
|
-
if check:
|
570
|
-
if check_existing_file(filename):
|
571
|
-
count_dict['skip'] += 1
|
572
|
-
return
|
573
|
-
clear_existing_file(filename)
|
574
670
|
# print(f'Download_start_time: {datetime.datetime.now()}')
|
575
671
|
download_time_s = datetime.datetime.now()
|
576
672
|
order_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
|
@@ -593,11 +689,15 @@ def dlownload_file(target_url, store_path, file_name, check=False):
|
|
593
689
|
with open(filename, 'wb') as f:
|
594
690
|
f.write(response.content) """
|
595
691
|
|
596
|
-
|
692
|
+
if find_proxy:
|
693
|
+
proxies = get_proxy()
|
694
|
+
response = s.get(target_url, headers=headers, proxies=proxies, stream=True, timeout=random.randint(5, max_timeout))
|
695
|
+
else:
|
696
|
+
response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
|
597
697
|
response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
|
598
698
|
|
599
699
|
# 保存文件
|
600
|
-
with open(
|
700
|
+
with open(fname, "wb") as f:
|
601
701
|
print(f"[bold #96cbd7]Downloading {file_name}...")
|
602
702
|
for chunk in response.iter_content(chunk_size=1024):
|
603
703
|
if chunk:
|
@@ -605,12 +705,12 @@ def dlownload_file(target_url, store_path, file_name, check=False):
|
|
605
705
|
|
606
706
|
f.close()
|
607
707
|
|
608
|
-
# print(f'\r文件 {
|
609
|
-
if os.path.exists(
|
708
|
+
# print(f'\r文件 {fname} 下载成功', end="")
|
709
|
+
if os.path.exists(fname):
|
610
710
|
download_success = True
|
611
711
|
download_time_e = datetime.datetime.now()
|
612
712
|
download_delta = download_time_e - download_time_s
|
613
|
-
print(f"[#3dfc40]File [bold #dfff73]{
|
713
|
+
print(f"[#3dfc40]File [bold #dfff73]{fname} [#3dfc40]has been downloaded successfully, Time: [#39cbdd]{download_delta}")
|
614
714
|
count_dict["success"] += 1
|
615
715
|
# print(f'Download_end_time: {datetime.datetime.now()}')
|
616
716
|
|
@@ -671,7 +771,8 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
|
|
671
771
|
|
672
772
|
|
673
773
|
def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
|
674
|
-
year_str = str(download_time)[:4]
|
774
|
+
# year_str = str(download_time)[:4]
|
775
|
+
ymdh_str = str(download_time)
|
675
776
|
if depth is not None and level_num is not None:
|
676
777
|
print("Please ensure the depth or level_num is None")
|
677
778
|
print("Progress will use the depth")
|
@@ -683,10 +784,10 @@ def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max
|
|
683
784
|
print(f"Data of single level ({level_num}) will be downloaded...")
|
684
785
|
which_mode = "level"
|
685
786
|
else:
|
686
|
-
print("Full depth or full level data will be downloaded...")
|
787
|
+
# print("Full depth or full level data will be downloaded...")
|
687
788
|
which_mode = "full"
|
688
789
|
query_dict = get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
|
689
|
-
submit_url = get_submit_url(dataset_name, version_name, var,
|
790
|
+
submit_url = get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
|
690
791
|
return submit_url
|
691
792
|
|
692
793
|
|
@@ -716,7 +817,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
716
817
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
|
717
818
|
if download_time_end is not None:
|
718
819
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}_{download_time_end}.nc"
|
719
|
-
|
820
|
+
download_file(submit_url, store_path, file_name, check)
|
720
821
|
else:
|
721
822
|
varlist = [_ for _ in var]
|
722
823
|
for key, value in var_group.items():
|
@@ -740,7 +841,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
740
841
|
file_name = f"HYCOM_{key}_{download_time}.nc"
|
741
842
|
if download_time_end is not None:
|
742
843
|
file_name = f"HYCOM_{key}_{download_time}_{download_time_end}.nc"
|
743
|
-
|
844
|
+
download_file(submit_url, store_path, file_name, check)
|
744
845
|
|
745
846
|
|
746
847
|
def convert_full_name_to_short_name(full_name):
|
@@ -767,6 +868,22 @@ def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_ma
|
|
767
868
|
prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
|
768
869
|
|
769
870
|
|
871
|
+
def done_callback(future, progress, task, total, counter_lock):
|
872
|
+
"""
|
873
|
+
# 并行下载任务的回调函数
|
874
|
+
# 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
|
875
|
+
|
876
|
+
回调函数:当一个任务完成后,会调用这个函数,这样可以及时更新进度条,显示任务的完成情况。
|
877
|
+
本情况下,done_callback函数的作用是当一个任务完成后,更新进度条的进度,显示任务的完成情况。
|
878
|
+
这样,即使多个任务同时执行,也可以及时看到每个任务的完成情况,不会等到所有任务都完成才显示。
|
879
|
+
"""
|
880
|
+
|
881
|
+
global parallel_counter
|
882
|
+
with counter_lock:
|
883
|
+
parallel_counter += 1
|
884
|
+
progress.update(task, advance=1, description=f"[cyan]Downloading... {parallel_counter}/{total}")
|
885
|
+
|
886
|
+
|
770
887
|
def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
|
771
888
|
"""
|
772
889
|
Description:
|
@@ -791,6 +908,10 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
791
908
|
None
|
792
909
|
"""
|
793
910
|
ymdh_time_s, ymdh_time_e = str(time_s), str(time_e)
|
911
|
+
if num_workers is not None and num_workers > 1: # 如果使用多线程下载,用于进度条显示
|
912
|
+
global parallel_counter
|
913
|
+
parallel_counter = 0
|
914
|
+
counter_lock = Lock() # 创建一个锁,线程安全的计数器
|
794
915
|
if ymdh_time_s == ymdh_time_e:
|
795
916
|
prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name)
|
796
917
|
elif int(ymdh_time_s) < int(ymdh_time_e):
|
@@ -808,8 +929,10 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
808
929
|
# 并行方式
|
809
930
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
810
931
|
futures = [executor.submit(download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
|
811
|
-
for i, future in enumerate(futures):
|
812
|
-
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}"))
|
932
|
+
""" for i, future in enumerate(futures):
|
933
|
+
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")) """
|
934
|
+
for feature in as_completed(futures):
|
935
|
+
done_callback(feature, progress, task, len(time_list), counter_lock)
|
813
936
|
else:
|
814
937
|
new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
|
815
938
|
total_num = len(new_time_list)
|
@@ -824,8 +947,10 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
824
947
|
# 并行方式
|
825
948
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
826
949
|
futures = [executor.submit(download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
|
827
|
-
for i, future in enumerate(futures):
|
828
|
-
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}"))
|
950
|
+
""" for i, future in enumerate(futures):
|
951
|
+
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")) """
|
952
|
+
for feature in as_completed(futures):
|
953
|
+
done_callback(feature, progress, task, len(time_list), counter_lock)
|
829
954
|
else:
|
830
955
|
print("Please ensure the time_s is no more than time_e")
|
831
956
|
|
@@ -907,31 +1032,30 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
907
1032
|
time_e = str(time_e)
|
908
1033
|
if len(time_e) == 8:
|
909
1034
|
time_e += "21"
|
910
|
-
|
1035
|
+
|
911
1036
|
global count_dict
|
912
|
-
count_dict = {
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
}
|
1037
|
+
count_dict = {"success": 0, "fail": 0, "skip": 0, "no_data": 0, "total": 0, "no_data_list": []}
|
1038
|
+
|
1039
|
+
""" global current_platform
|
1040
|
+
current_platform = platform.system() """
|
1041
|
+
|
1042
|
+
global find_proxy
|
1043
|
+
find_proxy = False
|
920
1044
|
|
921
1045
|
download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
|
922
|
-
|
1046
|
+
|
923
1047
|
count_dict["total"] = count_dict["success"] + count_dict["fail"] + count_dict["skip"] + count_dict["no_data"]
|
924
|
-
|
1048
|
+
|
925
1049
|
print("[bold #ecdbfe]-" * 160)
|
926
1050
|
print(f"[bold #ff80ab]Total: {count_dict['total']}\nSuccess: {count_dict['success']}\nFail: {count_dict['fail']}\nSkip: {count_dict['skip']}")
|
927
|
-
if count_dict[
|
1051
|
+
if count_dict["fail"] > 0:
|
928
1052
|
print("[bold #be5528]Please try again to download the failed data later")
|
929
|
-
if count_dict[
|
930
|
-
if count_dict[
|
1053
|
+
if count_dict["no_data"] > 0:
|
1054
|
+
if count_dict["no_data"] == 1:
|
931
1055
|
print(f"[bold #f90000]There is {count_dict['no_data']} data that does not exist in any dataset and version")
|
932
1056
|
else:
|
933
1057
|
print(f"[bold #f90000]These are {count_dict['no_data']} data that do not exist in any dataset and version")
|
934
|
-
for no_data in count_dict[
|
1058
|
+
for no_data in count_dict["no_data_list"]:
|
935
1059
|
print(f"[bold #d81b60]{no_data}")
|
936
1060
|
print("[bold #ecdbfe]-" * 160)
|
937
1061
|
|
@@ -987,8 +1111,8 @@ def how_to_use():
|
|
987
1111
|
|
988
1112
|
if __name__ == "__main__":
|
989
1113
|
# help(hycom3h.download)
|
990
|
-
time_s, time_e = "
|
991
|
-
merge_name = "
|
1114
|
+
time_s, time_e = "2023010100", "2023123121"
|
1115
|
+
merge_name = f"{time_s}_{time_e}" # 合并后的文件名
|
992
1116
|
root_path = r"G:\Data\HYCOM\3hourly"
|
993
1117
|
location_dict = {"west": 105, "east": 130, "south": 15, "north": 45}
|
994
1118
|
download_dict = {
|
@@ -1003,16 +1127,13 @@ if __name__ == "__main__":
|
|
1003
1127
|
"salinity_bottom": {"simple_name": "salt_b", "download": 0},
|
1004
1128
|
}
|
1005
1129
|
|
1006
|
-
var_list = []
|
1007
|
-
for var_name in download_dict.keys():
|
1008
|
-
if download_dict[var_name]["download"] == 1:
|
1009
|
-
var_list.append(var_name)
|
1130
|
+
var_list = [var_name for var_name in download_dict.keys() if download_dict[var_name]["download"]]
|
1010
1131
|
|
1011
1132
|
# set depth or level, only one can be True
|
1012
1133
|
# if you wanna download all depth or level, set both False
|
1013
1134
|
depth = None # or 0-5000 meters
|
1014
1135
|
level = None # or 1-40 levels
|
1015
|
-
num_workers =
|
1136
|
+
num_workers = 3
|
1016
1137
|
|
1017
1138
|
check = True
|
1018
1139
|
ftimes = 1
|
oafuncs/oa_down/test.py
CHANGED
@@ -1,45 +1,45 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
2
|
# coding=utf-8
|
3
|
-
|
3
|
+
"""
|
4
4
|
Author: Liu Kun && 16031215@qq.com
|
5
5
|
Date: 2024-12-01 19:32:25
|
6
6
|
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
-
LastEditTime: 2024-12-
|
7
|
+
LastEditTime: 2024-12-10 11:16:36
|
8
8
|
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\test.py
|
9
|
-
Description:
|
9
|
+
Description:
|
10
10
|
EditPlatform: vscode
|
11
11
|
ComputerInfo: XPS 15 9510
|
12
12
|
SystemInfo: Windows 11
|
13
13
|
Python Version: 3.12
|
14
|
-
|
14
|
+
"""
|
15
15
|
|
16
16
|
import os
|
17
17
|
import random
|
18
|
+
import re
|
18
19
|
|
19
|
-
txtfile = r'E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\User_Agent-list.txt'
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
print(lines[i])
|
21
|
+
def is_valid_user_agent(user_agent):
|
22
|
+
# 简单的正则表达式来检查User Agent的格式
|
23
|
+
# 这个正则表达式检查User Agent是否包含常见的浏览器信息格式
|
24
|
+
pattern = re.compile(
|
25
|
+
r"^(?:(?:Mozilla|Opera|Chrome|Safari|Edg|OPR)/[\d.]+)"
|
26
|
+
r"(?:\s(?:\(.*?\)))?"
|
27
|
+
r"(?:\s(?:Gecko|AppleWebKit|KHTML, like Gecko|Version|Edge|OPR)/[\d.]+)?"
|
28
|
+
r"(?:\s.*?(?:rv:|Version/|Ubuntu|Macintosh|Windows|X11|Linux|CrOS|FreeBSD|OpenBSD|NetBSD|iPhone|iPad|iPod|Android|BlackBerry|BB10|Mobile|Symbian|Windows Phone|IEMobile|Opera Mini|Opera Mobi|UCBrowser|MQQBrowser|baiduboxapp|baidubrowser|Safari|Firefox|MSIE|Trident|Edge|EdgA|Chrome|CriOS|Vivaldi|Sleipnir|Midori|ELinks|Lynx|w3m|Arora|Epiphany|Konqueror|Dillo|Netscape|SeaMonkey|K-Meleon|Camino|Iceape|Galeon|GranParadiso|Iceweasel|Firefox|Fennec|Conkeror|PaleMoon|Uzbl|QupZilla|Otter|Waterfox|Basilisk|Cyberfox|PaleMoon|GNU IceCat|GNU IceWeasel|IceCat|IceWeasel|Seamonkey|Iceape|Firefox|Epiphany|Web|Safari|Android|Mobile|BlackBerry|BB10|Tablet|Silk|Kindle|FxiOS|Focus|SamsungBrowser|browser|AppleWebKit|Puffin|DuckDuckGo|YaBrowser|Yandex|Amigo|NokiaBrowser|OviBrowser|OneBrowser|Chrome|Firefox|Safari|OPR|Coast|Mercury|Silk|Skyfire|IEMobile|Bolt|Jasmine|NativeHost|Crosswalk|TizenBrowser|SailfishBrowser|SamsungBrowser|Silk-Accelerated|UCBrowser|Quark|XiaoMi|OnePlus|Vivo|Oppo|Realme|Meizu|Lenovo|Huawei|ZTE|Alcatel|Sony|Nokia|LG|HTC|Asus|Acer|Motorola|Samsung)/[\d.]+)?$"
|
29
|
+
)
|
31
30
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
# 使用正则表达式匹配User Agent字符串
|
32
|
+
if pattern.match(user_agent):
|
33
|
+
return True
|
34
|
+
else:
|
35
|
+
return False
|
36
36
|
|
37
37
|
|
38
38
|
def get_ua():
|
39
39
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
40
|
-
ua_file_txt = os.path.join(current_dir,
|
40
|
+
ua_file_txt = os.path.join(current_dir, "User_Agent-list.txt")
|
41
41
|
|
42
|
-
with open(ua_file_txt,
|
42
|
+
with open(ua_file_txt, "r") as f:
|
43
43
|
ua_list = f.readlines()
|
44
44
|
# 去掉换行符和空行
|
45
45
|
ua_list = [line.strip() for line in ua_list if line.strip()]
|
@@ -47,9 +47,6 @@ def get_ua():
|
|
47
47
|
return random.choice(ua_list)
|
48
48
|
|
49
49
|
|
50
|
-
print(get_ua())
|
51
|
-
|
52
|
-
|
53
50
|
def get_ua_org():
|
54
51
|
ua_list = [
|
55
52
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
|
@@ -115,13 +112,40 @@ def get_ua_org():
|
|
115
112
|
"NOKIA5700/UCWEB7.0.2.37/28/999",
|
116
113
|
"Openwave/UCWEB7.0.2.37/28/999",
|
117
114
|
"Openwave/UCWEB7.0.2.37/28/999",
|
118
|
-
|
119
115
|
]
|
120
|
-
with open(newtxtfile,
|
116
|
+
with open(newtxtfile, "w") as f:
|
121
117
|
for line in ua_list:
|
122
|
-
f.write(line +
|
118
|
+
f.write(line + "\n")
|
123
119
|
# print(f'Using User-Agent: {ua}')
|
124
120
|
ua = random.choice(ua_list)
|
125
121
|
return ua
|
126
122
|
|
127
|
-
|
123
|
+
|
124
|
+
# get_ua_org()
|
125
|
+
|
126
|
+
if __name__ == "__main__":
|
127
|
+
txtfile = r"E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\User_Agent-list.txt"
|
128
|
+
|
129
|
+
with open(txtfile, "r") as f:
|
130
|
+
lines = f.readlines()
|
131
|
+
# 去掉换行符和空行
|
132
|
+
lines = [line.strip() for line in lines if line.strip()]
|
133
|
+
""" new_line = []
|
134
|
+
for i in range(len(lines)):
|
135
|
+
if '/' in lines[i]:
|
136
|
+
new_line.append(lines[i])
|
137
|
+
else:
|
138
|
+
print(lines[i]) """
|
139
|
+
|
140
|
+
new_line = []
|
141
|
+
for line in lines:
|
142
|
+
if is_valid_user_agent(line):
|
143
|
+
# print(line)
|
144
|
+
new_line.append(line)
|
145
|
+
else:
|
146
|
+
print(f"Invalid User-Agent: {line}")
|
147
|
+
|
148
|
+
newtxtfile = r"E:\Code\Python\My_Funcs\OAFuncs\oafuncs\oa_down\ua_list_new.txt"
|
149
|
+
with open(newtxtfile, "w") as f:
|
150
|
+
for line in new_line:
|
151
|
+
f.write(line + "\n")
|