oafuncs 0.0.89__py2.py3-none-any.whl → 0.0.91__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oafuncs/data_store/OAFuncs.png +0 -0
- oafuncs/oa_data.py +9 -82
- oafuncs/oa_down/__init__.py +1 -0
- oafuncs/oa_down/hycom_3hourly.py +315 -401
- oafuncs/oa_down/idm.py +50 -0
- oafuncs/oa_down/literature.py +53 -29
- oafuncs/oa_down/user_agent.py +0 -3
- oafuncs/oa_file.py +80 -21
- oafuncs/oa_help.py +8 -1
- oafuncs/oa_nc.py +20 -18
- oafuncs/oa_tool/__init__.py +6 -6
- oafuncs/oa_tool/parallel.py +90 -0
- {oafuncs-0.0.89.dist-info → oafuncs-0.0.91.dist-info}/METADATA +1 -1
- oafuncs-0.0.91.dist-info/RECORD +28 -0
- {oafuncs-0.0.89.dist-info → oafuncs-0.0.91.dist-info}/WHEEL +1 -1
- oafuncs-0.0.89.dist-info/RECORD +0 -26
- {oafuncs-0.0.89.dist-info → oafuncs-0.0.91.dist-info}/LICENSE.txt +0 -0
- {oafuncs-0.0.89.dist-info → oafuncs-0.0.91.dist-info}/top_level.txt +0 -0
oafuncs/oa_down/hycom_3hourly.py
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
# coding=utf-8
|
3
3
|
"""
|
4
4
|
Author: Liu Kun && 16031215@qq.com
|
5
|
-
Date: 2024-11-
|
5
|
+
Date: 2024-11-02 11:07:49
|
6
6
|
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
-
LastEditTime:
|
7
|
+
LastEditTime: 2025-01-07 16:31:36
|
8
8
|
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\hycom_3hourly.py
|
9
9
|
Description:
|
10
10
|
EditPlatform: vscode
|
@@ -16,182 +16,189 @@ Python Version: 3.12
|
|
16
16
|
import datetime
|
17
17
|
import os
|
18
18
|
import random
|
19
|
+
import re
|
19
20
|
import time
|
20
21
|
import warnings
|
21
22
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22
23
|
from pathlib import Path
|
23
24
|
from threading import Lock
|
24
|
-
import re
|
25
25
|
|
26
26
|
import matplotlib.pyplot as plt
|
27
27
|
import numpy as np
|
28
28
|
import pandas as pd
|
29
29
|
import requests
|
30
|
-
from bs4 import BeautifulSoup
|
31
30
|
from rich import print
|
32
31
|
from rich.progress import Progress
|
33
|
-
|
32
|
+
|
33
|
+
from oafuncs.oa_down.user_agent import get_ua
|
34
|
+
from oafuncs.oa_file import file_size, mean_size
|
35
|
+
from oafuncs.oa_nc import check as check_nc
|
34
36
|
|
35
37
|
warnings.filterwarnings("ignore", category=RuntimeWarning, message="Engine '.*' loading failed:.*")
|
36
38
|
|
37
39
|
__all__ = ["draw_time_range", "download", "how_to_use", "get_time_list"]
|
38
40
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
#
|
43
|
-
#
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
#
|
57
|
-
|
58
|
-
|
59
|
-
#
|
60
|
-
#
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]
|
66
|
-
|
67
|
-
data_info["hourly"]["dataset"]["
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
#
|
74
|
-
|
75
|
-
|
76
|
-
#
|
77
|
-
|
78
|
-
"
|
79
|
-
"
|
80
|
-
"
|
81
|
-
"
|
82
|
-
"
|
83
|
-
"
|
84
|
-
|
85
|
-
"
|
86
|
-
|
87
|
-
}
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
#
|
92
|
-
#
|
93
|
-
#
|
94
|
-
|
95
|
-
|
96
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["
|
97
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["
|
98
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.
|
99
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.
|
100
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["
|
101
|
-
data_info["hourly"]["dataset"]["
|
102
|
-
data_info["hourly"]["dataset"]["
|
103
|
-
data_info["hourly"]["dataset"]["
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
#
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["
|
115
|
-
# GLBv0.08
|
116
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["
|
117
|
-
# GLBv0.08
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["
|
126
|
-
# GLBv0.08
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
41
|
+
|
42
|
+
def _get_initial_data():
|
43
|
+
global variable_info, data_info, var_group, single_var_group
|
44
|
+
# ----------------------------------------------
|
45
|
+
# variable
|
46
|
+
variable_info = {
|
47
|
+
"u": {"var_name": "water_u", "standard_name": "eastward_sea_water_velocity"},
|
48
|
+
"v": {"var_name": "water_v", "standard_name": "northward_sea_water_velocity"},
|
49
|
+
"temp": {"var_name": "water_temp", "standard_name": "sea_water_potential_temperature"},
|
50
|
+
"salt": {"var_name": "salinity", "standard_name": "sea_water_salinity"},
|
51
|
+
"ssh": {"var_name": "surf_el", "standard_name": "sea_surface_elevation"},
|
52
|
+
"u_b": {"var_name": "water_u_bottom", "standard_name": "eastward_sea_water_velocity_at_sea_floor"},
|
53
|
+
"v_b": {"var_name": "water_v_bottom", "standard_name": "northward_sea_water_velocity_at_sea_floor"},
|
54
|
+
"temp_b": {"var_name": "water_temp_bottom", "standard_name": "sea_water_potential_temperature_at_sea_floor"},
|
55
|
+
"salt_b": {"var_name": "salinity_bottom", "standard_name": "sea_water_salinity_at_sea_floor"},
|
56
|
+
}
|
57
|
+
# ----------------------------------------------
|
58
|
+
# time resolution
|
59
|
+
data_info = {"yearly": {}, "monthly": {}, "daily": {}, "hourly": {}}
|
60
|
+
|
61
|
+
# hourly data
|
62
|
+
# dataset: GLBv0.08, GLBu0.08, GLBy0.08
|
63
|
+
data_info["hourly"]["dataset"] = {"GLBv0.08": {}, "GLBu0.08": {}, "GLBy0.08": {}, "ESPC_D": {}}
|
64
|
+
|
65
|
+
# version
|
66
|
+
# version of GLBv0.08: 53.X, 56.3, 57.2, 92.8, 57.7, 92.9, 93.0
|
67
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"] = {"53.X": {}, "56.3": {}, "57.2": {}, "92.8": {}, "57.7": {}, "92.9": {}, "93.0": {}}
|
68
|
+
# version of GLBu0.08: 93.0
|
69
|
+
data_info["hourly"]["dataset"]["GLBu0.08"]["version"] = {"93.0": {}}
|
70
|
+
# version of GLBy0.08: 93.0
|
71
|
+
data_info["hourly"]["dataset"]["GLBy0.08"]["version"] = {"93.0": {}}
|
72
|
+
# version of ESPC_D: V02
|
73
|
+
data_info["hourly"]["dataset"]["ESPC_D"]["version"] = {"V02": {}}
|
74
|
+
|
75
|
+
# info details
|
76
|
+
# time range
|
77
|
+
# GLBv0.08
|
78
|
+
# 在网页上提交超过范围的时间,会返回该数据集实际时间范围,从而纠正下面的时间范围
|
79
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["time_range"] = {"time_start": "1994010112", "time_end": "2015123109"}
|
80
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["time_range"] = {"time_start": "2014070112", "time_end": "2016093009"}
|
81
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["time_range"] = {"time_start": "2016050112", "time_end": "2017020109"}
|
82
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["time_range"] = {"time_start": "2017020112", "time_end": "2017060109"}
|
83
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["time_range"] = {"time_start": "2017060112", "time_end": "2017100109"}
|
84
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["time_range"] = {"time_start": "2017100112", "time_end": "2018032009"}
|
85
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018010112", "time_end": "2020021909"}
|
86
|
+
# GLBu0.08
|
87
|
+
data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018091912", "time_end": "2018120909"}
|
88
|
+
# GLBy0.08
|
89
|
+
data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["time_range"] = {"time_start": "2018120412", "time_end": "2024090509"}
|
90
|
+
# ESPC-D
|
91
|
+
data_info["hourly"]["dataset"]["ESPC_D"]["version"]["V02"]["time_range"] = {"time_start": "2024081012", "time_end": "2030010100"}
|
92
|
+
|
93
|
+
# classification method
|
94
|
+
# year_different: the data of different years is stored in different files
|
95
|
+
# same_path: the data of different years is stored in the same file
|
96
|
+
# var_different: the data of different variables is stored in different files
|
97
|
+
# var_year_different: the data of different variables and years is stored in different files
|
98
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["classification"] = "year_different"
|
99
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["classification"] = "same_path"
|
100
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["classification"] = "same_path"
|
101
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["classification"] = "var_different"
|
102
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["classification"] = "same_path"
|
103
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["classification"] = "var_different"
|
104
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["93.0"]["classification"] = "var_different"
|
105
|
+
data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["classification"] = "var_different"
|
106
|
+
data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["classification"] = "var_year_different"
|
107
|
+
data_info["hourly"]["dataset"]["ESPC_D"]["version"]["V02"]["classification"] = "single_var_year_different"
|
108
|
+
|
109
|
+
# download info
|
110
|
+
# base url
|
111
|
+
# GLBv0.08 53.X
|
112
|
+
url_53x = {}
|
113
|
+
for y_53x in range(1994, 2016):
|
114
|
+
# r'https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_53.X/data/2013?'
|
115
|
+
url_53x[str(y_53x)] = rf"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_53.X/data/{y_53x}?"
|
116
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["53.X"]["url"] = url_53x
|
117
|
+
# GLBv0.08 56.3
|
118
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["56.3"]["url"] = r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_56.3?"
|
119
|
+
# GLBv0.08 57.2
|
120
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.2"]["url"] = r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_57.2?"
|
121
|
+
# GLBv0.08 92.8
|
122
|
+
url_928 = {
|
123
|
+
"uv3z": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_92.8/uv3z?",
|
124
|
+
"ts3z": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_92.8/ts3z?",
|
125
|
+
"ssh": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_92.8/ssh?",
|
126
|
+
}
|
127
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.8"]["url"] = url_928
|
128
|
+
# GLBv0.08 57.7
|
129
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["57.7"]["url"] = r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_57.7?"
|
130
|
+
# GLBv0.08 92.9
|
131
|
+
url_929 = {
|
132
|
+
"uv3z": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_92.9/uv3z?",
|
133
|
+
"ts3z": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_92.9/ts3z?",
|
134
|
+
"ssh": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_92.9/ssh?",
|
135
|
+
}
|
136
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["92.9"]["url"] = url_929
|
137
|
+
# GLBv0.08 93.0
|
138
|
+
url_930_v = {
|
139
|
+
"uv3z": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_93.0/uv3z?",
|
140
|
+
"ts3z": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_93.0/ts3z?",
|
141
|
+
"ssh": r"https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_93.0/ssh?",
|
142
|
+
}
|
143
|
+
data_info["hourly"]["dataset"]["GLBv0.08"]["version"]["93.0"]["url"] = url_930_v
|
144
|
+
# GLBu0.08 93.0
|
145
|
+
url_930_u = {
|
146
|
+
"uv3z": r"https://ncss.hycom.org/thredds/ncss/GLBu0.08/expt_93.0/uv3z?",
|
147
|
+
"ts3z": r"https://ncss.hycom.org/thredds/ncss/GLBu0.08/expt_93.0/ts3z?",
|
148
|
+
"ssh": r"https://ncss.hycom.org/thredds/ncss/GLBu0.08/expt_93.0/ssh?",
|
149
|
+
}
|
150
|
+
data_info["hourly"]["dataset"]["GLBu0.08"]["version"]["93.0"]["url"] = url_930_u
|
151
|
+
# GLBy0.08 93.0
|
152
|
+
uv3z_930_y = {}
|
153
|
+
ts3z_930_y = {}
|
154
|
+
ssh_930_y = {}
|
155
|
+
for y_930_y in range(2018, 2025):
|
156
|
+
uv3z_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/uv3z/{y_930_y}?"
|
157
|
+
ts3z_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/ts3z/{y_930_y}?"
|
158
|
+
ssh_930_y[str(y_930_y)] = rf"https://ncss.hycom.org/thredds/ncss/GLBy0.08/expt_93.0/ssh/{y_930_y}?"
|
159
|
+
# GLBy0.08 93.0 data time range in each year: year-01-01 12:00 to year+1-01-01 09:00
|
160
|
+
url_930_y = {
|
161
|
+
"uv3z": uv3z_930_y,
|
162
|
+
"ts3z": ts3z_930_y,
|
163
|
+
"ssh": ssh_930_y,
|
164
|
+
}
|
165
|
+
data_info["hourly"]["dataset"]["GLBy0.08"]["version"]["93.0"]["url"] = url_930_y
|
166
|
+
# ESPC-D-V02
|
167
|
+
u3z_espc_d_v02_y = {}
|
168
|
+
v3z_espc_d_v02_y = {}
|
169
|
+
t3z_espc_d_v02_y = {}
|
170
|
+
s3z_espc_d_v02_y = {}
|
171
|
+
ssh_espc_d_v02_y = {}
|
172
|
+
for y_espc_d_v02 in range(2024, 2030):
|
173
|
+
u3z_espc_d_v02_y[str(y_espc_d_v02)] = rf"https://ncss.hycom.org/thredds/ncss/ESPC-D-V02/u3z/{y_espc_d_v02}?"
|
174
|
+
v3z_espc_d_v02_y[str(y_espc_d_v02)] = rf"https://ncss.hycom.org/thredds/ncss/ESPC-D-V02/v3z/{y_espc_d_v02}?"
|
175
|
+
t3z_espc_d_v02_y[str(y_espc_d_v02)] = rf"https://ncss.hycom.org/thredds/ncss/ESPC-D-V02/t3z/{y_espc_d_v02}?"
|
176
|
+
s3z_espc_d_v02_y[str(y_espc_d_v02)] = rf"https://ncss.hycom.org/thredds/ncss/ESPC-D-V02/s3z/{y_espc_d_v02}?"
|
177
|
+
ssh_espc_d_v02_y[str(y_espc_d_v02)] = rf"https://ncss.hycom.org/thredds/ncss/ESPC-D-V02/ssh/{y_espc_d_v02}?"
|
178
|
+
url_espc_d_v02_y = {
|
179
|
+
"u3z": u3z_espc_d_v02_y,
|
180
|
+
"v3z": v3z_espc_d_v02_y,
|
181
|
+
"t3z": t3z_espc_d_v02_y,
|
182
|
+
"s3z": s3z_espc_d_v02_y,
|
183
|
+
"ssh": ssh_espc_d_v02_y,
|
184
|
+
}
|
185
|
+
data_info["hourly"]["dataset"]["ESPC_D"]["version"]["V02"]["url"] = url_espc_d_v02_y
|
186
|
+
# ----------------------------------------------
|
187
|
+
var_group = {
|
188
|
+
"uv3z": ["u", "v", "u_b", "v_b"],
|
189
|
+
"ts3z": ["temp", "salt", "temp_b", "salt_b"],
|
190
|
+
"ssh": ["ssh"],
|
191
|
+
}
|
192
|
+
# ----------------------------------------------
|
193
|
+
single_var_group = {
|
194
|
+
"u3z": ["u"],
|
195
|
+
"v3z": ["v"],
|
196
|
+
"t3z": ["temp"],
|
197
|
+
"s3z": ["salt"],
|
198
|
+
"ssh": ["ssh"],
|
199
|
+
}
|
200
|
+
|
201
|
+
return variable_info, data_info, var_group, single_var_group
|
195
202
|
|
196
203
|
|
197
204
|
def draw_time_range(pic_save_folder=None):
|
@@ -299,14 +306,14 @@ def get_time_list(time_s, time_e, delta, interval_type="hour"):
|
|
299
306
|
return dt_list
|
300
307
|
|
301
308
|
|
302
|
-
def
|
309
|
+
def _transform_time(time_str):
|
303
310
|
# old_time = '2023080203'
|
304
311
|
# time_new = '2023-08-02T03%3A00%3A00Z'
|
305
312
|
time_new = f"{time_str[:4]}-{time_str[4:6]}-{time_str[6:8]}T{time_str[8:10]}%3A00%3A00Z"
|
306
313
|
return time_new
|
307
314
|
|
308
315
|
|
309
|
-
def
|
316
|
+
def _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_str_end=None, mode="single_depth", depth=None, level_num=None):
|
310
317
|
query_dict = {
|
311
318
|
"var": variable_info[var]["var_name"],
|
312
319
|
"north": lat_max,
|
@@ -325,11 +332,11 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
|
|
325
332
|
}
|
326
333
|
|
327
334
|
if time_str_end is not None:
|
328
|
-
query_dict["time_start"] =
|
329
|
-
query_dict["time_end"] =
|
335
|
+
query_dict["time_start"] = _transform_time(time_str_ymdh)
|
336
|
+
query_dict["time_end"] = _transform_time(time_str_end)
|
330
337
|
query_dict["timeStride"] = 1
|
331
338
|
else:
|
332
|
-
query_dict["time"] =
|
339
|
+
query_dict["time"] = _transform_time(time_str_ymdh)
|
333
340
|
|
334
341
|
def get_nearest_level_index(depth):
|
335
342
|
level_depth = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 125.0, 150.0, 200.0, 250.0, 300.0, 350.0, 400.0, 500.0, 600.0, 700.0, 800.0, 900.0, 1000.0, 1250.0, 1500.0, 2000.0, 2500.0, 3000.0, 4000.0, 5000]
|
@@ -354,7 +361,7 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
|
|
354
361
|
return query_dict
|
355
362
|
|
356
363
|
|
357
|
-
def
|
364
|
+
def _check_time_in_dataset_and_version(time_input, time_end=None):
|
358
365
|
# 判断是处理单个时间点还是时间范围
|
359
366
|
is_single_time = time_end is None
|
360
367
|
|
@@ -411,8 +418,8 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
|
|
411
418
|
if is_single_time:
|
412
419
|
return True
|
413
420
|
else:
|
414
|
-
base_url_s =
|
415
|
-
base_url_e =
|
421
|
+
base_url_s = _get_base_url(d_list[0], v_list[0], "u", str(time_start))
|
422
|
+
base_url_e = _get_base_url(d_list[0], v_list[0], "u", str(time_end))
|
416
423
|
if base_url_s == base_url_e:
|
417
424
|
return True
|
418
425
|
else:
|
@@ -423,7 +430,7 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
|
|
423
430
|
return False
|
424
431
|
|
425
432
|
|
426
|
-
def
|
433
|
+
def _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time_input, time_end=None):
|
427
434
|
# 根据时间长度补全时间格式
|
428
435
|
if len(str(time_input)) == 8:
|
429
436
|
time_input = str(time_input) + "00"
|
@@ -462,7 +469,7 @@ def ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time
|
|
462
469
|
return False
|
463
470
|
|
464
471
|
|
465
|
-
def
|
472
|
+
def _direct_choose_dataset_and_version(time_input, time_end=None):
|
466
473
|
# 假设 data_info 是一个字典,包含了数据集和版本的信息
|
467
474
|
# 示例结构:data_info['hourly']['dataset'][dataset_name]['version'][version_name]['time_range']
|
468
475
|
|
@@ -501,7 +508,7 @@ def direct_choose_dataset_and_version(time_input, time_end=None):
|
|
501
508
|
return dataset_name_out, version_name_out
|
502
509
|
|
503
510
|
|
504
|
-
def
|
511
|
+
def _get_base_url(dataset_name, version_name, var, ymdh_str):
|
505
512
|
year_str = int(ymdh_str[:4])
|
506
513
|
url_dict = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["url"]
|
507
514
|
classification_method = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["classification"]
|
@@ -542,199 +549,109 @@ def get_base_url(dataset_name, version_name, var, ymdh_str):
|
|
542
549
|
return base_url
|
543
550
|
|
544
551
|
|
545
|
-
def
|
546
|
-
base_url =
|
552
|
+
def _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
|
553
|
+
base_url = _get_base_url(dataset_name, version_name, var, ymdh_str)
|
547
554
|
if isinstance(query_dict["var"], str):
|
548
555
|
query_dict["var"] = [query_dict["var"]]
|
549
556
|
target_url = base_url + "&".join(f"var={var}" for var in query_dict["var"]) + "&" + "&".join(f"{key}={value}" for key, value in query_dict.items() if key != "var")
|
550
557
|
return target_url
|
551
558
|
|
552
559
|
|
553
|
-
def
|
560
|
+
def _clear_existing_file(file_full_path):
|
554
561
|
if os.path.exists(file_full_path):
|
555
562
|
os.remove(file_full_path)
|
556
563
|
print(f"{file_full_path} has been removed")
|
557
564
|
|
558
565
|
|
559
|
-
def
|
560
|
-
"""
|
561
|
-
description:
|
562
|
-
param {*} parent_path: The parent path where the files are located
|
563
|
-
param {*} fname: The file name pattern to search for
|
564
|
-
param {*} mode: 'path' to return the full path of the files, 'file' to return only the file names
|
565
|
-
return {*} A list of file paths or file names if files are found, None otherwise
|
566
|
-
"""
|
567
|
-
|
568
|
-
def natural_sort_key(s):
|
569
|
-
"""生成一个用于自然排序的键"""
|
570
|
-
return [int(text) if text.isdigit() else text.lower() for text in re.split("([0-9]+)", s)]
|
571
|
-
|
572
|
-
# 将parent_path和fname结合成完整的搜索路径
|
573
|
-
search_pattern = os.path.join(str(parent_path), fname)
|
574
|
-
|
575
|
-
# 使用glob模块查找所有匹配的文件
|
576
|
-
matched_files = glob.glob(search_pattern)
|
577
|
-
|
578
|
-
# 如果没有找到任何文件,则返回False
|
579
|
-
if not matched_files:
|
580
|
-
return None
|
581
|
-
|
582
|
-
# 在find_files函数中替换natsorted调用
|
583
|
-
matched_files = sorted(matched_files, key=natural_sort_key)
|
584
|
-
|
585
|
-
# 根据mode参数决定返回的内容
|
586
|
-
if mode == "file":
|
587
|
-
# 只返回文件名
|
588
|
-
result = [os.path.basename(file) for file in matched_files]
|
589
|
-
else: # 默认为'path'
|
590
|
-
# 返回文件的绝对路径
|
591
|
-
result = [os.path.abspath(file) for file in matched_files]
|
592
|
-
|
593
|
-
return result
|
594
|
-
|
595
|
-
|
596
|
-
def file_size(file_path, unit="KB"):
|
597
|
-
# 检查文件是否存在
|
598
|
-
if not os.path.exists(file_path):
|
599
|
-
return "文件不存在"
|
600
|
-
|
601
|
-
# 获取文件大小(字节)
|
602
|
-
file_size = os.path.getsize(file_path)
|
603
|
-
|
604
|
-
# 单位转换字典
|
605
|
-
unit_dict = {"PB": 1024**5, "TB": 1024**4, "GB": 1024**3, "MB": 1024**2, "KB": 1024}
|
606
|
-
|
607
|
-
# 检查传入的单位是否合法
|
608
|
-
if unit not in unit_dict:
|
609
|
-
return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
|
610
|
-
|
611
|
-
# 转换文件大小到指定单位
|
612
|
-
converted_size = file_size / unit_dict[unit]
|
613
|
-
|
614
|
-
return converted_size
|
615
|
-
|
616
|
-
|
617
|
-
# ** 计算文件夹下指定相关文件的平均大小
|
618
|
-
def mean_size(parent_path, fname):
|
619
|
-
flist = find_file(parent_path, fname)
|
620
|
-
if flist:
|
621
|
-
size_list = [file_size(f) for f in flist if file_size(f) != 0]
|
622
|
-
else:
|
623
|
-
size_list = []
|
624
|
-
if size_list:
|
625
|
-
min_size, max_size = min(size_list), max(size_list)
|
626
|
-
mean_size = sum(size_list) / len(size_list)
|
627
|
-
else:
|
628
|
-
mean_size, min_size, max_size = 0, 0, 0
|
629
|
-
return mean_size, min_size, max_size
|
630
|
-
|
631
|
-
|
632
|
-
def check_existing_file(file_full_path, min_size):
|
566
|
+
def _check_existing_file(file_full_path, avg_size):
|
633
567
|
if os.path.exists(file_full_path):
|
634
568
|
print(f"[bold #FFA54F]{file_full_path} exists")
|
635
569
|
fsize = file_size(file_full_path)
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
#
|
640
|
-
return False
|
641
|
-
else:
|
570
|
+
delta_size_ratio = (fsize - avg_size) / avg_size
|
571
|
+
if abs(delta_size_ratio) > 0.025:
|
572
|
+
if check_nc(file_full_path):
|
573
|
+
# print(f"File size is abnormal but can be opened normally, file size: {fsize:.2f} KB")
|
642
574
|
return True
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
return False
|
575
|
+
else:
|
576
|
+
print(f"File size is abnormal and cannot be opened, {file_full_path}: {fsize:.2f} KB")
|
577
|
+
return False
|
647
578
|
else:
|
648
579
|
return True
|
649
580
|
else:
|
650
|
-
# print(f'{file_full_path} does not exist')
|
651
581
|
return False
|
652
582
|
|
653
583
|
|
654
|
-
def
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
with open(ua_file_txt, "r") as f:
|
659
|
-
ua_list = f.readlines()
|
660
|
-
# 去掉换行符和空行
|
661
|
-
ua_list = [line.strip() for line in ua_list if line.strip()]
|
662
|
-
|
663
|
-
# if current_platform == 'Linux':
|
664
|
-
# ua_list = [line for line in ua_list if 'Linux' in line]
|
665
|
-
|
666
|
-
return random.choice(ua_list)
|
667
|
-
|
584
|
+
def _get_mean_size30(store_path, same_file):
|
585
|
+
if same_file not in fsize_dict.keys():
|
586
|
+
# print(f'Same file name: {same_file}')
|
587
|
+
fsize_dict[same_file] = {"size": 0, "count": 0}
|
668
588
|
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
ip_list.append(ip.strip())
|
679
|
-
choose_ip = random.choice(ip_list)
|
680
|
-
proxies = {"http": "http://" + choose_ip, "https": "https://" + choose_ip}
|
681
|
-
# print(f'Using proxy: {proxies}')
|
682
|
-
return proxies
|
683
|
-
|
684
|
-
|
685
|
-
def scrape_and_categorize_proxies(choose_protocol="http"):
|
686
|
-
url = "https://topproxylinks.com/"
|
687
|
-
# 发送HTTP请求获取网页内容
|
688
|
-
response = requests.get(url)
|
689
|
-
# 使用BeautifulSoup解析网页
|
690
|
-
soup = BeautifulSoup(response.text, "html.parser")
|
691
|
-
|
692
|
-
# 初始化字典来存储不同协议的代理
|
693
|
-
proxies_dict = {"http": [], "socks4": [], "socks5": []}
|
589
|
+
if fsize_dict[same_file]["count"] < 30 or fsize_dict[same_file]["size"] == 0:
|
590
|
+
# 更新30次文件最小值,后续认为可以代表所有文件,不再更新占用时间
|
591
|
+
fsize_mean = mean_size(store_path, same_file, max_num=30)
|
592
|
+
set_min_size = fsize_mean * 0.95
|
593
|
+
fsize_dict[same_file]["size"] = set_min_size
|
594
|
+
fsize_dict[same_file]["count"] += 1
|
595
|
+
else:
|
596
|
+
set_min_size = fsize_dict[same_file]["size"]
|
597
|
+
return set_min_size
|
694
598
|
|
695
|
-
# 查找表格中的所有行
|
696
|
-
tbody = soup.find("tbody")
|
697
599
|
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
600
|
+
def _get_mean_size_move(same_file, current_file):
|
601
|
+
# 获取锁
|
602
|
+
with fsize_dict_lock: # 全局锁,确保同一时间只能有一个线程访问
|
603
|
+
# 初始化字典中的值,如果文件不在字典中
|
604
|
+
if same_file not in fsize_dict.keys():
|
605
|
+
fsize_dict[same_file] = {"size_list": [], "mean_size": 1.0}
|
704
606
|
|
705
|
-
|
706
|
-
|
707
|
-
proxies_dict[protocol].append(proxy)
|
607
|
+
tolerance_ratio = 0.025 # 容忍的阈值比例
|
608
|
+
current_file_size = file_size(current_file)
|
708
609
|
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
610
|
+
# 如果列表不为空,则计算平均值,否则保持为1
|
611
|
+
if fsize_dict[same_file]["size_list"]:
|
612
|
+
fsize_dict[same_file]["mean_size"] = sum(fsize_dict[same_file]["size_list"]) / len(fsize_dict[same_file]["size_list"])
|
613
|
+
fsize_dict[same_file]["mean_size"] = max(fsize_dict[same_file]["mean_size"], 1.0)
|
614
|
+
else:
|
615
|
+
fsize_dict[same_file]["mean_size"] = 1.0
|
713
616
|
|
714
|
-
|
617
|
+
size_difference_ratio = (current_file_size - fsize_dict[same_file]["mean_size"]) / fsize_dict[same_file]["mean_size"]
|
715
618
|
|
619
|
+
if abs(size_difference_ratio) > tolerance_ratio:
|
620
|
+
if check_nc(current_file):
|
621
|
+
# print(f"File size is abnormal but can be opened normally, file size: {current_file_size:.2f} KB")
|
622
|
+
# 文件可以正常打开,但大小异常,保留当前文件大小
|
623
|
+
fsize_dict[same_file]["size_list"] = [current_file_size]
|
624
|
+
fsize_dict[same_file]["mean_size"] = current_file_size
|
625
|
+
else:
|
626
|
+
_clear_existing_file(current_file)
|
627
|
+
print(f"File size is abnormal, may need to be downloaded again, file size: {current_file_size:.2f} KB")
|
628
|
+
else:
|
629
|
+
# 添加当前文件大小到列表中,并更新计数
|
630
|
+
fsize_dict[same_file]["size_list"].append(current_file_size)
|
716
631
|
|
717
|
-
|
718
|
-
|
719
|
-
choose_ip = random.choice(ip_list)
|
720
|
-
proxies = {"http": f"http://{choose_ip}", "https": f"http://{choose_ip}"}
|
721
|
-
print(f"Using proxy: {proxies}")
|
722
|
-
return proxies
|
632
|
+
# 返回调整后的平均值,这里根据您的需求,返回的是添加新值之前的平均值
|
633
|
+
return fsize_dict[same_file]["mean_size"]
|
723
634
|
|
724
635
|
|
725
|
-
def
|
636
|
+
def _download_file(target_url, store_path, file_name, check=False):
|
726
637
|
# Check if the file exists
|
727
638
|
fname = Path(store_path) / file_name
|
728
639
|
file_name_split = file_name.split("_")
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
640
|
+
file_name_split = file_name_split[:-1]
|
641
|
+
# same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
|
642
|
+
same_file = "_".join(file_name_split) + "*nc"
|
643
|
+
|
733
644
|
if check:
|
734
|
-
if
|
645
|
+
if same_file not in fsize_dict.keys(): # 对第一个文件单独进行检查,因为没有大小可以对比
|
646
|
+
check_nc(fname,if_delete=True)
|
647
|
+
|
648
|
+
# set_min_size = _get_mean_size30(store_path, same_file) # 原方案,只30次取平均值;若遇变化,无法判断
|
649
|
+
get_mean_size = _get_mean_size_move(same_file, fname)
|
650
|
+
|
651
|
+
if _check_existing_file(fname, get_mean_size):
|
735
652
|
count_dict["skip"] += 1
|
736
653
|
return
|
737
|
-
|
654
|
+
_clear_existing_file(fname)
|
738
655
|
|
739
656
|
# -----------------------------------------------
|
740
657
|
print(f"[bold #f0f6d0]Requesting {file_name}...")
|
@@ -744,13 +661,11 @@ def download_file(target_url, store_path, file_name, check=False):
|
|
744
661
|
request_times = 0
|
745
662
|
|
746
663
|
def calculate_wait_time(time_str, target_url):
|
747
|
-
import re
|
748
|
-
|
749
664
|
# 定义正则表达式,匹配YYYYMMDDHH格式的时间
|
750
665
|
time_pattern = r"\d{10}"
|
751
666
|
|
752
667
|
# 定义两个字符串
|
753
|
-
# str1 = '
|
668
|
+
# str1 = 'HYCOM_water_u_2018010100-2018010112.nc'
|
754
669
|
# str2 = 'HYCOM_water_u_2018010100.nc'
|
755
670
|
|
756
671
|
# 使用正则表达式查找时间
|
@@ -796,13 +711,8 @@ def download_file(target_url, store_path, file_name, check=False):
|
|
796
711
|
with open(filename, 'wb') as f:
|
797
712
|
f.write(response.content) """
|
798
713
|
|
799
|
-
|
800
|
-
proxies = get_proxy()
|
801
|
-
response = s.get(target_url, headers=headers, proxies=proxies, stream=True, timeout=random.randint(5, max_timeout))
|
802
|
-
else:
|
803
|
-
response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
|
714
|
+
response = s.get(target_url, headers=headers, stream=True, timeout=random.randint(5, max_timeout)) # 启用流式传输
|
804
715
|
response.raise_for_status() # 如果请求返回的不是200,将抛出HTTPError异常
|
805
|
-
|
806
716
|
# 保存文件
|
807
717
|
with open(fname, "wb") as f:
|
808
718
|
print(f"[bold #96cbd7]Downloading {file_name}...")
|
@@ -834,7 +744,7 @@ def download_file(target_url, store_path, file_name, check=False):
|
|
834
744
|
request_times += 1
|
835
745
|
|
836
746
|
|
837
|
-
def
|
747
|
+
def _check_hour_is_valid(ymdh_str):
|
838
748
|
# hour should be 00, 03, 06, 09, 12, 15, 18, 21
|
839
749
|
hh = int(str(ymdh_str[-2:]))
|
840
750
|
if hh in [0, 3, 6, 9, 12, 15, 18, 21]:
|
@@ -843,9 +753,9 @@ def check_hour_is_valid(ymdh_str):
|
|
843
753
|
return False
|
844
754
|
|
845
755
|
|
846
|
-
def
|
756
|
+
def _check_dataset_version(dataset_name, version_name, download_time, download_time_end=None):
|
847
757
|
if dataset_name is not None and version_name is not None:
|
848
|
-
just_ensure =
|
758
|
+
just_ensure = _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, download_time, download_time_end)
|
849
759
|
if just_ensure:
|
850
760
|
return dataset_name, version_name
|
851
761
|
else:
|
@@ -858,7 +768,7 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
|
|
858
768
|
download_time_str = download_time_str + "00"
|
859
769
|
|
860
770
|
# 检查小时是否有效(如果需要的话)
|
861
|
-
if download_time_end is None and not
|
771
|
+
if download_time_end is None and not _check_hour_is_valid(download_time_str):
|
862
772
|
print("Please ensure the hour is 00, 03, 06, 09, 12, 15, 18, 21")
|
863
773
|
raise ValueError("The hour is invalid")
|
864
774
|
|
@@ -866,18 +776,18 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
|
|
866
776
|
if download_time_end is not None:
|
867
777
|
if len(str(download_time_end)) == 8:
|
868
778
|
download_time_end = str(download_time_end) + "21"
|
869
|
-
have_data =
|
779
|
+
have_data = _check_time_in_dataset_and_version(download_time_str, download_time_end)
|
870
780
|
if have_data:
|
871
|
-
return
|
781
|
+
return _direct_choose_dataset_and_version(download_time_str, download_time_end)
|
872
782
|
else:
|
873
|
-
have_data =
|
783
|
+
have_data = _check_time_in_dataset_and_version(download_time_str)
|
874
784
|
if have_data:
|
875
|
-
return
|
785
|
+
return _direct_choose_dataset_and_version(download_time_str)
|
876
786
|
|
877
787
|
return None, None
|
878
788
|
|
879
789
|
|
880
|
-
def
|
790
|
+
def _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
|
881
791
|
# year_str = str(download_time)[:4]
|
882
792
|
ymdh_str = str(download_time)
|
883
793
|
if depth is not None and level_num is not None:
|
@@ -893,19 +803,19 @@ def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max
|
|
893
803
|
else:
|
894
804
|
# print("Full depth or full level data will be downloaded...")
|
895
805
|
which_mode = "full"
|
896
|
-
query_dict =
|
897
|
-
submit_url =
|
806
|
+
query_dict = _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
|
807
|
+
submit_url = _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
|
898
808
|
return submit_url
|
899
809
|
|
900
810
|
|
901
|
-
def
|
811
|
+
def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
|
902
812
|
print("[bold #ecdbfe]-" * 160)
|
903
813
|
download_time = str(download_time)
|
904
814
|
if download_time_end is not None:
|
905
815
|
download_time_end = str(download_time_end)
|
906
|
-
dataset_name, version_name =
|
816
|
+
dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time, download_time_end)
|
907
817
|
else:
|
908
|
-
dataset_name, version_name =
|
818
|
+
dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time)
|
909
819
|
if dataset_name is None and version_name is None:
|
910
820
|
count_dict["no_data"] += 1
|
911
821
|
if download_time_end is not None:
|
@@ -920,11 +830,11 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
920
830
|
if isinstance(var, list):
|
921
831
|
if len(var) == 1:
|
922
832
|
var = var[0]
|
923
|
-
submit_url =
|
833
|
+
submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
|
924
834
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
|
925
835
|
if download_time_end is not None:
|
926
|
-
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}
|
927
|
-
|
836
|
+
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
|
837
|
+
_download_file(submit_url, store_path, file_name, check)
|
928
838
|
else:
|
929
839
|
if download_time < "2024081012":
|
930
840
|
varlist = [_ for _ in var]
|
@@ -937,7 +847,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
937
847
|
continue
|
938
848
|
|
939
849
|
var = current_group[0]
|
940
|
-
submit_url =
|
850
|
+
submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
|
941
851
|
file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
|
942
852
|
old_str = f'var={variable_info[var]["var_name"]}'
|
943
853
|
new_str = f'var={variable_info[var]["var_name"]}'
|
@@ -948,18 +858,18 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
|
|
948
858
|
# file_name = f'HYCOM_{'-'.join([variable_info[v]["var_name"] for v in current_group])}_{download_time}.nc'
|
949
859
|
file_name = f"HYCOM_{key}_{download_time}.nc"
|
950
860
|
if download_time_end is not None:
|
951
|
-
file_name = f"HYCOM_{key}_{download_time}
|
952
|
-
|
861
|
+
file_name = f"HYCOM_{key}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
|
862
|
+
_download_file(submit_url, store_path, file_name, check)
|
953
863
|
else:
|
954
864
|
for v in var:
|
955
|
-
submit_url =
|
865
|
+
submit_url = _get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
|
956
866
|
file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}.nc"
|
957
867
|
if download_time_end is not None:
|
958
|
-
file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}
|
959
|
-
|
868
|
+
file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}-{download_time_end}.nc"
|
869
|
+
_download_file(submit_url, store_path, file_name, check)
|
960
870
|
|
961
871
|
|
962
|
-
def
|
872
|
+
def _convert_full_name_to_short_name(full_name):
|
963
873
|
for var, info in variable_info.items():
|
964
874
|
if full_name == info["var_name"] or full_name == info["standard_name"] or full_name == var:
|
965
875
|
return var
|
@@ -969,7 +879,7 @@ def convert_full_name_to_short_name(full_name):
|
|
969
879
|
return False
|
970
880
|
|
971
881
|
|
972
|
-
def
|
882
|
+
def _download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
|
973
883
|
"""
|
974
884
|
# 并行下载任务
|
975
885
|
# 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
|
@@ -980,10 +890,10 @@ def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_ma
|
|
980
890
|
因此,即使多个任务同时执行,也不会出现数据交互错乱的问题。
|
981
891
|
"""
|
982
892
|
|
983
|
-
|
893
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
|
984
894
|
|
985
895
|
|
986
|
-
def
|
896
|
+
def _done_callback(future, progress, task, total, counter_lock):
|
987
897
|
"""
|
988
898
|
# 并行下载任务的回调函数
|
989
899
|
# 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
|
@@ -999,7 +909,7 @@ def done_callback(future, progress, task, total, counter_lock):
|
|
999
909
|
progress.update(task, advance=1, description=f"[cyan]Downloading... {parallel_counter}/{total}")
|
1000
910
|
|
1001
911
|
|
1002
|
-
def
|
912
|
+
def _download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
|
1003
913
|
"""
|
1004
914
|
Description:
|
1005
915
|
Download the data of single time or a series of time
|
@@ -1028,7 +938,7 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
1028
938
|
parallel_counter = 0
|
1029
939
|
counter_lock = Lock() # 创建一个锁,线程安全的计数器
|
1030
940
|
if ymdh_time_s == ymdh_time_e:
|
1031
|
-
|
941
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
|
1032
942
|
elif int(ymdh_time_s) < int(ymdh_time_e):
|
1033
943
|
print("Downloading a series of files...")
|
1034
944
|
time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3, "hour")
|
@@ -1038,16 +948,16 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
1038
948
|
if num_workers is None or num_workers <= 1:
|
1039
949
|
# 串行方式
|
1040
950
|
for i, time_str in enumerate(time_list):
|
1041
|
-
|
951
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
|
1042
952
|
progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")
|
1043
953
|
else:
|
1044
954
|
# 并行方式
|
1045
955
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
1046
|
-
futures = [executor.submit(
|
956
|
+
futures = [executor.submit(_download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
|
1047
957
|
""" for i, future in enumerate(futures):
|
1048
958
|
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")) """
|
1049
959
|
for feature in as_completed(futures):
|
1050
|
-
|
960
|
+
_done_callback(feature, progress, task, len(time_list), counter_lock)
|
1051
961
|
else:
|
1052
962
|
new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
|
1053
963
|
total_num = len(new_time_list)
|
@@ -1056,16 +966,16 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
1056
966
|
for i, time_str in enumerate(new_time_list):
|
1057
967
|
time_str_end_index = int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))
|
1058
968
|
time_str_end = time_list[time_str_end_index]
|
1059
|
-
|
969
|
+
_prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
|
1060
970
|
progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")
|
1061
971
|
else:
|
1062
972
|
# 并行方式
|
1063
973
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
1064
|
-
futures = [executor.submit(
|
974
|
+
futures = [executor.submit(_download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
|
1065
975
|
""" for i, future in enumerate(futures):
|
1066
976
|
future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")) """
|
1067
977
|
for feature in as_completed(futures):
|
1068
|
-
|
978
|
+
_done_callback(feature, progress, task, len(time_list), counter_lock)
|
1069
979
|
else:
|
1070
980
|
print("Please ensure the time_s is no more than time_e")
|
1071
981
|
|
@@ -1073,28 +983,30 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
|
|
1073
983
|
def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
|
1074
984
|
"""
|
1075
985
|
Description:
|
1076
|
-
|
986
|
+
Download the data of single time or a series of time
|
1077
987
|
|
1078
988
|
Parameters:
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
989
|
+
var: str or list, the variable name, such as 'u', 'v', 'temp', 'salt', 'ssh', 'u_b', 'v_b', 'temp_b', 'salt_b' or 'water_u', 'water_v', 'water_temp', 'salinity', 'surf_el', 'water_u_bottom', 'water_v_bottom', 'water_temp_bottom', 'salinity_bottom'
|
990
|
+
time_s: str, the start time, such as '2024110100' or '20241101', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21
|
991
|
+
time_e: str, the end time, such as '2024110221' or '20241102', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21; default is None, if not set, the data of single time will be downloaded; or same as time_s, the data of single time will be downloaded
|
992
|
+
lon_min: float, the minimum longitude, default is 0
|
993
|
+
lon_max: float, the maximum longitude, default is 359.92
|
994
|
+
lat_min: float, the minimum latitude, default is -80
|
995
|
+
lat_max: float, the maximum latitude, default is 90
|
996
|
+
depth: float, the depth, default is None, if you wanna get the data of single depth, you can set the depth, suggest to set the depth in [0, 5000]
|
997
|
+
level: int, the level number, default is None, if you wanna get the data of single level, you can set the level, suggest to set the level in [1, 40]
|
998
|
+
store_path: str, the path to store the data, default is None, if not set, the data will be stored in the current working directory
|
999
|
+
dataset_name: str, the dataset name, default is None, example: 'GLBv0.08', 'GLBu0.08', 'GLBy0.08', if not set, the dataset will be chosen according to the download_time
|
1000
|
+
version_name: str, the version name, default is None, example: '53.X', '56.3', if not set, the version will be chosen according to the download_time
|
1001
|
+
num_workers: int, the number of workers, default is None, if not set, the number of workers will be 1; suggest not to set the number of workers too large
|
1002
|
+
check: bool, whether to check the existing file, default is False, if set to True, the existing file will be checked and not downloaded again; else, the existing file will be covered
|
1003
|
+
ftimes: int, the number of time in one file, default is 1, if set to 1, the data of single time will be downloaded; the maximum is 8, if set to 8, the data of 8 times will be downloaded in one file
|
1094
1004
|
|
1095
1005
|
Returns:
|
1096
|
-
|
1006
|
+
None
|
1097
1007
|
"""
|
1008
|
+
_get_initial_data()
|
1009
|
+
|
1098
1010
|
# 打印信息并处理数据集和版本名称
|
1099
1011
|
if dataset_name is None and version_name is None:
|
1100
1012
|
print("The dataset_name and version_name are None, so the dataset and version will be chosen according to the download_time.\nIf there is more than one dataset and version in the time range, the first one will be chosen.")
|
@@ -1111,11 +1023,11 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
1111
1023
|
|
1112
1024
|
if isinstance(var, list):
|
1113
1025
|
if len(var) == 1:
|
1114
|
-
var =
|
1026
|
+
var = _convert_full_name_to_short_name(var[0])
|
1115
1027
|
else:
|
1116
|
-
var = [
|
1028
|
+
var = [_convert_full_name_to_short_name(v) for v in var]
|
1117
1029
|
elif isinstance(var, str):
|
1118
|
-
var =
|
1030
|
+
var = _convert_full_name_to_short_name(var)
|
1119
1031
|
else:
|
1120
1032
|
raise ValueError("The var is invalid")
|
1121
1033
|
if var is False:
|
@@ -1136,8 +1048,8 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
1136
1048
|
os.makedirs(str(store_path), exist_ok=True)
|
1137
1049
|
|
1138
1050
|
if num_workers is not None:
|
1139
|
-
num_workers = max(min(num_workers, 10), 1)
|
1140
|
-
|
1051
|
+
num_workers = max(min(num_workers, 10), 1) # 暂时不限制最大值,再检查的时候可以多开一些线程
|
1052
|
+
# num_workers = int(max(num_workers, 1))
|
1141
1053
|
time_s = str(time_s)
|
1142
1054
|
if len(time_s) == 8:
|
1143
1055
|
time_s += "00"
|
@@ -1154,10 +1066,13 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
|
|
1154
1066
|
""" global current_platform
|
1155
1067
|
current_platform = platform.system() """
|
1156
1068
|
|
1157
|
-
global
|
1158
|
-
|
1069
|
+
global fsize_dict
|
1070
|
+
fsize_dict = {}
|
1071
|
+
|
1072
|
+
global fsize_dict_lock
|
1073
|
+
fsize_dict_lock = Lock()
|
1159
1074
|
|
1160
|
-
|
1075
|
+
_download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
|
1161
1076
|
|
1162
1077
|
count_dict["total"] = count_dict["success"] + count_dict["fail"] + count_dict["skip"] + count_dict["no_data"]
|
1163
1078
|
|
@@ -1225,8 +1140,7 @@ def how_to_use():
|
|
1225
1140
|
|
1226
1141
|
|
1227
1142
|
if __name__ == "__main__":
|
1228
|
-
|
1229
|
-
time_s, time_e = "2024081012", "2024081115"
|
1143
|
+
time_s, time_e = "2024101012", "2024101018"
|
1230
1144
|
merge_name = f"{time_s}_{time_e}" # 合并后的文件名
|
1231
1145
|
root_path = r"G:\Data\HYCOM\3hourly_test"
|
1232
1146
|
location_dict = {"west": 105, "east": 130, "south": 15, "north": 45}
|
@@ -1256,7 +1170,7 @@ if __name__ == "__main__":
|
|
1256
1170
|
download_switch, single_var = True, False
|
1257
1171
|
combine_switch = False
|
1258
1172
|
copy_switch, copy_dir = False, r"G:\Data\HYCOM\3hourly"
|
1259
|
-
|
1173
|
+
|
1260
1174
|
# draw_time_range(pic_save_folder=r'I:\Delete')
|
1261
1175
|
|
1262
1176
|
if download_switch:
|
@@ -1288,6 +1202,6 @@ if __name__ == "__main__":
|
|
1288
1202
|
merge_path_name = Path(root_path)/f'HYCOM_{fname}_{merge_name}.nc'
|
1289
1203
|
if combine_switch:
|
1290
1204
|
# 这里的var_name必须是官方变量名,不能再是简写了
|
1291
|
-
|
1205
|
+
merge(file_list, var_name, 'time', merge_path_name)
|
1292
1206
|
if copy_switch:
|
1293
1207
|
copy_file(merge_path_name, copy_dir) """
|