oafuncs 0.0.90__py2.py3-none-any.whl → 0.0.91__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
oafuncs/oa_data.py CHANGED
@@ -15,13 +15,14 @@ Python Version: 3.11
15
15
 
16
16
  import itertools
17
17
  import multiprocessing as mp
18
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
18
+ from concurrent.futures import ThreadPoolExecutor
19
19
 
20
20
  import numpy as np
21
- from scipy.interpolate import griddata
22
21
  from rich import print
22
+ from scipy.interpolate import griddata
23
23
 
24
- __all__ = ["interp_2d","ParallelExecutor"]
24
+
25
+ __all__ = ["interp_2d"]
25
26
 
26
27
 
27
28
  def interp_2d(target_x, target_y, origin_x, origin_y, data, method="linear", parallel=True):
@@ -91,70 +92,7 @@ def interp_2d(target_x, target_y, origin_x, origin_y, data, method="linear", par
91
92
  return np.array(interpolated_data)
92
93
 
93
94
 
94
- class ParallelExecutor:
95
- """
96
- 通用并行计算类,支持多进程和多线程模式。
97
-
98
- 使用说明:
99
- 1. 创建实例时选择模式:
100
- - mode="process" 使用多进程(适合 CPU 密集型任务)。
101
- - mode="thread" 使用多线程(适合 IO 密集型任务)。
102
-
103
- 2. 调用 run 方法:
104
- - 参数 func:需要并行执行的函数。
105
- - 参数 param_list:参数列表,每个元素是传递给 func 的参数元组。
106
-
107
- 示例:
108
- # 示例 1:计算平方
109
- def compute_square(x):
110
- return x * x
111
-
112
- params = [(i,) for i in range(10)]
113
- executor = ParallelExecutor(mode="process", max_workers=4)
114
- results = executor.run(compute_square, params)
115
- print("Results:", results)
116
-
117
- # 示例 2:计算两数之和
118
- def compute_sum(a, b):
119
- return a + b
120
-
121
- params = [(1, 2), (3, 4), (5, 6)]
122
- executor = ParallelExecutor(mode="thread", max_workers=2)
123
- results = executor.run(compute_sum, params)
124
- print("Results:", results)
125
-
126
- 参数:
127
- mode (str): 并行模式,"process" 表示多进程,"thread" 表示多线程。
128
- max_workers (int): 最大并行工作数,默认为 CPU 核心数减 2。
129
- """
130
-
131
- def __init__(self, mode="process", max_workers=mp.cpu_count() - 2):
132
- self.mode = mode
133
- self.max_workers = max_workers
134
- self.executor = ProcessPoolExecutor if mode == "process" else ThreadPoolExecutor
135
-
136
- def run(self, func, param_list):
137
- """
138
- 并行运行指定函数,并确保结果顺序与输入参数顺序一致。
139
95
 
140
- 参数:
141
- func (callable): 需要并行执行的函数。
142
- param_list (list): 参数列表,每个元素是传递给 func 的参数元组。
143
-
144
- 返回:
145
- results (list): 按输入顺序返回的结果。
146
- """
147
- results = [None] * len(param_list) # 预分配结果数组
148
-
149
- with self.executor(max_workers=self.max_workers) as executor:
150
- # 提交任务并保存其索引
151
- future_to_index = {executor.submit(func, *params): idx for idx, params in enumerate(param_list)}
152
-
153
- for future in future_to_index:
154
- idx = future_to_index[future] # 获取原始索引
155
- results[idx] = future.result() # 将结果存放到对应位置
156
-
157
- return results
158
96
 
159
97
 
160
98
  # ---------------------------------------------------------------------------------- not used below ----------------------------------------------------------------------------------
@@ -203,7 +141,7 @@ def interp_2d_20241213(target_x, target_y, origin_x, origin_y, data, method="lin
203
141
  for i in range(dims[0]):
204
142
  dt = griddata(origin_points, np.ravel(data[i, :, :]), target_points, method=method)
205
143
  interpolated_data.append(np.reshape(dt, target_y.shape))
206
- print(f"Interpolating {i+1}/{dims[0]}...")
144
+ print(f"Interpolating {i + 1}/{dims[0]}...")
207
145
  interpolated_data = np.array(interpolated_data)
208
146
  elif len_dims == 4:
209
147
  interpolated_data = []
@@ -212,7 +150,7 @@ def interp_2d_20241213(target_x, target_y, origin_x, origin_y, data, method="lin
212
150
  for j in range(dims[1]):
213
151
  dt = griddata(origin_points, np.ravel(data[i, j, :, :]), target_points, method=method)
214
152
  interpolated_data[i].append(np.reshape(dt, target_y.shape))
215
- print(f"\rInterpolating {i*dims[1]+j+1}/{dims[0]*dims[1]}...", end="")
153
+ print(f"\rInterpolating {i * dims[1] + j + 1}/{dims[0] * dims[1]}...", end="")
216
154
  print("\n")
217
155
  interpolated_data = np.array(interpolated_data)
218
156
 
@@ -270,7 +208,7 @@ def interp_2d_parallel_20241213(target_x, target_y, origin_x, origin_y, data, me
270
208
 
271
209
  # 使用多线程进行插值
272
210
  with ThreadPoolExecutor(max_workers=mp.cpu_count() - 2) as executor:
273
- print(f"Using {mp.cpu_count()-2} threads...")
211
+ print(f"Using {mp.cpu_count() - 2} threads...")
274
212
  if len_dims == 2:
275
213
  interpolated_data = list(executor.map(interp_single2d, [target_y], [target_x], [origin_y], [origin_x], [data], [method]))
276
214
  elif len_dims == 3:
@@ -296,23 +234,12 @@ def interp_2d_parallel_20241213(target_x, target_y, origin_x, origin_y, data, me
296
234
  return interpolated_data
297
235
 
298
236
 
299
- def _test_sum(a,b):
300
- return a+b
237
+ def _test_sum(a, b):
238
+ return a + b
301
239
 
302
240
 
303
241
  if __name__ == "__main__":
304
- # 参数列表:每个参数是元组
305
- params_list = [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)]
306
-
307
- # 创建并行执行器
308
- executor = ParallelExecutor()
309
-
310
- # 并行运行
311
- results = executor.run(_test_sum, params_list)
312
242
 
313
- # 验证结果顺序
314
- print("Params:", params_list)
315
- print("Results:", results)
316
243
  pass
317
244
  """ import time
318
245
 
@@ -19,3 +19,4 @@ Python Version: 3.11
19
19
  from .hycom_3hourly import *
20
20
  from .literature import *
21
21
  from .user_agent import *
22
+ from .idm import *
@@ -32,13 +32,14 @@ from rich.progress import Progress
32
32
 
33
33
  from oafuncs.oa_down.user_agent import get_ua
34
34
  from oafuncs.oa_file import file_size, mean_size
35
+ from oafuncs.oa_nc import check as check_nc
35
36
 
36
37
  warnings.filterwarnings("ignore", category=RuntimeWarning, message="Engine '.*' loading failed:.*")
37
38
 
38
39
  __all__ = ["draw_time_range", "download", "how_to_use", "get_time_list"]
39
40
 
40
41
 
41
- def get_initial_data():
42
+ def _get_initial_data():
42
43
  global variable_info, data_info, var_group, single_var_group
43
44
  # ----------------------------------------------
44
45
  # variable
@@ -305,14 +306,14 @@ def get_time_list(time_s, time_e, delta, interval_type="hour"):
305
306
  return dt_list
306
307
 
307
308
 
308
- def transform_time(time_str):
309
+ def _transform_time(time_str):
309
310
  # old_time = '2023080203'
310
311
  # time_new = '2023-08-02T03%3A00%3A00Z'
311
312
  time_new = f"{time_str[:4]}-{time_str[4:6]}-{time_str[6:8]}T{time_str[8:10]}%3A00%3A00Z"
312
313
  return time_new
313
314
 
314
315
 
315
- def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_str_end=None, mode="single_depth", depth=None, level_num=None):
316
+ def _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_str_end=None, mode="single_depth", depth=None, level_num=None):
316
317
  query_dict = {
317
318
  "var": variable_info[var]["var_name"],
318
319
  "north": lat_max,
@@ -331,11 +332,11 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
331
332
  }
332
333
 
333
334
  if time_str_end is not None:
334
- query_dict["time_start"] = transform_time(time_str_ymdh)
335
- query_dict["time_end"] = transform_time(time_str_end)
335
+ query_dict["time_start"] = _transform_time(time_str_ymdh)
336
+ query_dict["time_end"] = _transform_time(time_str_end)
336
337
  query_dict["timeStride"] = 1
337
338
  else:
338
- query_dict["time"] = transform_time(time_str_ymdh)
339
+ query_dict["time"] = _transform_time(time_str_ymdh)
339
340
 
340
341
  def get_nearest_level_index(depth):
341
342
  level_depth = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 125.0, 150.0, 200.0, 250.0, 300.0, 350.0, 400.0, 500.0, 600.0, 700.0, 800.0, 900.0, 1000.0, 1250.0, 1500.0, 2000.0, 2500.0, 3000.0, 4000.0, 5000]
@@ -360,7 +361,7 @@ def get_query_dict(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh, time_
360
361
  return query_dict
361
362
 
362
363
 
363
- def check_time_in_dataset_and_version(time_input, time_end=None):
364
+ def _check_time_in_dataset_and_version(time_input, time_end=None):
364
365
  # 判断是处理单个时间点还是时间范围
365
366
  is_single_time = time_end is None
366
367
 
@@ -417,8 +418,8 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
417
418
  if is_single_time:
418
419
  return True
419
420
  else:
420
- base_url_s = get_base_url(d_list[0], v_list[0], "u", str(time_start))
421
- base_url_e = get_base_url(d_list[0], v_list[0], "u", str(time_end))
421
+ base_url_s = _get_base_url(d_list[0], v_list[0], "u", str(time_start))
422
+ base_url_e = _get_base_url(d_list[0], v_list[0], "u", str(time_end))
422
423
  if base_url_s == base_url_e:
423
424
  return True
424
425
  else:
@@ -429,7 +430,7 @@ def check_time_in_dataset_and_version(time_input, time_end=None):
429
430
  return False
430
431
 
431
432
 
432
- def ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time_input, time_end=None):
433
+ def _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time_input, time_end=None):
433
434
  # 根据时间长度补全时间格式
434
435
  if len(str(time_input)) == 8:
435
436
  time_input = str(time_input) + "00"
@@ -468,7 +469,7 @@ def ensure_time_in_specific_dataset_and_version(dataset_name, version_name, time
468
469
  return False
469
470
 
470
471
 
471
- def direct_choose_dataset_and_version(time_input, time_end=None):
472
+ def _direct_choose_dataset_and_version(time_input, time_end=None):
472
473
  # 假设 data_info 是一个字典,包含了数据集和版本的信息
473
474
  # 示例结构:data_info['hourly']['dataset'][dataset_name]['version'][version_name]['time_range']
474
475
 
@@ -507,7 +508,7 @@ def direct_choose_dataset_and_version(time_input, time_end=None):
507
508
  return dataset_name_out, version_name_out
508
509
 
509
510
 
510
- def get_base_url(dataset_name, version_name, var, ymdh_str):
511
+ def _get_base_url(dataset_name, version_name, var, ymdh_str):
511
512
  year_str = int(ymdh_str[:4])
512
513
  url_dict = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["url"]
513
514
  classification_method = data_info["hourly"]["dataset"][dataset_name]["version"][version_name]["classification"]
@@ -548,67 +549,109 @@ def get_base_url(dataset_name, version_name, var, ymdh_str):
548
549
  return base_url
549
550
 
550
551
 
551
- def get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
552
- base_url = get_base_url(dataset_name, version_name, var, ymdh_str)
552
+ def _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict):
553
+ base_url = _get_base_url(dataset_name, version_name, var, ymdh_str)
553
554
  if isinstance(query_dict["var"], str):
554
555
  query_dict["var"] = [query_dict["var"]]
555
556
  target_url = base_url + "&".join(f"var={var}" for var in query_dict["var"]) + "&" + "&".join(f"{key}={value}" for key, value in query_dict.items() if key != "var")
556
557
  return target_url
557
558
 
558
559
 
559
- def clear_existing_file(file_full_path):
560
+ def _clear_existing_file(file_full_path):
560
561
  if os.path.exists(file_full_path):
561
562
  os.remove(file_full_path)
562
563
  print(f"{file_full_path} has been removed")
563
564
 
564
565
 
565
- def check_existing_file(file_full_path, min_size):
566
+ def _check_existing_file(file_full_path, avg_size):
566
567
  if os.path.exists(file_full_path):
567
568
  print(f"[bold #FFA54F]{file_full_path} exists")
568
569
  fsize = file_size(file_full_path)
569
- if min_size:
570
- if fsize < min_size:
571
- print(f"[bold #FFA54F]{file_full_path} ({fsize:.2f} KB) may be incomplete")
572
- # clear_existing_file(file_full_path)
573
- return False
574
- else:
570
+ delta_size_ratio = (fsize - avg_size) / avg_size
571
+ if abs(delta_size_ratio) > 0.025:
572
+ if check_nc(file_full_path):
573
+ # print(f"File size is abnormal but can be opened normally, file size: {fsize:.2f} KB")
575
574
  return True
576
- if fsize < 5:
577
- print(f"[bold #FFA54F]{file_full_path} ({fsize:.2f} KB) may be incomplete")
578
- # clear_existing_file(file_full_path)
579
- return False
575
+ else:
576
+ print(f"File size is abnormal and cannot be opened, {file_full_path}: {fsize:.2f} KB")
577
+ return False
580
578
  else:
581
579
  return True
582
580
  else:
583
- # print(f'{file_full_path} does not exist')
584
581
  return False
585
582
 
586
583
 
587
- def download_file(target_url, store_path, file_name, check=False):
588
- # Check if the file exists
589
- fname = Path(store_path) / file_name
590
- file_name_split = file_name.split("_")
591
- file_name_split = file_name_split[:-1]
592
- # same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
593
- same_file = "_".join(file_name_split) + "*nc"
594
-
584
+ def _get_mean_size30(store_path, same_file):
595
585
  if same_file not in fsize_dict.keys():
596
- # print(f'Same file name: {same_file}')
597
- fsize_dict[same_file] = {"size": 0, "count": 0}
586
+ # print(f'Same file name: {same_file}')
587
+ fsize_dict[same_file] = {"size": 0, "count": 0}
598
588
 
599
589
  if fsize_dict[same_file]["count"] < 30 or fsize_dict[same_file]["size"] == 0:
600
590
  # 更新30次文件最小值,后续认为可以代表所有文件,不再更新占用时间
601
591
  fsize_mean = mean_size(store_path, same_file, max_num=30)
602
- set_min_size = fsize_mean * 0.8
592
+ set_min_size = fsize_mean * 0.95
603
593
  fsize_dict[same_file]["size"] = set_min_size
604
594
  fsize_dict[same_file]["count"] += 1
605
595
  else:
606
596
  set_min_size = fsize_dict[same_file]["size"]
597
+ return set_min_size
598
+
599
+
600
+ def _get_mean_size_move(same_file, current_file):
601
+ # 获取锁
602
+ with fsize_dict_lock: # 全局锁,确保同一时间只能有一个线程访问
603
+ # 初始化字典中的值,如果文件不在字典中
604
+ if same_file not in fsize_dict.keys():
605
+ fsize_dict[same_file] = {"size_list": [], "mean_size": 1.0}
606
+
607
+ tolerance_ratio = 0.025 # 容忍的阈值比例
608
+ current_file_size = file_size(current_file)
609
+
610
+ # 如果列表不为空,则计算平均值,否则保持为1
611
+ if fsize_dict[same_file]["size_list"]:
612
+ fsize_dict[same_file]["mean_size"] = sum(fsize_dict[same_file]["size_list"]) / len(fsize_dict[same_file]["size_list"])
613
+ fsize_dict[same_file]["mean_size"] = max(fsize_dict[same_file]["mean_size"], 1.0)
614
+ else:
615
+ fsize_dict[same_file]["mean_size"] = 1.0
616
+
617
+ size_difference_ratio = (current_file_size - fsize_dict[same_file]["mean_size"]) / fsize_dict[same_file]["mean_size"]
618
+
619
+ if abs(size_difference_ratio) > tolerance_ratio:
620
+ if check_nc(current_file):
621
+ # print(f"File size is abnormal but can be opened normally, file size: {current_file_size:.2f} KB")
622
+ # 文件可以正常打开,但大小异常,保留当前文件大小
623
+ fsize_dict[same_file]["size_list"] = [current_file_size]
624
+ fsize_dict[same_file]["mean_size"] = current_file_size
625
+ else:
626
+ _clear_existing_file(current_file)
627
+ print(f"File size is abnormal, may need to be downloaded again, file size: {current_file_size:.2f} KB")
628
+ else:
629
+ # 添加当前文件大小到列表中,并更新计数
630
+ fsize_dict[same_file]["size_list"].append(current_file_size)
631
+
632
+ # 返回调整后的平均值,这里根据您的需求,返回的是添加新值之前的平均值
633
+ return fsize_dict[same_file]["mean_size"]
634
+
635
+
636
+ def _download_file(target_url, store_path, file_name, check=False):
637
+ # Check if the file exists
638
+ fname = Path(store_path) / file_name
639
+ file_name_split = file_name.split("_")
640
+ file_name_split = file_name_split[:-1]
641
+ # same_file = f"{file_name_split[0]}_{file_name_split[1]}*nc"
642
+ same_file = "_".join(file_name_split) + "*nc"
643
+
607
644
  if check:
608
- if check_existing_file(fname, set_min_size):
645
+ if same_file not in fsize_dict.keys(): # 对第一个文件单独进行检查,因为没有大小可以对比
646
+ check_nc(fname,if_delete=True)
647
+
648
+ # set_min_size = _get_mean_size30(store_path, same_file) # 原方案,只30次取平均值;若遇变化,无法判断
649
+ get_mean_size = _get_mean_size_move(same_file, fname)
650
+
651
+ if _check_existing_file(fname, get_mean_size):
609
652
  count_dict["skip"] += 1
610
653
  return
611
- clear_existing_file(fname)
654
+ _clear_existing_file(fname)
612
655
 
613
656
  # -----------------------------------------------
614
657
  print(f"[bold #f0f6d0]Requesting {file_name}...")
@@ -701,7 +744,7 @@ def download_file(target_url, store_path, file_name, check=False):
701
744
  request_times += 1
702
745
 
703
746
 
704
- def check_hour_is_valid(ymdh_str):
747
+ def _check_hour_is_valid(ymdh_str):
705
748
  # hour should be 00, 03, 06, 09, 12, 15, 18, 21
706
749
  hh = int(str(ymdh_str[-2:]))
707
750
  if hh in [0, 3, 6, 9, 12, 15, 18, 21]:
@@ -710,9 +753,9 @@ def check_hour_is_valid(ymdh_str):
710
753
  return False
711
754
 
712
755
 
713
- def check_dataset_version(dataset_name, version_name, download_time, download_time_end=None):
756
+ def _check_dataset_version(dataset_name, version_name, download_time, download_time_end=None):
714
757
  if dataset_name is not None and version_name is not None:
715
- just_ensure = ensure_time_in_specific_dataset_and_version(dataset_name, version_name, download_time, download_time_end)
758
+ just_ensure = _ensure_time_in_specific_dataset_and_version(dataset_name, version_name, download_time, download_time_end)
716
759
  if just_ensure:
717
760
  return dataset_name, version_name
718
761
  else:
@@ -725,7 +768,7 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
725
768
  download_time_str = download_time_str + "00"
726
769
 
727
770
  # 检查小时是否有效(如果需要的话)
728
- if download_time_end is None and not check_hour_is_valid(download_time_str):
771
+ if download_time_end is None and not _check_hour_is_valid(download_time_str):
729
772
  print("Please ensure the hour is 00, 03, 06, 09, 12, 15, 18, 21")
730
773
  raise ValueError("The hour is invalid")
731
774
 
@@ -733,18 +776,18 @@ def check_dataset_version(dataset_name, version_name, download_time, download_ti
733
776
  if download_time_end is not None:
734
777
  if len(str(download_time_end)) == 8:
735
778
  download_time_end = str(download_time_end) + "21"
736
- have_data = check_time_in_dataset_and_version(download_time_str, download_time_end)
779
+ have_data = _check_time_in_dataset_and_version(download_time_str, download_time_end)
737
780
  if have_data:
738
- return direct_choose_dataset_and_version(download_time_str, download_time_end)
781
+ return _direct_choose_dataset_and_version(download_time_str, download_time_end)
739
782
  else:
740
- have_data = check_time_in_dataset_and_version(download_time_str)
783
+ have_data = _check_time_in_dataset_and_version(download_time_str)
741
784
  if have_data:
742
- return direct_choose_dataset_and_version(download_time_str)
785
+ return _direct_choose_dataset_and_version(download_time_str)
743
786
 
744
787
  return None, None
745
788
 
746
789
 
747
- def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
790
+ def _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end=None):
748
791
  # year_str = str(download_time)[:4]
749
792
  ymdh_str = str(download_time)
750
793
  if depth is not None and level_num is not None:
@@ -760,19 +803,19 @@ def get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max
760
803
  else:
761
804
  # print("Full depth or full level data will be downloaded...")
762
805
  which_mode = "full"
763
- query_dict = get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
764
- submit_url = get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
806
+ query_dict = _get_query_dict(var, lon_min, lon_max, lat_min, lat_max, download_time, download_time_end, which_mode, depth, level_num)
807
+ submit_url = _get_submit_url(dataset_name, version_name, var, ymdh_str, query_dict)
765
808
  return submit_url
766
809
 
767
810
 
768
- def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
811
+ def _prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, download_time="2024083100", download_time_end=None, depth=None, level_num=None, store_path=None, dataset_name=None, version_name=None, check=False):
769
812
  print("[bold #ecdbfe]-" * 160)
770
813
  download_time = str(download_time)
771
814
  if download_time_end is not None:
772
815
  download_time_end = str(download_time_end)
773
- dataset_name, version_name = check_dataset_version(dataset_name, version_name, download_time, download_time_end)
816
+ dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time, download_time_end)
774
817
  else:
775
- dataset_name, version_name = check_dataset_version(dataset_name, version_name, download_time)
818
+ dataset_name, version_name = _check_dataset_version(dataset_name, version_name, download_time)
776
819
  if dataset_name is None and version_name is None:
777
820
  count_dict["no_data"] += 1
778
821
  if download_time_end is not None:
@@ -787,11 +830,11 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
787
830
  if isinstance(var, list):
788
831
  if len(var) == 1:
789
832
  var = var[0]
790
- submit_url = get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
833
+ submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
791
834
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
792
835
  if download_time_end is not None:
793
836
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
794
- download_file(submit_url, store_path, file_name, check)
837
+ _download_file(submit_url, store_path, file_name, check)
795
838
  else:
796
839
  if download_time < "2024081012":
797
840
  varlist = [_ for _ in var]
@@ -804,7 +847,7 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
804
847
  continue
805
848
 
806
849
  var = current_group[0]
807
- submit_url = get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
850
+ submit_url = _get_submit_url_var(var, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
808
851
  file_name = f"HYCOM_{variable_info[var]['var_name']}_{download_time}.nc"
809
852
  old_str = f'var={variable_info[var]["var_name"]}'
810
853
  new_str = f'var={variable_info[var]["var_name"]}'
@@ -816,17 +859,17 @@ def prepare_url_to_download(var, lon_min=0, lon_max=359.92, lat_min=-80, lat_max
816
859
  file_name = f"HYCOM_{key}_{download_time}.nc"
817
860
  if download_time_end is not None:
818
861
  file_name = f"HYCOM_{key}_{download_time}-{download_time_end}.nc" # 这里时间不能用下划线,不然后续处理查找同一变量文件会出问题
819
- download_file(submit_url, store_path, file_name, check)
862
+ _download_file(submit_url, store_path, file_name, check)
820
863
  else:
821
864
  for v in var:
822
- submit_url = get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
865
+ submit_url = _get_submit_url_var(v, depth, level_num, lon_min, lon_max, lat_min, lat_max, dataset_name, version_name, download_time, download_time_end)
823
866
  file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}.nc"
824
867
  if download_time_end is not None:
825
868
  file_name = f"HYCOM_{variable_info[v]['var_name']}_{download_time}-{download_time_end}.nc"
826
- download_file(submit_url, store_path, file_name, check)
869
+ _download_file(submit_url, store_path, file_name, check)
827
870
 
828
871
 
829
- def convert_full_name_to_short_name(full_name):
872
+ def _convert_full_name_to_short_name(full_name):
830
873
  for var, info in variable_info.items():
831
874
  if full_name == info["var_name"] or full_name == info["standard_name"] or full_name == var:
832
875
  return var
@@ -836,7 +879,7 @@ def convert_full_name_to_short_name(full_name):
836
879
  return False
837
880
 
838
881
 
839
- def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
882
+ def _download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check):
840
883
  """
841
884
  # 并行下载任务
842
885
  # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
@@ -847,10 +890,10 @@ def download_task(var, time_str, time_str_end, lon_min, lon_max, lat_min, lat_ma
847
890
  因此,即使多个任务同时执行,也不会出现数据交互错乱的问题。
848
891
  """
849
892
 
850
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
893
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
851
894
 
852
895
 
853
- def done_callback(future, progress, task, total, counter_lock):
896
+ def _done_callback(future, progress, task, total, counter_lock):
854
897
  """
855
898
  # 并行下载任务的回调函数
856
899
  # 这个函数是为了并行下载而设置的,是必须的,直接调用direct_download并行下载会出问题
@@ -866,7 +909,7 @@ def done_callback(future, progress, task, total, counter_lock):
866
909
  progress.update(task, advance=1, description=f"[cyan]Downloading... {parallel_counter}/{total}")
867
910
 
868
911
 
869
- def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
912
+ def _download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_max=90, depth=None, level=None, store_path=None, dataset_name=None, version_name=None, num_workers=None, check=False, ftimes=1):
870
913
  """
871
914
  Description:
872
915
  Download the data of single time or a series of time
@@ -895,7 +938,7 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
895
938
  parallel_counter = 0
896
939
  counter_lock = Lock() # 创建一个锁,线程安全的计数器
897
940
  if ymdh_time_s == ymdh_time_e:
898
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
941
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, ymdh_time_s, None, depth, level, store_path, dataset_name, version_name, check)
899
942
  elif int(ymdh_time_s) < int(ymdh_time_e):
900
943
  print("Downloading a series of files...")
901
944
  time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3, "hour")
@@ -905,16 +948,16 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
905
948
  if num_workers is None or num_workers <= 1:
906
949
  # 串行方式
907
950
  for i, time_str in enumerate(time_list):
908
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
951
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, None, depth, level, store_path, dataset_name, version_name, check)
909
952
  progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")
910
953
  else:
911
954
  # 并行方式
912
955
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
913
- futures = [executor.submit(download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
956
+ futures = [executor.submit(_download_task, var, time_str, None, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for time_str in time_list]
914
957
  """ for i, future in enumerate(futures):
915
958
  future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{len(time_list)}")) """
916
959
  for feature in as_completed(futures):
917
- done_callback(feature, progress, task, len(time_list), counter_lock)
960
+ _done_callback(feature, progress, task, len(time_list), counter_lock)
918
961
  else:
919
962
  new_time_list = get_time_list(ymdh_time_s, ymdh_time_e, 3 * ftimes, "hour")
920
963
  total_num = len(new_time_list)
@@ -923,16 +966,16 @@ def download_hourly_func(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min
923
966
  for i, time_str in enumerate(new_time_list):
924
967
  time_str_end_index = int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))
925
968
  time_str_end = time_list[time_str_end_index]
926
- prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
969
+ _prepare_url_to_download(var, lon_min, lon_max, lat_min, lat_max, time_str, time_str_end, depth, level, store_path, dataset_name, version_name, check)
927
970
  progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")
928
971
  else:
929
972
  # 并行方式
930
973
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
931
- futures = [executor.submit(download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
974
+ futures = [executor.submit(_download_task, var, new_time_list[i], time_list[int(min(len(time_list) - 1, int(i * ftimes + ftimes - 1)))], lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, check) for i in range(total_num)]
932
975
  """ for i, future in enumerate(futures):
933
976
  future.add_done_callback(lambda _: progress.update(task, advance=1, description=f"[cyan]Downloading... {i+1}/{total_num}")) """
934
977
  for feature in as_completed(futures):
935
- done_callback(feature, progress, task, len(time_list), counter_lock)
978
+ _done_callback(feature, progress, task, len(time_list), counter_lock)
936
979
  else:
937
980
  print("Please ensure the time_s is no more than time_e")
938
981
 
@@ -962,7 +1005,7 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
962
1005
  Returns:
963
1006
  None
964
1007
  """
965
- get_initial_data()
1008
+ _get_initial_data()
966
1009
 
967
1010
  # 打印信息并处理数据集和版本名称
968
1011
  if dataset_name is None and version_name is None:
@@ -980,11 +1023,11 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
980
1023
 
981
1024
  if isinstance(var, list):
982
1025
  if len(var) == 1:
983
- var = convert_full_name_to_short_name(var[0])
1026
+ var = _convert_full_name_to_short_name(var[0])
984
1027
  else:
985
- var = [convert_full_name_to_short_name(v) for v in var]
1028
+ var = [_convert_full_name_to_short_name(v) for v in var]
986
1029
  elif isinstance(var, str):
987
- var = convert_full_name_to_short_name(var)
1030
+ var = _convert_full_name_to_short_name(var)
988
1031
  else:
989
1032
  raise ValueError("The var is invalid")
990
1033
  if var is False:
@@ -1005,8 +1048,8 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
1005
1048
  os.makedirs(str(store_path), exist_ok=True)
1006
1049
 
1007
1050
  if num_workers is not None:
1008
- num_workers = max(min(num_workers, 10), 1)
1009
-
1051
+ num_workers = max(min(num_workers, 10), 1) # 暂时不限制最大值,再检查的时候可以多开一些线程
1052
+ # num_workers = int(max(num_workers, 1))
1010
1053
  time_s = str(time_s)
1011
1054
  if len(time_s) == 8:
1012
1055
  time_s += "00"
@@ -1025,8 +1068,11 @@ def download(var, time_s, time_e=None, lon_min=0, lon_max=359.92, lat_min=-80, l
1025
1068
 
1026
1069
  global fsize_dict
1027
1070
  fsize_dict = {}
1071
+
1072
+ global fsize_dict_lock
1073
+ fsize_dict_lock = Lock()
1028
1074
 
1029
- download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
1075
+ _download_hourly_func(var, time_s, time_e, lon_min, lon_max, lat_min, lat_max, depth, level, store_path, dataset_name, version_name, num_workers, check, ftimes)
1030
1076
 
1031
1077
  count_dict["total"] = count_dict["success"] + count_dict["fail"] + count_dict["skip"] + count_dict["no_data"]
1032
1078
 
oafuncs/oa_down/idm.py ADDED
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Author: Liu Kun && 16031215@qq.com
5
+ Date: 2025-01-11 16:19:12
6
+ LastEditors: Liu Kun && 16031215@qq.com
7
+ LastEditTime: 2025-01-11 16:25:47
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\idm.py
9
+ Description:
10
+ EditPlatform: vscode
11
+ ComputerInfo: XPS 15 9510
12
+ SystemInfo: Windows 11
13
+ Python Version: 3.12
14
+ """
15
+
16
+ import datetime
17
+ import os
18
+ from subprocess import call
19
+
20
+ from rich import print
21
+
22
+ __all__ = ["downloader"]
23
+
24
+
25
+ def downloader(task_url, folder_path, file_name, idm_engine=r"D:\Programs\Internet Download Manager\IDMan.exe"):
26
+ """
27
+ Description:
28
+ Use IDM to download files.
29
+ Parameter:
30
+ task_url: str
31
+ The download link of the file.
32
+ folder_path: str
33
+ The path of the folder where the file is saved.
34
+ file_name: str
35
+ The name of the file to be saved.
36
+ idm_engine: str
37
+ The path of the IDM engine. Note: "IDMan.exe"
38
+ Return:
39
+ None
40
+ Example:
41
+ downloader("https://www.test.com/data.nc", r"E:\Data", "test.nc", r"D:\Programs\Internet Download Manager\IDMan.exe")
42
+ """
43
+ os.makedirs(folder_path, exist_ok=True)
44
+ # 将任务添加至队列
45
+ call([idm_engine, "/d", task_url, "/p", folder_path, "/f", file_name, "/a"])
46
+ # 开始任务队列
47
+ call([idm_engine, "/s"])
48
+ # print(f"IDM下载器:{file_name}下载任务已添加至队列...")
49
+ print("[purple]-" * 50 + f"\n{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + "[purple]-" * 50)
50
+ print(f"[green]IDM Downloader: {file_name} download task has been added to the queue...[/green]")
@@ -64,7 +64,21 @@ class _Downloader:
64
64
  r"https://sci-hub.se",
65
65
  r"https://sci-hub.ren",
66
66
  r"https://sci-hub.st",
67
- r"https://sci-hub.ru",
67
+ r"https://sci-hub.ru", # 最好用的一个网站
68
+ # ------------------------------------- 以下网站没验证
69
+ r"https://sci-hub.wf",
70
+ r"https://sci-hub.yt",
71
+ r"https://sci-hub.ee",
72
+ r"https://sci-hub.cat",
73
+ r"https://sci-hub.in",
74
+ r"https://www.pismin.com",
75
+ r"https://sci-hub.vkif.top",
76
+ r"https://www.bothonce.com",
77
+ r"https://sci-hub.et-fine.com",
78
+ r"https://sci-hub.hkvisa.net",
79
+ # r"https://sci-hub.3800808.com", # 这个只能手动保存
80
+ r"https://sci-hub.zidianzhan.net",
81
+ r"https://sci-hub.usualwant.com",
68
82
  ]
69
83
  self.base_url = None
70
84
  self.url = None
@@ -86,33 +100,37 @@ class _Downloader:
86
100
  self.try_times = 0
87
101
 
88
102
  def get_pdf_url(self):
89
- print("[bold #E6E6FA]-" * 100)
103
+ print("[bold #E6E6FA]-" * 120)
90
104
  print(f"DOI: {self.doi}")
91
105
  print(f"Requesting: {self.url}...")
92
- response = requests.get(self.url, headers=self.headers)
93
- if response.status_code == 200:
94
- self.cookies = response.cookies
95
- text = response.text.replace("\\", "")
96
- # text = text.replace(' ', '') # It is important to remove the space
97
- # print(text)
98
- pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
99
- match = pattern.search(text)
100
- if match:
101
- got_url = match.group(1)
102
- if r"http" not in got_url:
103
- if got_url[:2] == "//":
104
- self.pdf_url = "https:" + got_url
106
+ try:
107
+ response = requests.get(self.url, headers=self.headers)
108
+ if response.status_code == 200:
109
+ self.cookies = response.cookies
110
+ text = response.text.replace("\\", "")
111
+ # text = text.replace(' ', '') # It is important to remove the space
112
+ # print(text)
113
+ pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
114
+ match = pattern.search(text)
115
+ if match:
116
+ got_url = match.group(1)
117
+ if r"http" not in got_url:
118
+ if got_url[:2] == "//":
119
+ self.pdf_url = "https:" + got_url
120
+ else:
121
+ self.pdf_url = self.base_url + got_url
105
122
  else:
106
- self.pdf_url = self.base_url + got_url
123
+ self.pdf_url = got_url
124
+ print(f"URL: {self.pdf_url}")
107
125
  else:
108
- self.pdf_url = got_url
109
- print(f"URL: {self.pdf_url}")
126
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
127
+ self.try_times = self.try_times_each_url_max + 1
110
128
  else:
129
+ print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
111
130
  print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
112
131
  self.try_times = self.try_times_each_url_max + 1
113
- else:
114
- print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
115
- print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
132
+ except Exception as e:
133
+ print(f"Failed to retrieve the webpage. Error: {e}")
116
134
  self.try_times = self.try_times_each_url_max + 1
117
135
 
118
136
  def url_iterate(self):
@@ -129,6 +147,12 @@ class _Downloader:
129
147
  # break
130
148
 
131
149
  def write_wrong_record(self):
150
+ # 先读取txt中的内容,如果已经存在则不再写入
151
+ if self.wrong_record_file.exists():
152
+ with open(self.wrong_record_file, "r") as f:
153
+ lines = f.readlines()
154
+ if self.doi in lines:
155
+ return
132
156
  with open(self.wrong_record_file, "a") as f:
133
157
  f.write(self.doi + "\n")
134
158
 
@@ -140,7 +164,7 @@ class _Downloader:
140
164
  os.remove(self.fpath)
141
165
  print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
142
166
  else:
143
- print("[bold #E6E6FA]-" * 100)
167
+ print("[bold #E6E6FA]-" * 120)
144
168
  print(f"[bold purple]The PDF file {self.fpath} already exists.")
145
169
  return
146
170
  self.url_index = 0
@@ -230,11 +254,11 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
230
254
 
231
255
  Example:
232
256
  download5doi(doi_list='10.3389/feart.2021.698876')
233
- download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
234
- download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
235
- download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
236
- download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
237
- download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
257
+ download5doi(store_path='I:\\Delete\\ref_pdf', doi_list='10.3389/feart.2021.698876')
258
+ download5doi(store_path='I:\\Delete\\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
259
+ download5doi(store_path='I:\\Delete\\ref_pdf', txt_file='I:\\Delete\\ref_pdf\\wrong_record.txt')
260
+ download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx')
261
+ download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx', col_name='DOI')
238
262
  """
239
263
  if not store_path:
240
264
  store_path = Path.cwd()
@@ -257,7 +281,7 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
257
281
 
258
282
 
259
283
  if __name__ == "__main__":
260
- store_path = r"I:\Delete\ref_pdf"
261
- excel_file = r"I:\Delete\Ref_DA_ROMS\savedrecs.xls"
284
+ store_path = r"F:\AAA-Delete\DOI_Reference\pdf"
285
+ excel_file = r"F:\AAA-Delete\DOI_Reference\savedrecs.xls"
262
286
  # download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
263
287
  download5doi(store_path, excel_file=excel_file)
oafuncs/oa_file.py CHANGED
@@ -19,7 +19,7 @@ import re
19
19
  import shutil
20
20
  from rich import print
21
21
 
22
- __all__ = ["find_file", "link_file", "copy_file", "rename_file", "make_folder", "clear_folder", "remove_empty_folder", "remove", "file_size"]
22
+ __all__ = ["find_file", "link_file", "copy_file", "rename_file", "make_folder", "clear_folder", "remove_empty_folder", "remove", "file_size", "mean_size", "make_dir"]
23
23
 
24
24
 
25
25
  # ** 查找文件,支持通配符
@@ -191,7 +191,7 @@ def rename_file(directory, old_str, new_str):
191
191
 
192
192
 
193
193
  # ** 创建子文件夹(可选清空)
194
- def make_folder(rootpath=None, folder_name=None, clear=0) -> str:
194
+ def make_folder(rootpath=None, folder_name=None, clear=False) -> str:
195
195
  """
196
196
  # 描述:创建子文件夹(可选清空)
197
197
  # 使用示例
@@ -210,6 +210,26 @@ def make_folder(rootpath=None, folder_name=None, clear=0) -> str:
210
210
  return folder_path
211
211
 
212
212
 
213
+ # ** 创建路径
214
+ def make_dir(directory):
215
+ """
216
+ Description:
217
+ Create a directory if it does not exist
218
+
219
+ Parameters:
220
+ directory: The directory path to create
221
+
222
+ Returns:
223
+ None
224
+
225
+ Example:
226
+ make_dir(r"E:\Data\2024\09\17\var1")
227
+ """
228
+ directory = str(directory)
229
+ os.makedirs(directory, exist_ok=True)
230
+ print(f"Created directory: {directory}")
231
+
232
+
213
233
  # ** 清空文件夹
214
234
  def clear_folder(folder_path):
215
235
  """
@@ -270,27 +290,45 @@ def remove_empty_folder(path, print_info=1):
270
290
  # ** 删除相关文件,可使用通配符
271
291
  def remove(pattern):
272
292
  """
273
- # 描述:删除相关文件,可使用通配符
293
+ Delete files or directories that match the given wildcard pattern.
294
+
295
+ Parameters:
296
+ pattern : str
297
+ File path or string containing wildcards. For example:
298
+ - r'E:\Code\Python\Model\WRF\Radar2\bzip2-radar-0*'
299
+ - 'bzip2-radar-0*' (assuming you are already in the target directory)
300
+
301
+ Usage examples:
274
302
  remove(r'E:\Code\Python\Model\WRF\Radar2\bzip2-radar-0*')
275
- # or
303
+ or
276
304
  os.chdir(r'E:\Code\Python\Model\WRF\Radar2')
277
305
  remove('bzip2-radar-0*')
278
- param {*} pattern # 文件路径或通配符
306
+
307
+ last updated: 2025-01-10 11:49:13
279
308
  """
280
- # 使用glob.glob来获取所有匹配的文件
281
- # 可以使用通配符*来匹配所有文件
282
309
  pattern = str(pattern)
310
+
311
+ # Use glob.glob to get all matching files or directories
283
312
  file_list = glob.glob(pattern)
313
+
314
+ if not file_list:
315
+ print(f"No files or directories found matching '{pattern}'.")
316
+ return
317
+
284
318
  for file_path in file_list:
285
319
  if os.path.exists(file_path):
286
320
  try:
287
- shutil.rmtree(file_path)
288
- print(f"成功删除文件: {file_path}")
321
+ if os.path.isdir(file_path):
322
+ shutil.rmtree(file_path)
323
+ print(f"Successfully deleted directory: {file_path}")
324
+ else:
325
+ os.remove(file_path)
326
+ print(f"Successfully deleted file: {file_path}")
289
327
  except Exception as e:
290
- print(f"删除文件失败: {file_path}")
291
- print(e)
328
+ print(f"Deletion failed: {file_path}")
329
+ print(f"Error message: {e}")
292
330
  else:
293
- print(f"文件不存在: {file_path}")
331
+ print(f"File or directory does not exist: {file_path}")
294
332
 
295
333
 
296
334
  # ** 获取文件大小
@@ -307,7 +345,10 @@ def file_size(file_path, unit="KB"):
307
345
  """
308
346
  # 检查文件是否存在
309
347
  if not os.path.exists(file_path):
310
- return "文件不存在"
348
+ # return "文件不存在"
349
+ # print(f"文件不存在: {file_path}\n返回0.0")
350
+ print(f'File does not exist: {file_path}\nReturn 0.0')
351
+ return 0.0
311
352
 
312
353
  # 获取文件大小(字节)
313
354
  file_size = os.path.getsize(file_path)
@@ -317,7 +358,10 @@ def file_size(file_path, unit="KB"):
317
358
 
318
359
  # 检查传入的单位是否合法
319
360
  if unit not in unit_dict:
320
- return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
361
+ # return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
362
+ # print("单位不合法,请选择PB、TB、GB、MB、KB中的一个\n返回0.0")
363
+ print("Invalid unit, please choose one of PB, TB, GB, MB, KB\nReturn 0.0")
364
+ return 0.0
321
365
 
322
366
  # 转换文件大小到指定单位
323
367
  converted_size = file_size / unit_dict[unit]
oafuncs/oa_help.py CHANGED
@@ -116,12 +116,18 @@ def log():
116
116
  log()
117
117
  """
118
118
  print("更新日志:")
119
+ print(
120
+ """
121
+ 2025-01-15
122
+ 1. 优化了doi下载文献函数,增加下载途径及优化异常处理
123
+ """
124
+ )
119
125
  print(
120
126
  """
121
127
  2025-01-07
122
128
  1. 测试Python版本最低为3.9
123
129
  2. 优化了部分函数说明
124
- 3. 优化hycom_3hourly模块,仅更新30次件大小,避免每次计算,提高下载速度。
130
+ 3. 优化hycom_3hourly模块,滑动判断文件是否正常
125
131
  """
126
132
  )
127
133
  print(
oafuncs/oa_nc.py CHANGED
@@ -20,7 +20,7 @@ import numpy as np
20
20
  import xarray as xr
21
21
  from rich import print
22
22
 
23
- __all__ = ["get_var", "extract", "save", "merge", "modify", "rename", "check_file", "convert_longitude", "isel"]
23
+ __all__ = ["get_var", "extract", "save", "merge", "modify", "rename", "check", "convert_longitude", "isel"]
24
24
 
25
25
 
26
26
  def get_var(file, *vars):
@@ -222,8 +222,10 @@ def merge(file_list, var_name=None, dim_name=None, target_filename=None):
222
222
  merged_data = {}
223
223
 
224
224
  # 遍历文件列表
225
+ print('Reading file ...')
225
226
  for i, file in enumerate(file_list):
226
- print(f"\rReading file {i + 1}/{len(file_list)}...", end="")
227
+ # 更新track描述进度
228
+ # print(f"\rReading file {i + 1}/{len(file_list)}...", end="")
227
229
  ds = xr.open_dataset(file)
228
230
  for var_name in var_names:
229
231
  var = ds[var_name]
@@ -239,17 +241,17 @@ def merge(file_list, var_name=None, dim_name=None, target_filename=None):
239
241
  merged_data[var_name] = var
240
242
  ds.close()
241
243
 
242
- print("\nMerging data...")
244
+ print("\nMerging data ...")
243
245
  for var_name in merged_data:
244
246
  if isinstance(merged_data[var_name], list):
245
247
  merged_data[var_name] = xr.concat(merged_data[var_name], dim=dim_name)
246
248
 
247
249
  merged_data = xr.Dataset(merged_data)
248
250
 
249
- print("Writing data to file...")
251
+ print("Writing data to file ...")
250
252
  if os.path.exists(target_filename):
251
253
  print("Warning: The target file already exists.")
252
- print("Removing existing file...")
254
+ print("Removing existing file ...")
253
255
  os.remove(target_filename)
254
256
  merged_data.to_netcdf(target_filename)
255
257
  print(f'File "{target_filename}" has been created.')
@@ -370,38 +372,38 @@ def rename(ncfile_path, old_name, new_name):
370
372
  print(f"An error occurred: {e}")
371
373
 
372
374
 
373
- def check_file(ncfile, if_delete=False):
374
- '''
375
+ def check(ncfile, if_delete=False):
376
+ """
375
377
  Description:
376
- Check if the NetCDF file is corrupted.
377
-
378
+ Check if the NetCDF file is corrupted using xarray.
379
+
378
380
  Parameters:
379
381
  ncfile (str): The path to the NetCDF file.
380
382
  if_delete (bool): Whether to delete the file if it is corrupted, default is False.
381
-
383
+
382
384
  Returns:
383
385
  bool: True if the file is not corrupted, False otherwise.
384
- '''
386
+ """
385
387
  if not os.path.exists(ncfile):
386
388
  return False
387
389
 
388
390
  try:
389
- with nc.Dataset(ncfile, "r") as f:
390
- # 确保f被使用,这里我们检查文件中变量的数量
391
- if len(f.variables) > 0:
391
+ with xr.open_dataset(ncfile) as ds:
392
+ if len(ds.variables) > 0:
392
393
  return True
393
394
  else:
394
- # 如果没有变量,我们可以认为文件是损坏的
395
- raise ValueError("File is empty or corrupted.")
395
+ print(f"File {ncfile} is empty or corrupted.")
396
+ if if_delete:
397
+ os.remove(ncfile)
398
+ print(f"File {ncfile} has been deleted.")
399
+ return False
396
400
  except OSError as e:
397
- # 捕获文件打开时可能发生的OSError
398
401
  print(f"An error occurred while opening the file: {e}")
399
402
  if if_delete:
400
403
  os.remove(ncfile)
401
404
  print(f"File {ncfile} has been deleted.")
402
405
  return False
403
406
  except Exception as e:
404
- # 捕获其他可能的异常
405
407
  print(f"An unexpected error occurred: {e}")
406
408
  if if_delete:
407
409
  os.remove(ncfile)
@@ -1,18 +1,18 @@
1
1
  #!/usr/bin/env python
2
2
  # coding=utf-8
3
- '''
3
+ """
4
4
  Author: Liu Kun && 16031215@qq.com
5
5
  Date: 2024-11-21 09:48:00
6
6
  LastEditors: Liu Kun && 16031215@qq.com
7
- LastEditTime: 2024-11-21 10:18:33
8
- FilePath: \\Python\\My_Funcs\\OAFuncs\\OAFuncs\\oa_tool\\__init__.py
9
- Description:
7
+ LastEditTime: 2025-01-11 20:09:09
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_tool\\__init__.py
9
+ Description:
10
10
  EditPlatform: vscode
11
11
  ComputerInfo: XPS 15 9510
12
12
  SystemInfo: Windows 11
13
13
  Python Version: 3.12
14
- '''
15
-
14
+ """
16
15
 
17
16
  # 会导致OAFuncs直接导入所有函数,不符合模块化设计
18
17
  from .email import *
18
+ from .parallel import *
@@ -0,0 +1,90 @@
1
+ import logging
2
+ import multiprocessing as mp
3
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
4
+
5
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
6
+
7
+ __all__ = ["ParallelExecutor"]
8
+
9
+
10
+ class ParallelExecutor:
11
+ """
12
+ A class for parallel execution of tasks using threads or processes.
13
+
14
+ If mode is "process", the tasks are executed in separate processes.
15
+ If mode is "thread", the tasks are executed in separate threads.
16
+
17
+ Parameters:
18
+ mode (str): The execution mode. Supported values are "process" and "thread".
19
+ process ~ Must use top function to run, can't use in jupyter notebook
20
+ thread ~ Function can not be top function, can use in jupyter notebook
21
+ max_workers (int): The maximum number of workers to use. Defaults to CPU count - 1.
22
+
23
+ Note:!!!
24
+ If Jupyter notebook is used, the mode should be "thread" to avoid hanging issues.
25
+ """
26
+
27
+ def __init__(self, mode="process", max_workers=None):
28
+ if mode not in {"process", "thread"}:
29
+ raise ValueError("Invalid mode. Supported values are 'process' and 'thread'.")
30
+ # process: Must use top function to run, can't use in jupyter notebook
31
+ # thread: Can use in jupyter notebook
32
+ self.mode = mode
33
+ self.max_workers = max_workers or max(1, mp.cpu_count() - 1)
34
+ self.executor_class = ProcessPoolExecutor if mode == "process" else ThreadPoolExecutor
35
+
36
+ def run(self, func, param_list):
37
+ """
38
+ Run a function in parallel using the specified executor.
39
+
40
+ Args:
41
+ func (callable): The function to execute.
42
+ param_list (list): A list of parameter tuples to pass to the function.
43
+
44
+ Returns:
45
+ list: Results of the function execution.
46
+ """
47
+ if not callable(func):
48
+ raise ValueError("func must be callable.")
49
+ if not isinstance(param_list, list) or not all(isinstance(p, tuple) for p in param_list):
50
+ raise ValueError("param_list must be a list of tuples.")
51
+
52
+ results = [None] * len(param_list)
53
+ logging.info("Starting parallel execution in %s mode with %d workers.", self.mode, self.max_workers)
54
+
55
+ with self.executor_class(max_workers=self.max_workers) as executor:
56
+ future_to_index = {executor.submit(func, *params): idx for idx, params in enumerate(param_list)}
57
+
58
+ for future in as_completed(future_to_index):
59
+ idx = future_to_index[future]
60
+ try:
61
+ results[idx] = future.result()
62
+ except Exception as e:
63
+ logging.error("Task %d failed with error: %s", idx, e)
64
+ results[idx] = e
65
+
66
+ logging.info("Parallel execution completed.")
67
+ return results
68
+
69
+ def _compute_square(x):
70
+ return x * x
71
+
72
+ def _example():
73
+ def _compute_sum(a, b):
74
+ return a + b
75
+
76
+ executor1 = ParallelExecutor(mode="process", max_workers=4)
77
+ params1 = [(i,) for i in range(10)]
78
+ results1 = executor1.run(_compute_square, params1)
79
+ print("Results (compute_square):", results1)
80
+
81
+ executor2 = ParallelExecutor(mode="thread", max_workers=2)
82
+ params2 = [(1, 2), (3, 4), (5, 6)]
83
+ results2 = executor2.run(_compute_sum, params2)
84
+ print("Results (compute_sum):", results2)
85
+
86
+
87
+ if __name__ == "__main__":
88
+ _example()
89
+ # 也可以不要装饰器,直接运行没啥问题,就是避免在ipynb中使用,最好使用ipynb,或者把这个函数放到一个独立的py文件中运行
90
+ # 或者,jupyter中使用thread,不要使用process,因为process会导致jupyter挂掉
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: oafuncs
3
- Version: 0.0.90
3
+ Version: 0.0.91
4
4
  Summary: Oceanic and Atmospheric Functions
5
5
  Home-page: https://github.com/Industry-Pays/OAFuncs
6
6
  Author: Kun Liu
@@ -0,0 +1,28 @@
1
+ oafuncs/__init__.py,sha256=glcIlhQ9xSK4WtL58dq7Od2S3JPqsuEyhUQ-VWO8hOc,1426
2
+ oafuncs/oa_cmap.py,sha256=azVg9QR_IlG9lXCCXXVs1LS1kFci8yjxDmb_VA_TdTQ,7408
3
+ oafuncs/oa_data.py,sha256=nENfszcOaALRse70fWFKi2vKH35EhRSCr65oIAKHiS8,12774
4
+ oafuncs/oa_draw.py,sha256=QypQp4vJIrbAyFddEVxd9K9Q4d85PRYqYQi9xDUmSZw,11150
5
+ oafuncs/oa_file.py,sha256=FVffpW3p6C8l1zrDrNr9aQeuCrA1qt4u4YssSwcTkkE,14106
6
+ oafuncs/oa_help.py,sha256=loyzTbjU_0VpSIBvAEUA_tqxG8MVsO0xFE_2hgQ3zMw,4188
7
+ oafuncs/oa_nc.py,sha256=CVZlv2EIehdgzrf1MHXYOUFcNkdOnmE1GYQYLldzrk0,17499
8
+ oafuncs/oa_python.py,sha256=Q-6UGGw_dJff7Ef8i87fsLPoGeHV5jBzfb-7HP4THR0,4018
9
+ oafuncs/data_store/OAFuncs.png,sha256=Cc0TDi9H5mWBporXYw9K0bUWC0oSsI-Qj3FGAXUtGKM,3332020
10
+ oafuncs/oa_down/User_Agent-list.txt,sha256=pazxSip8_lphEBOPHG902zmIBUg8sBKXgmqp_g6j_E4,661062
11
+ oafuncs/oa_down/__init__.py,sha256=kRX5eTUCbAiz3zTaQM1501paOYS_3fizDN4Pa0mtNUA,585
12
+ oafuncs/oa_down/hycom_3hourly.py,sha256=ZJpsx2D_x-C1Z4R1Wwr2vzUuT6iNPTZVDxusCG_q330,62113
13
+ oafuncs/oa_down/idm.py,sha256=lOiDQ5i5JPhj5ca3uDM9dw5DnHtj1EyJ17owhy7luLg,1666
14
+ oafuncs/oa_down/literature.py,sha256=n9pvL_N7pk-MZHHNIqc8OUYK_9ycASjDq0-D0wLSZ3s,11329
15
+ oafuncs/oa_down/test_ua.py,sha256=0IQq3NjqfNr7KkyjS_U-a4mYu-r-E7gzawwo4IfEa6Y,10851
16
+ oafuncs/oa_down/user_agent.py,sha256=TsPcAxFmMTYAEHRFjurI1bQBJfDhcA70MdHoUPwQmks,785
17
+ oafuncs/oa_sign/__init__.py,sha256=QKqTFrJDFK40C5uvk48GlRRbGFzO40rgkYwu6dYxatM,563
18
+ oafuncs/oa_sign/meteorological.py,sha256=mLbupsZSq427HTfVbZMvIlFzDHwSzQAbK3X19o8anFY,6525
19
+ oafuncs/oa_sign/ocean.py,sha256=xrW-rWD7xBWsB5PuCyEwQ1Q_RDKq2KCLz-LOONHgldU,5932
20
+ oafuncs/oa_sign/scientific.py,sha256=a4JxOBgm9vzNZKpJ_GQIQf7cokkraV5nh23HGbmTYKw,5064
21
+ oafuncs/oa_tool/__init__.py,sha256=bNTy9abznDhg3k_Irx0YieXl37r-oDRMtTAxf57Stzs,487
22
+ oafuncs/oa_tool/email.py,sha256=4lJxV_KUzhxgLYfVwYTqp0qxRugD7fvsZkXDe5WkUKo,3052
23
+ oafuncs/oa_tool/parallel.py,sha256=kYbiIFDB7EoxasmXGSomaEDVUsg9Rfvdgbw93lBOY7o,3770
24
+ oafuncs-0.0.91.dist-info/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
25
+ oafuncs-0.0.91.dist-info/METADATA,sha256=KT2rJ-ZeMPNYiqfyN5tRDdvf7eP5DB5NnesLZVsFG5A,3321
26
+ oafuncs-0.0.91.dist-info/WHEEL,sha256=M1ikteR9eetPNvm1LyQ3rpXxNYuGd90oakQO1a-ohSk,109
27
+ oafuncs-0.0.91.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
28
+ oafuncs-0.0.91.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.7.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
@@ -1,26 +0,0 @@
1
- oafuncs/__init__.py,sha256=glcIlhQ9xSK4WtL58dq7Od2S3JPqsuEyhUQ-VWO8hOc,1426
2
- oafuncs/oa_cmap.py,sha256=azVg9QR_IlG9lXCCXXVs1LS1kFci8yjxDmb_VA_TdTQ,7408
3
- oafuncs/oa_data.py,sha256=21HC_7GVFAtU9AMYKGSSzY9J6_0Ju-5n8dJKwOOx5HI,15641
4
- oafuncs/oa_draw.py,sha256=QypQp4vJIrbAyFddEVxd9K9Q4d85PRYqYQi9xDUmSZw,11150
5
- oafuncs/oa_file.py,sha256=9b2uXTOqJqds5IhEqA_702G-qzyCZiguGY5JcT9CZ78,12728
6
- oafuncs/oa_help.py,sha256=42xvmv6BSTyrKfQtW0bvedyv6ElhFJLMblq5jhziuB4,4076
7
- oafuncs/oa_nc.py,sha256=m_80xWzoyY2niupfpTSvej1D_k4WvTnDYlnlYbIfqGI,17525
8
- oafuncs/oa_python.py,sha256=Q-6UGGw_dJff7Ef8i87fsLPoGeHV5jBzfb-7HP4THR0,4018
9
- oafuncs/data_store/OAFuncs.png,sha256=HZORbnBSRX0MZSLTGAZAPK24RBUTmihguMeG9YiU_So,3261697
10
- oafuncs/oa_down/User_Agent-list.txt,sha256=pazxSip8_lphEBOPHG902zmIBUg8sBKXgmqp_g6j_E4,661062
11
- oafuncs/oa_down/__init__.py,sha256=pKPqxD0z09NEXWCemuemfgTct7Kcu3APPJqqB1FPXRM,565
12
- oafuncs/oa_down/hycom_3hourly.py,sha256=Bt4MjcshhAyDckfFvdqxjNvzU7JuBVYCwvY8b1OPbPw,59501
13
- oafuncs/oa_down/literature.py,sha256=Txv1YGSG-Z7m4o7FGHvXOR40EFxYozMsyM0-gy5CMEg,10086
14
- oafuncs/oa_down/test_ua.py,sha256=0IQq3NjqfNr7KkyjS_U-a4mYu-r-E7gzawwo4IfEa6Y,10851
15
- oafuncs/oa_down/user_agent.py,sha256=TsPcAxFmMTYAEHRFjurI1bQBJfDhcA70MdHoUPwQmks,785
16
- oafuncs/oa_sign/__init__.py,sha256=QKqTFrJDFK40C5uvk48GlRRbGFzO40rgkYwu6dYxatM,563
17
- oafuncs/oa_sign/meteorological.py,sha256=mLbupsZSq427HTfVbZMvIlFzDHwSzQAbK3X19o8anFY,6525
18
- oafuncs/oa_sign/ocean.py,sha256=xrW-rWD7xBWsB5PuCyEwQ1Q_RDKq2KCLz-LOONHgldU,5932
19
- oafuncs/oa_sign/scientific.py,sha256=a4JxOBgm9vzNZKpJ_GQIQf7cokkraV5nh23HGbmTYKw,5064
20
- oafuncs/oa_tool/__init__.py,sha256=IKOlqpWlb4cMDCtq2VKR_RTxQHDNqR_vfqqsOsp_lKQ,466
21
- oafuncs/oa_tool/email.py,sha256=4lJxV_KUzhxgLYfVwYTqp0qxRugD7fvsZkXDe5WkUKo,3052
22
- oafuncs-0.0.90.dist-info/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
23
- oafuncs-0.0.90.dist-info/METADATA,sha256=s3X6lHw6yv20rd2528K-5cOk7zcYRGSIGYEg4SeIqqI,3321
24
- oafuncs-0.0.90.dist-info/WHEEL,sha256=pxeNX5JdtCe58PUSYP9upmc7jdRPgvT0Gm9kb1SHlVw,109
25
- oafuncs-0.0.90.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
26
- oafuncs-0.0.90.dist-info/RECORD,,