kevin-toolbox-dev 1.4.7__py3-none-any.whl → 1.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kevin_toolbox/__init__.py +2 -2
- kevin_toolbox/{developing → computer_science/algorithm}/decorator/__init__.py +2 -1
- kevin_toolbox/computer_science/algorithm/decorator/retry.py +62 -0
- kevin_toolbox/computer_science/algorithm/registration/__init__.py +1 -0
- kevin_toolbox/computer_science/algorithm/registration/serializer_for_registry_execution.py +82 -0
- kevin_toolbox/data_flow/core/cache/cache_manager_for_iterator.py +1 -1
- kevin_toolbox/data_flow/file/json_/write_json.py +2 -1
- kevin_toolbox/env_info/check_version_and_update.py +0 -1
- kevin_toolbox/env_info/variable_/env_vars_parser.py +17 -2
- kevin_toolbox/nested_dict_list/copy_.py +4 -2
- kevin_toolbox/nested_dict_list/get_nodes.py +4 -2
- kevin_toolbox/nested_dict_list/serializer/variable.py +14 -2
- kevin_toolbox/nested_dict_list/serializer/write.py +2 -0
- kevin_toolbox/nested_dict_list/traverse.py +75 -21
- kevin_toolbox/nested_dict_list/value_parser/replace_identical_with_reference.py +1 -4
- kevin_toolbox/network/__init__.py +10 -0
- kevin_toolbox/network/download_file.py +120 -0
- kevin_toolbox/network/fetch_content.py +55 -0
- kevin_toolbox/network/fetch_metadata.py +64 -0
- kevin_toolbox/network/get_response.py +50 -0
- kevin_toolbox/network/variable.py +6 -0
- kevin_toolbox/patches/for_logging/build_logger.py +1 -1
- kevin_toolbox/patches/for_matplotlib/color/convert_format.py +0 -2
- kevin_toolbox/patches/for_matplotlib/common_charts/__init__.py +45 -0
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_bars.py +63 -22
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_confusion_matrix.py +67 -20
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_distribution.py +66 -17
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_from_record.py +21 -0
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_lines.py +59 -19
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_scatters.py +61 -12
- kevin_toolbox/patches/for_matplotlib/common_charts/plot_scatters_matrix.py +57 -14
- kevin_toolbox/patches/for_matplotlib/common_charts/utils/__init__.py +3 -0
- kevin_toolbox/patches/for_matplotlib/common_charts/utils/get_output_path.py +15 -0
- kevin_toolbox/patches/for_matplotlib/common_charts/utils/save_plot.py +12 -0
- kevin_toolbox/patches/for_matplotlib/common_charts/utils/save_record.py +34 -0
- kevin_toolbox/patches/for_matplotlib/variable.py +20 -0
- kevin_toolbox/patches/for_numpy/linalg/softmax.py +4 -1
- kevin_toolbox_dev-1.4.9.dist-info/METADATA +75 -0
- {kevin_toolbox_dev-1.4.7.dist-info → kevin_toolbox_dev-1.4.9.dist-info}/RECORD +42 -28
- kevin_toolbox_dev-1.4.7.dist-info/METADATA +0 -69
- /kevin_toolbox/{developing → computer_science/algorithm}/decorator/restore_original_work_path.py +0 -0
- {kevin_toolbox_dev-1.4.7.dist-info → kevin_toolbox_dev-1.4.9.dist-info}/WHEEL +0 -0
- {kevin_toolbox_dev-1.4.7.dist-info → kevin_toolbox_dev-1.4.9.dist-info}/top_level.txt +0 -0
kevin_toolbox/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "1.4.
|
1
|
+
__version__ = "1.4.9"
|
2
2
|
|
3
3
|
|
4
4
|
import os
|
@@ -12,5 +12,5 @@ os.system(
|
|
12
12
|
os.system(
|
13
13
|
f'python {os.path.split(__file__)[0]}/env_info/check_validity_and_uninstall.py '
|
14
14
|
f'--package_name kevin-toolbox-dev '
|
15
|
-
f'--expiration_timestamp
|
15
|
+
f'--expiration_timestamp 1758638079 --verbose 0'
|
16
16
|
)
|
@@ -1 +1,2 @@
|
|
1
|
-
from .restore_original_work_path import restore_original_work_path
|
1
|
+
from .restore_original_work_path import restore_original_work_path
|
2
|
+
from .retry import retry
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import time
|
2
|
+
import functools
|
3
|
+
from kevin_toolbox.patches.for_logging import build_logger
|
4
|
+
|
5
|
+
default_logger = build_logger(
|
6
|
+
name=":retry",
|
7
|
+
handler_ls=[
|
8
|
+
dict(target=None, level="INFO", formatter="%(name)s - %(levelname)s - %(message)s"),
|
9
|
+
]
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
def retry(retries=3, delay=0.5, exceptions=(Exception,), logger=None):
|
14
|
+
"""
|
15
|
+
在函数执行失败时,等待一定时间后重试多次
|
16
|
+
|
17
|
+
参数:
|
18
|
+
retries: <int> 重试次数
|
19
|
+
默认重试3次
|
20
|
+
delay: <int/float> 每次重试前等待的秒数
|
21
|
+
默认0.5秒
|
22
|
+
exceptions: <list> 捕获的异常类型
|
23
|
+
默认捕获所有 Exception
|
24
|
+
|
25
|
+
使用示例:
|
26
|
+
@retry(retries=5, delay=2)
|
27
|
+
def func():
|
28
|
+
...
|
29
|
+
"""
|
30
|
+
logger = default_logger if logger == "default" else logger
|
31
|
+
|
32
|
+
def decorator(func):
|
33
|
+
@functools.wraps(func)
|
34
|
+
def wrapper(*args, **kwargs):
|
35
|
+
last_exception = None
|
36
|
+
for attempt in range(1, retries + 1):
|
37
|
+
try:
|
38
|
+
return func(*args, **kwargs)
|
39
|
+
except exceptions as e:
|
40
|
+
last_exception = e
|
41
|
+
if logger is not None:
|
42
|
+
logger.info(f"第 {attempt} 次调用 {func.__name__} 失败\n\t异常:{e}\n\t等待 {delay} 秒后重试...")
|
43
|
+
time.sleep(delay)
|
44
|
+
# 如果所有重试均失败,则抛出最后一次捕获的异常
|
45
|
+
raise last_exception
|
46
|
+
|
47
|
+
return wrapper
|
48
|
+
|
49
|
+
return decorator
|
50
|
+
|
51
|
+
|
52
|
+
if __name__ == '__main__':
|
53
|
+
@retry(retries=2, delay=0.3, logger="default")
|
54
|
+
def func_(*args, **kwargs):
|
55
|
+
if args or kwargs:
|
56
|
+
return args, kwargs
|
57
|
+
else:
|
58
|
+
raise ValueError("no paras")
|
59
|
+
|
60
|
+
|
61
|
+
print(func_(123))
|
62
|
+
func_()
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from kevin_toolbox.nested_dict_list import serializer
|
2
|
+
from kevin_toolbox.computer_science.data_structure import Executor
|
3
|
+
from kevin_toolbox.computer_science.algorithm.registration import Registry
|
4
|
+
|
5
|
+
|
6
|
+
class Serializer_for_Registry_Execution:
|
7
|
+
"""
|
8
|
+
用于对基于 Registry 中成员构建的执行过程进行序列化和反序列化操作
|
9
|
+
比如对于一个含有callable成员的 Registry,我们可以使用该 recorder 将其执行过程序列化保存下来,并在需要时恢复并执行
|
10
|
+
|
11
|
+
工作流程:
|
12
|
+
recover() ---> executor ---> run to get result
|
13
|
+
^
|
14
|
+
|
|
15
|
+
record(...) ---> self.record_s ---> save()
|
16
|
+
^ |
|
17
|
+
| v
|
18
|
+
load() <--- record_file
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self):
|
22
|
+
self.record_s = None
|
23
|
+
|
24
|
+
def record(self, _name=None, _registry=None, *args, **kwargs):
|
25
|
+
"""
|
26
|
+
将参数保存到 record_s 中
|
27
|
+
"""
|
28
|
+
return self.record_name(_name, _registry).record_paras(*args, **kwargs)
|
29
|
+
|
30
|
+
def record_name(self, _name, _registry):
|
31
|
+
assert isinstance(_registry, (Registry,))
|
32
|
+
assert callable(_registry.get(name=_name, default=None))
|
33
|
+
self.record_s = self.record_s or dict()
|
34
|
+
self.record_s["name"] = _name
|
35
|
+
self.record_s["registry_uid"] = _registry.uid
|
36
|
+
return self
|
37
|
+
|
38
|
+
def record_paras(self, *args, **kwargs):
|
39
|
+
self.record_s = self.record_s or dict()
|
40
|
+
self.record_s["args"] = args
|
41
|
+
self.record_s["kwargs"] = kwargs
|
42
|
+
return self
|
43
|
+
|
44
|
+
def save(self, output_dir=None, b_pack_into_tar=False, b_allow_overwrite=False, **kwargs):
|
45
|
+
"""
|
46
|
+
将 record_s 使用 ndl 持久化到文件中
|
47
|
+
|
48
|
+
参数:
|
49
|
+
output_dir:
|
50
|
+
b_pack_into_tar:
|
51
|
+
b_allow_overwrite:
|
52
|
+
其余未列出参数请参考 ndl.serializer.write 中的介绍,常用的有:
|
53
|
+
b_allow_overwrite
|
54
|
+
settings
|
55
|
+
等。
|
56
|
+
"""
|
57
|
+
assert self.record_s is not None
|
58
|
+
file_path = serializer.write(var=self.record_s, output_dir=output_dir, b_pack_into_tar=b_pack_into_tar,
|
59
|
+
b_allow_overwrite=b_allow_overwrite, **kwargs)
|
60
|
+
return file_path
|
61
|
+
|
62
|
+
def load(self, input_path):
|
63
|
+
"""
|
64
|
+
从文件中加载内容到 record_s
|
65
|
+
"""
|
66
|
+
self.record_s = serializer.read(input_path=input_path)
|
67
|
+
return self
|
68
|
+
|
69
|
+
def recover(self, record_s=None):
|
70
|
+
"""
|
71
|
+
根据 record_s 中的信息,结合 registry 构建一个执行器并返回
|
72
|
+
"""
|
73
|
+
record_s = record_s or self.record_s
|
74
|
+
assert record_s is not None
|
75
|
+
|
76
|
+
func = Registry(uid=record_s["registry_uid"]).get(name=record_s["name"], default=None)
|
77
|
+
assert callable(func)
|
78
|
+
executor = Executor(func=func, args=record_s["args"], kwargs=record_s["kwargs"])
|
79
|
+
return executor
|
80
|
+
|
81
|
+
|
82
|
+
execution_serializer = Serializer_for_Registry_Execution()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
import time
|
3
3
|
import importlib.util
|
4
|
-
from kevin_toolbox.
|
4
|
+
from kevin_toolbox.computer_science.algorithm.decorator import restore_original_work_path
|
5
5
|
from kevin_toolbox.computer_science.algorithm.cache_manager import Cache_Manager
|
6
6
|
|
7
7
|
if importlib.util.find_spec("cPickle") is not None:
|
@@ -56,7 +56,8 @@ def write_json(content, file_path, converters=None, b_use_suggested_converter=Fa
|
|
56
56
|
content = traverse(var=[copy.deepcopy(content)],
|
57
57
|
match_cond=lambda _, __, ___: True, action_mode="replace",
|
58
58
|
converter=lambda _, x: converter(x),
|
59
|
-
b_traverse_matched_element=True
|
59
|
+
b_traverse_matched_element=True,
|
60
|
+
b_skip_repeated_non_leaf_node=True)[0]
|
60
61
|
|
61
62
|
content = json.dumps(content, **output_format)
|
62
63
|
|
@@ -16,7 +16,6 @@ def check_version_and_update(package_name, cur_version=None, available_versions=
|
|
16
16
|
ex = subprocess.Popen(f'pip list | grep "{package_name} "', shell=True, stdout=subprocess.PIPE)
|
17
17
|
out, _ = ex.communicate()
|
18
18
|
out = out.decode().strip()
|
19
|
-
# breakpoint()
|
20
19
|
cur_version = out.split(package_name)[-1].strip()
|
21
20
|
|
22
21
|
# try to read available versions
|
@@ -26,10 +26,24 @@ class Env_Vars_Parser:
|
|
26
26
|
def __call__(self, *args, **kwargs):
|
27
27
|
return self.parse(*args, **kwargs)
|
28
28
|
|
29
|
-
def parse(self, text):
|
29
|
+
def parse(self, text, **kwargs):
|
30
30
|
"""
|
31
31
|
解释并替换
|
32
|
+
|
33
|
+
参数:
|
34
|
+
default: 默认之。
|
35
|
+
当有设定时,若无法解释则返回该值。
|
36
|
+
否则,若无法解释将报错。
|
32
37
|
"""
|
38
|
+
if "default" in kwargs:
|
39
|
+
try:
|
40
|
+
return self.__parse(text=text)
|
41
|
+
except:
|
42
|
+
return kwargs["default"]
|
43
|
+
else:
|
44
|
+
return self.__parse(text=text)
|
45
|
+
|
46
|
+
def __parse(self, text):
|
33
47
|
temp_ls = []
|
34
48
|
for it in self.split_string(text=text):
|
35
49
|
if isinstance(it, str):
|
@@ -85,4 +99,5 @@ class Env_Vars_Parser:
|
|
85
99
|
|
86
100
|
if __name__ == '__main__':
|
87
101
|
env_vars_parser = Env_Vars_Parser()
|
88
|
-
print(env_vars_parser.split_string("666/123${:VAR}/afasf/${/xxx.../xxx.json:111:222}336"))
|
102
|
+
# print(env_vars_parser.split_string("666/123${:VAR}/afasf/${/xxx.../xxx.json:111:222}336"))
|
103
|
+
print(env_vars_parser.parse("${KVT_PATCHES:for_matplotlib:common_charts:font_settings:for_non-windows-platform}"))
|
@@ -123,7 +123,8 @@ def _copy_structure(var, b_keep_internal_references):
|
|
123
123
|
|
124
124
|
return traverse(var=[var], match_cond=lambda _, __, value: isinstance(value, (list, dict,)),
|
125
125
|
action_mode="replace", converter=converter,
|
126
|
-
traversal_mode="dfs_pre_order", b_traverse_matched_element=True
|
126
|
+
traversal_mode="dfs_pre_order", b_traverse_matched_element=True,
|
127
|
+
b_skip_repeated_non_leaf_node=True)[0]
|
127
128
|
|
128
129
|
|
129
130
|
def _copy_nodes(var, b_keep_internal_references):
|
@@ -149,7 +150,8 @@ def _copy_nodes(var, b_keep_internal_references):
|
|
149
150
|
|
150
151
|
return traverse(var=[var], match_cond=lambda _, __, value: not isinstance(value, (list, dict,)),
|
151
152
|
action_mode="replace", converter=converter,
|
152
|
-
traversal_mode="dfs_pre_order", b_traverse_matched_element=True
|
153
|
+
traversal_mode="dfs_pre_order", b_traverse_matched_element=True,
|
154
|
+
b_skip_repeated_non_leaf_node=True)[0]
|
153
155
|
|
154
156
|
|
155
157
|
if __name__ == '__main__':
|
@@ -2,7 +2,7 @@ from kevin_toolbox.nested_dict_list import traverse, get_value
|
|
2
2
|
from kevin_toolbox.nested_dict_list.name_handler import parse_name
|
3
3
|
|
4
4
|
|
5
|
-
def get_nodes(var, level=-1, b_strict=True):
|
5
|
+
def get_nodes(var, level=-1, b_strict=True, **kwargs):
|
6
6
|
"""
|
7
7
|
获取嵌套字典列表 var 中所有叶节点
|
8
8
|
以列表 [(name,value), ...] 形式返回,其中名字 name 的解释方式参考 name_handler.parse_name() 介绍
|
@@ -23,6 +23,8 @@ def get_nodes(var, level=-1, b_strict=True):
|
|
23
23
|
默认为 True,不添加。
|
24
24
|
"""
|
25
25
|
assert isinstance(level, (int,))
|
26
|
+
kwargs.setdefault("b_skip_repeated_non_leaf_node", False)
|
27
|
+
|
26
28
|
if level == 0:
|
27
29
|
return [("", var)]
|
28
30
|
|
@@ -38,7 +40,7 @@ def get_nodes(var, level=-1, b_strict=True):
|
|
38
40
|
res_empty.add(idx + "@None") # 添加哨兵,表示空节点,并不会被实际解释
|
39
41
|
return False
|
40
42
|
|
41
|
-
traverse(var=var, match_cond=func, action_mode="skip", b_use_name_as_idx=True)
|
43
|
+
traverse(var=var, match_cond=func, action_mode="skip", b_use_name_as_idx=True, **kwargs)
|
42
44
|
|
43
45
|
if level != -1:
|
44
46
|
names = set()
|
@@ -3,6 +3,18 @@ from kevin_toolbox.computer_science.algorithm.registration import Registry
|
|
3
3
|
|
4
4
|
SERIALIZER_BACKEND = Registry(uid="SERIALIZER_BACKEND")
|
5
5
|
|
6
|
+
# 导入时的默认过滤规则
|
7
|
+
ignore_s = [
|
8
|
+
{
|
9
|
+
"func": lambda _, __, path: os.path.basename(path) in ["temp", "test", "__pycache__",
|
10
|
+
"_old_version"],
|
11
|
+
"scope": ["root", "dirs"]
|
12
|
+
},
|
13
|
+
]
|
14
|
+
|
6
15
|
# 从 kevin_toolbox/nested_dict_list/serializer/backends 下收集被注册的 backend
|
7
|
-
SERIALIZER_BACKEND.collect_from_paths(
|
8
|
-
|
16
|
+
SERIALIZER_BACKEND.collect_from_paths(
|
17
|
+
path_ls=[os.path.join(os.path.dirname(__file__), "backends"), ],
|
18
|
+
ignore_s=ignore_s,
|
19
|
+
b_execute_now=False
|
20
|
+
)
|
@@ -250,6 +250,8 @@ def write(var, output_dir, settings=None, traversal_mode=Traversal_Mode.BFS, b_p
|
|
250
250
|
if not os.path.exists(tgt_path):
|
251
251
|
for_os.copy(src=src_path, dst=tgt_path, remove_dst_if_exists=True)
|
252
252
|
|
253
|
+
return tgt_path
|
254
|
+
|
253
255
|
|
254
256
|
def _judge_processed_or_not(processed_s, name):
|
255
257
|
"""
|
@@ -15,7 +15,8 @@ class Traversal_Mode(Enum):
|
|
15
15
|
|
16
16
|
|
17
17
|
def traverse(var, match_cond, action_mode="remove", converter=None,
|
18
|
-
b_use_name_as_idx=False, traversal_mode="dfs_pre_order", b_traverse_matched_element=False
|
18
|
+
b_use_name_as_idx=False, traversal_mode="dfs_pre_order", b_traverse_matched_element=False,
|
19
|
+
b_skip_repeated_non_leaf_node=None, cond_for_repeated_leaf_to_skip=None, **kwargs):
|
19
20
|
"""
|
20
21
|
遍历 var 找到符合 match_cond 的元素,将其按照 action_mode 指定的操作进行处理
|
21
22
|
|
@@ -48,46 +49,86 @@ def traverse(var, match_cond, action_mode="remove", converter=None,
|
|
48
49
|
默认为 "dfs_pre_order"
|
49
50
|
b_use_name_as_idx: <boolean> 对于 match_cond/converter 中的 idx 参数,是传入整体的 name 还是父节点的 index 或 key。
|
50
51
|
默认为 False
|
51
|
-
b_traverse_matched_element
|
52
|
+
b_traverse_matched_element: <boolean> 对于匹配上的元素,经过处理后,是否继续遍历该元素的内容
|
52
53
|
默认为 False
|
54
|
+
b_skip_repeated_non_leaf_node: <boolean> 是否跳过重复的非叶节点。
|
55
|
+
何为重复?
|
56
|
+
在内存中的id相同。
|
57
|
+
默认为 None,此时将根据 action_mode 的来决定:
|
58
|
+
- 对于会对节点进行修改的模式,比如 "remove" 和 "replace",将设为 True,以避免预期外的重复转换和替换。
|
59
|
+
- 对于不会修改节点内容的模式,比如 "skip",将设为 False。
|
60
|
+
cond_for_repeated_leaf_to_skip: <list/tuple of callable> 在叶节点位置上,遇到满足其中某个条件的重复的元素时需要跳过。
|
61
|
+
要求函数接受 叶节点的值,并返回一个 boolean,表示是否匹配成功。
|
62
|
+
默认为 None
|
53
63
|
"""
|
54
64
|
assert callable(match_cond)
|
55
65
|
action_mode = Action_Mode(action_mode)
|
56
66
|
if action_mode is Action_Mode.REPLACE:
|
57
67
|
assert callable(converter)
|
58
68
|
traversal_mode = Traversal_Mode(traversal_mode)
|
69
|
+
if b_skip_repeated_non_leaf_node is None:
|
70
|
+
if action_mode is Action_Mode.SKIP:
|
71
|
+
b_skip_repeated_non_leaf_node = False
|
72
|
+
else: # action_mode in (Action_Mode.REMOVE, Action_Mode.REPLACE)
|
73
|
+
b_skip_repeated_non_leaf_node = True
|
74
|
+
cond_for_repeated_leaf_to_skip = [] if cond_for_repeated_leaf_to_skip is None else cond_for_repeated_leaf_to_skip
|
75
|
+
|
76
|
+
passed_node_ids = {"leaf": set(), "non_leaf": set()}
|
59
77
|
|
60
78
|
if traversal_mode is Traversal_Mode.BFS:
|
61
|
-
return _bfs(var, match_cond, action_mode, converter, b_use_name_as_idx, b_traverse_matched_element
|
79
|
+
return _bfs(var, match_cond, action_mode, converter, b_use_name_as_idx, b_traverse_matched_element,
|
80
|
+
b_skip_repeated_non_leaf_node=b_skip_repeated_non_leaf_node,
|
81
|
+
cond_for_repeated_leaf_to_skip=cond_for_repeated_leaf_to_skip,
|
82
|
+
passed_node_ids=passed_node_ids)
|
62
83
|
else:
|
63
84
|
return _dfs(var, match_cond, action_mode, converter, b_use_name_as_idx, traversal_mode,
|
64
|
-
b_traverse_matched_element, ""
|
85
|
+
b_traverse_matched_element, pre_name="",
|
86
|
+
b_skip_repeated_non_leaf_node=b_skip_repeated_non_leaf_node,
|
87
|
+
cond_for_repeated_leaf_to_skip=cond_for_repeated_leaf_to_skip,
|
88
|
+
passed_node_ids=passed_node_ids)
|
65
89
|
|
66
90
|
|
67
|
-
def _bfs(var, match_cond, action_mode, converter, b_use_name_as_idx, b_traverse_matched_element
|
91
|
+
def _bfs(var, match_cond, action_mode, converter, b_use_name_as_idx, b_traverse_matched_element,
|
92
|
+
b_skip_repeated_non_leaf_node, cond_for_repeated_leaf_to_skip, passed_node_ids):
|
68
93
|
temp = [("", var)]
|
69
94
|
|
70
95
|
while len(temp):
|
71
|
-
pre_name,
|
72
|
-
if isinstance(
|
73
|
-
|
96
|
+
pre_name, it = temp.pop(0)
|
97
|
+
if isinstance(it, (list, dict)):
|
98
|
+
#
|
99
|
+
if b_skip_repeated_non_leaf_node:
|
100
|
+
if id(it) in passed_node_ids["non_leaf"]:
|
101
|
+
continue
|
102
|
+
else:
|
103
|
+
passed_node_ids["non_leaf"].add(id(it))
|
104
|
+
#
|
105
|
+
keys = list(range(len(it)) if isinstance(it, list) else it.keys())
|
74
106
|
keys.reverse() # 反过来便于 列表 弹出元素
|
75
|
-
idx_ls = _gen_idx(
|
107
|
+
idx_ls = _gen_idx(it, keys, b_use_name_as_idx, pre_name)
|
76
108
|
|
77
109
|
# 匹配&处理
|
78
110
|
for k, idx in zip(keys, idx_ls):
|
79
|
-
b_matched, b_popped = _deal(
|
80
|
-
|
111
|
+
b_matched, b_popped, b_skip = _deal(it, k, idx, match_cond, converter, action_mode,
|
112
|
+
cond_for_repeated_leaf_to_skip, passed_node_ids)
|
113
|
+
if b_skip or b_popped or (b_matched and not b_traverse_matched_element):
|
81
114
|
continue
|
82
115
|
# 添加到队尾
|
83
|
-
temp.append((idx,
|
116
|
+
temp.append((idx, it[k]))
|
84
117
|
|
85
118
|
return var
|
86
119
|
|
87
120
|
|
88
121
|
def _dfs(var, match_cond, action_mode, converter,
|
89
|
-
b_use_name_as_idx, traversal_mode, b_traverse_matched_element, pre_name
|
122
|
+
b_use_name_as_idx, traversal_mode, b_traverse_matched_element, pre_name,
|
123
|
+
b_skip_repeated_non_leaf_node, cond_for_repeated_leaf_to_skip, passed_node_ids):
|
90
124
|
if isinstance(var, (list, dict)):
|
125
|
+
#
|
126
|
+
if b_skip_repeated_non_leaf_node:
|
127
|
+
if id(var) in passed_node_ids["non_leaf"]:
|
128
|
+
return var
|
129
|
+
else:
|
130
|
+
passed_node_ids["non_leaf"].add(id(var))
|
131
|
+
#
|
91
132
|
keys = list(range(len(var)) if isinstance(var, list) else var.keys())
|
92
133
|
keys.reverse() # 反过来便于 列表 弹出元素
|
93
134
|
idx_ls = _gen_idx(var, keys, b_use_name_as_idx, pre_name)
|
@@ -98,29 +139,42 @@ def _dfs(var, match_cond, action_mode, converter,
|
|
98
139
|
# 匹配&处理
|
99
140
|
deal_res_ls = []
|
100
141
|
for k, idx in zip(keys, idx_ls):
|
101
|
-
deal_res_ls.append(_deal(var, k, idx, match_cond, converter, action_mode
|
142
|
+
deal_res_ls.append(_deal(var, k, idx, match_cond, converter, action_mode,
|
143
|
+
cond_for_repeated_leaf_to_skip, passed_node_ids))
|
102
144
|
# 递归遍历
|
103
|
-
for (b_matched, b_popped), k, idx in zip(deal_res_ls, keys, idx_ls):
|
104
|
-
if b_popped or (b_matched and not b_traverse_matched_element):
|
145
|
+
for (b_matched, b_popped, b_skip), k, idx in zip(deal_res_ls, keys, idx_ls):
|
146
|
+
if b_skip or b_popped or (b_matched and not b_traverse_matched_element):
|
105
147
|
continue
|
106
148
|
var[k] = _dfs(var[k], match_cond, action_mode, converter, b_use_name_as_idx, traversal_mode,
|
107
|
-
b_traverse_matched_element, idx
|
149
|
+
b_traverse_matched_element, idx,
|
150
|
+
b_skip_repeated_non_leaf_node, cond_for_repeated_leaf_to_skip, passed_node_ids)
|
108
151
|
else:
|
109
152
|
# 后序
|
110
153
|
# 递归遍历
|
111
154
|
for k, idx in zip(keys, idx_ls):
|
112
155
|
var[k] = _dfs(var[k], match_cond, action_mode, converter, b_use_name_as_idx, traversal_mode,
|
113
|
-
b_traverse_matched_element, idx
|
156
|
+
b_traverse_matched_element, idx,
|
157
|
+
b_skip_repeated_non_leaf_node, cond_for_repeated_leaf_to_skip, passed_node_ids)
|
114
158
|
# 匹配&处理
|
115
159
|
for k, idx in zip(keys, idx_ls):
|
116
|
-
_deal(var, k, idx, match_cond, converter, action_mode
|
160
|
+
_deal(var, k, idx, match_cond, converter, action_mode,
|
161
|
+
cond_for_repeated_leaf_to_skip, passed_node_ids)
|
117
162
|
else:
|
118
163
|
pass
|
119
164
|
return var
|
120
165
|
|
121
166
|
|
122
|
-
def _deal(var, k, idx, match_cond, converter, action_mode
|
167
|
+
def _deal(var, k, idx, match_cond, converter, action_mode,
|
168
|
+
cond_for_repeated_leaf_to_skip, passed_node_ids):
|
123
169
|
"""处理节点"""
|
170
|
+
b_skip = False
|
171
|
+
|
172
|
+
if cond_for_repeated_leaf_to_skip and not isinstance(var[k], (dict, list,)) and any(
|
173
|
+
[i(var[k]) for i in cond_for_repeated_leaf_to_skip]):
|
174
|
+
if id(var[k]) in passed_node_ids["leaf"]:
|
175
|
+
return None, None, True
|
176
|
+
else:
|
177
|
+
passed_node_ids["leaf"].add(id(var[k]))
|
124
178
|
# 匹配
|
125
179
|
b_matched = match_cond(type(var), idx, var[k])
|
126
180
|
b_popped = False
|
@@ -133,7 +187,7 @@ def _deal(var, k, idx, match_cond, converter, action_mode):
|
|
133
187
|
var[k] = converter(idx, var[k])
|
134
188
|
else:
|
135
189
|
pass
|
136
|
-
return b_matched, b_popped
|
190
|
+
return b_matched, b_popped, b_skip
|
137
191
|
|
138
192
|
|
139
193
|
def _gen_idx(var, keys, b_use_name_as_idx, pre_name):
|
@@ -70,10 +70,7 @@ def _forward(var, flag, match_cond):
|
|
70
70
|
# 任选其一进行保留,其余改为引用
|
71
71
|
keep_name = unprocessed_name_set.pop()
|
72
72
|
for name in unprocessed_name_set:
|
73
|
-
|
74
|
-
var = set_value(var=var, name=name, value=f'<{flag}>{{{keep_name}}}', b_force=False)
|
75
|
-
except:
|
76
|
-
breakpoint()
|
73
|
+
var = set_value(var=var, name=name, value=f'<{flag}>{{{keep_name}}}', b_force=False)
|
77
74
|
processed_name_set.update(unprocessed_name_set)
|
78
75
|
|
79
76
|
# 将叶节点中,未被处理过,且是 str,且以 flag 开头的字符串,添加多一个 flag,以示区分
|
@@ -0,0 +1,10 @@
|
|
1
|
+
import urllib3
|
2
|
+
from urllib3.exceptions import InsecureRequestWarning
|
3
|
+
|
4
|
+
# 禁用不安全请求警告(例如当 verify=False 时)
|
5
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
6
|
+
|
7
|
+
from .get_response import get_response
|
8
|
+
from .fetch_metadata import fetch_metadata
|
9
|
+
from .fetch_content import fetch_content
|
10
|
+
from .download_file import download_file
|
@@ -0,0 +1,120 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
from tqdm import tqdm
|
4
|
+
from kevin_toolbox.network import fetch_metadata, fetch_content, get_response
|
5
|
+
from kevin_toolbox.patches.for_os.path import replace_illegal_chars
|
6
|
+
from kevin_toolbox.patches import for_os
|
7
|
+
from kevin_toolbox.nested_dict_list import get_hash
|
8
|
+
|
9
|
+
default_option_func_s = {
|
10
|
+
"hash_name": lambda url, _, option_s: {"hash_name": get_hash(option_s["name"], length=12)},
|
11
|
+
"hash_url": lambda url, _, option_s: {"hash_url": get_hash(url, length=12)},
|
12
|
+
"timestamp": lambda *arg, **kwargs: {"timestamp": f'{time.time()}'},
|
13
|
+
"legalized_name": lambda url, _, option_s: {
|
14
|
+
"legalized_name": replace_illegal_chars(file_name=option_s["name"], b_is_path=False)}
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
def download_file(
|
19
|
+
output_dir, url=None, response=None, chunk_size=1024 * 10,
|
20
|
+
file_name=None, file_name_format="{legalized_name:.100}{suffix}", format_option_generate_func_ls=None,
|
21
|
+
b_allow_overwrite=False, b_display_progress=False, **kwargs
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
下载文件
|
25
|
+
支持以下高级功能:
|
26
|
+
1. 自动识别文件类型并命名。
|
27
|
+
2. 多次重试。
|
28
|
+
3. TODO:断点续传(待实现)。
|
29
|
+
|
30
|
+
参数:
|
31
|
+
output_dir: <path> 文件保存的目录
|
32
|
+
url: <str> 下载的 URL 地址。
|
33
|
+
response: 响应。
|
34
|
+
以上两个参数只需要指定其一即可,建议使用后者。
|
35
|
+
chunk_size: <int> 采用分块下载时,块的大小
|
36
|
+
默认为 1024 * 10
|
37
|
+
file_name: <str> 文件名
|
38
|
+
默认为 None,此时将根据 file_name_format 自动生成名字。
|
39
|
+
file_name_format: <str> 保存的文件的命名方式。
|
40
|
+
基本结构为: '{<part_0>}...{<part_1>}...'
|
41
|
+
其中 {} 内将根据 part 指定的选项进行自动填充。目前支持以下几种选项:
|
42
|
+
- "name" 文件名(不含后缀)。
|
43
|
+
- "suffix" 后缀。
|
44
|
+
- "timestamp" 下载的时间戳。
|
45
|
+
- "hash_name" 文件名的hash值。
|
46
|
+
- "legalized_name" 经过合法化处理的文件名(对其中特殊符号进行了替换)。
|
47
|
+
- "hash_url" url的hash值。
|
48
|
+
!!注意:
|
49
|
+
"name" 该选项由于其可能含有 : 和 / 等特殊符号,当以其作为文件名时,可能会引发错误。
|
50
|
+
因此对于 windows 用户,请慎重使用该选项,对于 mac 和 linux 用户,同样也不建议使用该选项。
|
51
|
+
相较而言,"legalized_name" 是一个更好的选择。
|
52
|
+
"hash_name" 和 "hash_url" 有极低但非0的可能会发生 hash 碰撞。
|
53
|
+
综合而言:
|
54
|
+
建议使用 "legalized_name" 和 "suffix" 以及 "timestamp" 的组合。
|
55
|
+
高级设置:
|
56
|
+
1. 如果想限制文件名中某部分的长度(避免文件名过长在某些系统下引发报错),应该如何做?
|
57
|
+
本命名方式兼容 str.format() 语法,比如你可以通过 {name:.10} 来限制名字的长度不大于10个字符。
|
58
|
+
2. 如果已有的选项无法满足你的需求,如何新增选项?
|
59
|
+
本函数支持通过设置 format_option_generate_func_ls 来补充或者覆盖默认选项。
|
60
|
+
默认值为:
|
61
|
+
'{legalized_name:.100}{suffix}'
|
62
|
+
format_option_generate_func_ls: <list of callable> 函数列表,将使用这些函数的结果来对 file_name_format 中的选项进行补充或者覆盖。
|
63
|
+
函数需要接受 url, response, option_s(已有的选项键值对) 三个参数,并返回一个包含选项名和选项值的 dict。
|
64
|
+
默认为 None
|
65
|
+
b_allow_overwrite: <boolean> 是否允许覆盖已有文件。
|
66
|
+
b_display_progress: <boolean> 显示进度条。
|
67
|
+
|
68
|
+
|
69
|
+
返回:
|
70
|
+
文件完整路径(下载成功)或空字符串(失败)
|
71
|
+
"""
|
72
|
+
global default_option_func_s
|
73
|
+
assert url is not None or response is not None
|
74
|
+
if url is not None:
|
75
|
+
response = response or get_response(url=url, **kwargs)
|
76
|
+
assert response is not None
|
77
|
+
output_dir = os.path.expanduser(output_dir)
|
78
|
+
#
|
79
|
+
metadata_s = fetch_metadata(url=url, response=response, default_name="", default_suffix="")
|
80
|
+
if file_name is None:
|
81
|
+
option_s = metadata_s.copy()
|
82
|
+
for k, func in default_option_func_s.items():
|
83
|
+
if k in file_name_format:
|
84
|
+
option_s.update(func(url, response, option_s))
|
85
|
+
if isinstance(format_option_generate_func_ls, (list, tuple,)):
|
86
|
+
for func in format_option_generate_func_ls:
|
87
|
+
assert callable(func)
|
88
|
+
option_s.update(func(url, response, option_s))
|
89
|
+
file_name = file_name_format.format(**option_s)
|
90
|
+
#
|
91
|
+
os.makedirs(output_dir, exist_ok=True)
|
92
|
+
#
|
93
|
+
file_path = os.path.join(output_dir, file_name)
|
94
|
+
if os.path.exists(file_path):
|
95
|
+
if b_allow_overwrite:
|
96
|
+
for_os.remove(path=file_path, ignore_errors=True)
|
97
|
+
else:
|
98
|
+
raise FileExistsError(f"target {file_path} already exists")
|
99
|
+
|
100
|
+
if metadata_s["content_length"] and b_display_progress:
|
101
|
+
pbar = tqdm(total=metadata_s["content_length"], unit="B", unit_scale=True, desc="下载进度")
|
102
|
+
else:
|
103
|
+
pbar = None
|
104
|
+
|
105
|
+
with open(file_path, "wb") as f:
|
106
|
+
for chunk in fetch_content(response=response, chunk_size=chunk_size):
|
107
|
+
if chunk:
|
108
|
+
f.write(chunk)
|
109
|
+
if pbar is not None:
|
110
|
+
pbar.update(len(chunk))
|
111
|
+
|
112
|
+
return file_path
|
113
|
+
|
114
|
+
|
115
|
+
# 示例用法
|
116
|
+
if __name__ == "__main__":
|
117
|
+
url_ = "https://i.pinimg.com/736x/28/6a/b1/286ab1eb816dc59a1c72374c75645d80.jpg"
|
118
|
+
output_dir = r'./temp/123'
|
119
|
+
downloaded_file = download_file(url=url_, output_dir=output_dir, file_name="233.jpg", b_allow_overwrite=True,
|
120
|
+
b_display_progress=True, chunk_size=100)
|