kevin-toolbox-dev 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kevin_toolbox/__init__.py +2 -2
- kevin_toolbox/computer_science/algorithm/cache_manager/__init__.py +1 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/cache/__init__.py +2 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/cache/cache_base.py +89 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/cache/memo_cache.py +36 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/cache_manager.py +218 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/strategy/__init__.py +5 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/strategy/fifo_strategy.py +21 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/strategy/lfu_strategy.py +80 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/strategy/lru_strategy.py +43 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/strategy/lst_strategy.py +26 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/strategy/strategy_base.py +45 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/test/__init__.py +0 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/test/test_cache_builder.py +37 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/test/test_cache_manager.py +197 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/test/test_cache_strategy.py +129 -0
- kevin_toolbox/computer_science/algorithm/cache_manager/variable.py +28 -0
- kevin_toolbox/computer_science/algorithm/registration/registry.py +38 -16
- kevin_toolbox/data_flow/core/cache/__init__.py +1 -1
- kevin_toolbox/data_flow/core/cache/cache_manager_for_iterator.py +36 -168
- kevin_toolbox/data_flow/core/cache/test/__init__.py +0 -0
- kevin_toolbox/data_flow/core/cache/test/test_cache_manager_for_iterator.py +34 -0
- kevin_toolbox/data_flow/core/reader/file_iterative_reader.py +44 -9
- kevin_toolbox/data_flow/core/reader/unified_reader.py +2 -2
- kevin_toolbox/data_flow/core/reader/unified_reader_base.py +4 -5
- kevin_toolbox/data_flow/file/json_/converter/__init__.py +2 -2
- kevin_toolbox/data_flow/file/json_/converter/escape_tuple_and_set.py +23 -0
- kevin_toolbox/data_flow/file/json_/converter/{unescape_tuple.py → unescape_tuple_and_set.py} +7 -5
- kevin_toolbox/data_flow/file/json_/read_json.py +3 -3
- kevin_toolbox/data_flow/file/json_/write_json.py +3 -3
- kevin_toolbox/data_flow/file/kevin_notation/kevin_notation_reader.py +6 -5
- kevin_toolbox/data_flow/file/kevin_notation/read.py +4 -2
- kevin_toolbox/data_flow/file/kevin_notation/test/test_kevin_notation.py +15 -3
- kevin_toolbox/data_flow/file/markdown/generate_table.py +2 -2
- kevin_toolbox/math/utils/__init__.py +1 -1
- kevin_toolbox/math/utils/{spilt_integer_most_evenly.py → split_integer_most_evenly.py} +2 -2
- kevin_toolbox/nested_dict_list/get_nodes.py +9 -4
- kevin_toolbox/nested_dict_list/name_handler/build_name.py +1 -1
- kevin_toolbox/nested_dict_list/name_handler/parse_name.py +1 -1
- kevin_toolbox/nested_dict_list/set_default.py +44 -28
- kevin_toolbox/patches/for_matplotlib/__init__.py +1 -0
- kevin_toolbox/patches/for_matplotlib/generate_color_list.py +33 -0
- kevin_toolbox/patches/for_numpy/linalg/__init__.py +1 -0
- kevin_toolbox/patches/for_numpy/linalg/entropy.py +26 -0
- kevin_toolbox/patches/for_numpy/random/__init__.py +3 -0
- kevin_toolbox/patches/for_numpy/random/get_rng.py +64 -0
- kevin_toolbox/patches/for_numpy/random/truncated_multivariate_normal.py +129 -0
- kevin_toolbox/patches/for_numpy/random/truncated_normal.py +89 -0
- kevin_toolbox/patches/for_numpy/random/variable.py +10 -0
- kevin_toolbox/patches/for_optuna/serialize/for_study/dump.py +10 -2
- kevin_toolbox_dev-1.3.3.dist-info/METADATA +75 -0
- {kevin_toolbox_dev-1.3.1.dist-info → kevin_toolbox_dev-1.3.3.dist-info}/RECORD +54 -29
- kevin_toolbox/data_flow/file/json_/converter/escape_tuple.py +0 -20
- kevin_toolbox_dev-1.3.1.dist-info/METADATA +0 -91
- {kevin_toolbox_dev-1.3.1.dist-info → kevin_toolbox_dev-1.3.3.dist-info}/WHEEL +0 -0
- {kevin_toolbox_dev-1.3.1.dist-info → kevin_toolbox_dev-1.3.3.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
|
|
2
2
|
import time
|
3
3
|
import importlib.util
|
4
4
|
from kevin_toolbox.developing.decorator import restore_original_work_path
|
5
|
+
from kevin_toolbox.computer_science.algorithm.cache_manager import Cache_Manager
|
5
6
|
|
6
7
|
if importlib.util.find_spec("cPickle") is not None:
|
7
8
|
# 如果安装有 cPickle,可以更快处理串行化
|
@@ -27,29 +28,10 @@ class Cache_Manager_for_Iterator:
|
|
27
28
|
file_dict: 二进制文件的文件名与序号的对应关系表
|
28
29
|
例如:{ 0: "0.pkl", 1: "1.pkl", 2: "2.pkl", ...}
|
29
30
|
其中 index 0 对应于文件 ./temp/cache_name/0.pkl
|
30
|
-
|
31
|
+
然后在进行读取时,将先到基于内存的缓存 memo_cache_manager 中寻找是否已经有需要的 chunk 分块,如果没有则到前面 file_dict 中读取,同时更新 memo_cache_manager
|
31
32
|
相关变量:
|
32
|
-
|
33
|
-
例如:{ 1: chunk_var_1, 3: chunk_var_3 }
|
33
|
+
memo_cache_manager: 基于内存的缓存,由 Cache_Manager 构建,指定有更新策略等
|
34
34
|
其中 key 是 chunk 分块的 index,value 是对应的保存在内存中的变量
|
35
|
-
cache_metadata: 缓存的属性数据
|
36
|
-
包含以下字段,各字段将自动更新
|
37
|
-
例如:{ 1: { "last_time": xxx, # 最近读取时间
|
38
|
-
"initial_time": xxx, # 最初读取时间
|
39
|
-
"counts": xxx, # 读取次数
|
40
|
-
},
|
41
|
-
... }
|
42
|
-
cache_update_strategy:缓存更新策略
|
43
|
-
是一个函数,该函数的输入是 cache_metadata ,输出是需要删除的缓存的序号
|
44
|
-
触发:
|
45
|
-
在每次出现 cache_dict 无法命中,导致有新的 cache 添加到 cache_dict 时,将会采用该策略进行更新
|
46
|
-
现有策略:
|
47
|
-
drop_min_counts: 去除读取次数最小的
|
48
|
-
drop_min_last_time: 去除最近没有读取的
|
49
|
-
drop_min_survival_time: 去除生存时间最短的,生成时间 survival_time:=last_time-initial_time
|
50
|
-
默认使用 drop_min_last_time
|
51
|
-
(cache_dict 的大小受 cache_update_strategy 中的 cache_size 限制,当大小超过限制时,
|
52
|
-
将根据 cache_update_strategy 去除优先级较低的部分来更新缓存)
|
53
35
|
|
54
36
|
支持以下几种方式来:
|
55
37
|
以迭代器的形式进行顺序读取
|
@@ -60,44 +42,43 @@ class Cache_Manager_for_Iterator:
|
|
60
42
|
"""
|
61
43
|
设定关键参数
|
62
44
|
参数:
|
63
|
-
iterator:
|
64
|
-
folder_path:
|
65
|
-
|
45
|
+
iterator: 迭代器/生成器
|
46
|
+
folder_path: <path> 构建基于磁盘的缓存时,保存二进制文件的路径
|
47
|
+
paras_for_memo_cache: <dict> 构建基于内存的缓存的参数
|
66
48
|
其他参数:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
49
|
+
b_strict_mode: <boolean> 禁止同时设置 iterator 和给定一个非空的 folder_path
|
50
|
+
默认为 True 开启,此时同时设置将报错。
|
51
|
+
当设置为 False 时,同时设置将以 folder_path 中的二进制文件为准
|
52
|
+
b_del_cache_when_exit: <boolean> 退出时删除生成的缓存二进制文件
|
53
|
+
只有在设置了 iterator 的前提下,才会触发。
|
54
|
+
(对于非本实例生成的文件,比如只给定了非空的 folder_path,不做删除。)
|
55
|
+
默认为 True 开启。
|
74
56
|
"""
|
75
57
|
|
76
58
|
# 默认参数
|
77
59
|
paras = {
|
78
60
|
"iterator": None,
|
79
61
|
"folder_path": None,
|
80
|
-
"
|
81
|
-
|
82
|
-
"
|
83
|
-
"
|
62
|
+
"paras_for_memo_cache": dict(upper_bound=20, refactor_size=0.7, strategy=":by_last_time:LRU",
|
63
|
+
cache=":in_memory:Memo"),
|
64
|
+
"b_strict_mode": True,
|
65
|
+
"b_del_cache_when_exit": True,
|
84
66
|
}
|
85
67
|
|
86
68
|
# 获取参数
|
87
69
|
paras.update(kwargs)
|
88
70
|
|
89
71
|
# 校验参数
|
90
|
-
#
|
91
|
-
|
92
|
-
|
93
|
-
cache_size_upper_bound=10)
|
72
|
+
# paras_for_memo_cache
|
73
|
+
assert isinstance(paras["paras_for_memo_cache"], (dict,))
|
74
|
+
paras["paras_for_memo_cache"]["strategy"] = ":by_last_time:LRU"
|
94
75
|
# 同时非空
|
95
76
|
b_folder_not_empty = isinstance(paras["folder_path"], (str,)) and paras[
|
96
77
|
"folder_path"] is not None and os.path.exists(paras["folder_path"]) and len(
|
97
78
|
os.listdir(paras["folder_path"])) > 0
|
98
79
|
if paras["iterator"] is not None and b_folder_not_empty:
|
99
80
|
# iterator 非空,folder_path 非空
|
100
|
-
if paras["
|
81
|
+
if paras["b_strict_mode"]:
|
101
82
|
# 不能同时设置
|
102
83
|
raise Exception(f"Error: folder_path and iterator cannot be set at the same time\n"
|
103
84
|
f"iterator {paras['iterator']} is given when "
|
@@ -123,13 +104,12 @@ class Cache_Manager_for_Iterator:
|
|
123
104
|
# 尝试直接根据已有文件构建 file_dict
|
124
105
|
file_dict = self.find_chuck_files(paras["folder_path"])
|
125
106
|
|
107
|
+
# 构建基于内存的缓存
|
108
|
+
self.memo_cache_manager = Cache_Manager(**paras["paras_for_memo_cache"])
|
109
|
+
|
126
110
|
self.file_dict = file_dict
|
127
111
|
self.paras = paras
|
128
112
|
|
129
|
-
# 初始化基于内存的缓存
|
130
|
-
self.cache_dict = dict()
|
131
|
-
self.cache_metadata = dict()
|
132
|
-
|
133
113
|
# 记录最后读取的index
|
134
114
|
self.index = -1
|
135
115
|
|
@@ -181,49 +161,19 @@ class Cache_Manager_for_Iterator:
|
|
181
161
|
chunk = pickle.load(f)
|
182
162
|
return chunk
|
183
163
|
|
184
|
-
# ------------------------------------ 基于内存的缓存 ------------------------------------ #
|
185
|
-
|
186
|
-
def __read_from_cache(self, index):
|
187
|
-
"""
|
188
|
-
从内存中读取
|
189
|
-
"""
|
190
|
-
chunk = self.cache_dict[index]
|
191
|
-
# 更新缓存属性
|
192
|
-
self.cache_metadata[index]["counts"] += 1
|
193
|
-
self.cache_metadata[index]["last_time"] = time.time()
|
194
|
-
return chunk
|
195
|
-
|
196
|
-
def __add_to_cache(self, index, chunk):
|
197
|
-
"""
|
198
|
-
添加到内存中
|
199
|
-
"""
|
200
|
-
# 更新缓存
|
201
|
-
self.cache_dict[index] = chunk
|
202
|
-
# 更新缓存属性
|
203
|
-
self.cache_metadata[index] = {
|
204
|
-
"last_time": time.time(), # 最近读取时间
|
205
|
-
"initial_time": time.time(), # 最初读取时间
|
206
|
-
"counts": 1, # 读取次数
|
207
|
-
}
|
208
|
-
# 依据策略去除优先级较低的缓存
|
209
|
-
drop_ls = self.paras["cache_update_strategy"](self.cache_metadata)
|
210
|
-
for i in drop_ls:
|
211
|
-
self.cache_dict.pop(i)
|
212
|
-
self.cache_metadata.pop(i)
|
213
|
-
|
214
164
|
# ------------------------------------ 读取 ------------------------------------ #
|
215
165
|
|
216
166
|
def read(self, index):
|
217
167
|
assert 0 <= index < len(self), \
|
218
168
|
KeyError(f"Error: index {index} not in [0, {len(self)})")
|
219
|
-
if
|
169
|
+
if self.memo_cache_manager.has(key=index):
|
220
170
|
# 直接从内存中读取
|
221
|
-
chunk = self.
|
171
|
+
chunk = self.memo_cache_manager.get(key=index)
|
222
172
|
else:
|
223
173
|
# 到磁盘读取
|
224
174
|
chunk = self.__read_from_files(index)
|
225
175
|
# 添加到缓存
|
226
|
-
self.
|
176
|
+
self.memo_cache_manager.add(key=index, value=chunk)
|
227
177
|
self.index = index
|
228
178
|
return chunk
|
229
179
|
|
@@ -246,8 +196,8 @@ class Cache_Manager_for_Iterator:
|
|
246
196
|
return len(self.file_dict)
|
247
197
|
|
248
198
|
def __del__(self):
|
249
|
-
if self.paras["iterator"] is not None and self.paras["
|
250
|
-
# 在
|
199
|
+
if self.paras["iterator"] is not None and self.paras["b_del_cache_when_exit"] and self.paras["b_strict_mode"]:
|
200
|
+
# 在 b_strict_mode 开启,且 iterator 非空的情况下 self.file_dict 中的二进制文件一定是根据 iterator 生成的
|
251
201
|
# 删除文件
|
252
202
|
pwd_bak = os.getcwd()
|
253
203
|
os.chdir(self.paras["folder_path"])
|
@@ -259,97 +209,15 @@ class Cache_Manager_for_Iterator:
|
|
259
209
|
os.removedirs(self.paras["folder_path"])
|
260
210
|
|
261
211
|
|
262
|
-
class Strategies:
|
263
|
-
"""
|
264
|
-
现有策略:
|
265
|
-
drop_min_counts: 去除读取次数最小的
|
266
|
-
drop_min_last_time: 去除最近没有读取的
|
267
|
-
drop_min_survival_time: 去除生存时间最短的,生成时间 survival_time:=last_time-initial_time
|
268
|
-
"""
|
269
|
-
|
270
|
-
@staticmethod
|
271
|
-
def drop_min_counts(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
|
272
|
-
"""
|
273
|
-
去除读取次数最小的
|
274
|
-
参数:
|
275
|
-
cache_metadata: 缓存的属性数据
|
276
|
-
包含以下字段,各字段将自动更新
|
277
|
-
例如:{ 1: { "last_time": xxx, # 最近读取时间
|
278
|
-
"initial_time": xxx, # 最初读取时间
|
279
|
-
"counts": xxx, # 读取次数
|
280
|
-
},
|
281
|
-
... }
|
282
|
-
cache_size_upper_bound: 当 cache_metadata 的大小超过该值时触发更新
|
283
|
-
cache_size_after_drop: 更新后 cache_metadata 的目标大小
|
284
|
-
默认为 cache_size_upper_bound
|
285
|
-
"""
|
286
|
-
if cache_size_upper_bound >= len(cache_metadata):
|
287
|
-
return []
|
288
|
-
|
289
|
-
cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
|
290
|
-
# (这里其实可以用最大堆来优化,但是我懒啊)
|
291
|
-
drop_ls = [i for i, j in sorted(cache_metadata.items(), key=lambda x: x[1]["counts"])[:-cache_size_after_drop]]
|
292
|
-
return drop_ls
|
293
|
-
|
294
|
-
@staticmethod
|
295
|
-
def drop_min_last_time(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
|
296
|
-
"""
|
297
|
-
去除最近没有读取的
|
298
|
-
"""
|
299
|
-
if cache_size_upper_bound >= len(cache_metadata):
|
300
|
-
return []
|
301
|
-
|
302
|
-
cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
|
303
|
-
drop_ls = [i for i, j in
|
304
|
-
sorted(cache_metadata.items(), key=lambda x: x[1]["last_time"])[:-cache_size_after_drop]]
|
305
|
-
return drop_ls
|
306
|
-
|
307
|
-
@staticmethod
|
308
|
-
def drop_min_survival_time(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
|
309
|
-
"""
|
310
|
-
去除生存时间最短的,生成时间 survival_time:=last_time-initial_time
|
311
|
-
"""
|
312
|
-
if cache_size_upper_bound >= len(cache_metadata):
|
313
|
-
return []
|
314
|
-
|
315
|
-
cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
|
316
|
-
drop_ls = [i for i, j in sorted(cache_metadata.items(), key=lambda x: x[1]["last_time"] - x[1]["initial_time"])[
|
317
|
-
:-cache_size_after_drop]]
|
318
|
-
return drop_ls
|
319
|
-
|
320
|
-
|
321
212
|
if __name__ == '__main__':
|
322
|
-
"测试 Strategies"
|
323
|
-
_cache_metadata = {
|
324
|
-
1: {
|
325
|
-
"last_time": 123,
|
326
|
-
"initial_time": 56,
|
327
|
-
"counts": 2,
|
328
|
-
},
|
329
|
-
2: {
|
330
|
-
"last_time": 110,
|
331
|
-
"initial_time": 0,
|
332
|
-
"counts": 4,
|
333
|
-
},
|
334
|
-
3: {
|
335
|
-
"last_time": 126,
|
336
|
-
"initial_time": 121,
|
337
|
-
"counts": 1,
|
338
|
-
},
|
339
|
-
}
|
340
|
-
|
341
|
-
print(Strategies.drop_min_counts(_cache_metadata, 2))
|
342
|
-
print(Strategies.drop_min_last_time(_cache_metadata, 1))
|
343
|
-
print(Strategies.drop_min_survival_time(_cache_metadata, 1))
|
344
|
-
|
345
213
|
"测试 Cache_Manager_for_Iterator"
|
346
|
-
cache_manager = Cache_Manager_for_Iterator(
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
214
|
+
cache_manager = Cache_Manager_for_Iterator(
|
215
|
+
iterator=range(10),
|
216
|
+
b_del_cache_when_exit=True,
|
217
|
+
paras_for_memo_cache=dict(upper_bound=3, refactor_size=3, strategy=":by_last_time:LRU")
|
218
|
+
)
|
351
219
|
print(cache_manager.file_dict)
|
352
|
-
print(cache_manager.
|
220
|
+
print(cache_manager.memo_cache_manager.metadata_s)
|
353
221
|
for i in range(3):
|
354
222
|
print(cache_manager.read(0))
|
355
223
|
for i in range(2):
|
@@ -358,4 +226,4 @@ if __name__ == '__main__':
|
|
358
226
|
print(cache_manager.read(6))
|
359
227
|
for i in range(4):
|
360
228
|
print(cache_manager.read(9))
|
361
|
-
print(cache_manager.
|
229
|
+
print(cache_manager.memo_cache_manager.metadata_s)
|
File without changes
|
@@ -0,0 +1,34 @@
|
|
1
|
+
import pytest
|
2
|
+
import time
|
3
|
+
from kevin_toolbox.patches.for_test import check_consistency
|
4
|
+
from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
|
5
|
+
|
6
|
+
|
7
|
+
def test_cache_manager_for_iterator():
|
8
|
+
print("test Cache_Manager_for_Iterator")
|
9
|
+
|
10
|
+
cache_manager = Cache_Manager_for_Iterator(
|
11
|
+
iterator=range(10),
|
12
|
+
b_del_cache_when_exit=True,
|
13
|
+
paras_for_memo_cache=dict(upper_bound=3, refactor_size=3, strategy=":by_last_time:LRU")
|
14
|
+
)
|
15
|
+
|
16
|
+
check_consistency(
|
17
|
+
cache_manager.file_dict,
|
18
|
+
{i: f'{i}.pkl' for i in range(10)}
|
19
|
+
)
|
20
|
+
check_consistency(
|
21
|
+
cache_manager.memo_cache_manager.metadata_s,
|
22
|
+
dict()
|
23
|
+
)
|
24
|
+
key_to_counts = {0: 3, 3: 2, 6: 1, 9: 4}
|
25
|
+
for key, counts in key_to_counts.items():
|
26
|
+
for _ in range(counts):
|
27
|
+
time.sleep(0.05)
|
28
|
+
check_consistency(cache_manager.read(key), key)
|
29
|
+
# counts
|
30
|
+
for key, counts in key_to_counts.items():
|
31
|
+
if key == 0:
|
32
|
+
assert key not in cache_manager.memo_cache_manager.metadata_s
|
33
|
+
else:
|
34
|
+
check_consistency(cache_manager.memo_cache_manager.metadata_s[key]["counts"], counts - 1)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import os
|
2
|
+
import copy
|
2
3
|
|
3
4
|
|
4
5
|
class File_Iterative_Reader:
|
@@ -12,11 +13,14 @@ class File_Iterative_Reader:
|
|
12
13
|
设定关键参数
|
13
14
|
必要参数:
|
14
15
|
file_path: 文件路径
|
16
|
+
file_obj: 文件对象
|
17
|
+
注意!!以上两个参数指定其一即可,同时指定时候,以后者为准。
|
15
18
|
读取模式相关参数:
|
16
19
|
paras_for_open: open() 函数的补充参数
|
17
20
|
mode: 读取模式,默认为 "lines"
|
18
21
|
"lines": 按行数计算批次大小
|
19
22
|
"bytes": 按字节数计算
|
23
|
+
注意!!以上两个参数在指定了 file_obj 参数后将失效。
|
20
24
|
chunk_size: 批次大小
|
21
25
|
默认为 1k
|
22
26
|
当为-1时,读取整个文件
|
@@ -51,6 +55,7 @@ class File_Iterative_Reader:
|
|
51
55
|
paras = {
|
52
56
|
# 必要参数
|
53
57
|
"file_path": None,
|
58
|
+
"file_obj": None,
|
54
59
|
# 读取模式相关参数
|
55
60
|
"paras_for_open": dict(mode="r", encoding='utf-8'),
|
56
61
|
"mode": "lines",
|
@@ -74,16 +79,21 @@ class File_Iterative_Reader:
|
|
74
79
|
assert mode in ["lines", "bytes"]
|
75
80
|
paras["chunk_size"] = int(paras["chunk_size"])
|
76
81
|
paras["loop_num"] = int(paras["loop_num"]) - 1
|
77
|
-
#
|
78
|
-
file_path = paras["file_path"]
|
79
|
-
assert isinstance(file_path, (str,)) and os.path.exists(file_path), \
|
80
|
-
Exception(f"Error: file {file_path} not exists!")
|
81
|
-
#
|
82
|
-
paras_for_open = paras["paras_for_open"]
|
83
|
-
assert isinstance(paras_for_open, (dict,))
|
84
82
|
|
85
83
|
# 获取文件对象
|
86
|
-
|
84
|
+
if paras["file_obj"] is None:
|
85
|
+
assert isinstance(paras["file_path"], (str,)) and os.path.isfile(paras["file_path"]), \
|
86
|
+
Exception(f'Error: file {paras["file_path"]} not exists!')
|
87
|
+
#
|
88
|
+
assert isinstance(paras["paras_for_open"], (dict,))
|
89
|
+
self.file = open(paras["file_path"], **paras["paras_for_open"])
|
90
|
+
else:
|
91
|
+
# 拷贝对象,防止修改外部对象
|
92
|
+
try:
|
93
|
+
self.file = copy.deepcopy(paras["file_obj"])
|
94
|
+
except:
|
95
|
+
self.file = open(paras["file_obj"].name, mode=paras["file_obj"].mode)
|
96
|
+
|
87
97
|
# 选择相应模式
|
88
98
|
self.__read_func = {"lines": self.__read_lines, "bytes": self.__read_bytes}[mode]
|
89
99
|
self.__jump_func = {"lines": self.__jump_lines, "bytes": self.__jump_bytes}[mode]
|
@@ -225,9 +235,34 @@ class File_Iterative_Reader:
|
|
225
235
|
if __name__ == "__main__":
|
226
236
|
import numpy as np
|
227
237
|
|
228
|
-
|
238
|
+
print("使用 file_path")
|
239
|
+
reader = File_Iterative_Reader(file_path="test/test_data/test_data.txt", chunk_size=2, drop=True, loop_num=2,
|
229
240
|
pre_jump_size=3, convert_func=lambda x: np.array(x))
|
230
241
|
for i in reader:
|
231
242
|
print(i)
|
232
243
|
|
233
244
|
del reader
|
245
|
+
|
246
|
+
print("使用 file_obj")
|
247
|
+
reader = File_Iterative_Reader(
|
248
|
+
file_obj=open("test/test_data/test_data.txt", "r"), chunk_size=2, drop=True, loop_num=2,
|
249
|
+
pre_jump_size=3, convert_func=lambda x: np.array(x))
|
250
|
+
for i in reader:
|
251
|
+
print(i)
|
252
|
+
|
253
|
+
del reader
|
254
|
+
|
255
|
+
print("从字符串构建文件对象作为 file_obj")
|
256
|
+
from io import StringIO
|
257
|
+
|
258
|
+
file_obj = StringIO(initial_value=open("test/test_data/test_data.txt", "r").read())
|
259
|
+
reader = File_Iterative_Reader(
|
260
|
+
file_obj=file_obj, chunk_size=2, drop=True, loop_num=2,
|
261
|
+
pre_jump_size=3, convert_func=lambda x: np.array(x))
|
262
|
+
for i in reader:
|
263
|
+
print(i)
|
264
|
+
|
265
|
+
print("证明不会修改外部对象")
|
266
|
+
print(file_obj.read())
|
267
|
+
|
268
|
+
del reader
|
@@ -49,7 +49,7 @@ if __name__ == '__main__':
|
|
49
49
|
print(reader.read([3, 3]).shape)
|
50
50
|
print(reader.find(1))
|
51
51
|
|
52
|
-
reader = UReader(file_path="test_data.txt", chunk_size=2, folder_path="./temp/233")
|
52
|
+
reader = UReader(file_path="test/test_data/test_data.txt", chunk_size=2, folder_path="./temp/233")
|
53
53
|
|
54
54
|
print(reader.read(2, 7))
|
55
55
|
# del reader
|
@@ -67,7 +67,7 @@ if __name__ == '__main__':
|
|
67
67
|
|
68
68
|
print(reader.find('data/6/horse_race_pan/2132020102319002000161_43_4.bmp'))
|
69
69
|
|
70
|
-
reader = Reader_for_files(file_path="test_data.txt", chunk_size=2, pre_jump_size=2, jump_size=2)
|
70
|
+
reader = Reader_for_files(file_path="test/test_data/test_data.txt", chunk_size=2, pre_jump_size=2, jump_size=2)
|
71
71
|
|
72
72
|
for i in reader:
|
73
73
|
print(2333, i)
|
@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
|
|
2
2
|
import os
|
3
3
|
import numpy as np
|
4
4
|
from kevin_toolbox.data_flow.core.reader import File_Iterative_Reader
|
5
|
-
from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
|
5
|
+
from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
|
6
6
|
|
7
7
|
|
8
8
|
class Unified_Reader_Base(ABC):
|
@@ -126,10 +126,9 @@ class Unified_Reader_Base(ABC):
|
|
126
126
|
"""
|
127
127
|
默认使用 Cache_Manager_for_Iterator
|
128
128
|
"""
|
129
|
-
manager = Cache_Manager_for_Iterator(
|
130
|
-
|
131
|
-
|
132
|
-
cache_size_upper_bound=3))
|
129
|
+
manager = Cache_Manager_for_Iterator(
|
130
|
+
iterator=iterator, folder_path=folder_path,
|
131
|
+
paras_for_memo_cache=dict(upper_bound=20, refactor_size=0.7, strategy=":by_last_time:LRU"))
|
133
132
|
return manager
|
134
133
|
|
135
134
|
def read(self, *args, **kwargs):
|
@@ -2,6 +2,6 @@ from .convert_dict_key_to_number import convert_dict_key_to_number
|
|
2
2
|
from .convert_ndarray_to_list import convert_ndarray_to_list
|
3
3
|
from .escape_non_str_dict_key import escape_non_str_dict_key
|
4
4
|
from .unescape_non_str_dict_key import unescape_non_str_dict_key
|
5
|
-
from .
|
6
|
-
from .
|
5
|
+
from .escape_tuple_and_set import escape_tuple_and_set
|
6
|
+
from .unescape_tuple_and_set import unescape_tuple_and_set
|
7
7
|
from .integrate import integrate
|
@@ -0,0 +1,23 @@
|
|
1
|
+
def escape_tuple_and_set(x):
|
2
|
+
"""
|
3
|
+
将 tuple 和 set 进行转义
|
4
|
+
转义: x ==> f"<eval>{x}"
|
5
|
+
反转义: f"<eval>{x}" ==> x
|
6
|
+
|
7
|
+
为什么要进行转义?
|
8
|
+
由于 json 中会将 tuple 作为 list 进行保存,同时也无法保存 set,因此在保存过程中会丢失相应信息。
|
9
|
+
"""
|
10
|
+
if isinstance(x, (tuple, set,)) or (isinstance(x, (str,)) and x.startswith("<eval>")):
|
11
|
+
return f'<eval>{x}'
|
12
|
+
else:
|
13
|
+
return x
|
14
|
+
|
15
|
+
|
16
|
+
if __name__ == '__main__':
|
17
|
+
print(escape_tuple_and_set((1, 2, "\'1\'")))
|
18
|
+
# <eval>(1, 2, "'1'")
|
19
|
+
print(escape_tuple_and_set({"1", (1, 2, 3), 233}))
|
20
|
+
# <eval>{'1', 233, (1, 2, 3)}
|
21
|
+
|
22
|
+
print(escape_tuple_and_set("<eval>233"))
|
23
|
+
# <eval><eval>233
|
kevin_toolbox/data_flow/file/json_/converter/{unescape_tuple.py → unescape_tuple_and_set.py}
RENAMED
@@ -1,11 +1,11 @@
|
|
1
|
-
def
|
1
|
+
def unescape_tuple_and_set(x):
|
2
2
|
"""
|
3
|
-
将 tuple 进行反转义
|
3
|
+
将 tuple 和 set 进行反转义
|
4
4
|
转义: x ==> f"<eval>{x}"
|
5
5
|
反转义: f"<eval>{x}" ==> x
|
6
6
|
|
7
7
|
为什么要进行转义?
|
8
|
-
由于 json 中会将 tuple 作为 list
|
8
|
+
由于 json 中会将 tuple 作为 list 进行保存,同时也无法保存 set,因此在保存过程中会丢失相应信息。
|
9
9
|
"""
|
10
10
|
if isinstance(x, str) and x.startswith("<eval>"):
|
11
11
|
x = x[6:]
|
@@ -17,7 +17,9 @@ def unescape_tuple(x):
|
|
17
17
|
|
18
18
|
|
19
19
|
if __name__ == '__main__':
|
20
|
-
print(
|
20
|
+
print(unescape_tuple_and_set("<eval>(1, 2, \"'1'\")"))
|
21
21
|
# (1, 2, "\'1\'")
|
22
|
-
print(
|
22
|
+
print(unescape_tuple_and_set("<eval>{'1', 233, (1, 2, 3)}"))
|
23
|
+
# {'1', 233, (1, 2, 3)}
|
24
|
+
print(unescape_tuple_and_set("<eval><eval>233"))
|
23
25
|
# "<eval>233"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
|
-
from kevin_toolbox.data_flow.file.json_.converter import integrate,
|
3
|
+
from kevin_toolbox.data_flow.file.json_.converter import integrate, unescape_tuple_and_set, unescape_non_str_dict_key
|
4
4
|
from kevin_toolbox.nested_dict_list import traverse
|
5
5
|
|
6
6
|
|
@@ -14,14 +14,14 @@ def read_json(file_path, converters=None, b_use_suggested_converter=False):
|
|
14
14
|
转换器 converter 应该是一个形如 def(x): ... ; return x 的函数,具体可以参考
|
15
15
|
json_.converter 中已实现的转换器
|
16
16
|
b_use_suggested_converter: <boolean> 是否使用建议的转换器
|
17
|
-
建议使用 unescape/escape_non_str_dict_key 和 unescape/
|
17
|
+
建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple_and_set 这两对转换器,
|
18
18
|
可以避免因 json 的读取/写入而丢失部分信息。
|
19
19
|
默认为 False。
|
20
20
|
注意:当 converters 非 None,此参数失效,以 converters 中的具体设置为准
|
21
21
|
"""
|
22
22
|
assert os.path.isfile(file_path), f'file {file_path} not found'
|
23
23
|
if converters is None and b_use_suggested_converter:
|
24
|
-
converters = [
|
24
|
+
converters = [unescape_tuple_and_set, unescape_non_str_dict_key]
|
25
25
|
|
26
26
|
with open(file_path, 'r') as f:
|
27
27
|
content = json.load(f)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
3
|
import copy
|
4
|
-
from kevin_toolbox.data_flow.file.json_.converter import integrate,
|
4
|
+
from kevin_toolbox.data_flow.file.json_.converter import integrate, escape_tuple_and_set, escape_non_str_dict_key
|
5
5
|
from kevin_toolbox.nested_dict_list import traverse
|
6
6
|
|
7
7
|
|
@@ -18,7 +18,7 @@ def write_json(content, file_path, sort_keys=False, converters=None, b_use_sugge
|
|
18
18
|
转换器 converter 应该是一个形如 def(x): ... ; return x 的函数,具体可以参考
|
19
19
|
json_.converter 中已实现的转换器
|
20
20
|
b_use_suggested_converter: <boolean> 是否使用建议的转换器
|
21
|
-
建议使用 unescape/escape_non_str_dict_key 和 unescape/
|
21
|
+
建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple_and_set 这两对转换器,
|
22
22
|
可以避免因 json 的读取/写入而丢失部分信息。
|
23
23
|
默认为 False。
|
24
24
|
注意:当 converters 非 None,此参数失效,以 converters 中的具体设置为准
|
@@ -26,7 +26,7 @@ def write_json(content, file_path, sort_keys=False, converters=None, b_use_sugge
|
|
26
26
|
assert isinstance(file_path, (str, type(None)))
|
27
27
|
|
28
28
|
if converters is None and b_use_suggested_converter:
|
29
|
-
converters = [
|
29
|
+
converters = [escape_tuple_and_set, escape_non_str_dict_key]
|
30
30
|
|
31
31
|
if converters is not None:
|
32
32
|
converter = integrate(converters)
|
@@ -15,6 +15,8 @@ class Kevin_Notation_Reader:
|
|
15
15
|
|
16
16
|
必要参数:
|
17
17
|
file_path: <string> 文件路径
|
18
|
+
file_obj: <file object> 文件对象
|
19
|
+
以上参数2选一,具体参见 File_Iterative_Reader
|
18
20
|
读取相关参数:
|
19
21
|
chunk_size: <integer> 每次读取多少行数据
|
20
22
|
beg: <integer> 开始读取的位置
|
@@ -28,6 +30,7 @@ class Kevin_Notation_Reader:
|
|
28
30
|
paras = {
|
29
31
|
# 必要参数
|
30
32
|
"file_path": None,
|
33
|
+
"file_obj": None,
|
31
34
|
# 读取相关参数
|
32
35
|
"chunk_size": 100,
|
33
36
|
"beg": 0,
|
@@ -38,9 +41,7 @@ class Kevin_Notation_Reader:
|
|
38
41
|
paras.update(kwargs)
|
39
42
|
|
40
43
|
# 校验参数
|
41
|
-
|
42
|
-
f'file not exists :{paras["file_path"]}'
|
43
|
-
#
|
44
|
+
# file_path 和 file_obj 交给 File_Iterative_Reader 校验
|
44
45
|
assert isinstance(paras["chunk_size"], (int,)) and (paras["chunk_size"] > 0 or paras["chunk_size"] == -1)
|
45
46
|
assert isinstance(paras["beg"], (int,)) and paras["beg"] >= 0
|
46
47
|
assert isinstance(paras["converter"], (Converter, dict,))
|
@@ -48,7 +49,7 @@ class Kevin_Notation_Reader:
|
|
48
49
|
self.paras = paras
|
49
50
|
|
50
51
|
# 读取开头
|
51
|
-
self.reader = File_Iterative_Reader(file_path=self.paras["file_path"],
|
52
|
+
self.reader = File_Iterative_Reader(file_path=self.paras["file_path"], file_obj=self.paras["file_obj"],
|
52
53
|
pre_jump_size=self.paras["beg"],
|
53
54
|
filter_=lambda x: x != "\n" and not x.startswith("//"), # 去除注释
|
54
55
|
map_func=lambda x: x.rsplit("\n", 1)[0].split("//", 1)[0],
|
@@ -64,7 +65,7 @@ class Kevin_Notation_Reader:
|
|
64
65
|
del self.reader
|
65
66
|
|
66
67
|
# 读取内容
|
67
|
-
self.reader = File_Iterative_Reader(file_path=self.paras["file_path"],
|
68
|
+
self.reader = File_Iterative_Reader(file_path=self.paras["file_path"], file_obj=self.paras["file_obj"],
|
68
69
|
pre_jump_size=self.paras["beg"] + offset,
|
69
70
|
filter_=lambda x: x != "\n" and not x.startswith("//"), # 去除注释
|
70
71
|
map_func=lambda x: x.rsplit("\n", 1)[0].split("//", 1)[0],
|
@@ -1,11 +1,13 @@
|
|
1
1
|
from kevin_toolbox.data_flow.file import kevin_notation
|
2
2
|
|
3
3
|
|
4
|
-
def read(file_path):
|
4
|
+
def read(file_path=None, file_obj=None):
|
5
5
|
"""
|
6
6
|
读取整个文件的快捷接口
|
7
7
|
"""
|
8
|
-
|
8
|
+
assert file_path is not None or file_obj is not None
|
9
|
+
|
10
|
+
with kevin_notation.Reader(file_path=file_path, chunk_size=-1, file_obj=file_obj) as reader:
|
9
11
|
# metadata
|
10
12
|
metadata = reader.metadata
|
11
13
|
# content
|