kevin-toolbox-dev 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. kevin_toolbox/__init__.py +2 -2
  2. kevin_toolbox/computer_science/algorithm/cache_manager/__init__.py +1 -0
  3. kevin_toolbox/computer_science/algorithm/cache_manager/cache/__init__.py +2 -0
  4. kevin_toolbox/computer_science/algorithm/cache_manager/cache/cache_base.py +89 -0
  5. kevin_toolbox/computer_science/algorithm/cache_manager/cache/memo_cache.py +36 -0
  6. kevin_toolbox/computer_science/algorithm/cache_manager/cache_manager.py +218 -0
  7. kevin_toolbox/computer_science/algorithm/cache_manager/strategy/__init__.py +5 -0
  8. kevin_toolbox/computer_science/algorithm/cache_manager/strategy/fifo_strategy.py +21 -0
  9. kevin_toolbox/computer_science/algorithm/cache_manager/strategy/lfu_strategy.py +80 -0
  10. kevin_toolbox/computer_science/algorithm/cache_manager/strategy/lru_strategy.py +43 -0
  11. kevin_toolbox/computer_science/algorithm/cache_manager/strategy/lst_strategy.py +26 -0
  12. kevin_toolbox/computer_science/algorithm/cache_manager/strategy/strategy_base.py +45 -0
  13. kevin_toolbox/computer_science/algorithm/cache_manager/test/__init__.py +0 -0
  14. kevin_toolbox/computer_science/algorithm/cache_manager/test/test_cache_builder.py +37 -0
  15. kevin_toolbox/computer_science/algorithm/cache_manager/test/test_cache_manager.py +197 -0
  16. kevin_toolbox/computer_science/algorithm/cache_manager/test/test_cache_strategy.py +129 -0
  17. kevin_toolbox/computer_science/algorithm/cache_manager/variable.py +28 -0
  18. kevin_toolbox/computer_science/algorithm/registration/registry.py +38 -16
  19. kevin_toolbox/data_flow/core/cache/__init__.py +1 -1
  20. kevin_toolbox/data_flow/core/cache/cache_manager_for_iterator.py +36 -168
  21. kevin_toolbox/data_flow/core/cache/test/__init__.py +0 -0
  22. kevin_toolbox/data_flow/core/cache/test/test_cache_manager_for_iterator.py +34 -0
  23. kevin_toolbox/data_flow/core/reader/file_iterative_reader.py +44 -9
  24. kevin_toolbox/data_flow/core/reader/unified_reader.py +2 -2
  25. kevin_toolbox/data_flow/core/reader/unified_reader_base.py +4 -5
  26. kevin_toolbox/data_flow/file/json_/converter/__init__.py +2 -2
  27. kevin_toolbox/data_flow/file/json_/converter/escape_tuple_and_set.py +23 -0
  28. kevin_toolbox/data_flow/file/json_/converter/{unescape_tuple.py → unescape_tuple_and_set.py} +7 -5
  29. kevin_toolbox/data_flow/file/json_/read_json.py +3 -3
  30. kevin_toolbox/data_flow/file/json_/write_json.py +3 -3
  31. kevin_toolbox/data_flow/file/kevin_notation/kevin_notation_reader.py +6 -5
  32. kevin_toolbox/data_flow/file/kevin_notation/read.py +4 -2
  33. kevin_toolbox/data_flow/file/kevin_notation/test/test_kevin_notation.py +15 -3
  34. kevin_toolbox/data_flow/file/markdown/generate_table.py +2 -2
  35. kevin_toolbox/math/utils/__init__.py +1 -1
  36. kevin_toolbox/math/utils/{spilt_integer_most_evenly.py → split_integer_most_evenly.py} +2 -2
  37. kevin_toolbox/nested_dict_list/get_nodes.py +9 -4
  38. kevin_toolbox/nested_dict_list/name_handler/build_name.py +1 -1
  39. kevin_toolbox/nested_dict_list/name_handler/parse_name.py +1 -1
  40. kevin_toolbox/nested_dict_list/set_default.py +44 -28
  41. kevin_toolbox/patches/for_matplotlib/__init__.py +1 -0
  42. kevin_toolbox/patches/for_matplotlib/generate_color_list.py +33 -0
  43. kevin_toolbox/patches/for_numpy/linalg/__init__.py +1 -0
  44. kevin_toolbox/patches/for_numpy/linalg/entropy.py +26 -0
  45. kevin_toolbox/patches/for_numpy/random/__init__.py +3 -0
  46. kevin_toolbox/patches/for_numpy/random/get_rng.py +64 -0
  47. kevin_toolbox/patches/for_numpy/random/truncated_multivariate_normal.py +129 -0
  48. kevin_toolbox/patches/for_numpy/random/truncated_normal.py +89 -0
  49. kevin_toolbox/patches/for_numpy/random/variable.py +10 -0
  50. kevin_toolbox/patches/for_optuna/serialize/for_study/dump.py +10 -2
  51. kevin_toolbox_dev-1.3.3.dist-info/METADATA +75 -0
  52. {kevin_toolbox_dev-1.3.1.dist-info → kevin_toolbox_dev-1.3.3.dist-info}/RECORD +54 -29
  53. kevin_toolbox/data_flow/file/json_/converter/escape_tuple.py +0 -20
  54. kevin_toolbox_dev-1.3.1.dist-info/METADATA +0 -91
  55. {kevin_toolbox_dev-1.3.1.dist-info → kevin_toolbox_dev-1.3.3.dist-info}/WHEEL +0 -0
  56. {kevin_toolbox_dev-1.3.1.dist-info → kevin_toolbox_dev-1.3.3.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
2
2
  import time
3
3
  import importlib.util
4
4
  from kevin_toolbox.developing.decorator import restore_original_work_path
5
+ from kevin_toolbox.computer_science.algorithm.cache_manager import Cache_Manager
5
6
 
6
7
  if importlib.util.find_spec("cPickle") is not None:
7
8
  # 如果安装有 cPickle,可以更快处理串行化
@@ -27,29 +28,10 @@ class Cache_Manager_for_Iterator:
27
28
  file_dict: 二进制文件的文件名与序号的对应关系表
28
29
  例如:{ 0: "0.pkl", 1: "1.pkl", 2: "2.pkl", ...}
29
30
  其中 index 0 对应于文件 ./temp/cache_name/0.pkl
30
- 然后在进行读取时,将先到 cache 中寻找是否已经有需要的 chunk 分块,如果没有则到前面 file_dict 中读取,同时更新 cache
31
+ 然后在进行读取时,将先到基于内存的缓存 memo_cache_manager 中寻找是否已经有需要的 chunk 分块,如果没有则到前面 file_dict 中读取,同时更新 memo_cache_manager
31
32
  相关变量:
32
- cache_dict 缓存
33
- 例如:{ 1: chunk_var_1, 3: chunk_var_3 }
33
+ memo_cache_manager 基于内存的缓存,由 Cache_Manager 构建,指定有更新策略等
34
34
  其中 key 是 chunk 分块的 index,value 是对应的保存在内存中的变量
35
- cache_metadata: 缓存的属性数据
36
- 包含以下字段,各字段将自动更新
37
- 例如:{ 1: { "last_time": xxx, # 最近读取时间
38
- "initial_time": xxx, # 最初读取时间
39
- "counts": xxx, # 读取次数
40
- },
41
- ... }
42
- cache_update_strategy:缓存更新策略
43
- 是一个函数,该函数的输入是 cache_metadata ,输出是需要删除的缓存的序号
44
- 触发:
45
- 在每次出现 cache_dict 无法命中,导致有新的 cache 添加到 cache_dict 时,将会采用该策略进行更新
46
- 现有策略:
47
- drop_min_counts: 去除读取次数最小的
48
- drop_min_last_time: 去除最近没有读取的
49
- drop_min_survival_time: 去除生存时间最短的,生成时间 survival_time:=last_time-initial_time
50
- 默认使用 drop_min_last_time
51
- (cache_dict 的大小受 cache_update_strategy 中的 cache_size 限制,当大小超过限制时,
52
- 将根据 cache_update_strategy 去除优先级较低的部分来更新缓存)
53
35
 
54
36
  支持以下几种方式来:
55
37
  以迭代器的形式进行顺序读取
@@ -60,44 +42,43 @@ class Cache_Manager_for_Iterator:
60
42
  """
61
43
  设定关键参数
62
44
  参数:
63
- iterator: 迭代器/生成器
64
- folder_path: 保存二进制文件的路径
65
- cache_update_strategy:缓存更新策略
45
+ iterator: 迭代器/生成器
46
+ folder_path: <path> 构建基于磁盘的缓存时,保存二进制文件的路径
47
+ paras_for_memo_cache: <dict> 构建基于内存的缓存的参数
66
48
  其他参数:
67
- strict_mode: 禁止同时设置 iterator 和给定一个非空的 folder_path
68
- 默认为 True 开启,此时同时设置将报错。
69
- 当设置为 False 时,同时设置将以 folder_path 中的二进制文件为准
70
- del_cache_when_exit: 退出时删除生成的缓存二进制文件
71
- 只有在设置了 iterator 的前提下,才会触发。
72
- (对于非本实例生成的文件,比如只给定了非空的 folder_path,不做删除。)
73
- 默认为 True 开启。
49
+ b_strict_mode: <boolean> 禁止同时设置 iterator 和给定一个非空的 folder_path
50
+ 默认为 True 开启,此时同时设置将报错。
51
+ 当设置为 False 时,同时设置将以 folder_path 中的二进制文件为准
52
+ b_del_cache_when_exit: <boolean> 退出时删除生成的缓存二进制文件
53
+ 只有在设置了 iterator 的前提下,才会触发。
54
+ (对于非本实例生成的文件,比如只给定了非空的 folder_path,不做删除。)
55
+ 默认为 True 开启。
74
56
  """
75
57
 
76
58
  # 默认参数
77
59
  paras = {
78
60
  "iterator": None,
79
61
  "folder_path": None,
80
- "cache_update_strategy": None,
81
- #
82
- "strict_mode": True,
83
- "del_cache_when_exit": True,
62
+ "paras_for_memo_cache": dict(upper_bound=20, refactor_size=0.7, strategy=":by_last_time:LRU",
63
+ cache=":in_memory:Memo"),
64
+ "b_strict_mode": True,
65
+ "b_del_cache_when_exit": True,
84
66
  }
85
67
 
86
68
  # 获取参数
87
69
  paras.update(kwargs)
88
70
 
89
71
  # 校验参数
90
- # cache_update_strategy
91
- if paras["cache_update_strategy"] is None:
92
- paras["cache_update_strategy"] = lambda x: Strategies.drop_min_last_time(cache_metadata=x,
93
- cache_size_upper_bound=10)
72
+ # paras_for_memo_cache
73
+ assert isinstance(paras["paras_for_memo_cache"], (dict,))
74
+ paras["paras_for_memo_cache"]["strategy"] = ":by_last_time:LRU"
94
75
  # 同时非空
95
76
  b_folder_not_empty = isinstance(paras["folder_path"], (str,)) and paras[
96
77
  "folder_path"] is not None and os.path.exists(paras["folder_path"]) and len(
97
78
  os.listdir(paras["folder_path"])) > 0
98
79
  if paras["iterator"] is not None and b_folder_not_empty:
99
80
  # iterator 非空,folder_path 非空
100
- if paras["strict_mode"]:
81
+ if paras["b_strict_mode"]:
101
82
  # 不能同时设置
102
83
  raise Exception(f"Error: folder_path and iterator cannot be set at the same time\n"
103
84
  f"iterator {paras['iterator']} is given when "
@@ -123,13 +104,12 @@ class Cache_Manager_for_Iterator:
123
104
  # 尝试直接根据已有文件构建 file_dict
124
105
  file_dict = self.find_chuck_files(paras["folder_path"])
125
106
 
107
+ # 构建基于内存的缓存
108
+ self.memo_cache_manager = Cache_Manager(**paras["paras_for_memo_cache"])
109
+
126
110
  self.file_dict = file_dict
127
111
  self.paras = paras
128
112
 
129
- # 初始化基于内存的缓存
130
- self.cache_dict = dict()
131
- self.cache_metadata = dict()
132
-
133
113
  # 记录最后读取的index
134
114
  self.index = -1
135
115
 
@@ -181,49 +161,19 @@ class Cache_Manager_for_Iterator:
181
161
  chunk = pickle.load(f)
182
162
  return chunk
183
163
 
184
- # ------------------------------------ 基于内存的缓存 ------------------------------------ #
185
-
186
- def __read_from_cache(self, index):
187
- """
188
- 从内存中读取
189
- """
190
- chunk = self.cache_dict[index]
191
- # 更新缓存属性
192
- self.cache_metadata[index]["counts"] += 1
193
- self.cache_metadata[index]["last_time"] = time.time()
194
- return chunk
195
-
196
- def __add_to_cache(self, index, chunk):
197
- """
198
- 添加到内存中
199
- """
200
- # 更新缓存
201
- self.cache_dict[index] = chunk
202
- # 更新缓存属性
203
- self.cache_metadata[index] = {
204
- "last_time": time.time(), # 最近读取时间
205
- "initial_time": time.time(), # 最初读取时间
206
- "counts": 1, # 读取次数
207
- }
208
- # 依据策略去除优先级较低的缓存
209
- drop_ls = self.paras["cache_update_strategy"](self.cache_metadata)
210
- for i in drop_ls:
211
- self.cache_dict.pop(i)
212
- self.cache_metadata.pop(i)
213
-
214
164
  # ------------------------------------ 读取 ------------------------------------ #
215
165
 
216
166
  def read(self, index):
217
167
  assert 0 <= index < len(self), \
218
168
  KeyError(f"Error: index {index} not in [0, {len(self)})")
219
- if index in self.cache_dict:
169
+ if self.memo_cache_manager.has(key=index):
220
170
  # 直接从内存中读取
221
- chunk = self.__read_from_cache(index)
171
+ chunk = self.memo_cache_manager.get(key=index)
222
172
  else:
223
173
  # 到磁盘读取
224
174
  chunk = self.__read_from_files(index)
225
175
  # 添加到缓存
226
- self.__add_to_cache(index, chunk)
176
+ self.memo_cache_manager.add(key=index, value=chunk)
227
177
  self.index = index
228
178
  return chunk
229
179
 
@@ -246,8 +196,8 @@ class Cache_Manager_for_Iterator:
246
196
  return len(self.file_dict)
247
197
 
248
198
  def __del__(self):
249
- if self.paras["iterator"] is not None and self.paras["del_cache_when_exit"] and self.paras["strict_mode"]:
250
- # 在 strict_mode 开启,且 iterator 非空的情况下 self.file_dict 中的二进制文件一定是根据 iterator 生成的
199
+ if self.paras["iterator"] is not None and self.paras["b_del_cache_when_exit"] and self.paras["b_strict_mode"]:
200
+ # 在 b_strict_mode 开启,且 iterator 非空的情况下 self.file_dict 中的二进制文件一定是根据 iterator 生成的
251
201
  # 删除文件
252
202
  pwd_bak = os.getcwd()
253
203
  os.chdir(self.paras["folder_path"])
@@ -259,97 +209,15 @@ class Cache_Manager_for_Iterator:
259
209
  os.removedirs(self.paras["folder_path"])
260
210
 
261
211
 
262
- class Strategies:
263
- """
264
- 现有策略:
265
- drop_min_counts: 去除读取次数最小的
266
- drop_min_last_time: 去除最近没有读取的
267
- drop_min_survival_time: 去除生存时间最短的,生成时间 survival_time:=last_time-initial_time
268
- """
269
-
270
- @staticmethod
271
- def drop_min_counts(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
272
- """
273
- 去除读取次数最小的
274
- 参数:
275
- cache_metadata: 缓存的属性数据
276
- 包含以下字段,各字段将自动更新
277
- 例如:{ 1: { "last_time": xxx, # 最近读取时间
278
- "initial_time": xxx, # 最初读取时间
279
- "counts": xxx, # 读取次数
280
- },
281
- ... }
282
- cache_size_upper_bound: 当 cache_metadata 的大小超过该值时触发更新
283
- cache_size_after_drop: 更新后 cache_metadata 的目标大小
284
- 默认为 cache_size_upper_bound
285
- """
286
- if cache_size_upper_bound >= len(cache_metadata):
287
- return []
288
-
289
- cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
290
- # (这里其实可以用最大堆来优化,但是我懒啊)
291
- drop_ls = [i for i, j in sorted(cache_metadata.items(), key=lambda x: x[1]["counts"])[:-cache_size_after_drop]]
292
- return drop_ls
293
-
294
- @staticmethod
295
- def drop_min_last_time(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
296
- """
297
- 去除最近没有读取的
298
- """
299
- if cache_size_upper_bound >= len(cache_metadata):
300
- return []
301
-
302
- cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
303
- drop_ls = [i for i, j in
304
- sorted(cache_metadata.items(), key=lambda x: x[1]["last_time"])[:-cache_size_after_drop]]
305
- return drop_ls
306
-
307
- @staticmethod
308
- def drop_min_survival_time(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
309
- """
310
- 去除生存时间最短的,生成时间 survival_time:=last_time-initial_time
311
- """
312
- if cache_size_upper_bound >= len(cache_metadata):
313
- return []
314
-
315
- cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
316
- drop_ls = [i for i, j in sorted(cache_metadata.items(), key=lambda x: x[1]["last_time"] - x[1]["initial_time"])[
317
- :-cache_size_after_drop]]
318
- return drop_ls
319
-
320
-
321
212
  if __name__ == '__main__':
322
- "测试 Strategies"
323
- _cache_metadata = {
324
- 1: {
325
- "last_time": 123,
326
- "initial_time": 56,
327
- "counts": 2,
328
- },
329
- 2: {
330
- "last_time": 110,
331
- "initial_time": 0,
332
- "counts": 4,
333
- },
334
- 3: {
335
- "last_time": 126,
336
- "initial_time": 121,
337
- "counts": 1,
338
- },
339
- }
340
-
341
- print(Strategies.drop_min_counts(_cache_metadata, 2))
342
- print(Strategies.drop_min_last_time(_cache_metadata, 1))
343
- print(Strategies.drop_min_survival_time(_cache_metadata, 1))
344
-
345
213
  "测试 Cache_Manager_for_Iterator"
346
- cache_manager = Cache_Manager_for_Iterator(iterator=range(10),
347
- del_cache_when_exit=True,
348
- cache_update_strategy=lambda x: Strategies.drop_min_last_time(
349
- cache_metadata=x,
350
- cache_size_upper_bound=3))
214
+ cache_manager = Cache_Manager_for_Iterator(
215
+ iterator=range(10),
216
+ b_del_cache_when_exit=True,
217
+ paras_for_memo_cache=dict(upper_bound=3, refactor_size=3, strategy=":by_last_time:LRU")
218
+ )
351
219
  print(cache_manager.file_dict)
352
- print(cache_manager.cache_metadata)
220
+ print(cache_manager.memo_cache_manager.metadata_s)
353
221
  for i in range(3):
354
222
  print(cache_manager.read(0))
355
223
  for i in range(2):
@@ -358,4 +226,4 @@ if __name__ == '__main__':
358
226
  print(cache_manager.read(6))
359
227
  for i in range(4):
360
228
  print(cache_manager.read(9))
361
- print(cache_manager.cache_metadata)
229
+ print(cache_manager.memo_cache_manager.metadata_s)
File without changes
@@ -0,0 +1,34 @@
1
+ import pytest
2
+ import time
3
+ from kevin_toolbox.patches.for_test import check_consistency
4
+ from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
5
+
6
+
7
+ def test_cache_manager_for_iterator():
8
+ print("test Cache_Manager_for_Iterator")
9
+
10
+ cache_manager = Cache_Manager_for_Iterator(
11
+ iterator=range(10),
12
+ b_del_cache_when_exit=True,
13
+ paras_for_memo_cache=dict(upper_bound=3, refactor_size=3, strategy=":by_last_time:LRU")
14
+ )
15
+
16
+ check_consistency(
17
+ cache_manager.file_dict,
18
+ {i: f'{i}.pkl' for i in range(10)}
19
+ )
20
+ check_consistency(
21
+ cache_manager.memo_cache_manager.metadata_s,
22
+ dict()
23
+ )
24
+ key_to_counts = {0: 3, 3: 2, 6: 1, 9: 4}
25
+ for key, counts in key_to_counts.items():
26
+ for _ in range(counts):
27
+ time.sleep(0.05)
28
+ check_consistency(cache_manager.read(key), key)
29
+ # counts
30
+ for key, counts in key_to_counts.items():
31
+ if key == 0:
32
+ assert key not in cache_manager.memo_cache_manager.metadata_s
33
+ else:
34
+ check_consistency(cache_manager.memo_cache_manager.metadata_s[key]["counts"], counts - 1)
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import copy
2
3
 
3
4
 
4
5
  class File_Iterative_Reader:
@@ -12,11 +13,14 @@ class File_Iterative_Reader:
12
13
  设定关键参数
13
14
  必要参数:
14
15
  file_path: 文件路径
16
+ file_obj: 文件对象
17
+ 注意!!以上两个参数指定其一即可,同时指定时候,以后者为准。
15
18
  读取模式相关参数:
16
19
  paras_for_open: open() 函数的补充参数
17
20
  mode: 读取模式,默认为 "lines"
18
21
  "lines": 按行数计算批次大小
19
22
  "bytes": 按字节数计算
23
+ 注意!!以上两个参数在指定了 file_obj 参数后将失效。
20
24
  chunk_size: 批次大小
21
25
  默认为 1k
22
26
  当为-1时,读取整个文件
@@ -51,6 +55,7 @@ class File_Iterative_Reader:
51
55
  paras = {
52
56
  # 必要参数
53
57
  "file_path": None,
58
+ "file_obj": None,
54
59
  # 读取模式相关参数
55
60
  "paras_for_open": dict(mode="r", encoding='utf-8'),
56
61
  "mode": "lines",
@@ -74,16 +79,21 @@ class File_Iterative_Reader:
74
79
  assert mode in ["lines", "bytes"]
75
80
  paras["chunk_size"] = int(paras["chunk_size"])
76
81
  paras["loop_num"] = int(paras["loop_num"]) - 1
77
- #
78
- file_path = paras["file_path"]
79
- assert isinstance(file_path, (str,)) and os.path.exists(file_path), \
80
- Exception(f"Error: file {file_path} not exists!")
81
- #
82
- paras_for_open = paras["paras_for_open"]
83
- assert isinstance(paras_for_open, (dict,))
84
82
 
85
83
  # 获取文件对象
86
- self.file = open(file_path, **paras_for_open)
84
+ if paras["file_obj"] is None:
85
+ assert isinstance(paras["file_path"], (str,)) and os.path.isfile(paras["file_path"]), \
86
+ Exception(f'Error: file {paras["file_path"]} not exists!')
87
+ #
88
+ assert isinstance(paras["paras_for_open"], (dict,))
89
+ self.file = open(paras["file_path"], **paras["paras_for_open"])
90
+ else:
91
+ # 拷贝对象,防止修改外部对象
92
+ try:
93
+ self.file = copy.deepcopy(paras["file_obj"])
94
+ except:
95
+ self.file = open(paras["file_obj"].name, mode=paras["file_obj"].mode)
96
+
87
97
  # 选择相应模式
88
98
  self.__read_func = {"lines": self.__read_lines, "bytes": self.__read_bytes}[mode]
89
99
  self.__jump_func = {"lines": self.__jump_lines, "bytes": self.__jump_bytes}[mode]
@@ -225,9 +235,34 @@ class File_Iterative_Reader:
225
235
  if __name__ == "__main__":
226
236
  import numpy as np
227
237
 
228
- reader = File_Iterative_Reader(file_path="developing/test_data.txt", chunk_size=2, drop=True, loop_num=2,
238
+ print("使用 file_path")
239
+ reader = File_Iterative_Reader(file_path="test/test_data/test_data.txt", chunk_size=2, drop=True, loop_num=2,
229
240
  pre_jump_size=3, convert_func=lambda x: np.array(x))
230
241
  for i in reader:
231
242
  print(i)
232
243
 
233
244
  del reader
245
+
246
+ print("使用 file_obj")
247
+ reader = File_Iterative_Reader(
248
+ file_obj=open("test/test_data/test_data.txt", "r"), chunk_size=2, drop=True, loop_num=2,
249
+ pre_jump_size=3, convert_func=lambda x: np.array(x))
250
+ for i in reader:
251
+ print(i)
252
+
253
+ del reader
254
+
255
+ print("从字符串构建文件对象作为 file_obj")
256
+ from io import StringIO
257
+
258
+ file_obj = StringIO(initial_value=open("test/test_data/test_data.txt", "r").read())
259
+ reader = File_Iterative_Reader(
260
+ file_obj=file_obj, chunk_size=2, drop=True, loop_num=2,
261
+ pre_jump_size=3, convert_func=lambda x: np.array(x))
262
+ for i in reader:
263
+ print(i)
264
+
265
+ print("证明不会修改外部对象")
266
+ print(file_obj.read())
267
+
268
+ del reader
@@ -49,7 +49,7 @@ if __name__ == '__main__':
49
49
  print(reader.read([3, 3]).shape)
50
50
  print(reader.find(1))
51
51
 
52
- reader = UReader(file_path="test_data.txt", chunk_size=2, folder_path="./temp/233")
52
+ reader = UReader(file_path="test/test_data/test_data.txt", chunk_size=2, folder_path="./temp/233")
53
53
 
54
54
  print(reader.read(2, 7))
55
55
  # del reader
@@ -67,7 +67,7 @@ if __name__ == '__main__':
67
67
 
68
68
  print(reader.find('data/6/horse_race_pan/2132020102319002000161_43_4.bmp'))
69
69
 
70
- reader = Reader_for_files(file_path="test_data.txt", chunk_size=2, pre_jump_size=2, jump_size=2)
70
+ reader = Reader_for_files(file_path="test/test_data/test_data.txt", chunk_size=2, pre_jump_size=2, jump_size=2)
71
71
 
72
72
  for i in reader:
73
73
  print(2333, i)
@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
2
2
  import os
3
3
  import numpy as np
4
4
  from kevin_toolbox.data_flow.core.reader import File_Iterative_Reader
5
- from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator, Strategies
5
+ from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
6
6
 
7
7
 
8
8
  class Unified_Reader_Base(ABC):
@@ -126,10 +126,9 @@ class Unified_Reader_Base(ABC):
126
126
  """
127
127
  默认使用 Cache_Manager_for_Iterator
128
128
  """
129
- manager = Cache_Manager_for_Iterator(iterator=iterator, folder_path=folder_path,
130
- cache_update_strategy=lambda x: Strategies.drop_min_last_time(
131
- cache_metadata=x,
132
- cache_size_upper_bound=3))
129
+ manager = Cache_Manager_for_Iterator(
130
+ iterator=iterator, folder_path=folder_path,
131
+ paras_for_memo_cache=dict(upper_bound=20, refactor_size=0.7, strategy=":by_last_time:LRU"))
133
132
  return manager
134
133
 
135
134
  def read(self, *args, **kwargs):
@@ -2,6 +2,6 @@ from .convert_dict_key_to_number import convert_dict_key_to_number
2
2
  from .convert_ndarray_to_list import convert_ndarray_to_list
3
3
  from .escape_non_str_dict_key import escape_non_str_dict_key
4
4
  from .unescape_non_str_dict_key import unescape_non_str_dict_key
5
- from .escape_tuple import escape_tuple
6
- from .unescape_tuple import unescape_tuple
5
+ from .escape_tuple_and_set import escape_tuple_and_set
6
+ from .unescape_tuple_and_set import unescape_tuple_and_set
7
7
  from .integrate import integrate
@@ -0,0 +1,23 @@
1
+ def escape_tuple_and_set(x):
2
+ """
3
+ 将 tuple 和 set 进行转义
4
+ 转义: x ==> f"<eval>{x}"
5
+ 反转义: f"<eval>{x}" ==> x
6
+
7
+ 为什么要进行转义?
8
+ 由于 json 中会将 tuple 作为 list 进行保存,同时也无法保存 set,因此在保存过程中会丢失相应信息。
9
+ """
10
+ if isinstance(x, (tuple, set,)) or (isinstance(x, (str,)) and x.startswith("<eval>")):
11
+ return f'<eval>{x}'
12
+ else:
13
+ return x
14
+
15
+
16
+ if __name__ == '__main__':
17
+ print(escape_tuple_and_set((1, 2, "\'1\'")))
18
+ # <eval>(1, 2, "'1'")
19
+ print(escape_tuple_and_set({"1", (1, 2, 3), 233}))
20
+ # <eval>{'1', 233, (1, 2, 3)}
21
+
22
+ print(escape_tuple_and_set("<eval>233"))
23
+ # <eval><eval>233
@@ -1,11 +1,11 @@
1
- def unescape_tuple(x):
1
+ def unescape_tuple_and_set(x):
2
2
  """
3
- 将 tuple 进行反转义
3
+ 将 tuple 和 set 进行反转义
4
4
  转义: x ==> f"<eval>{x}"
5
5
  反转义: f"<eval>{x}" ==> x
6
6
 
7
7
  为什么要进行转义?
8
- 由于 json 中会将 tuple 作为 list 进行保存,因此在保存过程中会丢失相应信息。
8
+ 由于 json 中会将 tuple 作为 list 进行保存,同时也无法保存 set,因此在保存过程中会丢失相应信息。
9
9
  """
10
10
  if isinstance(x, str) and x.startswith("<eval>"):
11
11
  x = x[6:]
@@ -17,7 +17,9 @@ def unescape_tuple(x):
17
17
 
18
18
 
19
19
  if __name__ == '__main__':
20
- print(unescape_tuple("<eval>(1, 2, \"'1'\")"))
20
+ print(unescape_tuple_and_set("<eval>(1, 2, \"'1'\")"))
21
21
  # (1, 2, "\'1\'")
22
- print(unescape_tuple("<eval><eval>233"))
22
+ print(unescape_tuple_and_set("<eval>{'1', 233, (1, 2, 3)}"))
23
+ # {'1', 233, (1, 2, 3)}
24
+ print(unescape_tuple_and_set("<eval><eval>233"))
23
25
  # "<eval>233"
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import json
3
- from kevin_toolbox.data_flow.file.json_.converter import integrate, unescape_tuple, unescape_non_str_dict_key
3
+ from kevin_toolbox.data_flow.file.json_.converter import integrate, unescape_tuple_and_set, unescape_non_str_dict_key
4
4
  from kevin_toolbox.nested_dict_list import traverse
5
5
 
6
6
 
@@ -14,14 +14,14 @@ def read_json(file_path, converters=None, b_use_suggested_converter=False):
14
14
  转换器 converter 应该是一个形如 def(x): ... ; return x 的函数,具体可以参考
15
15
  json_.converter 中已实现的转换器
16
16
  b_use_suggested_converter: <boolean> 是否使用建议的转换器
17
- 建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple 这两对转换器,
17
+ 建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple_and_set 这两对转换器,
18
18
  可以避免因 json 的读取/写入而丢失部分信息。
19
19
  默认为 False。
20
20
  注意:当 converters 非 None,此参数失效,以 converters 中的具体设置为准
21
21
  """
22
22
  assert os.path.isfile(file_path), f'file {file_path} not found'
23
23
  if converters is None and b_use_suggested_converter:
24
- converters = [unescape_tuple, unescape_non_str_dict_key]
24
+ converters = [unescape_tuple_and_set, unescape_non_str_dict_key]
25
25
 
26
26
  with open(file_path, 'r') as f:
27
27
  content = json.load(f)
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import json
3
3
  import copy
4
- from kevin_toolbox.data_flow.file.json_.converter import integrate, escape_tuple, escape_non_str_dict_key
4
+ from kevin_toolbox.data_flow.file.json_.converter import integrate, escape_tuple_and_set, escape_non_str_dict_key
5
5
  from kevin_toolbox.nested_dict_list import traverse
6
6
 
7
7
 
@@ -18,7 +18,7 @@ def write_json(content, file_path, sort_keys=False, converters=None, b_use_sugge
18
18
  转换器 converter 应该是一个形如 def(x): ... ; return x 的函数,具体可以参考
19
19
  json_.converter 中已实现的转换器
20
20
  b_use_suggested_converter: <boolean> 是否使用建议的转换器
21
- 建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple 这两对转换器,
21
+ 建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple_and_set 这两对转换器,
22
22
  可以避免因 json 的读取/写入而丢失部分信息。
23
23
  默认为 False。
24
24
  注意:当 converters 非 None,此参数失效,以 converters 中的具体设置为准
@@ -26,7 +26,7 @@ def write_json(content, file_path, sort_keys=False, converters=None, b_use_sugge
26
26
  assert isinstance(file_path, (str, type(None)))
27
27
 
28
28
  if converters is None and b_use_suggested_converter:
29
- converters = [escape_tuple, escape_non_str_dict_key]
29
+ converters = [escape_tuple_and_set, escape_non_str_dict_key]
30
30
 
31
31
  if converters is not None:
32
32
  converter = integrate(converters)
@@ -15,6 +15,8 @@ class Kevin_Notation_Reader:
15
15
 
16
16
  必要参数:
17
17
  file_path: <string> 文件路径
18
+ file_obj: <file object> 文件对象
19
+ 以上参数2选一,具体参见 File_Iterative_Reader
18
20
  读取相关参数:
19
21
  chunk_size: <integer> 每次读取多少行数据
20
22
  beg: <integer> 开始读取的位置
@@ -28,6 +30,7 @@ class Kevin_Notation_Reader:
28
30
  paras = {
29
31
  # 必要参数
30
32
  "file_path": None,
33
+ "file_obj": None,
31
34
  # 读取相关参数
32
35
  "chunk_size": 100,
33
36
  "beg": 0,
@@ -38,9 +41,7 @@ class Kevin_Notation_Reader:
38
41
  paras.update(kwargs)
39
42
 
40
43
  # 校验参数
41
- assert isinstance(paras["file_path"], (str,)) and os.path.isfile(paras["file_path"]), \
42
- f'file not exists :{paras["file_path"]}'
43
- #
44
+ # file_path file_obj 交给 File_Iterative_Reader 校验
44
45
  assert isinstance(paras["chunk_size"], (int,)) and (paras["chunk_size"] > 0 or paras["chunk_size"] == -1)
45
46
  assert isinstance(paras["beg"], (int,)) and paras["beg"] >= 0
46
47
  assert isinstance(paras["converter"], (Converter, dict,))
@@ -48,7 +49,7 @@ class Kevin_Notation_Reader:
48
49
  self.paras = paras
49
50
 
50
51
  # 读取开头
51
- self.reader = File_Iterative_Reader(file_path=self.paras["file_path"],
52
+ self.reader = File_Iterative_Reader(file_path=self.paras["file_path"], file_obj=self.paras["file_obj"],
52
53
  pre_jump_size=self.paras["beg"],
53
54
  filter_=lambda x: x != "\n" and not x.startswith("//"), # 去除注释
54
55
  map_func=lambda x: x.rsplit("\n", 1)[0].split("//", 1)[0],
@@ -64,7 +65,7 @@ class Kevin_Notation_Reader:
64
65
  del self.reader
65
66
 
66
67
  # 读取内容
67
- self.reader = File_Iterative_Reader(file_path=self.paras["file_path"],
68
+ self.reader = File_Iterative_Reader(file_path=self.paras["file_path"], file_obj=self.paras["file_obj"],
68
69
  pre_jump_size=self.paras["beg"] + offset,
69
70
  filter_=lambda x: x != "\n" and not x.startswith("//"), # 去除注释
70
71
  map_func=lambda x: x.rsplit("\n", 1)[0].split("//", 1)[0],
@@ -1,11 +1,13 @@
1
1
  from kevin_toolbox.data_flow.file import kevin_notation
2
2
 
3
3
 
4
- def read(file_path):
4
+ def read(file_path=None, file_obj=None):
5
5
  """
6
6
  读取整个文件的快捷接口
7
7
  """
8
- with kevin_notation.Reader(file_path=file_path, chunk_size=-1) as reader:
8
+ assert file_path is not None or file_obj is not None
9
+
10
+ with kevin_notation.Reader(file_path=file_path, chunk_size=-1, file_obj=file_obj) as reader:
9
11
  # metadata
10
12
  metadata = reader.metadata
11
13
  # content