PyPI - kevin-toolbox-dev - Versions diffs - 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl - Mend

kevin-toolbox-dev 1.3.1py3-none-any.whl → 1.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

kevin_toolbox/data_flow/core/cache/cache_manager_for_iterator.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import time
 import importlib.util
 from kevin_toolbox.developing.decorator import restore_original_work_path
+from kevin_toolbox.computer_science.algorithm.cache_manager import Cache_Manager
 if importlib.util.find_spec("cPickle") is not None:
     # 如果安装有 cPickle，可以更快处理串行化
@@ -27,29 +28,10 @@ class Cache_Manager_for_Iterator:
                     file_dict:      二进制文件的文件名与序号的对应关系表
                                         例如：{ 0: "0.pkl", 1: "1.pkl", 2: "2.pkl", ...}
                                         其中 index 0 对应于文件 ./temp/cache_name/0.pkl
-            然后在进行读取时，将先到 cache 中寻找是否已经有需要的 chunk 分块，如果没有则到前面 file_dict 中读取，同时更新 cache
+            然后在进行读取时，将先到基于内存的缓存 memo_cache_manager 中寻找是否已经有需要的 chunk 分块，如果没有则到前面 file_dict 中读取，同时更新 memo_cache_manager
                 相关变量：
-                    cache_dict：     缓存
-                                        例如：{ 1: chunk_var_1, 3: chunk_var_3 }
+                    memo_cache_manager： 基于内存的缓存，由 Cache_Manager 构建，指定有更新策略等
                                         其中 key 是 chunk 分块的 index，value 是对应的保存在内存中的变量
-                    cache_metadata:  缓存的属性数据
-                                        包含以下字段，各字段将自动更新
-                                        例如：{ 1: {  "last_time": xxx,      # 最近读取时间
-                                                     "initial_time": xxx,   # 最初读取时间
-                                                     "counts": xxx,         # 读取次数
-                                                     },
-                                               ... }
-                    cache_update_strategy：缓存更新策略
-                                        是一个函数，该函数的输入是 cache_metadata ，输出是需要删除的缓存的序号
-                                        触发：
-                                            在每次出现 cache_dict 无法命中，导致有新的 cache 添加到 cache_dict 时，将会采用该策略进行更新
-                                        现有策略：
-                                            drop_min_counts:    去除读取次数最小的
-                                            drop_min_last_time:    去除最近没有读取的
-                                            drop_min_survival_time:    去除生存时间最短的，生成时间 survival_time:=last_time-initial_time
-                                        默认使用 drop_min_last_time
-                    （cache_dict 的大小受 cache_update_strategy 中的 cache_size 限制，当大小超过限制时，
-                    将根据 cache_update_strategy 去除优先级较低的部分来更新缓存）
         支持以下几种方式来：
             以迭代器的形式进行顺序读取
@@ -60,44 +42,43 @@ class Cache_Manager_for_Iterator:
         """
             设定关键参数
             参数：
-                iterator:       迭代器/生成器
-                folder_path:    保存二进制文件的路径
-                cache_update_strategy：缓存更新策略
+                iterator:               迭代器/生成器
+                folder_path:            <path> 构建基于磁盘的缓存时，保存二进制文件的路径
+                paras_for_memo_cache：   <dict> 构建基于内存的缓存的参数
             其他参数：
-                strict_mode:    禁止同时设置 iterator 和给定一个非空的 folder_path
-                                    默认为 True 开启，此时同时设置将报错。
-                                    当设置为 False 时，同时设置将以 folder_path 中的二进制文件为准
-                del_cache_when_exit:    退出时删除生成的缓存二进制文件
-                                    只有在设置了 iterator 的前提下，才会触发。
-                                    （对于非本实例生成的文件，比如只给定了非空的 folder_path，不做删除。）
-                                    默认为 True 开启。
+                b_strict_mode:          <boolean> 禁止同时设置 iterator 和给定一个非空的 folder_path
+                                            默认为 True 开启，此时同时设置将报错。
+                                            当设置为 False 时，同时设置将以 folder_path 中的二进制文件为准
+                b_del_cache_when_exit:  <boolean> 退出时删除生成的缓存二进制文件
+                                            只有在设置了 iterator 的前提下，才会触发。
+                                            （对于非本实例生成的文件，比如只给定了非空的 folder_path，不做删除。）
+                                            默认为 True 开启。
         """
         # 默认参数
         paras = {
             "iterator": None,
             "folder_path": None,
-            "cache_update_strategy": None,
-            #
-            "strict_mode": True,
-            "del_cache_when_exit": True,
+            "paras_for_memo_cache": dict(upper_bound=20, refactor_size=0.7, strategy=":by_last_time:LRU",
+                                         cache=":in_memory:Memo"),
+            "b_strict_mode": True,
+            "b_del_cache_when_exit": True,
         }
         # 获取参数
         paras.update(kwargs)
         # 校验参数
-        # cache_update_strategy
-        if paras["cache_update_strategy"] is None:
-            paras["cache_update_strategy"] = lambda x: Strategies.drop_min_last_time(cache_metadata=x,
-                                                                                     cache_size_upper_bound=10)
+        # paras_for_memo_cache
+        assert isinstance(paras["paras_for_memo_cache"], (dict,))
+        paras["paras_for_memo_cache"]["strategy"] = ":by_last_time:LRU"
         # 同时非空
         b_folder_not_empty = isinstance(paras["folder_path"], (str,)) and paras[
             "folder_path"] is not None and os.path.exists(paras["folder_path"]) and len(
             os.listdir(paras["folder_path"])) > 0
         if paras["iterator"] is not None and b_folder_not_empty:
             # iterator 非空，folder_path 非空
-            if paras["strict_mode"]:
+            if paras["b_strict_mode"]:
                 # 不能同时设置
                 raise Exception(f"Error: folder_path and iterator cannot be set at the same time\n"
                                 f"iterator {paras['iterator']} is given when "
@@ -123,13 +104,12 @@ class Cache_Manager_for_Iterator:
             # 尝试直接根据已有文件构建 file_dict
             file_dict = self.find_chuck_files(paras["folder_path"])
+        # 构建基于内存的缓存
+        self.memo_cache_manager = Cache_Manager(**paras["paras_for_memo_cache"])
         self.file_dict = file_dict
         self.paras = paras
-        # 初始化基于内存的缓存
-        self.cache_dict = dict()
-        self.cache_metadata = dict()
         # 记录最后读取的index
         self.index = -1
@@ -181,49 +161,19 @@ class Cache_Manager_for_Iterator:
             chunk = pickle.load(f)
         return chunk
-    # ------------------------------------ 基于内存的缓存 ------------------------------------ #
-    def __read_from_cache(self, index):
-        """
-            从内存中读取
-        """
-        chunk = self.cache_dict[index]
-        # 更新缓存属性
-        self.cache_metadata[index]["counts"] += 1
-        self.cache_metadata[index]["last_time"] = time.time()
-        return chunk
-    def __add_to_cache(self, index, chunk):
-        """
-            添加到内存中
-        """
-        # 更新缓存
-        self.cache_dict[index] = chunk
-        # 更新缓存属性
-        self.cache_metadata[index] = {
-            "last_time": time.time(),  # 最近读取时间
-            "initial_time": time.time(),  # 最初读取时间
-            "counts": 1,  # 读取次数
-        }
-        # 依据策略去除优先级较低的缓存
-        drop_ls = self.paras["cache_update_strategy"](self.cache_metadata)
-        for i in drop_ls:
-            self.cache_dict.pop(i)
-            self.cache_metadata.pop(i)
     # ------------------------------------ 读取 ------------------------------------ #
     def read(self, index):
         assert 0 <= index < len(self), \
             KeyError(f"Error: index {index} not in [0, {len(self)})")
-        if index in self.cache_dict:
+        if self.memo_cache_manager.has(key=index):
             # 直接从内存中读取
-            chunk = self.__read_from_cache(index)
+            chunk = self.memo_cache_manager.get(key=index)
         else:
             # 到磁盘读取
             chunk = self.__read_from_files(index)
             # 添加到缓存
-            self.__add_to_cache(index, chunk)
+            self.memo_cache_manager.add(key=index, value=chunk)
         self.index = index
         return chunk
@@ -246,8 +196,8 @@ class Cache_Manager_for_Iterator:
         return len(self.file_dict)
     def __del__(self):
-        if self.paras["iterator"] is not None and self.paras["del_cache_when_exit"] and self.paras["strict_mode"]:
-            # 在 strict_mode 开启，且 iterator 非空的情况下 self.file_dict 中的二进制文件一定是根据 iterator 生成的
+        if self.paras["iterator"] is not None and self.paras["b_del_cache_when_exit"] and self.paras["b_strict_mode"]:
+            # 在 b_strict_mode 开启，且 iterator 非空的情况下 self.file_dict 中的二进制文件一定是根据 iterator 生成的
             # 删除文件
             pwd_bak = os.getcwd()
             os.chdir(self.paras["folder_path"])
@@ -259,97 +209,15 @@ class Cache_Manager_for_Iterator:
                 os.removedirs(self.paras["folder_path"])
-class Strategies:
-    """
-        现有策略：
-                drop_min_counts:    去除读取次数最小的
-                drop_min_last_time:    去除最近没有读取的
-                drop_min_survival_time:    去除生存时间最短的，生成时间 survival_time:=last_time-initial_time
-    """
-    @staticmethod
-    def drop_min_counts(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
-        """
-            去除读取次数最小的
-            参数：
-                cache_metadata:  缓存的属性数据
-                                    包含以下字段，各字段将自动更新
-                                    例如：{ 1: {  "last_time": xxx,      # 最近读取时间
-                                                 "initial_time": xxx,   # 最初读取时间
-                                                 "counts": xxx,         # 读取次数
-                                                 },
-                                           ... }
-                cache_size_upper_bound： 当 cache_metadata 的大小超过该值时触发更新
-                cache_size_after_drop：  更新后 cache_metadata 的目标大小
-                                    默认为 cache_size_upper_bound
-        """
-        if cache_size_upper_bound >= len(cache_metadata):
-            return []
-        cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
-        # （这里其实可以用最大堆来优化，但是我懒啊）
-        drop_ls = [i for i, j in sorted(cache_metadata.items(), key=lambda x: x[1]["counts"])[:-cache_size_after_drop]]
-        return drop_ls
-    @staticmethod
-    def drop_min_last_time(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
-        """
-            去除最近没有读取的
-        """
-        if cache_size_upper_bound >= len(cache_metadata):
-            return []
-        cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
-        drop_ls = [i for i, j in
-                   sorted(cache_metadata.items(), key=lambda x: x[1]["last_time"])[:-cache_size_after_drop]]
-        return drop_ls
-    @staticmethod
-    def drop_min_survival_time(cache_metadata, cache_size_upper_bound, cache_size_after_drop=None):
-        """
-            去除生存时间最短的，生成时间 survival_time:=last_time-initial_time
-        """
-        if cache_size_upper_bound >= len(cache_metadata):
-            return []
-        cache_size_after_drop = cache_size_upper_bound if cache_size_after_drop is None else cache_size_after_drop
-        drop_ls = [i for i, j in sorted(cache_metadata.items(), key=lambda x: x[1]["last_time"] - x[1]["initial_time"])[
-                                 :-cache_size_after_drop]]
-        return drop_ls
 if __name__ == '__main__':
-    "测试 Strategies"
-    _cache_metadata = {
-        1: {
-            "last_time": 123,
-            "initial_time": 56,
-            "counts": 2,
-        },
-        2: {
-            "last_time": 110,
-            "initial_time": 0,
-            "counts": 4,
-        },
-        3: {
-            "last_time": 126,
-            "initial_time": 121,
-            "counts": 1,
-        },
-    }
-    print(Strategies.drop_min_counts(_cache_metadata, 2))
-    print(Strategies.drop_min_last_time(_cache_metadata, 1))
-    print(Strategies.drop_min_survival_time(_cache_metadata, 1))
     "测试 Cache_Manager_for_Iterator"
-    cache_manager = Cache_Manager_for_Iterator(iterator=range(10),
-                                               del_cache_when_exit=True,
-                                               cache_update_strategy=lambda x: Strategies.drop_min_last_time(
-                                                   cache_metadata=x,
-                                                   cache_size_upper_bound=3))
+    cache_manager = Cache_Manager_for_Iterator(
+        iterator=range(10),
+        b_del_cache_when_exit=True,
+        paras_for_memo_cache=dict(upper_bound=3, refactor_size=3, strategy=":by_last_time:LRU")
+    )
     print(cache_manager.file_dict)
-    print(cache_manager.cache_metadata)
+    print(cache_manager.memo_cache_manager.metadata_s)
     for i in range(3):
         print(cache_manager.read(0))
     for i in range(2):
@@ -358,4 +226,4 @@ if __name__ == '__main__':
         print(cache_manager.read(6))
     for i in range(4):
         print(cache_manager.read(9))
-    print(cache_manager.cache_metadata)
+    print(cache_manager.memo_cache_manager.metadata_s)

kevin_toolbox/data_flow/core/cache/test/__init__.py ADDED Viewed

File without changes

kevin_toolbox/data_flow/core/cache/test/test_cache_manager_for_iterator.py ADDED Viewed

@@ -0,0 +1,34 @@
+import pytest
+import time
+from kevin_toolbox.patches.for_test import check_consistency
+from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
+def test_cache_manager_for_iterator():
+    print("test Cache_Manager_for_Iterator")
+    cache_manager = Cache_Manager_for_Iterator(
+        iterator=range(10),
+        b_del_cache_when_exit=True,
+        paras_for_memo_cache=dict(upper_bound=3, refactor_size=3, strategy=":by_last_time:LRU")
+    )
+    check_consistency(
+        cache_manager.file_dict,
+        {i: f'{i}.pkl' for i in range(10)}
+    )
+    check_consistency(
+        cache_manager.memo_cache_manager.metadata_s,
+        dict()
+    )
+    key_to_counts = {0: 3, 3: 2, 6: 1, 9: 4}
+    for key, counts in key_to_counts.items():
+        for _ in range(counts):
+            time.sleep(0.05)
+            check_consistency(cache_manager.read(key), key)
+    # counts
+    for key, counts in key_to_counts.items():
+        if key == 0:
+            assert key not in cache_manager.memo_cache_manager.metadata_s
+        else:
+            check_consistency(cache_manager.memo_cache_manager.metadata_s[key]["counts"], counts - 1)

kevin_toolbox/data_flow/core/reader/file_iterative_reader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
+import copy
 class File_Iterative_Reader:
@@ -12,11 +13,14 @@ class File_Iterative_Reader:
             设定关键参数
             必要参数：
                 file_path:  文件路径
+                file_obj:   文件对象
+                    注意！！以上两个参数指定其一即可，同时指定时候，以后者为准。
             读取模式相关参数：
                 paras_for_open:     open() 函数的补充参数
                 mode:       读取模式，默认为 "lines"
                                 "lines"：  按行数计算批次大小
                                 "bytes"：  按字节数计算
+                    注意！！以上两个参数在指定了 file_obj 参数后将失效。
                 chunk_size: 批次大小
                                 默认为 1k
                                 当为-1时，读取整个文件
@@ -51,6 +55,7 @@ class File_Iterative_Reader:
         paras = {
             # 必要参数
             "file_path": None,
+            "file_obj": None,
             # 读取模式相关参数
             "paras_for_open": dict(mode="r", encoding='utf-8'),
             "mode": "lines",
@@ -74,16 +79,21 @@ class File_Iterative_Reader:
         assert mode in ["lines", "bytes"]
         paras["chunk_size"] = int(paras["chunk_size"])
         paras["loop_num"] = int(paras["loop_num"]) - 1
-        #
-        file_path = paras["file_path"]
-        assert isinstance(file_path, (str,)) and os.path.exists(file_path), \
-            Exception(f"Error: file {file_path} not exists!")
-        #
-        paras_for_open = paras["paras_for_open"]
-        assert isinstance(paras_for_open, (dict,))
         # 获取文件对象
-        self.file = open(file_path, **paras_for_open)
+        if paras["file_obj"] is None:
+            assert isinstance(paras["file_path"], (str,)) and os.path.isfile(paras["file_path"]), \
+                Exception(f'Error: file {paras["file_path"]} not exists!')
+            #
+            assert isinstance(paras["paras_for_open"], (dict,))
+            self.file = open(paras["file_path"], **paras["paras_for_open"])
+        else:
+            # 拷贝对象，防止修改外部对象
+            try:
+                self.file = copy.deepcopy(paras["file_obj"])
+            except:
+                self.file = open(paras["file_obj"].name, mode=paras["file_obj"].mode)
         # 选择相应模式
         self.__read_func = {"lines": self.__read_lines, "bytes": self.__read_bytes}[mode]
         self.__jump_func = {"lines": self.__jump_lines, "bytes": self.__jump_bytes}[mode]
@@ -225,9 +235,34 @@ class File_Iterative_Reader:
 if __name__ == "__main__":
     import numpy as np
-    reader = File_Iterative_Reader(file_path="developing/test_data.txt", chunk_size=2, drop=True, loop_num=2,
+    print("使用 file_path")
+    reader = File_Iterative_Reader(file_path="test/test_data/test_data.txt", chunk_size=2, drop=True, loop_num=2,
                                    pre_jump_size=3, convert_func=lambda x: np.array(x))
     for i in reader:
         print(i)
     del reader
+    print("使用 file_obj")
+    reader = File_Iterative_Reader(
+        file_obj=open("test/test_data/test_data.txt", "r"), chunk_size=2, drop=True, loop_num=2,
+        pre_jump_size=3, convert_func=lambda x: np.array(x))
+    for i in reader:
+        print(i)
+    del reader
+    print("从字符串构建文件对象作为 file_obj")
+    from io import StringIO
+    file_obj = StringIO(initial_value=open("test/test_data/test_data.txt", "r").read())
+    reader = File_Iterative_Reader(
+        file_obj=file_obj, chunk_size=2, drop=True, loop_num=2,
+        pre_jump_size=3, convert_func=lambda x: np.array(x))
+    for i in reader:
+        print(i)
+    print("证明不会修改外部对象")
+    print(file_obj.read())
+    del reader

kevin_toolbox/data_flow/core/reader/unified_reader.py CHANGED Viewed

@@ -49,7 +49,7 @@ if __name__ == '__main__':
     print(reader.read([3, 3]).shape)
     print(reader.find(1))
-    reader = UReader(file_path="test_data.txt", chunk_size=2, folder_path="./temp/233")
+    reader = UReader(file_path="test/test_data/test_data.txt", chunk_size=2, folder_path="./temp/233")
     print(reader.read(2, 7))
     # del reader
@@ -67,7 +67,7 @@ if __name__ == '__main__':
     print(reader.find('data/6/horse_race_pan/2132020102319002000161_43_4.bmp'))
-    reader = Reader_for_files(file_path="test_data.txt", chunk_size=2, pre_jump_size=2, jump_size=2)
+    reader = Reader_for_files(file_path="test/test_data/test_data.txt", chunk_size=2, pre_jump_size=2, jump_size=2)
     for i in reader:
         print(2333, i)

kevin_toolbox/data_flow/core/reader/unified_reader_base.py CHANGED Viewed

@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
 import os
 import numpy as np
 from kevin_toolbox.data_flow.core.reader import File_Iterative_Reader
-from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator, Strategies
+from kevin_toolbox.data_flow.core.cache import Cache_Manager_for_Iterator
 class Unified_Reader_Base(ABC):
@@ -126,10 +126,9 @@ class Unified_Reader_Base(ABC):
         """
             默认使用 Cache_Manager_for_Iterator
         """
-        manager = Cache_Manager_for_Iterator(iterator=iterator, folder_path=folder_path,
-                                             cache_update_strategy=lambda x: Strategies.drop_min_last_time(
-                                                 cache_metadata=x,
-                                                 cache_size_upper_bound=3))
+        manager = Cache_Manager_for_Iterator(
+            iterator=iterator, folder_path=folder_path,
+            paras_for_memo_cache=dict(upper_bound=20, refactor_size=0.7, strategy=":by_last_time:LRU"))
         return manager
     def read(self, *args, **kwargs):

kevin_toolbox/data_flow/file/json_/converter/__init__.py CHANGED Viewed

@@ -2,6 +2,6 @@ from .convert_dict_key_to_number import convert_dict_key_to_number
 from .convert_ndarray_to_list import convert_ndarray_to_list
 from .escape_non_str_dict_key import escape_non_str_dict_key
 from .unescape_non_str_dict_key import unescape_non_str_dict_key
-from .escape_tuple import escape_tuple
-from .unescape_tuple import unescape_tuple
+from .escape_tuple_and_set import escape_tuple_and_set
+from .unescape_tuple_and_set import unescape_tuple_and_set
 from .integrate import integrate

kevin_toolbox/data_flow/file/json_/converter/escape_tuple_and_set.py ADDED Viewed

@@ -0,0 +1,23 @@
+def escape_tuple_and_set(x):
+    """
+        将 tuple 和 set 进行转义
+            转义：     x ==> f"<eval>{x}"
+            反转义：   f"<eval>{x}" ==> x
+        为什么要进行转义？
+            由于 json 中会将 tuple 作为 list 进行保存，同时也无法保存 set，因此在保存过程中会丢失相应信息。
+    """
+    if isinstance(x, (tuple, set,)) or (isinstance(x, (str,)) and x.startswith("<eval>")):
+        return f'<eval>{x}'
+    else:
+        return x
+if __name__ == '__main__':
+    print(escape_tuple_and_set((1, 2, "\'1\'")))
+    # <eval>(1, 2, "'1'")
+    print(escape_tuple_and_set({"1", (1, 2, 3), 233}))
+    # <eval>{'1', 233, (1, 2, 3)}
+    print(escape_tuple_and_set("<eval>233"))
+    # <eval><eval>233

kevin_toolbox/data_flow/file/json_/converter/{unescape_tuple.py → unescape_tuple_and_set.py} RENAMED Viewed

@@ -1,11 +1,11 @@
-def unescape_tuple(x):
+def unescape_tuple_and_set(x):
     """
-        将 tuple 进行反转义
+        将 tuple 和 set 进行反转义
             转义：     x ==> f"<eval>{x}"
             反转义：   f"<eval>{x}" ==> x
         为什么要进行转义？
-            由于 json 中会将 tuple 作为 list 进行保存，因此在保存过程中会丢失相应信息。
+            由于 json 中会将 tuple 作为 list 进行保存，同时也无法保存 set，因此在保存过程中会丢失相应信息。
     """
     if isinstance(x, str) and x.startswith("<eval>"):
         x = x[6:]
@@ -17,7 +17,9 @@ def unescape_tuple(x):
 if __name__ == '__main__':
-    print(unescape_tuple("<eval>(1, 2, \"'1'\")"))
+    print(unescape_tuple_and_set("<eval>(1, 2, \"'1'\")"))
     # (1, 2, "\'1\'")
-    print(unescape_tuple("<eval><eval>233"))
+    print(unescape_tuple_and_set("<eval>{'1', 233, (1, 2, 3)}"))
+    # {'1', 233, (1, 2, 3)}
+    print(unescape_tuple_and_set("<eval><eval>233"))
     # "<eval>233"

kevin_toolbox/data_flow/file/json_/read_json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import json
-from kevin_toolbox.data_flow.file.json_.converter import integrate, unescape_tuple, unescape_non_str_dict_key
+from kevin_toolbox.data_flow.file.json_.converter import integrate, unescape_tuple_and_set, unescape_non_str_dict_key
 from kevin_toolbox.nested_dict_list import traverse
@@ -14,14 +14,14 @@ def read_json(file_path, converters=None, b_use_suggested_converter=False):
                                             转换器 converter 应该是一个形如 def(x): ... ; return x 的函数，具体可以参考
                                             json_.converter 中已实现的转换器
             b_use_suggested_converter:  <boolean> 是否使用建议的转换器
-                                            建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple 这两对转换器，
+                                            建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple_and_set 这两对转换器，
                                             可以避免因 json 的读取/写入而丢失部分信息。
                                             默认为 False。
                     注意：当 converters 非 None，此参数失效，以 converters 中的具体设置为准
     """
     assert os.path.isfile(file_path), f'file {file_path} not found'
     if converters is None and b_use_suggested_converter:
-        converters = [unescape_tuple, unescape_non_str_dict_key]
+        converters = [unescape_tuple_and_set, unescape_non_str_dict_key]
     with open(file_path, 'r') as f:
         content = json.load(f)

kevin_toolbox/data_flow/file/json_/write_json.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import json
 import copy
-from kevin_toolbox.data_flow.file.json_.converter import integrate, escape_tuple, escape_non_str_dict_key
+from kevin_toolbox.data_flow.file.json_.converter import integrate, escape_tuple_and_set, escape_non_str_dict_key
 from kevin_toolbox.nested_dict_list import traverse
@@ -18,7 +18,7 @@ def write_json(content, file_path, sort_keys=False, converters=None, b_use_sugge
                                             转换器 converter 应该是一个形如 def(x): ... ; return x 的函数，具体可以参考
                                             json_.converter 中已实现的转换器
             b_use_suggested_converter:  <boolean> 是否使用建议的转换器
-                                            建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple 这两对转换器，
+                                            建议使用 unescape/escape_non_str_dict_key 和 unescape/escape_tuple_and_set 这两对转换器，
                                             可以避免因 json 的读取/写入而丢失部分信息。
                                             默认为 False。
                     注意：当 converters 非 None，此参数失效，以 converters 中的具体设置为准
@@ -26,7 +26,7 @@ def write_json(content, file_path, sort_keys=False, converters=None, b_use_sugge
     assert isinstance(file_path, (str, type(None)))
     if converters is None and b_use_suggested_converter:
-        converters = [escape_tuple, escape_non_str_dict_key]
+        converters = [escape_tuple_and_set, escape_non_str_dict_key]
     if converters is not None:
         converter = integrate(converters)

kevin_toolbox/data_flow/file/kevin_notation/kevin_notation_reader.py CHANGED Viewed

@@ -15,6 +15,8 @@ class Kevin_Notation_Reader:
             必要参数：
                 file_path:          <string> 文件路径
+                file_obj:           <file object> 文件对象
+                    以上参数2选一，具体参见 File_Iterative_Reader
             读取相关参数：
                 chunk_size:         <integer> 每次读取多少行数据
                 beg：                <integer> 开始读取的位置
@@ -28,6 +30,7 @@ class Kevin_Notation_Reader:
         paras = {
             # 必要参数
             "file_path": None,
+            "file_obj": None,
             # 读取相关参数
             "chunk_size": 100,
             "beg": 0,
@@ -38,9 +41,7 @@ class Kevin_Notation_Reader:
         paras.update(kwargs)
         # 校验参数
-        assert isinstance(paras["file_path"], (str,)) and os.path.isfile(paras["file_path"]), \
-            f'file not exists :{paras["file_path"]}'
-        #
+        #   file_path 和 file_obj 交给 File_Iterative_Reader 校验
         assert isinstance(paras["chunk_size"], (int,)) and (paras["chunk_size"] > 0 or paras["chunk_size"] == -1)
         assert isinstance(paras["beg"], (int,)) and paras["beg"] >= 0
         assert isinstance(paras["converter"], (Converter, dict,))
@@ -48,7 +49,7 @@ class Kevin_Notation_Reader:
         self.paras = paras
         # 读取开头
-        self.reader = File_Iterative_Reader(file_path=self.paras["file_path"],
+        self.reader = File_Iterative_Reader(file_path=self.paras["file_path"], file_obj=self.paras["file_obj"],
                                             pre_jump_size=self.paras["beg"],
                                             filter_=lambda x: x != "\n" and not x.startswith("//"),  # 去除注释
                                             map_func=lambda x: x.rsplit("\n", 1)[0].split("//", 1)[0],
@@ -64,7 +65,7 @@ class Kevin_Notation_Reader:
         del self.reader
         # 读取内容
-        self.reader = File_Iterative_Reader(file_path=self.paras["file_path"],
+        self.reader = File_Iterative_Reader(file_path=self.paras["file_path"], file_obj=self.paras["file_obj"],
                                             pre_jump_size=self.paras["beg"] + offset,
                                             filter_=lambda x: x != "\n" and not x.startswith("//"),  # 去除注释
                                             map_func=lambda x: x.rsplit("\n", 1)[0].split("//", 1)[0],

kevin_toolbox/data_flow/file/kevin_notation/read.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from kevin_toolbox.data_flow.file import kevin_notation
-def read(file_path):
+def read(file_path=None, file_obj=None):
     """
         读取整个文件的快捷接口
     """
-    with kevin_notation.Reader(file_path=file_path, chunk_size=-1) as reader:
+    assert file_path is not None or file_obj is not None
+    with kevin_notation.Reader(file_path=file_path, chunk_size=-1, file_obj=file_obj) as reader:
         # metadata
         metadata = reader.metadata
         # content

kevin-toolbox-dev 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

kevin-toolbox-dev 1.3.1py3-none-any.whl → 1.3.3py3-none-any.whl