nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +24 -11
- nlpertools/algo/__init__.py +0 -0
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -0
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -0
- nlpertools/data_structure/__init__.py +0 -0
- nlpertools/data_structure/base_structure.py +109 -0
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -47
- nlpertools/io/file.py +277 -205
- nlpertools/ml.py +483 -317
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -62
- nlpertools/other.py +364 -188
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -34
- nlpertools/reminder.py +98 -15
- nlpertools/template/__init__.py +0 -0
- nlpertools/utils/__init__.py +3 -0
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -0
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -0
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -0
- {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.4.dist-info/METADATA +0 -42
- nlpertools-1.0.4.dist-info/RECORD +0 -15
- nlpertools-1.0.4.dist-info/top_level.txt +0 -1
    
        nlpertools/io/dir.py
    CHANGED
    
    | @@ -1,47 +1,54 @@ | |
| 1 | 
            -
            #!/usr/bin/python3.8
         | 
| 2 | 
            -
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            -
            # @Author  : youshu.Ji
         | 
| 4 | 
            -
            import  | 
| 5 | 
            -
            import  | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
                 | 
| 16 | 
            -
                 | 
| 17 | 
            -
                : | 
| 18 | 
            -
                 | 
| 19 | 
            -
                '' | 
| 20 | 
            -
                 | 
| 21 | 
            -
                filename | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
            # | 
| 26 | 
            -
             | 
| 27 | 
            -
                 | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
                filenames = os.listdir( | 
| 37 | 
            -
                 | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
                 | 
| 46 | 
            -
                 | 
| 47 | 
            -
             | 
| 1 | 
            +
            #!/usr/bin/python3.8
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            +
            # @Author  : youshu.Ji
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            from pathlib import Path
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            # dir ----------------------------------------------------------------------
         | 
| 9 | 
            +
            def j_mkdir(name):
         | 
| 10 | 
            +
                os.makedirs(name, exist_ok=True)
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            def get_filename(path) -> str:
         | 
| 14 | 
            +
                """
         | 
| 15 | 
            +
                返回路径最后的文件名
         | 
| 16 | 
            +
                :param path:
         | 
| 17 | 
            +
                :return:
         | 
| 18 | 
            +
                """
         | 
| 19 | 
            +
                # path = r'***/**/***.txt'
         | 
| 20 | 
            +
                filename = os.path.split(path)[-1]
         | 
| 21 | 
            +
                return filename
         | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
| 24 | 
            +
            def j_listdir(dir_name, including_dir=True):
         | 
| 25 | 
            +
                #  yield
         | 
| 26 | 
            +
                filenames = os.listdir(dir_name)
         | 
| 27 | 
            +
                for filename in filenames:
         | 
| 28 | 
            +
                    if including_dir:
         | 
| 29 | 
            +
                        yield os.path.join(dir_name, filename)
         | 
| 30 | 
            +
                    else:
         | 
| 31 | 
            +
                        yield filename
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            # 合并文件 TODO 还没写
         | 
| 35 | 
            +
            def imgrate_files(path):
         | 
| 36 | 
            +
                filenames = os.listdir(path)
         | 
| 37 | 
            +
                return None
         | 
| 38 | 
            +
             | 
| 39 | 
            +
             | 
| 40 | 
            +
            def case_sensitive_path_exists(path: str, relative_path=False):
         | 
| 41 | 
            +
                """
         | 
| 42 | 
            +
                https://juejin.cn/post/7316725867086692391
         | 
| 43 | 
            +
                Check if the path exists in a case-sensitive manner.
         | 
| 44 | 
            +
                """
         | 
| 45 | 
            +
                # 构造成Path
         | 
| 46 | 
            +
                if relative_path:
         | 
| 47 | 
            +
                    path = Path.cwd() / path
         | 
| 48 | 
            +
                else:
         | 
| 49 | 
            +
                    path = Path(path)
         | 
| 50 | 
            +
                if not path.exists():
         | 
| 51 | 
            +
                    return False
         | 
| 52 | 
            +
                # resolved_path是系统里的该文件实际名称
         | 
| 53 | 
            +
                resolved_path = path.resolve()
         | 
| 54 | 
            +
                return str(resolved_path) == str(path)
         | 
    
        nlpertools/io/file.py
    CHANGED
    
    | @@ -1,205 +1,277 @@ | |
| 1 | 
            -
            #!/usr/bin/python3.8
         | 
| 2 | 
            -
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            -
            # @Author  : youshu.Ji
         | 
| 4 | 
            -
            import codecs
         | 
| 5 | 
            -
            import  | 
| 6 | 
            -
            import  | 
| 7 | 
            -
            import  | 
| 8 | 
            -
            import time
         | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
                     | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
                 | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
                     | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
                 | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
                 | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
            def  | 
| 93 | 
            -
                with codecs.open(path, ' | 
| 94 | 
            -
                     | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
             | 
| 112 | 
            -
             | 
| 113 | 
            -
            #  | 
| 114 | 
            -
            def  | 
| 115 | 
            -
                with codecs.open(path,  | 
| 116 | 
            -
                     | 
| 117 | 
            -
             | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
| 120 | 
            -
             | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
             | 
| 124 | 
            -
             | 
| 125 | 
            -
             | 
| 126 | 
            -
             | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
             | 
| 130 | 
            -
             | 
| 131 | 
            -
             | 
| 132 | 
            -
             | 
| 133 | 
            -
             | 
| 134 | 
            -
             | 
| 135 | 
            -
             | 
| 136 | 
            -
             | 
| 137 | 
            -
             | 
| 138 | 
            -
             | 
| 139 | 
            -
             | 
| 140 | 
            -
             | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
             | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
            def  | 
| 157 | 
            -
                 | 
| 158 | 
            -
                     | 
| 159 | 
            -
             | 
| 160 | 
            -
             | 
| 161 | 
            -
             | 
| 162 | 
            -
             | 
| 163 | 
            -
             | 
| 164 | 
            -
             | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
                         | 
| 168 | 
            -
             | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
                             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 185 | 
            -
            def  | 
| 186 | 
            -
                 | 
| 187 | 
            -
                     | 
| 188 | 
            -
             | 
| 189 | 
            -
                         | 
| 190 | 
            -
             | 
| 191 | 
            -
             | 
| 192 | 
            -
                         | 
| 193 | 
            -
                             | 
| 194 | 
            -
             | 
| 195 | 
            -
             | 
| 196 | 
            -
             | 
| 197 | 
            -
             | 
| 198 | 
            -
             | 
| 199 | 
            -
             | 
| 200 | 
            -
             | 
| 201 | 
            -
             | 
| 202 | 
            -
             | 
| 203 | 
            -
             | 
| 204 | 
            -
             | 
| 205 | 
            -
             | 
| 1 | 
            +
            #!/usr/bin/python3.8
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            +
            # @Author  : youshu.Ji
         | 
| 4 | 
            +
            import codecs
         | 
| 5 | 
            +
            import json
         | 
| 6 | 
            +
            import pickle
         | 
| 7 | 
            +
            import random
         | 
| 8 | 
            +
            import time
         | 
| 9 | 
            +
            from itertools import (takewhile, repeat)
         | 
| 10 | 
            +
            import pandas as pd
         | 
| 11 | 
            +
            # import omegaconf
         | 
| 12 | 
            +
            # import yaml
         | 
| 13 | 
            +
            from ..utils.package import *
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            LARGE_FILE_THRESHOLD = 1e5
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             | 
| 18 | 
            +
            def read_yaml(path, omega=False):
         | 
| 19 | 
            +
                if omega:
         | 
| 20 | 
            +
                    return omegaconf.OmegaConf.load(path)
         | 
| 21 | 
            +
                return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
| 24 | 
            +
            def _merge_file(filelist, save_filename, shuffle=False):
         | 
| 25 | 
            +
                contents = []
         | 
| 26 | 
            +
                for file in filelist:
         | 
| 27 | 
            +
                    content = readtxt_list_all_strip(file)
         | 
| 28 | 
            +
                    contents.extend(content)
         | 
| 29 | 
            +
                if shuffle:
         | 
| 30 | 
            +
                    random.shuffle(contents)
         | 
| 31 | 
            +
                writetxt_w_list(contents, save_filename)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            # file's io ----------------------------------------------------------------------
         | 
| 35 | 
            +
            def iter_count(file_name):
         | 
| 36 | 
            +
                """
         | 
| 37 | 
            +
                最快的文件行数统计,不知道和wc -l 谁快
         | 
| 38 | 
            +
                author: unknown
         | 
| 39 | 
            +
                """
         | 
| 40 | 
            +
                buffer = 1024 * 1024
         | 
| 41 | 
            +
                with codecs.open(file_name, 'r', 'utf-8') as f:
         | 
| 42 | 
            +
                    buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
         | 
| 43 | 
            +
                    return sum(buf.count('\n') for buf in buf_gen)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
             | 
| 46 | 
            +
            # 需要加入进度条的函数包括
         | 
| 47 | 
            +
            """
         | 
| 48 | 
            +
            readtxt_list_all_strip
         | 
| 49 | 
            +
            save_to_json
         | 
| 50 | 
            +
            load_from_json
         | 
| 51 | 
            +
            """
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
            # 读txt文件 一次全读完 返回list 去换行
         | 
| 55 | 
            +
            def readtxt_list_all_strip(path, encoding='utf-8'):
         | 
| 56 | 
            +
                file_line_num = iter_count(path)
         | 
| 57 | 
            +
                lines = []
         | 
| 58 | 
            +
                with codecs.open(path, 'r', encoding) as r:
         | 
| 59 | 
            +
                    if file_line_num > LARGE_FILE_THRESHOLD:
         | 
| 60 | 
            +
                        iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
         | 
| 61 | 
            +
                    else:
         | 
| 62 | 
            +
                        iter_obj = enumerate(r.readlines())
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    for ldx, line in iter_obj:
         | 
| 65 | 
            +
                        lines.append(line.strip('\n').strip("\r"))
         | 
| 66 | 
            +
                    return lines
         | 
| 67 | 
            +
             | 
| 68 | 
            +
             | 
| 69 | 
            +
            # 读txt 一次读一行 最后返回list
         | 
| 70 | 
            +
            def readtxt_list_each(path):
         | 
| 71 | 
            +
                lines = []
         | 
| 72 | 
            +
                with codecs.open(path, 'r', 'utf-8') as r:
         | 
| 73 | 
            +
                    line = r.readline()
         | 
| 74 | 
            +
                    while line:
         | 
| 75 | 
            +
                        lines.append(line)
         | 
| 76 | 
            +
                        line = r.readline()
         | 
| 77 | 
            +
                return lines
         | 
| 78 | 
            +
             | 
| 79 | 
            +
             | 
| 80 | 
            +
            def readtxt_list_each_strip(path):
         | 
| 81 | 
            +
                """
         | 
| 82 | 
            +
                yield方法
         | 
| 83 | 
            +
                """
         | 
| 84 | 
            +
                with codecs.open(path, 'r', 'utf-8') as r:
         | 
| 85 | 
            +
                    line = r.readline()
         | 
| 86 | 
            +
                    while line:
         | 
| 87 | 
            +
                        yield line.strip("\n").strip("\r")
         | 
| 88 | 
            +
                        line = r.readline()
         | 
| 89 | 
            +
             | 
| 90 | 
            +
             | 
| 91 | 
            +
            # 读txt文件 一次全读完 返回list
         | 
| 92 | 
            +
            def readtxt_list_all(path):
         | 
| 93 | 
            +
                with codecs.open(path, 'r', 'utf-8') as r:
         | 
| 94 | 
            +
                    lines = r.readlines()
         | 
| 95 | 
            +
                    return lines
         | 
| 96 | 
            +
             | 
| 97 | 
            +
             | 
| 98 | 
            +
            # 读byte文件 读成一条string
         | 
| 99 | 
            +
            def readtxt_byte(path, encoding="utf-8"):
         | 
| 100 | 
            +
                with codecs.open(path, 'rb') as r:
         | 
| 101 | 
            +
                    lines = r.read()
         | 
| 102 | 
            +
                    lines = lines.decode(encoding)
         | 
| 103 | 
            +
                    return lines.replace('\r', '')
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            # 读txt文件 读成一条string
         | 
| 107 | 
            +
            def readtxt_string(path, encoding="utf-8"):
         | 
| 108 | 
            +
                with codecs.open(path, 'r', encoding) as r:
         | 
| 109 | 
            +
                    lines = r.read()
         | 
| 110 | 
            +
                    return lines.replace('\r', '')
         | 
| 111 | 
            +
             | 
| 112 | 
            +
             | 
| 113 | 
            +
            # 写txt文件覆盖
         | 
| 114 | 
            +
            def writetxt_w(txt, path, r='w'):
         | 
| 115 | 
            +
                with codecs.open(path, r, 'utf-8') as w:
         | 
| 116 | 
            +
                    w.writelines(txt)
         | 
| 117 | 
            +
             | 
| 118 | 
            +
             | 
| 119 | 
            +
            # 写txt文件追加
         | 
| 120 | 
            +
            def writetxt_a(txt, path):
         | 
| 121 | 
            +
                with codecs.open(path, 'a', 'utf-8') as w:
         | 
| 122 | 
            +
                    w.writelines(txt)
         | 
| 123 | 
            +
             | 
| 124 | 
            +
             | 
| 125 | 
            +
            def writetxt(txt, path, encoding="utf-8"):
         | 
| 126 | 
            +
                with codecs.open(path, 'w', encoding) as w:
         | 
| 127 | 
            +
                    w.write(txt)
         | 
| 128 | 
            +
             | 
| 129 | 
            +
             | 
| 130 | 
            +
            def writetxt_wb(txt, path):
         | 
| 131 | 
            +
                with codecs.open(path, 'wb') as w:
         | 
| 132 | 
            +
                    w.write(txt)
         | 
| 133 | 
            +
             | 
| 134 | 
            +
             | 
| 135 | 
            +
            # 写list 覆盖
         | 
| 136 | 
            +
            def writetxt_w_list(list, path, num_lf=1):
         | 
| 137 | 
            +
                with codecs.open(path, 'w', "utf-8") as w:
         | 
| 138 | 
            +
                    for i in list:
         | 
| 139 | 
            +
                        w.write(i)
         | 
| 140 | 
            +
                        w.write("\n" * num_lf)
         | 
| 141 | 
            +
             | 
| 142 | 
            +
             | 
| 143 | 
            +
            # 写list 追加
         | 
| 144 | 
            +
            def writetxt_a_list(list, path, num_lf=2):
         | 
| 145 | 
            +
                with codecs.open(path, 'a', "utf-8") as w:
         | 
| 146 | 
            +
                    for i in list:
         | 
| 147 | 
            +
                        w.write(i)
         | 
| 148 | 
            +
                        w.write("\n" * num_lf)
         | 
| 149 | 
            +
             | 
| 150 | 
            +
             | 
| 151 | 
            +
            def save_to_json(content, path):
         | 
| 152 | 
            +
                with codecs.open(path, "w", "utf-8") as w:
         | 
| 153 | 
            +
                    json.dump(content, w, ensure_ascii=False, indent=1)
         | 
| 154 | 
            +
             | 
| 155 | 
            +
             | 
| 156 | 
            +
            def load_from_json(path):
         | 
| 157 | 
            +
                with codecs.open(path, "r", "utf-8") as r:
         | 
| 158 | 
            +
                    content = json.load(r)
         | 
| 159 | 
            +
                    return content
         | 
| 160 | 
            +
             | 
| 161 | 
            +
             | 
| 162 | 
            +
            # 读txt文件 读成一条string if gb2312
         | 
| 163 | 
            +
            def readtxt_string_all_encoding(path):
         | 
| 164 | 
            +
                try:
         | 
| 165 | 
            +
                    with codecs.open(path, 'rb', "utf-8-sig") as r:
         | 
| 166 | 
            +
                        lines = r.read()
         | 
| 167 | 
            +
                        return lines
         | 
| 168 | 
            +
                except:
         | 
| 169 | 
            +
                    try:
         | 
| 170 | 
            +
                        with codecs.open(path, 'rb', "utf-8") as r:
         | 
| 171 | 
            +
                            lines = r.reacd()
         | 
| 172 | 
            +
                            return lines
         | 
| 173 | 
            +
                    except:
         | 
| 174 | 
            +
                        try:
         | 
| 175 | 
            +
                            with codecs.open(path, 'rb', "big5") as r:
         | 
| 176 | 
            +
                                lines = r.read()
         | 
| 177 | 
            +
                                return lines
         | 
| 178 | 
            +
                        except:
         | 
| 179 | 
            +
                            print(path)
         | 
| 180 | 
            +
                            with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
         | 
| 181 | 
            +
                                lines = r.read()
         | 
| 182 | 
            +
                                return lines
         | 
| 183 | 
            +
             | 
| 184 | 
            +
             | 
| 185 | 
            +
            def readtxt_list_all_encoding(path):
         | 
| 186 | 
            +
                try:
         | 
| 187 | 
            +
                    with codecs.open(path, 'rb', "utf-8-sig") as r:
         | 
| 188 | 
            +
                        lines = r.readlines()
         | 
| 189 | 
            +
                        return lines
         | 
| 190 | 
            +
                except:
         | 
| 191 | 
            +
                    try:
         | 
| 192 | 
            +
                        with codecs.open(path, 'rb', "utf-8") as r:
         | 
| 193 | 
            +
                            lines = r.readlines()
         | 
| 194 | 
            +
                            return lines
         | 
| 195 | 
            +
                    except:
         | 
| 196 | 
            +
                        try:
         | 
| 197 | 
            +
                            with codecs.open(path, 'rb', "big5") as r:
         | 
| 198 | 
            +
                                lines = r.readlines()
         | 
| 199 | 
            +
                                return lines
         | 
| 200 | 
            +
                        except:
         | 
| 201 | 
            +
                            with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
         | 
| 202 | 
            +
                                lines = r.readlines()
         | 
| 203 | 
            +
                                return lines
         | 
| 204 | 
            +
             | 
| 205 | 
            +
             | 
| 206 | 
            +
            # line by line
         | 
| 207 | 
            +
            def save_to_jsonl(corpus, path):
         | 
| 208 | 
            +
                with open(path, 'w', encoding='utf-8') as wt:
         | 
| 209 | 
            +
                    for i in corpus:
         | 
| 210 | 
            +
                        wt.write(json.dumps(i, ensure_ascii=False))
         | 
| 211 | 
            +
                        wt.write('\n')
         | 
| 212 | 
            +
             | 
| 213 | 
            +
             | 
| 214 | 
            +
            # line by line
         | 
| 215 | 
            +
            def load_from_jsonl(path):
         | 
| 216 | 
            +
                file_line_num = iter_count(path)
         | 
| 217 | 
            +
                if file_line_num > 1e5:
         | 
| 218 | 
            +
                    with open(path, 'r', encoding='utf-8') as rd:
         | 
| 219 | 
            +
                        corpus = []
         | 
| 220 | 
            +
                        while True:
         | 
| 221 | 
            +
                            line = rd.readline()
         | 
| 222 | 
            +
                            if line:
         | 
| 223 | 
            +
                                corpus.append(json.loads(line))
         | 
| 224 | 
            +
                            else:
         | 
| 225 | 
            +
                                break
         | 
| 226 | 
            +
                    return corpus
         | 
| 227 | 
            +
                else:
         | 
| 228 | 
            +
                    with open(path, 'r', encoding='utf-8') as rd:
         | 
| 229 | 
            +
                        corpus = []
         | 
| 230 | 
            +
                        while True:
         | 
| 231 | 
            +
                            line = rd.readline()
         | 
| 232 | 
            +
                            if line:
         | 
| 233 | 
            +
                                corpus.append(json.loads(line))
         | 
| 234 | 
            +
                            else:
         | 
| 235 | 
            +
                                break
         | 
| 236 | 
            +
                    return corpus
         | 
| 237 | 
            +
             | 
| 238 | 
            +
             | 
| 239 | 
            +
            def pickle_save(data, path):
         | 
| 240 | 
            +
                with open(path, 'wb') as f:
         | 
| 241 | 
            +
                    pickle.dump(data, f)
         | 
| 242 | 
            +
             | 
| 243 | 
            +
             | 
| 244 | 
            +
            def pickle_load(path):
         | 
| 245 | 
            +
                with open(path, 'rb') as f:
         | 
| 246 | 
            +
                    data = pickle.load(f)
         | 
| 247 | 
            +
                return data
         | 
| 248 | 
            +
             | 
| 249 | 
            +
             | 
| 250 | 
            +
            def save_to_csv(df, save_path, index_flag=False):
         | 
| 251 | 
            +
                with open(save_path, 'wb+') as csvfile:
         | 
| 252 | 
            +
                    csvfile.write(codecs.BOM_UTF8)
         | 
| 253 | 
            +
                df.to_csv(save_path, mode='a', index=index_flag)
         | 
| 254 | 
            +
             | 
| 255 | 
            +
             | 
| 256 | 
            +
            def save_to_mongo():
         | 
| 257 | 
            +
                # fake
         | 
| 258 | 
            +
                """
         | 
| 259 | 
            +
                示例
         | 
| 260 | 
            +
             | 
| 261 | 
            +
                """
         | 
| 262 | 
            +
                pass
         | 
| 263 | 
            +
             | 
| 264 | 
            +
            def load_from_mongo():
         | 
| 265 | 
            +
                pass
         | 
| 266 | 
            +
             | 
| 267 | 
            +
             | 
| 268 | 
            +
            def unmerge_cells_df(df) -> pd.DataFrame:
         | 
| 269 | 
            +
                for column in df.columns:
         | 
| 270 | 
            +
                    values = []
         | 
| 271 | 
            +
                    for i in df[column]:
         | 
| 272 | 
            +
                        if pd.isna(i):
         | 
| 273 | 
            +
                            values.append(values[-1])
         | 
| 274 | 
            +
                        else:
         | 
| 275 | 
            +
                            values.append(i)
         | 
| 276 | 
            +
                    df[column] = values
         | 
| 277 | 
            +
                return df
         |