nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. nlpertools/__init__.py +24 -11
  2. nlpertools/algo/__init__.py +0 -0
  3. nlpertools/algo/ac.py +18 -0
  4. nlpertools/algo/bit_ops.py +28 -0
  5. nlpertools/algo/kmp.py +94 -0
  6. nlpertools/algo/num_ops.py +12 -0
  7. nlpertools/algo/template.py +116 -0
  8. nlpertools/algo/union.py +13 -0
  9. nlpertools/data_client.py +387 -0
  10. nlpertools/data_structure/__init__.py +0 -0
  11. nlpertools/data_structure/base_structure.py +109 -0
  12. nlpertools/dataprocess.py +611 -3
  13. nlpertools/default_db_config.yml +41 -0
  14. nlpertools/io/__init__.py +3 -3
  15. nlpertools/io/dir.py +54 -47
  16. nlpertools/io/file.py +277 -205
  17. nlpertools/ml.py +483 -317
  18. nlpertools/monitor/__init__.py +0 -0
  19. nlpertools/monitor/gpu.py +18 -0
  20. nlpertools/monitor/memory.py +24 -0
  21. nlpertools/movie.py +36 -0
  22. nlpertools/nlpertools_config.yml +1 -0
  23. nlpertools/{openApi.py → open_api.py} +65 -62
  24. nlpertools/other.py +364 -188
  25. nlpertools/pic.py +288 -0
  26. nlpertools/plugin.py +43 -34
  27. nlpertools/reminder.py +98 -15
  28. nlpertools/template/__init__.py +0 -0
  29. nlpertools/utils/__init__.py +3 -0
  30. nlpertools/utils/lazy.py +727 -0
  31. nlpertools/utils/log_util.py +20 -0
  32. nlpertools/utils/package.py +89 -0
  33. nlpertools/utils/package_v1.py +94 -0
  34. nlpertools/utils/package_v2.py +117 -0
  35. nlpertools/utils_for_nlpertools.py +93 -0
  36. nlpertools/vector_index_demo.py +108 -0
  37. nlpertools/wrapper.py +161 -0
  38. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  39. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  40. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  41. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  42. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  43. nlpertools_helper/__init__.py +10 -0
  44. nlpertools-1.0.4.dist-info/METADATA +0 -42
  45. nlpertools-1.0.4.dist-info/RECORD +0 -15
  46. nlpertools-1.0.4.dist-info/top_level.txt +0 -1
nlpertools/io/dir.py CHANGED
@@ -1,47 +1,54 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import os
6
- import json
7
- import pickle
8
- import time
9
- # dir ----------------------------------------------------------------------
10
- def j_mkdir(name):
11
- os.makedirs(name, exist_ok=True)
12
-
13
-
14
- def get_filename(path):
15
- '''
16
- 返回路径最后的文件名
17
- :param path:
18
- :return:
19
- '''
20
- # path = r'***/**/***.txt'
21
- filename = os.path.split(path)[-1]
22
- return filename
23
-
24
-
25
- # TODO 还没写
26
- def walk():
27
- paths = os.walk(r'F:\**\**\**\***')
28
- for root, dir, files in paths:
29
- for name in files:
30
- if name == '***.**':
31
- os.remove(os.path.join(root, name))
32
-
33
-
34
- def j_listdir(dir_name, including_dir=True):
35
- # yield
36
- filenames = os.listdir(dir_name)
37
- for filename in filenames:
38
- if including_dir:
39
- yield os.path.join(dir_name, filename)
40
- else:
41
- yield filename
42
-
43
- # 合并文件 TODO 还没写
44
- def imgrate_files(path):
45
- filenames = os.listdir(path)
46
- return None
47
-
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import os
5
+ from pathlib import Path
6
+
7
+
8
+ # dir ----------------------------------------------------------------------
9
+ def j_mkdir(name):
10
+ os.makedirs(name, exist_ok=True)
11
+
12
+
13
+ def get_filename(path) -> str:
14
+ """
15
+ 返回路径最后的文件名
16
+ :param path:
17
+ :return:
18
+ """
19
+ # path = r'***/**/***.txt'
20
+ filename = os.path.split(path)[-1]
21
+ return filename
22
+
23
+
24
+ def j_listdir(dir_name, including_dir=True):
25
+ # yield
26
+ filenames = os.listdir(dir_name)
27
+ for filename in filenames:
28
+ if including_dir:
29
+ yield os.path.join(dir_name, filename)
30
+ else:
31
+ yield filename
32
+
33
+
34
+ # 合并文件 TODO 还没写
35
+ def imgrate_files(path):
36
+ filenames = os.listdir(path)
37
+ return None
38
+
39
+
40
+ def case_sensitive_path_exists(path: str, relative_path=False):
41
+ """
42
+ https://juejin.cn/post/7316725867086692391
43
+ Check if the path exists in a case-sensitive manner.
44
+ """
45
+ # 构造成Path
46
+ if relative_path:
47
+ path = Path.cwd() / path
48
+ else:
49
+ path = Path(path)
50
+ if not path.exists():
51
+ return False
52
+ # resolved_path是系统里的该文件实际名称
53
+ resolved_path = path.resolve()
54
+ return str(resolved_path) == str(path)
nlpertools/io/file.py CHANGED
@@ -1,205 +1,277 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import os
6
- import json
7
- import pickle
8
- import time
9
-
10
- def _merge_file(filelist, save_filename, shuffle=False):
11
- contents = []
12
- for file in filelist:
13
- content = nlpertools.readtxt_list_all_strip(file)
14
- contents.extend(content)
15
- if shuffle:
16
- random.shuffle(contents)
17
- nlpertools.writetxt_w_list(contents, save_filename)
18
-
19
-
20
- # file's io ----------------------------------------------------------------------
21
- # 读txt文件 一次全读完 返回list 去换行
22
- def readtxt_list_all_strip(path, encoding='utf-8'):
23
- lines = []
24
- with codecs.open(path, 'r', encoding) as r:
25
- for line in r.readlines():
26
- line = line.strip('\n').strip("\r")
27
- lines.append(line)
28
- return lines
29
-
30
-
31
- # 读txt 一次读一行 最后返回list
32
- def readtxt_list_each(path):
33
- lines = []
34
- with codecs.open(path, 'r', 'utf-8') as r:
35
- line = r.readline()
36
- while line:
37
- lines.append(line)
38
- line = r.readline()
39
- return lines
40
-
41
-
42
- # 读txt 一次读一行 最后返回list 去换行
43
- def readtxt_list_each_strip(path):
44
- lines = []
45
- with codecs.open(path, 'r', 'utf-8') as r:
46
- line = r.readline()
47
- while line:
48
- lines.append(line.strip("\n").strip("\r"))
49
- line = r.readline()
50
- return lines
51
-
52
-
53
- # 读txt文件 一次全读完 返回list
54
- def readtxt_list_all(path):
55
- with codecs.open(path, 'r', 'utf-8') as r:
56
- lines = r.readlines()
57
- return lines
58
-
59
-
60
- # 读byte文件 读成一条string
61
- def readtxt_byte(path, encoding="utf-8"):
62
- with codecs.open(path, 'rb') as r:
63
- lines = r.read()
64
- lines = lines.decode(encoding)
65
- return lines.replace('\r', '')
66
-
67
-
68
- # 读txt文件 读成一条string
69
- def readtxt_string(path, encoding="utf-8"):
70
- with codecs.open(path, 'r', encoding) as r:
71
- lines = r.read()
72
- return lines.replace('\r', '')
73
-
74
-
75
- # 写txt文件覆盖
76
- def writetxt_w(txt, path, r='w'):
77
- with codecs.open(path, r, 'utf-8') as w:
78
- w.writelines(txt)
79
-
80
-
81
- # 写txt文件追加
82
- def writetxt_a(txt, path):
83
- with codecs.open(path, 'a', 'utf-8') as w:
84
- w.writelines(txt)
85
-
86
-
87
- def writetxt(txt, path, encoding="utf-8"):
88
- with codecs.open(path, 'w', encoding) as w:
89
- w.write(txt)
90
-
91
-
92
- def writetxt_wb(txt, path):
93
- with codecs.open(path, 'wb') as w:
94
- w.write(txt)
95
-
96
-
97
- # 写list 覆盖
98
- def writetxt_w_list(list, path, num_lf=1):
99
- with codecs.open(path, 'w', "utf-8") as w:
100
- for i in list:
101
- w.write(i)
102
- w.write("\n" * num_lf)
103
-
104
-
105
- # 写list 追加
106
- def writetxt_a_list(list, path, num_lf=2):
107
- with codecs.open(path, 'a', "utf-8") as w:
108
- for i in list:
109
- w.write(i)
110
- w.write("\n" * num_lf)
111
-
112
-
113
- # 写二维list 追加
114
- def writetxt_a_2list(list, path):
115
- with codecs.open(path, 'a', "utf-8") as w:
116
- for i in list:
117
- writetxt_a_list(i, path)
118
-
119
-
120
-
121
-
122
- def SaveToJson(content, path):
123
- with codecs.open(path, "w", "utf-8") as w:
124
- json.dump(content, w, ensure_ascii=False, indent=1)
125
-
126
-
127
- def LoadFromJson(path):
128
- with codecs.open(path, "r", "utf-8") as r:
129
- content = json.load(r)
130
- return content
131
-
132
-
133
- # 读txt文件 读成一条string if gb2312
134
- def readtxt_string_all_encoding(path):
135
- try:
136
- with codecs.open(path, 'rb', "utf-8-sig") as r:
137
- lines = r.read()
138
- return lines
139
- except:
140
- try:
141
- with codecs.open(path, 'rb', "utf-8") as r:
142
- lines = r.reacd()
143
- return lines
144
- except:
145
- try:
146
- with codecs.open(path, 'rb', "big5") as r:
147
- lines = r.read()
148
- return lines
149
- except:
150
- print(path)
151
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
152
- lines = r.read()
153
- return lines
154
-
155
-
156
- def readtxt_list_all_encoding(path):
157
- try:
158
- with codecs.open(path, 'rb', "utf-8-sig") as r:
159
- lines = r.readlines()
160
- return lines
161
- except:
162
- try:
163
- with codecs.open(path, 'rb', "utf-8") as r:
164
- lines = r.readlines()
165
- return lines
166
- except:
167
- try:
168
- with codecs.open(path, 'rb', "big5") as r:
169
- lines = r.readlines()
170
- return lines
171
- except:
172
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
173
- lines = r.readlines()
174
- return lines
175
-
176
- # line by line
177
- def save_to_json(corpus, path):
178
- with open(path, 'w', encoding='utf-8') as wt:
179
- for i in corpus:
180
- wt.write(json.dumps(i, ensure_ascii=False))
181
- wt.write('\n')
182
-
183
-
184
- # line by line
185
- def load_from_json(path):
186
- with open(path, 'r', encoding='utf-8') as rd:
187
- corpus = []
188
- while True:
189
- line = rd.readline()
190
- if line:
191
- corpus.append(json.loads(line))
192
- else:
193
- break
194
- return corpus
195
-
196
- def pickle_save(data, path):
197
- with open(path, 'wb') as f:
198
- pickle.dump(data, f)
199
-
200
-
201
- def pickle_load(path):
202
- with open(path, 'rb') as f:
203
- data = pickle.load(f)
204
- return data
205
-
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import codecs
5
+ import json
6
+ import pickle
7
+ import random
8
+ import time
9
+ from itertools import (takewhile, repeat)
10
+ import pandas as pd
11
+ # import omegaconf
12
+ # import yaml
13
+ from ..utils.package import *
14
+
15
+ LARGE_FILE_THRESHOLD = 1e5
16
+
17
+
18
+ def read_yaml(path, omega=False):
19
+ if omega:
20
+ return omegaconf.OmegaConf.load(path)
21
+ return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
22
+
23
+
24
+ def _merge_file(filelist, save_filename, shuffle=False):
25
+ contents = []
26
+ for file in filelist:
27
+ content = readtxt_list_all_strip(file)
28
+ contents.extend(content)
29
+ if shuffle:
30
+ random.shuffle(contents)
31
+ writetxt_w_list(contents, save_filename)
32
+
33
+
34
+ # file's io ----------------------------------------------------------------------
35
+ def iter_count(file_name):
36
+ """
37
+ 最快的文件行数统计,不知道和wc -l 谁快
38
+ author: unknown
39
+ """
40
+ buffer = 1024 * 1024
41
+ with codecs.open(file_name, 'r', 'utf-8') as f:
42
+ buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
43
+ return sum(buf.count('\n') for buf in buf_gen)
44
+
45
+
46
+ # 需要加入进度条的函数包括
47
+ """
48
+ readtxt_list_all_strip
49
+ save_to_json
50
+ load_from_json
51
+ """
52
+
53
+
54
+ # 读txt文件 一次全读完 返回list 去换行
55
+ def readtxt_list_all_strip(path, encoding='utf-8'):
56
+ file_line_num = iter_count(path)
57
+ lines = []
58
+ with codecs.open(path, 'r', encoding) as r:
59
+ if file_line_num > LARGE_FILE_THRESHOLD:
60
+ iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
61
+ else:
62
+ iter_obj = enumerate(r.readlines())
63
+
64
+ for ldx, line in iter_obj:
65
+ lines.append(line.strip('\n').strip("\r"))
66
+ return lines
67
+
68
+
69
+ # 读txt 一次读一行 最后返回list
70
+ def readtxt_list_each(path):
71
+ lines = []
72
+ with codecs.open(path, 'r', 'utf-8') as r:
73
+ line = r.readline()
74
+ while line:
75
+ lines.append(line)
76
+ line = r.readline()
77
+ return lines
78
+
79
+
80
+ def readtxt_list_each_strip(path):
81
+ """
82
+ yield方法
83
+ """
84
+ with codecs.open(path, 'r', 'utf-8') as r:
85
+ line = r.readline()
86
+ while line:
87
+ yield line.strip("\n").strip("\r")
88
+ line = r.readline()
89
+
90
+
91
+ # 读txt文件 一次全读完 返回list
92
+ def readtxt_list_all(path):
93
+ with codecs.open(path, 'r', 'utf-8') as r:
94
+ lines = r.readlines()
95
+ return lines
96
+
97
+
98
+ # 读byte文件 读成一条string
99
+ def readtxt_byte(path, encoding="utf-8"):
100
+ with codecs.open(path, 'rb') as r:
101
+ lines = r.read()
102
+ lines = lines.decode(encoding)
103
+ return lines.replace('\r', '')
104
+
105
+
106
+ # 读txt文件 读成一条string
107
+ def readtxt_string(path, encoding="utf-8"):
108
+ with codecs.open(path, 'r', encoding) as r:
109
+ lines = r.read()
110
+ return lines.replace('\r', '')
111
+
112
+
113
+ # 写txt文件覆盖
114
+ def writetxt_w(txt, path, r='w'):
115
+ with codecs.open(path, r, 'utf-8') as w:
116
+ w.writelines(txt)
117
+
118
+
119
+ # 写txt文件追加
120
+ def writetxt_a(txt, path):
121
+ with codecs.open(path, 'a', 'utf-8') as w:
122
+ w.writelines(txt)
123
+
124
+
125
+ def writetxt(txt, path, encoding="utf-8"):
126
+ with codecs.open(path, 'w', encoding) as w:
127
+ w.write(txt)
128
+
129
+
130
+ def writetxt_wb(txt, path):
131
+ with codecs.open(path, 'wb') as w:
132
+ w.write(txt)
133
+
134
+
135
+ # 写list 覆盖
136
+ def writetxt_w_list(list, path, num_lf=1):
137
+ with codecs.open(path, 'w', "utf-8") as w:
138
+ for i in list:
139
+ w.write(i)
140
+ w.write("\n" * num_lf)
141
+
142
+
143
+ # 写list 追加
144
+ def writetxt_a_list(list, path, num_lf=2):
145
+ with codecs.open(path, 'a', "utf-8") as w:
146
+ for i in list:
147
+ w.write(i)
148
+ w.write("\n" * num_lf)
149
+
150
+
151
+ def save_to_json(content, path):
152
+ with codecs.open(path, "w", "utf-8") as w:
153
+ json.dump(content, w, ensure_ascii=False, indent=1)
154
+
155
+
156
+ def load_from_json(path):
157
+ with codecs.open(path, "r", "utf-8") as r:
158
+ content = json.load(r)
159
+ return content
160
+
161
+
162
+ # 读txt文件 读成一条string if gb2312
163
+ def readtxt_string_all_encoding(path):
164
+ try:
165
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
166
+ lines = r.read()
167
+ return lines
168
+ except:
169
+ try:
170
+ with codecs.open(path, 'rb', "utf-8") as r:
171
+ lines = r.reacd()
172
+ return lines
173
+ except:
174
+ try:
175
+ with codecs.open(path, 'rb', "big5") as r:
176
+ lines = r.read()
177
+ return lines
178
+ except:
179
+ print(path)
180
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
181
+ lines = r.read()
182
+ return lines
183
+
184
+
185
+ def readtxt_list_all_encoding(path):
186
+ try:
187
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
188
+ lines = r.readlines()
189
+ return lines
190
+ except:
191
+ try:
192
+ with codecs.open(path, 'rb', "utf-8") as r:
193
+ lines = r.readlines()
194
+ return lines
195
+ except:
196
+ try:
197
+ with codecs.open(path, 'rb', "big5") as r:
198
+ lines = r.readlines()
199
+ return lines
200
+ except:
201
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
202
+ lines = r.readlines()
203
+ return lines
204
+
205
+
206
+ # line by line
207
+ def save_to_jsonl(corpus, path):
208
+ with open(path, 'w', encoding='utf-8') as wt:
209
+ for i in corpus:
210
+ wt.write(json.dumps(i, ensure_ascii=False))
211
+ wt.write('\n')
212
+
213
+
214
+ # line by line
215
+ def load_from_jsonl(path):
216
+ file_line_num = iter_count(path)
217
+ if file_line_num > 1e5:
218
+ with open(path, 'r', encoding='utf-8') as rd:
219
+ corpus = []
220
+ while True:
221
+ line = rd.readline()
222
+ if line:
223
+ corpus.append(json.loads(line))
224
+ else:
225
+ break
226
+ return corpus
227
+ else:
228
+ with open(path, 'r', encoding='utf-8') as rd:
229
+ corpus = []
230
+ while True:
231
+ line = rd.readline()
232
+ if line:
233
+ corpus.append(json.loads(line))
234
+ else:
235
+ break
236
+ return corpus
237
+
238
+
239
+ def pickle_save(data, path):
240
+ with open(path, 'wb') as f:
241
+ pickle.dump(data, f)
242
+
243
+
244
+ def pickle_load(path):
245
+ with open(path, 'rb') as f:
246
+ data = pickle.load(f)
247
+ return data
248
+
249
+
250
+ def save_to_csv(df, save_path, index_flag=False):
251
+ with open(save_path, 'wb+') as csvfile:
252
+ csvfile.write(codecs.BOM_UTF8)
253
+ df.to_csv(save_path, mode='a', index=index_flag)
254
+
255
+
256
+ def save_to_mongo():
257
+ # fake
258
+ """
259
+ 示例
260
+
261
+ """
262
+ pass
263
+
264
+ def load_from_mongo():
265
+ pass
266
+
267
+
268
+ def unmerge_cells_df(df) -> pd.DataFrame:
269
+ for column in df.columns:
270
+ values = []
271
+ for i in df[column]:
272
+ if pd.isna(i):
273
+ values.append(values[-1])
274
+ else:
275
+ values.append(i)
276
+ df[column] = values
277
+ return df