nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. nlpertools/__init__.py +23 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/cli.py +87 -0
  9. nlpertools/data_client.py +426 -257
  10. nlpertools/data_structure/base_structure.py +109 -13
  11. nlpertools/dataprocess.py +627 -3
  12. nlpertools/default_db_config.yml +41 -0
  13. nlpertools/draw/__init__.py +0 -0
  14. nlpertools/draw/draw.py +83 -0
  15. nlpertools/draw/math_func.py +33 -0
  16. nlpertools/get_2fa.py +0 -0
  17. nlpertools/io/__init__.py +3 -3
  18. nlpertools/io/dir.py +86 -36
  19. nlpertools/io/file.py +283 -222
  20. nlpertools/ml.py +511 -460
  21. nlpertools/monitor/__init__.py +0 -0
  22. nlpertools/monitor/gpu.py +18 -0
  23. nlpertools/monitor/memory.py +24 -0
  24. nlpertools/movie.py +36 -0
  25. nlpertools/nlpertools_config.yml +1 -0
  26. nlpertools/{openApi.py → open_api.py} +65 -65
  27. nlpertools/other.py +475 -249
  28. nlpertools/pic.py +288 -0
  29. nlpertools/plugin.py +43 -43
  30. nlpertools/reminder.py +98 -87
  31. nlpertools/utils/__init__.py +3 -3
  32. nlpertools/utils/lazy.py +727 -0
  33. nlpertools/utils/log_util.py +20 -0
  34. nlpertools/utils/package.py +89 -76
  35. nlpertools/utils/package_v1.py +94 -0
  36. nlpertools/utils/package_v2.py +117 -0
  37. nlpertools/utils_for_nlpertools.py +93 -93
  38. nlpertools/vector_index_demo.py +108 -0
  39. nlpertools/wrapper.py +161 -96
  40. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
  41. nlpertools-1.0.8.dist-info/METADATA +132 -0
  42. nlpertools-1.0.8.dist-info/RECORD +49 -0
  43. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
  44. nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
  45. nlpertools-1.0.8.dist-info/top_level.txt +2 -0
  46. nlpertools_helper/__init__.py +10 -0
  47. nlpertools-1.0.5.dist-info/METADATA +0 -85
  48. nlpertools-1.0.5.dist-info/RECORD +0 -25
  49. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/io/file.py CHANGED
@@ -1,222 +1,283 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import json
6
- import pickle
7
- import random
8
- import time
9
-
10
- # import yaml
11
- from ..utils.package import *
12
-
13
-
14
- def read_yaml(path):
15
- return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
16
-
17
-
18
- def _merge_file(filelist, save_filename, shuffle=False):
19
- contents = []
20
- for file in filelist:
21
- content = readtxt_list_all_strip(file)
22
- contents.extend(content)
23
- if shuffle:
24
- random.shuffle(contents)
25
- writetxt_w_list(contents, save_filename)
26
-
27
-
28
- # file's io ----------------------------------------------------------------------
29
- # 读txt文件 一次全读完 返回list 去换行
30
- def readtxt_list_all_strip(path, encoding='utf-8'):
31
- lines = []
32
- t_start = time.time()
33
- with codecs.open(path, 'r', encoding) as r:
34
- for ldx, line in enumerate(r.readlines()):
35
- line = line.strip('\n').strip("\r")
36
- lines.append(line)
37
- if ldx > 1e5:
38
- t_end = time.time()
39
- print("read {} over, cos time {} ms".format(path, t_end - t_start))
40
- return lines
41
-
42
-
43
- # 读txt 一次读一行 最后返回list
44
- def readtxt_list_each(path):
45
- lines = []
46
- with codecs.open(path, 'r', 'utf-8') as r:
47
- line = r.readline()
48
- while line:
49
- lines.append(line)
50
- line = r.readline()
51
- return lines
52
-
53
-
54
- def readtxt_list_each_strip(path):
55
- """
56
- yield方法
57
- """
58
- with codecs.open(path, 'r', 'utf-8') as r:
59
- line = r.readline()
60
- while line:
61
- line = r.readline()
62
- yield line.strip("\n").strip("\r")
63
-
64
-
65
- # 读txt文件 一次全读完 返回list
66
- def readtxt_list_all(path):
67
- with codecs.open(path, 'r', 'utf-8') as r:
68
- lines = r.readlines()
69
- return lines
70
-
71
-
72
- # 读byte文件 读成一条string
73
- def readtxt_byte(path, encoding="utf-8"):
74
- with codecs.open(path, 'rb') as r:
75
- lines = r.read()
76
- lines = lines.decode(encoding)
77
- return lines.replace('\r', '')
78
-
79
-
80
- # 读txt文件 读成一条string
81
- def readtxt_string(path, encoding="utf-8"):
82
- with codecs.open(path, 'r', encoding) as r:
83
- lines = r.read()
84
- return lines.replace('\r', '')
85
-
86
-
87
- # 写txt文件覆盖
88
- def writetxt_w(txt, path, r='w'):
89
- with codecs.open(path, r, 'utf-8') as w:
90
- w.writelines(txt)
91
-
92
-
93
- # 写txt文件追加
94
- def writetxt_a(txt, path):
95
- with codecs.open(path, 'a', 'utf-8') as w:
96
- w.writelines(txt)
97
-
98
-
99
- def writetxt(txt, path, encoding="utf-8"):
100
- with codecs.open(path, 'w', encoding) as w:
101
- w.write(txt)
102
-
103
-
104
- def writetxt_wb(txt, path):
105
- with codecs.open(path, 'wb') as w:
106
- w.write(txt)
107
-
108
-
109
- # 写list 覆盖
110
- def writetxt_w_list(list, path, num_lf=1):
111
- with codecs.open(path, 'w', "utf-8") as w:
112
- for i in list:
113
- w.write(i)
114
- w.write("\n" * num_lf)
115
-
116
-
117
- # 写list 追加
118
- def writetxt_a_list(list, path, num_lf=2):
119
- with codecs.open(path, 'a', "utf-8") as w:
120
- for i in list:
121
- w.write(i)
122
- w.write("\n" * num_lf)
123
-
124
-
125
- # 写二维list 追加
126
- def writetxt_a_2list(list, path):
127
- with codecs.open(path, 'a', "utf-8") as w:
128
- for i in list:
129
- writetxt_a_list(i, path)
130
-
131
-
132
- def SaveToJson(content, path):
133
- with codecs.open(path, "w", "utf-8") as w:
134
- json.dump(content, w, ensure_ascii=False, indent=1)
135
-
136
-
137
- def LoadFromJson(path):
138
- with codecs.open(path, "r", "utf-8") as r:
139
- content = json.load(r)
140
- return content
141
-
142
-
143
- # 读txt文件 读成一条string if gb2312
144
- def readtxt_string_all_encoding(path):
145
- try:
146
- with codecs.open(path, 'rb', "utf-8-sig") as r:
147
- lines = r.read()
148
- return lines
149
- except:
150
- try:
151
- with codecs.open(path, 'rb', "utf-8") as r:
152
- lines = r.reacd()
153
- return lines
154
- except:
155
- try:
156
- with codecs.open(path, 'rb', "big5") as r:
157
- lines = r.read()
158
- return lines
159
- except:
160
- print(path)
161
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
162
- lines = r.read()
163
- return lines
164
-
165
-
166
- def readtxt_list_all_encoding(path):
167
- try:
168
- with codecs.open(path, 'rb', "utf-8-sig") as r:
169
- lines = r.readlines()
170
- return lines
171
- except:
172
- try:
173
- with codecs.open(path, 'rb', "utf-8") as r:
174
- lines = r.readlines()
175
- return lines
176
- except:
177
- try:
178
- with codecs.open(path, 'rb', "big5") as r:
179
- lines = r.readlines()
180
- return lines
181
- except:
182
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
183
- lines = r.readlines()
184
- return lines
185
-
186
-
187
- # line by line
188
- def save_to_json(corpus, path):
189
- with open(path, 'w', encoding='utf-8') as wt:
190
- for i in corpus:
191
- wt.write(json.dumps(i, ensure_ascii=False))
192
- wt.write('\n')
193
-
194
-
195
- # line by line
196
- def load_from_json(path):
197
- with open(path, 'r', encoding='utf-8') as rd:
198
- corpus = []
199
- while True:
200
- line = rd.readline()
201
- if line:
202
- corpus.append(json.loads(line))
203
- else:
204
- break
205
- return corpus
206
-
207
-
208
- def pickle_save(data, path):
209
- with open(path, 'wb') as f:
210
- pickle.dump(data, f)
211
-
212
-
213
- def pickle_load(path):
214
- with open(path, 'rb') as f:
215
- data = pickle.load(f)
216
- return data
217
-
218
-
219
- def save_to_csv(df, save_path, index_flag=False):
220
- with open(save_path, 'wb+') as csvfile:
221
- csvfile.write(codecs.BOM_UTF8)
222
- df.to_csv(save_path, mode='a', index=index_flag)
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import codecs
5
+ import json
6
+ import pickle
7
+ import random
8
+ from itertools import (takewhile, repeat)
9
+ import pandas as pd
10
+ # import omegaconf
11
+ # import yaml
12
+ from ..utils.package import *
13
+
14
+ LARGE_FILE_THRESHOLD = 1e5
15
+
16
+
17
+ def safe_filename(filename: str) -> str:
18
+ for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|']:
19
+ filename = filename.replace(char, '_')
20
+ return filename
21
+
22
+
23
+ def read_yaml(path, omega=False):
24
+ if omega:
25
+ return omegaconf.OmegaConf.load(path)
26
+ return yaml.load(codecs.open(path, encoding='utf-8'), Loader=yaml.FullLoader)
27
+
28
+
29
+ def _merge_file(filelist, save_filename, shuffle=False):
30
+ contents = []
31
+ for file in filelist:
32
+ content = readtxt_list_all_strip(file)
33
+ contents.extend(content)
34
+ if shuffle:
35
+ random.shuffle(contents)
36
+ writetxt_w_list(contents, save_filename)
37
+
38
+
39
+ # file's io ----------------------------------------------------------------------
40
+ def iter_count(file_name):
41
+ """
42
+ 最快的文件行数统计,不知道和wc -l 谁快
43
+ author: unknown
44
+ """
45
+ buffer = 1024 * 1024
46
+ with codecs.open(file_name, 'r', 'utf-8') as f:
47
+ buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
48
+ return sum(buf.count('\n') for buf in buf_gen)
49
+
50
+
51
+ # 需要加入进度条的函数包括
52
+ """
53
+ readtxt_list_all_strip
54
+ save_to_json
55
+ load_from_json
56
+ """
57
+
58
+
59
+ # 读txt文件 一次全读完 返回list 去换行
60
+ def readtxt_list_all_strip(path, encoding='utf-8') -> list:
61
+ file_line_num = iter_count(path)
62
+ lines = []
63
+ with codecs.open(path, 'r', encoding) as r:
64
+ if file_line_num > LARGE_FILE_THRESHOLD:
65
+ iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
66
+ else:
67
+ iter_obj = enumerate(r.readlines())
68
+
69
+ for ldx, line in iter_obj:
70
+ lines.append(line.strip('\n').strip("\r"))
71
+ return lines
72
+
73
+
74
+ # 读txt 一次读一行 最后返回list
75
+ def readtxt_list_each(path) -> list:
76
+ lines = []
77
+ with codecs.open(path, 'r', 'utf-8') as r:
78
+ line = r.readline()
79
+ while line:
80
+ lines.append(line)
81
+ line = r.readline()
82
+ return lines
83
+
84
+
85
+ def readtxt_list_each_strip(path) -> list:
86
+ """
87
+ yield方法
88
+ """
89
+ with codecs.open(path, 'r', 'utf-8') as r:
90
+ line = r.readline()
91
+ while line:
92
+ yield line.strip("\n").strip("\r")
93
+ line = r.readline()
94
+
95
+
96
+ # 读txt文件 一次全读完 返回list
97
+ def readtxt_list_all(path) -> list:
98
+ with codecs.open(path, 'r', 'utf-8') as r:
99
+ lines = r.readlines()
100
+ return lines
101
+
102
+
103
+ # 读byte文件 读成一条string
104
+ def readtxt_byte(path, encoding="utf-8") -> str:
105
+ with codecs.open(path, 'rb') as r:
106
+ lines = r.read()
107
+ lines = lines.decode(encoding)
108
+ return lines.replace('\r', '')
109
+
110
+
111
+ # 读txt文件 读成一条string
112
+ def readtxt_string(path, encoding="utf-8") -> str:
113
+ with codecs.open(path, 'r', encoding) as r:
114
+ lines = r.read()
115
+ return lines.replace('\r', '')
116
+
117
+
118
+ # 写txt文件覆盖
119
+ def writetxt_w(txt, path, r='w'):
120
+ with codecs.open(path, r, 'utf-8') as w:
121
+ w.writelines(txt)
122
+
123
+
124
+ # 写txt文件追加
125
+ def writetxt_a(txt, path):
126
+ with codecs.open(path, 'a', 'utf-8') as w:
127
+ w.writelines(txt)
128
+
129
+
130
+ def writetxt(txt, path, encoding="utf-8"):
131
+ with codecs.open(path, 'w', encoding) as w:
132
+ w.write(txt)
133
+
134
+
135
+ def writetxt_wb(txt, path):
136
+ with codecs.open(path, 'wb') as w:
137
+ w.write(txt)
138
+
139
+
140
+ # 写list 覆盖
141
+ def writetxt_w_list(list, path, num_lf=1):
142
+ with codecs.open(path, 'w', "utf-8") as w:
143
+ for i in list:
144
+ w.write(i)
145
+ w.write("\n" * num_lf)
146
+
147
+
148
+ # 写list 追加
149
+ def writetxt_a_list(list, path, num_lf=2):
150
+ with codecs.open(path, 'a', "utf-8") as w:
151
+ for i in list:
152
+ w.write(i)
153
+ w.write("\n" * num_lf)
154
+
155
+
156
+ def save_to_json(content, path):
157
+ with codecs.open(path, "w", "utf-8") as w:
158
+ json.dump(content, w, ensure_ascii=False, indent=1)
159
+
160
+
161
+ def load_from_json(path):
162
+ with codecs.open(path, "r", "utf-8") as r:
163
+ content = json.load(r)
164
+ return content
165
+
166
+
167
+ # 读txt文件 读成一条string if gb2312
168
+ def readtxt_string_all_encoding(path):
169
+ try:
170
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
171
+ lines = r.read()
172
+ return lines
173
+ except:
174
+ try:
175
+ with codecs.open(path, 'rb', "utf-8") as r:
176
+ lines = r.reacd()
177
+ return lines
178
+ except:
179
+ try:
180
+ with codecs.open(path, 'rb', "big5") as r:
181
+ lines = r.read()
182
+ return lines
183
+ except:
184
+ print(path)
185
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
186
+ lines = r.read()
187
+ return lines
188
+
189
+
190
+ def readtxt_list_all_encoding(path):
191
+ try:
192
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
193
+ lines = r.readlines()
194
+ return lines
195
+ except:
196
+ try:
197
+ with codecs.open(path, 'rb', "utf-8") as r:
198
+ lines = r.readlines()
199
+ return lines
200
+ except:
201
+ try:
202
+ with codecs.open(path, 'rb', "big5") as r:
203
+ lines = r.readlines()
204
+ return lines
205
+ except:
206
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
207
+ lines = r.readlines()
208
+ return lines
209
+
210
+
211
+ # line by line
212
+ def save_to_jsonl(corpus, path):
213
+ with open(path, 'w', encoding='utf-8') as wt:
214
+ for i in corpus:
215
+ wt.write(json.dumps(i, ensure_ascii=False))
216
+ wt.write('\n')
217
+
218
+
219
+ # line by line
220
+ def load_from_jsonl(path):
221
+ file_line_num = iter_count(path)
222
+ if file_line_num > 1e5:
223
+ with open(path, 'r', encoding='utf-8') as rd:
224
+ corpus = []
225
+ while True:
226
+ line = rd.readline()
227
+ if line:
228
+ corpus.append(json.loads(line))
229
+ else:
230
+ break
231
+ return corpus
232
+ else:
233
+ with open(path, 'r', encoding='utf-8') as rd:
234
+ corpus = []
235
+ while True:
236
+ line = rd.readline()
237
+ if line:
238
+ corpus.append(json.loads(line))
239
+ else:
240
+ break
241
+ return corpus
242
+
243
+
244
+ def pickle_save(data, path):
245
+ with open(path, 'wb') as f:
246
+ pickle.dump(data, f)
247
+
248
+
249
+ def pickle_load(path):
250
+ with open(path, 'rb') as f:
251
+ data = pickle.load(f)
252
+ return data
253
+
254
+
255
+ def save_to_csv(df, save_path, index_flag=False):
256
+ with open(save_path, 'wb+') as csvfile:
257
+ csvfile.write(codecs.BOM_UTF8)
258
+ df.to_csv(save_path, mode='a', index=index_flag)
259
+
260
+
261
+ def save_to_mongo():
262
+ # fake
263
+ """
264
+ 示例
265
+
266
+ """
267
+ pass
268
+
269
+
270
+ def load_from_mongo():
271
+ pass
272
+
273
+
274
+ def unmerge_cells_df(df) -> pd.DataFrame:
275
+ for column in df.columns:
276
+ values = []
277
+ for i in df[column]:
278
+ if pd.isna(i):
279
+ values.append(values[-1])
280
+ else:
281
+ values.append(i)
282
+ df[column] = values
283
+ return df