nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. nlpertools/__init__.py +23 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/cli.py +87 -0
  9. nlpertools/data_client.py +426 -257
  10. nlpertools/data_structure/base_structure.py +109 -13
  11. nlpertools/dataprocess.py +627 -3
  12. nlpertools/default_db_config.yml +41 -0
  13. nlpertools/draw/__init__.py +0 -0
  14. nlpertools/draw/draw.py +83 -0
  15. nlpertools/draw/math_func.py +33 -0
  16. nlpertools/get_2fa.py +0 -0
  17. nlpertools/io/__init__.py +3 -3
  18. nlpertools/io/dir.py +86 -36
  19. nlpertools/io/file.py +283 -222
  20. nlpertools/ml.py +511 -460
  21. nlpertools/monitor/__init__.py +0 -0
  22. nlpertools/monitor/gpu.py +18 -0
  23. nlpertools/monitor/memory.py +24 -0
  24. nlpertools/movie.py +36 -0
  25. nlpertools/nlpertools_config.yml +1 -0
  26. nlpertools/{openApi.py → open_api.py} +65 -65
  27. nlpertools/other.py +475 -249
  28. nlpertools/pic.py +288 -0
  29. nlpertools/plugin.py +43 -43
  30. nlpertools/reminder.py +98 -87
  31. nlpertools/utils/__init__.py +3 -3
  32. nlpertools/utils/lazy.py +727 -0
  33. nlpertools/utils/log_util.py +20 -0
  34. nlpertools/utils/package.py +89 -76
  35. nlpertools/utils/package_v1.py +94 -0
  36. nlpertools/utils/package_v2.py +117 -0
  37. nlpertools/utils_for_nlpertools.py +93 -93
  38. nlpertools/vector_index_demo.py +108 -0
  39. nlpertools/wrapper.py +161 -96
  40. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
  41. nlpertools-1.0.8.dist-info/METADATA +132 -0
  42. nlpertools-1.0.8.dist-info/RECORD +49 -0
  43. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
  44. nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
  45. nlpertools-1.0.8.dist-info/top_level.txt +2 -0
  46. nlpertools_helper/__init__.py +10 -0
  47. nlpertools-1.0.5.dist-info/METADATA +0 -85
  48. nlpertools-1.0.5.dist-info/RECORD +0 -25
  49. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/io/file.py CHANGED
@@ -1,222 +1,283 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import json
6
- import pickle
7
- import random
8
- import time
9
-
10
- # import yaml
11
- from ..utils.package import *
12
-
13
-
14
- def read_yaml(path):
15
- return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
16
-
17
-
18
- def _merge_file(filelist, save_filename, shuffle=False):
19
- contents = []
20
- for file in filelist:
21
- content = readtxt_list_all_strip(file)
22
- contents.extend(content)
23
- if shuffle:
24
- random.shuffle(contents)
25
- writetxt_w_list(contents, save_filename)
26
-
27
-
28
- # file's io ----------------------------------------------------------------------
29
- # 读txt文件 一次全读完 返回list 去换行
30
- def readtxt_list_all_strip(path, encoding='utf-8'):
31
- lines = []
32
- t_start = time.time()
33
- with codecs.open(path, 'r', encoding) as r:
34
- for ldx, line in enumerate(r.readlines()):
35
- line = line.strip('\n').strip("\r")
36
- lines.append(line)
37
- if ldx > 1e5:
38
- t_end = time.time()
39
- print("read {} over, cos time {} ms".format(path, t_end - t_start))
40
- return lines
41
-
42
-
43
- # 读txt 一次读一行 最后返回list
44
- def readtxt_list_each(path):
45
- lines = []
46
- with codecs.open(path, 'r', 'utf-8') as r:
47
- line = r.readline()
48
- while line:
49
- lines.append(line)
50
- line = r.readline()
51
- return lines
52
-
53
-
54
- def readtxt_list_each_strip(path):
55
- """
56
- yield方法
57
- """
58
- with codecs.open(path, 'r', 'utf-8') as r:
59
- line = r.readline()
60
- while line:
61
- line = r.readline()
62
- yield line.strip("\n").strip("\r")
63
-
64
-
65
- # 读txt文件 一次全读完 返回list
66
- def readtxt_list_all(path):
67
- with codecs.open(path, 'r', 'utf-8') as r:
68
- lines = r.readlines()
69
- return lines
70
-
71
-
72
- # 读byte文件 读成一条string
73
- def readtxt_byte(path, encoding="utf-8"):
74
- with codecs.open(path, 'rb') as r:
75
- lines = r.read()
76
- lines = lines.decode(encoding)
77
- return lines.replace('\r', '')
78
-
79
-
80
- # 读txt文件 读成一条string
81
- def readtxt_string(path, encoding="utf-8"):
82
- with codecs.open(path, 'r', encoding) as r:
83
- lines = r.read()
84
- return lines.replace('\r', '')
85
-
86
-
87
- # 写txt文件覆盖
88
- def writetxt_w(txt, path, r='w'):
89
- with codecs.open(path, r, 'utf-8') as w:
90
- w.writelines(txt)
91
-
92
-
93
- # 写txt文件追加
94
- def writetxt_a(txt, path):
95
- with codecs.open(path, 'a', 'utf-8') as w:
96
- w.writelines(txt)
97
-
98
-
99
- def writetxt(txt, path, encoding="utf-8"):
100
- with codecs.open(path, 'w', encoding) as w:
101
- w.write(txt)
102
-
103
-
104
- def writetxt_wb(txt, path):
105
- with codecs.open(path, 'wb') as w:
106
- w.write(txt)
107
-
108
-
109
- # 写list 覆盖
110
- def writetxt_w_list(list, path, num_lf=1):
111
- with codecs.open(path, 'w', "utf-8") as w:
112
- for i in list:
113
- w.write(i)
114
- w.write("\n" * num_lf)
115
-
116
-
117
- # 写list 追加
118
- def writetxt_a_list(list, path, num_lf=2):
119
- with codecs.open(path, 'a', "utf-8") as w:
120
- for i in list:
121
- w.write(i)
122
- w.write("\n" * num_lf)
123
-
124
-
125
- # 写二维list 追加
126
- def writetxt_a_2list(list, path):
127
- with codecs.open(path, 'a', "utf-8") as w:
128
- for i in list:
129
- writetxt_a_list(i, path)
130
-
131
-
132
- def SaveToJson(content, path):
133
- with codecs.open(path, "w", "utf-8") as w:
134
- json.dump(content, w, ensure_ascii=False, indent=1)
135
-
136
-
137
- def LoadFromJson(path):
138
- with codecs.open(path, "r", "utf-8") as r:
139
- content = json.load(r)
140
- return content
141
-
142
-
143
- # 读txt文件 读成一条string if gb2312
144
- def readtxt_string_all_encoding(path):
145
- try:
146
- with codecs.open(path, 'rb', "utf-8-sig") as r:
147
- lines = r.read()
148
- return lines
149
- except:
150
- try:
151
- with codecs.open(path, 'rb', "utf-8") as r:
152
- lines = r.reacd()
153
- return lines
154
- except:
155
- try:
156
- with codecs.open(path, 'rb', "big5") as r:
157
- lines = r.read()
158
- return lines
159
- except:
160
- print(path)
161
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
162
- lines = r.read()
163
- return lines
164
-
165
-
166
- def readtxt_list_all_encoding(path):
167
- try:
168
- with codecs.open(path, 'rb', "utf-8-sig") as r:
169
- lines = r.readlines()
170
- return lines
171
- except:
172
- try:
173
- with codecs.open(path, 'rb', "utf-8") as r:
174
- lines = r.readlines()
175
- return lines
176
- except:
177
- try:
178
- with codecs.open(path, 'rb', "big5") as r:
179
- lines = r.readlines()
180
- return lines
181
- except:
182
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
183
- lines = r.readlines()
184
- return lines
185
-
186
-
187
- # line by line
188
- def save_to_json(corpus, path):
189
- with open(path, 'w', encoding='utf-8') as wt:
190
- for i in corpus:
191
- wt.write(json.dumps(i, ensure_ascii=False))
192
- wt.write('\n')
193
-
194
-
195
- # line by line
196
- def load_from_json(path):
197
- with open(path, 'r', encoding='utf-8') as rd:
198
- corpus = []
199
- while True:
200
- line = rd.readline()
201
- if line:
202
- corpus.append(json.loads(line))
203
- else:
204
- break
205
- return corpus
206
-
207
-
208
- def pickle_save(data, path):
209
- with open(path, 'wb') as f:
210
- pickle.dump(data, f)
211
-
212
-
213
- def pickle_load(path):
214
- with open(path, 'rb') as f:
215
- data = pickle.load(f)
216
- return data
217
-
218
-
219
- def save_to_csv(df, save_path, index_flag=False):
220
- with open(save_path, 'wb+') as csvfile:
221
- csvfile.write(codecs.BOM_UTF8)
222
- df.to_csv(save_path, mode='a', index=index_flag)
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import codecs
5
+ import json
6
+ import pickle
7
+ import random
8
+ from itertools import (takewhile, repeat)
9
+ import pandas as pd
10
+ # import omegaconf
11
+ # import yaml
12
+ from ..utils.package import *
13
+
14
+ LARGE_FILE_THRESHOLD = 1e5
15
+
16
+
17
+ def safe_filename(filename: str) -> str:
18
+ for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|']:
19
+ filename = filename.replace(char, '_')
20
+ return filename
21
+
22
+
23
+ def read_yaml(path, omega=False):
24
+ if omega:
25
+ return omegaconf.OmegaConf.load(path)
26
+ return yaml.load(codecs.open(path, encoding='utf-8'), Loader=yaml.FullLoader)
27
+
28
+
29
+ def _merge_file(filelist, save_filename, shuffle=False):
30
+ contents = []
31
+ for file in filelist:
32
+ content = readtxt_list_all_strip(file)
33
+ contents.extend(content)
34
+ if shuffle:
35
+ random.shuffle(contents)
36
+ writetxt_w_list(contents, save_filename)
37
+
38
+
39
+ # file's io ----------------------------------------------------------------------
40
+ def iter_count(file_name):
41
+ """
42
+ 最快的文件行数统计,不知道和wc -l 谁快
43
+ author: unknown
44
+ """
45
+ buffer = 1024 * 1024
46
+ with codecs.open(file_name, 'r', 'utf-8') as f:
47
+ buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
48
+ return sum(buf.count('\n') for buf in buf_gen)
49
+
50
+
51
+ # 需要加入进度条的函数包括
52
+ """
53
+ readtxt_list_all_strip
54
+ save_to_json
55
+ load_from_json
56
+ """
57
+
58
+
59
+ # 读txt文件 一次全读完 返回list 去换行
60
+ def readtxt_list_all_strip(path, encoding='utf-8') -> list:
61
+ file_line_num = iter_count(path)
62
+ lines = []
63
+ with codecs.open(path, 'r', encoding) as r:
64
+ if file_line_num > LARGE_FILE_THRESHOLD:
65
+ iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
66
+ else:
67
+ iter_obj = enumerate(r.readlines())
68
+
69
+ for ldx, line in iter_obj:
70
+ lines.append(line.strip('\n').strip("\r"))
71
+ return lines
72
+
73
+
74
+ # 读txt 一次读一行 最后返回list
75
+ def readtxt_list_each(path) -> list:
76
+ lines = []
77
+ with codecs.open(path, 'r', 'utf-8') as r:
78
+ line = r.readline()
79
+ while line:
80
+ lines.append(line)
81
+ line = r.readline()
82
+ return lines
83
+
84
+
85
+ def readtxt_list_each_strip(path) -> list:
86
+ """
87
+ yield方法
88
+ """
89
+ with codecs.open(path, 'r', 'utf-8') as r:
90
+ line = r.readline()
91
+ while line:
92
+ yield line.strip("\n").strip("\r")
93
+ line = r.readline()
94
+
95
+
96
+ # 读txt文件 一次全读完 返回list
97
+ def readtxt_list_all(path) -> list:
98
+ with codecs.open(path, 'r', 'utf-8') as r:
99
+ lines = r.readlines()
100
+ return lines
101
+
102
+
103
+ # 读byte文件 读成一条string
104
+ def readtxt_byte(path, encoding="utf-8") -> str:
105
+ with codecs.open(path, 'rb') as r:
106
+ lines = r.read()
107
+ lines = lines.decode(encoding)
108
+ return lines.replace('\r', '')
109
+
110
+
111
+ # 读txt文件 读成一条string
112
+ def readtxt_string(path, encoding="utf-8") -> str:
113
+ with codecs.open(path, 'r', encoding) as r:
114
+ lines = r.read()
115
+ return lines.replace('\r', '')
116
+
117
+
118
+ # 写txt文件覆盖
119
+ def writetxt_w(txt, path, r='w'):
120
+ with codecs.open(path, r, 'utf-8') as w:
121
+ w.writelines(txt)
122
+
123
+
124
+ # 写txt文件追加
125
+ def writetxt_a(txt, path):
126
+ with codecs.open(path, 'a', 'utf-8') as w:
127
+ w.writelines(txt)
128
+
129
+
130
+ def writetxt(txt, path, encoding="utf-8"):
131
+ with codecs.open(path, 'w', encoding) as w:
132
+ w.write(txt)
133
+
134
+
135
+ def writetxt_wb(txt, path):
136
+ with codecs.open(path, 'wb') as w:
137
+ w.write(txt)
138
+
139
+
140
+ # 写list 覆盖
141
+ def writetxt_w_list(list, path, num_lf=1):
142
+ with codecs.open(path, 'w', "utf-8") as w:
143
+ for i in list:
144
+ w.write(i)
145
+ w.write("\n" * num_lf)
146
+
147
+
148
+ # 写list 追加
149
+ def writetxt_a_list(list, path, num_lf=2):
150
+ with codecs.open(path, 'a', "utf-8") as w:
151
+ for i in list:
152
+ w.write(i)
153
+ w.write("\n" * num_lf)
154
+
155
+
156
+ def save_to_json(content, path):
157
+ with codecs.open(path, "w", "utf-8") as w:
158
+ json.dump(content, w, ensure_ascii=False, indent=1)
159
+
160
+
161
+ def load_from_json(path):
162
+ with codecs.open(path, "r", "utf-8") as r:
163
+ content = json.load(r)
164
+ return content
165
+
166
+
167
+ # 读txt文件 读成一条string if gb2312
168
+ def readtxt_string_all_encoding(path):
169
+ try:
170
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
171
+ lines = r.read()
172
+ return lines
173
+ except:
174
+ try:
175
+ with codecs.open(path, 'rb', "utf-8") as r:
176
+ lines = r.reacd()
177
+ return lines
178
+ except:
179
+ try:
180
+ with codecs.open(path, 'rb', "big5") as r:
181
+ lines = r.read()
182
+ return lines
183
+ except:
184
+ print(path)
185
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
186
+ lines = r.read()
187
+ return lines
188
+
189
+
190
+ def readtxt_list_all_encoding(path):
191
+ try:
192
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
193
+ lines = r.readlines()
194
+ return lines
195
+ except:
196
+ try:
197
+ with codecs.open(path, 'rb', "utf-8") as r:
198
+ lines = r.readlines()
199
+ return lines
200
+ except:
201
+ try:
202
+ with codecs.open(path, 'rb', "big5") as r:
203
+ lines = r.readlines()
204
+ return lines
205
+ except:
206
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
207
+ lines = r.readlines()
208
+ return lines
209
+
210
+
211
+ # line by line
212
+ def save_to_jsonl(corpus, path):
213
+ with open(path, 'w', encoding='utf-8') as wt:
214
+ for i in corpus:
215
+ wt.write(json.dumps(i, ensure_ascii=False))
216
+ wt.write('\n')
217
+
218
+
219
+ # line by line
220
+ def load_from_jsonl(path):
221
+ file_line_num = iter_count(path)
222
+ if file_line_num > 1e5:
223
+ with open(path, 'r', encoding='utf-8') as rd:
224
+ corpus = []
225
+ while True:
226
+ line = rd.readline()
227
+ if line:
228
+ corpus.append(json.loads(line))
229
+ else:
230
+ break
231
+ return corpus
232
+ else:
233
+ with open(path, 'r', encoding='utf-8') as rd:
234
+ corpus = []
235
+ while True:
236
+ line = rd.readline()
237
+ if line:
238
+ corpus.append(json.loads(line))
239
+ else:
240
+ break
241
+ return corpus
242
+
243
+
244
+ def pickle_save(data, path):
245
+ with open(path, 'wb') as f:
246
+ pickle.dump(data, f)
247
+
248
+
249
+ def pickle_load(path):
250
+ with open(path, 'rb') as f:
251
+ data = pickle.load(f)
252
+ return data
253
+
254
+
255
+ def save_to_csv(df, save_path, index_flag=False):
256
+ with open(save_path, 'wb+') as csvfile:
257
+ csvfile.write(codecs.BOM_UTF8)
258
+ df.to_csv(save_path, mode='a', index=index_flag)
259
+
260
+
261
+ def save_to_mongo():
262
+ # fake
263
+ """
264
+ 示例
265
+
266
+ """
267
+ pass
268
+
269
+
270
+ def load_from_mongo():
271
+ pass
272
+
273
+
274
+ def unmerge_cells_df(df) -> pd.DataFrame:
275
+ for column in df.columns:
276
+ values = []
277
+ for i in df[column]:
278
+ if pd.isna(i):
279
+ values.append(values[-1])
280
+ else:
281
+ values.append(i)
282
+ df[column] = values
283
+ return df