nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. nlpertools/__init__.py +24 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/data_client.py +387 -257
  9. nlpertools/data_structure/base_structure.py +109 -13
  10. nlpertools/dataprocess.py +611 -3
  11. nlpertools/default_db_config.yml +41 -0
  12. nlpertools/io/__init__.py +3 -3
  13. nlpertools/io/dir.py +54 -36
  14. nlpertools/io/file.py +277 -222
  15. nlpertools/ml.py +483 -460
  16. nlpertools/monitor/__init__.py +0 -0
  17. nlpertools/monitor/gpu.py +18 -0
  18. nlpertools/monitor/memory.py +24 -0
  19. nlpertools/movie.py +36 -0
  20. nlpertools/nlpertools_config.yml +1 -0
  21. nlpertools/{openApi.py → open_api.py} +65 -65
  22. nlpertools/other.py +364 -249
  23. nlpertools/pic.py +288 -0
  24. nlpertools/plugin.py +43 -43
  25. nlpertools/reminder.py +98 -87
  26. nlpertools/utils/__init__.py +3 -3
  27. nlpertools/utils/lazy.py +727 -0
  28. nlpertools/utils/log_util.py +20 -0
  29. nlpertools/utils/package.py +89 -76
  30. nlpertools/utils/package_v1.py +94 -0
  31. nlpertools/utils/package_v2.py +117 -0
  32. nlpertools/utils_for_nlpertools.py +93 -93
  33. nlpertools/vector_index_demo.py +108 -0
  34. nlpertools/wrapper.py +161 -96
  35. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  36. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  37. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  38. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  39. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  40. nlpertools_helper/__init__.py +10 -0
  41. nlpertools-1.0.5.dist-info/METADATA +0 -85
  42. nlpertools-1.0.5.dist-info/RECORD +0 -25
  43. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/io/dir.py CHANGED
@@ -1,36 +1,54 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import os
5
-
6
-
7
- # dir ----------------------------------------------------------------------
8
- def j_mkdir(name):
9
- os.makedirs(name, exist_ok=True)
10
-
11
-
12
- def get_filename(path):
13
- '''
14
- 返回路径最后的文件名
15
- :param path:
16
- :return:
17
- '''
18
- # path = r'***/**/***.txt'
19
- filename = os.path.split(path)[-1]
20
- return filename
21
-
22
-
23
- def j_listdir(dir_name, including_dir=True):
24
- # yield
25
- filenames = os.listdir(dir_name)
26
- for filename in filenames:
27
- if including_dir:
28
- yield os.path.join(dir_name, filename)
29
- else:
30
- yield filename
31
-
32
-
33
- # 合并文件 TODO 还没写
34
- def imgrate_files(path):
35
- filenames = os.listdir(path)
36
- return None
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import os
5
+ from pathlib import Path
6
+
7
+
8
+ # dir ----------------------------------------------------------------------
9
+ def j_mkdir(name):
10
+ os.makedirs(name, exist_ok=True)
11
+
12
+
13
+ def get_filename(path) -> str:
14
+ """
15
+ 返回路径最后的文件名
16
+ :param path:
17
+ :return:
18
+ """
19
+ # path = r'***/**/***.txt'
20
+ filename = os.path.split(path)[-1]
21
+ return filename
22
+
23
+
24
+ def j_listdir(dir_name, including_dir=True):
25
+ # yield
26
+ filenames = os.listdir(dir_name)
27
+ for filename in filenames:
28
+ if including_dir:
29
+ yield os.path.join(dir_name, filename)
30
+ else:
31
+ yield filename
32
+
33
+
34
+ # 合并文件 TODO 还没写
35
+ def imgrate_files(path):
36
+ filenames = os.listdir(path)
37
+ return None
38
+
39
+
40
+ def case_sensitive_path_exists(path: str, relative_path=False):
41
+ """
42
+ https://juejin.cn/post/7316725867086692391
43
+ Check if the path exists in a case-sensitive manner.
44
+ """
45
+ # 构造成Path
46
+ if relative_path:
47
+ path = Path.cwd() / path
48
+ else:
49
+ path = Path(path)
50
+ if not path.exists():
51
+ return False
52
+ # resolved_path是系统里的该文件实际名称
53
+ resolved_path = path.resolve()
54
+ return str(resolved_path) == str(path)
nlpertools/io/file.py CHANGED
@@ -1,222 +1,277 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import json
6
- import pickle
7
- import random
8
- import time
9
-
10
- # import yaml
11
- from ..utils.package import *
12
-
13
-
14
- def read_yaml(path):
15
- return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
16
-
17
-
18
- def _merge_file(filelist, save_filename, shuffle=False):
19
- contents = []
20
- for file in filelist:
21
- content = readtxt_list_all_strip(file)
22
- contents.extend(content)
23
- if shuffle:
24
- random.shuffle(contents)
25
- writetxt_w_list(contents, save_filename)
26
-
27
-
28
- # file's io ----------------------------------------------------------------------
29
- # 读txt文件 一次全读完 返回list 去换行
30
- def readtxt_list_all_strip(path, encoding='utf-8'):
31
- lines = []
32
- t_start = time.time()
33
- with codecs.open(path, 'r', encoding) as r:
34
- for ldx, line in enumerate(r.readlines()):
35
- line = line.strip('\n').strip("\r")
36
- lines.append(line)
37
- if ldx > 1e5:
38
- t_end = time.time()
39
- print("read {} over, cos time {} ms".format(path, t_end - t_start))
40
- return lines
41
-
42
-
43
- # 读txt 一次读一行 最后返回list
44
- def readtxt_list_each(path):
45
- lines = []
46
- with codecs.open(path, 'r', 'utf-8') as r:
47
- line = r.readline()
48
- while line:
49
- lines.append(line)
50
- line = r.readline()
51
- return lines
52
-
53
-
54
- def readtxt_list_each_strip(path):
55
- """
56
- yield方法
57
- """
58
- with codecs.open(path, 'r', 'utf-8') as r:
59
- line = r.readline()
60
- while line:
61
- line = r.readline()
62
- yield line.strip("\n").strip("\r")
63
-
64
-
65
- # 读txt文件 一次全读完 返回list
66
- def readtxt_list_all(path):
67
- with codecs.open(path, 'r', 'utf-8') as r:
68
- lines = r.readlines()
69
- return lines
70
-
71
-
72
- # 读byte文件 读成一条string
73
- def readtxt_byte(path, encoding="utf-8"):
74
- with codecs.open(path, 'rb') as r:
75
- lines = r.read()
76
- lines = lines.decode(encoding)
77
- return lines.replace('\r', '')
78
-
79
-
80
- # 读txt文件 读成一条string
81
- def readtxt_string(path, encoding="utf-8"):
82
- with codecs.open(path, 'r', encoding) as r:
83
- lines = r.read()
84
- return lines.replace('\r', '')
85
-
86
-
87
- # 写txt文件覆盖
88
- def writetxt_w(txt, path, r='w'):
89
- with codecs.open(path, r, 'utf-8') as w:
90
- w.writelines(txt)
91
-
92
-
93
- # 写txt文件追加
94
- def writetxt_a(txt, path):
95
- with codecs.open(path, 'a', 'utf-8') as w:
96
- w.writelines(txt)
97
-
98
-
99
- def writetxt(txt, path, encoding="utf-8"):
100
- with codecs.open(path, 'w', encoding) as w:
101
- w.write(txt)
102
-
103
-
104
- def writetxt_wb(txt, path):
105
- with codecs.open(path, 'wb') as w:
106
- w.write(txt)
107
-
108
-
109
- # 写list 覆盖
110
- def writetxt_w_list(list, path, num_lf=1):
111
- with codecs.open(path, 'w', "utf-8") as w:
112
- for i in list:
113
- w.write(i)
114
- w.write("\n" * num_lf)
115
-
116
-
117
- # 写list 追加
118
- def writetxt_a_list(list, path, num_lf=2):
119
- with codecs.open(path, 'a', "utf-8") as w:
120
- for i in list:
121
- w.write(i)
122
- w.write("\n" * num_lf)
123
-
124
-
125
- # 写二维list 追加
126
- def writetxt_a_2list(list, path):
127
- with codecs.open(path, 'a', "utf-8") as w:
128
- for i in list:
129
- writetxt_a_list(i, path)
130
-
131
-
132
- def SaveToJson(content, path):
133
- with codecs.open(path, "w", "utf-8") as w:
134
- json.dump(content, w, ensure_ascii=False, indent=1)
135
-
136
-
137
- def LoadFromJson(path):
138
- with codecs.open(path, "r", "utf-8") as r:
139
- content = json.load(r)
140
- return content
141
-
142
-
143
- # 读txt文件 读成一条string if gb2312
144
- def readtxt_string_all_encoding(path):
145
- try:
146
- with codecs.open(path, 'rb', "utf-8-sig") as r:
147
- lines = r.read()
148
- return lines
149
- except:
150
- try:
151
- with codecs.open(path, 'rb', "utf-8") as r:
152
- lines = r.reacd()
153
- return lines
154
- except:
155
- try:
156
- with codecs.open(path, 'rb', "big5") as r:
157
- lines = r.read()
158
- return lines
159
- except:
160
- print(path)
161
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
162
- lines = r.read()
163
- return lines
164
-
165
-
166
- def readtxt_list_all_encoding(path):
167
- try:
168
- with codecs.open(path, 'rb', "utf-8-sig") as r:
169
- lines = r.readlines()
170
- return lines
171
- except:
172
- try:
173
- with codecs.open(path, 'rb', "utf-8") as r:
174
- lines = r.readlines()
175
- return lines
176
- except:
177
- try:
178
- with codecs.open(path, 'rb', "big5") as r:
179
- lines = r.readlines()
180
- return lines
181
- except:
182
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
183
- lines = r.readlines()
184
- return lines
185
-
186
-
187
- # line by line
188
- def save_to_json(corpus, path):
189
- with open(path, 'w', encoding='utf-8') as wt:
190
- for i in corpus:
191
- wt.write(json.dumps(i, ensure_ascii=False))
192
- wt.write('\n')
193
-
194
-
195
- # line by line
196
- def load_from_json(path):
197
- with open(path, 'r', encoding='utf-8') as rd:
198
- corpus = []
199
- while True:
200
- line = rd.readline()
201
- if line:
202
- corpus.append(json.loads(line))
203
- else:
204
- break
205
- return corpus
206
-
207
-
208
- def pickle_save(data, path):
209
- with open(path, 'wb') as f:
210
- pickle.dump(data, f)
211
-
212
-
213
- def pickle_load(path):
214
- with open(path, 'rb') as f:
215
- data = pickle.load(f)
216
- return data
217
-
218
-
219
- def save_to_csv(df, save_path, index_flag=False):
220
- with open(save_path, 'wb+') as csvfile:
221
- csvfile.write(codecs.BOM_UTF8)
222
- df.to_csv(save_path, mode='a', index=index_flag)
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import codecs
5
+ import json
6
+ import pickle
7
+ import random
8
+ import time
9
+ from itertools import (takewhile, repeat)
10
+ import pandas as pd
11
+ # import omegaconf
12
+ # import yaml
13
+ from ..utils.package import *
14
+
15
+ LARGE_FILE_THRESHOLD = 1e5
16
+
17
+
18
+ def read_yaml(path, omega=False):
19
+ if omega:
20
+ return omegaconf.OmegaConf.load(path)
21
+ return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
22
+
23
+
24
+ def _merge_file(filelist, save_filename, shuffle=False):
25
+ contents = []
26
+ for file in filelist:
27
+ content = readtxt_list_all_strip(file)
28
+ contents.extend(content)
29
+ if shuffle:
30
+ random.shuffle(contents)
31
+ writetxt_w_list(contents, save_filename)
32
+
33
+
34
+ # file's io ----------------------------------------------------------------------
35
+ def iter_count(file_name):
36
+ """
37
+ 最快的文件行数统计,不知道和wc -l 谁快
38
+ author: unknown
39
+ """
40
+ buffer = 1024 * 1024
41
+ with codecs.open(file_name, 'r', 'utf-8') as f:
42
+ buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
43
+ return sum(buf.count('\n') for buf in buf_gen)
44
+
45
+
46
+ # 需要加入进度条的函数包括
47
+ """
48
+ readtxt_list_all_strip
49
+ save_to_json
50
+ load_from_json
51
+ """
52
+
53
+
54
+ # 读txt文件 一次全读完 返回list 去换行
55
+ def readtxt_list_all_strip(path, encoding='utf-8'):
56
+ file_line_num = iter_count(path)
57
+ lines = []
58
+ with codecs.open(path, 'r', encoding) as r:
59
+ if file_line_num > LARGE_FILE_THRESHOLD:
60
+ iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
61
+ else:
62
+ iter_obj = enumerate(r.readlines())
63
+
64
+ for ldx, line in iter_obj:
65
+ lines.append(line.strip('\n').strip("\r"))
66
+ return lines
67
+
68
+
69
+ # 读txt 一次读一行 最后返回list
70
+ def readtxt_list_each(path):
71
+ lines = []
72
+ with codecs.open(path, 'r', 'utf-8') as r:
73
+ line = r.readline()
74
+ while line:
75
+ lines.append(line)
76
+ line = r.readline()
77
+ return lines
78
+
79
+
80
+ def readtxt_list_each_strip(path):
81
+ """
82
+ yield方法
83
+ """
84
+ with codecs.open(path, 'r', 'utf-8') as r:
85
+ line = r.readline()
86
+ while line:
87
+ yield line.strip("\n").strip("\r")
88
+ line = r.readline()
89
+
90
+
91
+ # 读txt文件 一次全读完 返回list
92
+ def readtxt_list_all(path):
93
+ with codecs.open(path, 'r', 'utf-8') as r:
94
+ lines = r.readlines()
95
+ return lines
96
+
97
+
98
+ # 读byte文件 读成一条string
99
+ def readtxt_byte(path, encoding="utf-8"):
100
+ with codecs.open(path, 'rb') as r:
101
+ lines = r.read()
102
+ lines = lines.decode(encoding)
103
+ return lines.replace('\r', '')
104
+
105
+
106
+ # 读txt文件 读成一条string
107
+ def readtxt_string(path, encoding="utf-8"):
108
+ with codecs.open(path, 'r', encoding) as r:
109
+ lines = r.read()
110
+ return lines.replace('\r', '')
111
+
112
+
113
+ # 写txt文件覆盖
114
+ def writetxt_w(txt, path, r='w'):
115
+ with codecs.open(path, r, 'utf-8') as w:
116
+ w.writelines(txt)
117
+
118
+
119
+ # 写txt文件追加
120
+ def writetxt_a(txt, path):
121
+ with codecs.open(path, 'a', 'utf-8') as w:
122
+ w.writelines(txt)
123
+
124
+
125
+ def writetxt(txt, path, encoding="utf-8"):
126
+ with codecs.open(path, 'w', encoding) as w:
127
+ w.write(txt)
128
+
129
+
130
+ def writetxt_wb(txt, path):
131
+ with codecs.open(path, 'wb') as w:
132
+ w.write(txt)
133
+
134
+
135
+ # 写list 覆盖
136
+ def writetxt_w_list(list, path, num_lf=1):
137
+ with codecs.open(path, 'w', "utf-8") as w:
138
+ for i in list:
139
+ w.write(i)
140
+ w.write("\n" * num_lf)
141
+
142
+
143
+ # 写list 追加
144
+ def writetxt_a_list(list, path, num_lf=2):
145
+ with codecs.open(path, 'a', "utf-8") as w:
146
+ for i in list:
147
+ w.write(i)
148
+ w.write("\n" * num_lf)
149
+
150
+
151
+ def save_to_json(content, path):
152
+ with codecs.open(path, "w", "utf-8") as w:
153
+ json.dump(content, w, ensure_ascii=False, indent=1)
154
+
155
+
156
+ def load_from_json(path):
157
+ with codecs.open(path, "r", "utf-8") as r:
158
+ content = json.load(r)
159
+ return content
160
+
161
+
162
+ # 读txt文件 读成一条string if gb2312
163
+ def readtxt_string_all_encoding(path):
164
+ try:
165
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
166
+ lines = r.read()
167
+ return lines
168
+ except:
169
+ try:
170
+ with codecs.open(path, 'rb', "utf-8") as r:
171
+ lines = r.reacd()
172
+ return lines
173
+ except:
174
+ try:
175
+ with codecs.open(path, 'rb', "big5") as r:
176
+ lines = r.read()
177
+ return lines
178
+ except:
179
+ print(path)
180
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
181
+ lines = r.read()
182
+ return lines
183
+
184
+
185
+ def readtxt_list_all_encoding(path):
186
+ try:
187
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
188
+ lines = r.readlines()
189
+ return lines
190
+ except:
191
+ try:
192
+ with codecs.open(path, 'rb', "utf-8") as r:
193
+ lines = r.readlines()
194
+ return lines
195
+ except:
196
+ try:
197
+ with codecs.open(path, 'rb', "big5") as r:
198
+ lines = r.readlines()
199
+ return lines
200
+ except:
201
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
202
+ lines = r.readlines()
203
+ return lines
204
+
205
+
206
+ # line by line
207
+ def save_to_jsonl(corpus, path):
208
+ with open(path, 'w', encoding='utf-8') as wt:
209
+ for i in corpus:
210
+ wt.write(json.dumps(i, ensure_ascii=False))
211
+ wt.write('\n')
212
+
213
+
214
+ # line by line
215
+ def load_from_jsonl(path):
216
+ file_line_num = iter_count(path)
217
+ if file_line_num > 1e5:
218
+ with open(path, 'r', encoding='utf-8') as rd:
219
+ corpus = []
220
+ while True:
221
+ line = rd.readline()
222
+ if line:
223
+ corpus.append(json.loads(line))
224
+ else:
225
+ break
226
+ return corpus
227
+ else:
228
+ with open(path, 'r', encoding='utf-8') as rd:
229
+ corpus = []
230
+ while True:
231
+ line = rd.readline()
232
+ if line:
233
+ corpus.append(json.loads(line))
234
+ else:
235
+ break
236
+ return corpus
237
+
238
+
239
+ def pickle_save(data, path):
240
+ with open(path, 'wb') as f:
241
+ pickle.dump(data, f)
242
+
243
+
244
+ def pickle_load(path):
245
+ with open(path, 'rb') as f:
246
+ data = pickle.load(f)
247
+ return data
248
+
249
+
250
+ def save_to_csv(df, save_path, index_flag=False):
251
+ with open(save_path, 'wb+') as csvfile:
252
+ csvfile.write(codecs.BOM_UTF8)
253
+ df.to_csv(save_path, mode='a', index=index_flag)
254
+
255
+
256
+ def save_to_mongo():
257
+ # fake
258
+ """
259
+ 示例
260
+
261
+ """
262
+ pass
263
+
264
+ def load_from_mongo():
265
+ pass
266
+
267
+
268
+ def unmerge_cells_df(df) -> pd.DataFrame:
269
+ for column in df.columns:
270
+ values = []
271
+ for i in df[column]:
272
+ if pd.isna(i):
273
+ values.append(values[-1])
274
+ else:
275
+ values.append(i)
276
+ df[column] = values
277
+ return df