nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. nlpertools/__init__.py +24 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/data_client.py +387 -257
  9. nlpertools/data_structure/base_structure.py +109 -13
  10. nlpertools/dataprocess.py +611 -3
  11. nlpertools/default_db_config.yml +41 -0
  12. nlpertools/io/__init__.py +3 -3
  13. nlpertools/io/dir.py +54 -36
  14. nlpertools/io/file.py +277 -222
  15. nlpertools/ml.py +483 -460
  16. nlpertools/monitor/__init__.py +0 -0
  17. nlpertools/monitor/gpu.py +18 -0
  18. nlpertools/monitor/memory.py +24 -0
  19. nlpertools/movie.py +36 -0
  20. nlpertools/nlpertools_config.yml +1 -0
  21. nlpertools/{openApi.py → open_api.py} +65 -65
  22. nlpertools/other.py +364 -249
  23. nlpertools/pic.py +288 -0
  24. nlpertools/plugin.py +43 -43
  25. nlpertools/reminder.py +98 -87
  26. nlpertools/utils/__init__.py +3 -3
  27. nlpertools/utils/lazy.py +727 -0
  28. nlpertools/utils/log_util.py +20 -0
  29. nlpertools/utils/package.py +89 -76
  30. nlpertools/utils/package_v1.py +94 -0
  31. nlpertools/utils/package_v2.py +117 -0
  32. nlpertools/utils_for_nlpertools.py +93 -93
  33. nlpertools/vector_index_demo.py +108 -0
  34. nlpertools/wrapper.py +161 -96
  35. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  36. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  37. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  38. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  39. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  40. nlpertools_helper/__init__.py +10 -0
  41. nlpertools-1.0.5.dist-info/METADATA +0 -85
  42. nlpertools-1.0.5.dist-info/RECORD +0 -25
  43. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/io/dir.py CHANGED
@@ -1,36 +1,54 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import os
5
-
6
-
7
- # dir ----------------------------------------------------------------------
8
- def j_mkdir(name):
9
- os.makedirs(name, exist_ok=True)
10
-
11
-
12
- def get_filename(path):
13
- '''
14
- 返回路径最后的文件名
15
- :param path:
16
- :return:
17
- '''
18
- # path = r'***/**/***.txt'
19
- filename = os.path.split(path)[-1]
20
- return filename
21
-
22
-
23
- def j_listdir(dir_name, including_dir=True):
24
- # yield
25
- filenames = os.listdir(dir_name)
26
- for filename in filenames:
27
- if including_dir:
28
- yield os.path.join(dir_name, filename)
29
- else:
30
- yield filename
31
-
32
-
33
- # 合并文件 TODO 还没写
34
- def imgrate_files(path):
35
- filenames = os.listdir(path)
36
- return None
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import os
5
+ from pathlib import Path
6
+
7
+
8
+ # dir ----------------------------------------------------------------------
9
+ def j_mkdir(name):
10
+ os.makedirs(name, exist_ok=True)
11
+
12
+
13
+ def get_filename(path) -> str:
14
+ """
15
+ 返回路径最后的文件名
16
+ :param path:
17
+ :return:
18
+ """
19
+ # path = r'***/**/***.txt'
20
+ filename = os.path.split(path)[-1]
21
+ return filename
22
+
23
+
24
+ def j_listdir(dir_name, including_dir=True):
25
+ # yield
26
+ filenames = os.listdir(dir_name)
27
+ for filename in filenames:
28
+ if including_dir:
29
+ yield os.path.join(dir_name, filename)
30
+ else:
31
+ yield filename
32
+
33
+
34
+ # 合并文件 TODO 还没写
35
+ def imgrate_files(path):
36
+ filenames = os.listdir(path)
37
+ return None
38
+
39
+
40
+ def case_sensitive_path_exists(path: str, relative_path=False):
41
+ """
42
+ https://juejin.cn/post/7316725867086692391
43
+ Check if the path exists in a case-sensitive manner.
44
+ """
45
+ # 构造成Path
46
+ if relative_path:
47
+ path = Path.cwd() / path
48
+ else:
49
+ path = Path(path)
50
+ if not path.exists():
51
+ return False
52
+ # resolved_path是系统里的该文件实际名称
53
+ resolved_path = path.resolve()
54
+ return str(resolved_path) == str(path)
nlpertools/io/file.py CHANGED
@@ -1,222 +1,277 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import json
6
- import pickle
7
- import random
8
- import time
9
-
10
- # import yaml
11
- from ..utils.package import *
12
-
13
-
14
- def read_yaml(path):
15
- return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
16
-
17
-
18
- def _merge_file(filelist, save_filename, shuffle=False):
19
- contents = []
20
- for file in filelist:
21
- content = readtxt_list_all_strip(file)
22
- contents.extend(content)
23
- if shuffle:
24
- random.shuffle(contents)
25
- writetxt_w_list(contents, save_filename)
26
-
27
-
28
- # file's io ----------------------------------------------------------------------
29
- # 读txt文件 一次全读完 返回list 去换行
30
- def readtxt_list_all_strip(path, encoding='utf-8'):
31
- lines = []
32
- t_start = time.time()
33
- with codecs.open(path, 'r', encoding) as r:
34
- for ldx, line in enumerate(r.readlines()):
35
- line = line.strip('\n').strip("\r")
36
- lines.append(line)
37
- if ldx > 1e5:
38
- t_end = time.time()
39
- print("read {} over, cos time {} ms".format(path, t_end - t_start))
40
- return lines
41
-
42
-
43
- # 读txt 一次读一行 最后返回list
44
- def readtxt_list_each(path):
45
- lines = []
46
- with codecs.open(path, 'r', 'utf-8') as r:
47
- line = r.readline()
48
- while line:
49
- lines.append(line)
50
- line = r.readline()
51
- return lines
52
-
53
-
54
- def readtxt_list_each_strip(path):
55
- """
56
- yield方法
57
- """
58
- with codecs.open(path, 'r', 'utf-8') as r:
59
- line = r.readline()
60
- while line:
61
- line = r.readline()
62
- yield line.strip("\n").strip("\r")
63
-
64
-
65
- # 读txt文件 一次全读完 返回list
66
- def readtxt_list_all(path):
67
- with codecs.open(path, 'r', 'utf-8') as r:
68
- lines = r.readlines()
69
- return lines
70
-
71
-
72
- # 读byte文件 读成一条string
73
- def readtxt_byte(path, encoding="utf-8"):
74
- with codecs.open(path, 'rb') as r:
75
- lines = r.read()
76
- lines = lines.decode(encoding)
77
- return lines.replace('\r', '')
78
-
79
-
80
- # 读txt文件 读成一条string
81
- def readtxt_string(path, encoding="utf-8"):
82
- with codecs.open(path, 'r', encoding) as r:
83
- lines = r.read()
84
- return lines.replace('\r', '')
85
-
86
-
87
- # 写txt文件覆盖
88
- def writetxt_w(txt, path, r='w'):
89
- with codecs.open(path, r, 'utf-8') as w:
90
- w.writelines(txt)
91
-
92
-
93
- # 写txt文件追加
94
- def writetxt_a(txt, path):
95
- with codecs.open(path, 'a', 'utf-8') as w:
96
- w.writelines(txt)
97
-
98
-
99
- def writetxt(txt, path, encoding="utf-8"):
100
- with codecs.open(path, 'w', encoding) as w:
101
- w.write(txt)
102
-
103
-
104
- def writetxt_wb(txt, path):
105
- with codecs.open(path, 'wb') as w:
106
- w.write(txt)
107
-
108
-
109
- # 写list 覆盖
110
- def writetxt_w_list(list, path, num_lf=1):
111
- with codecs.open(path, 'w', "utf-8") as w:
112
- for i in list:
113
- w.write(i)
114
- w.write("\n" * num_lf)
115
-
116
-
117
- # 写list 追加
118
- def writetxt_a_list(list, path, num_lf=2):
119
- with codecs.open(path, 'a', "utf-8") as w:
120
- for i in list:
121
- w.write(i)
122
- w.write("\n" * num_lf)
123
-
124
-
125
- # 写二维list 追加
126
- def writetxt_a_2list(list, path):
127
- with codecs.open(path, 'a', "utf-8") as w:
128
- for i in list:
129
- writetxt_a_list(i, path)
130
-
131
-
132
- def SaveToJson(content, path):
133
- with codecs.open(path, "w", "utf-8") as w:
134
- json.dump(content, w, ensure_ascii=False, indent=1)
135
-
136
-
137
- def LoadFromJson(path):
138
- with codecs.open(path, "r", "utf-8") as r:
139
- content = json.load(r)
140
- return content
141
-
142
-
143
- # 读txt文件 读成一条string if gb2312
144
- def readtxt_string_all_encoding(path):
145
- try:
146
- with codecs.open(path, 'rb', "utf-8-sig") as r:
147
- lines = r.read()
148
- return lines
149
- except:
150
- try:
151
- with codecs.open(path, 'rb', "utf-8") as r:
152
- lines = r.reacd()
153
- return lines
154
- except:
155
- try:
156
- with codecs.open(path, 'rb', "big5") as r:
157
- lines = r.read()
158
- return lines
159
- except:
160
- print(path)
161
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
162
- lines = r.read()
163
- return lines
164
-
165
-
166
- def readtxt_list_all_encoding(path):
167
- try:
168
- with codecs.open(path, 'rb', "utf-8-sig") as r:
169
- lines = r.readlines()
170
- return lines
171
- except:
172
- try:
173
- with codecs.open(path, 'rb', "utf-8") as r:
174
- lines = r.readlines()
175
- return lines
176
- except:
177
- try:
178
- with codecs.open(path, 'rb', "big5") as r:
179
- lines = r.readlines()
180
- return lines
181
- except:
182
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
183
- lines = r.readlines()
184
- return lines
185
-
186
-
187
- # line by line
188
- def save_to_json(corpus, path):
189
- with open(path, 'w', encoding='utf-8') as wt:
190
- for i in corpus:
191
- wt.write(json.dumps(i, ensure_ascii=False))
192
- wt.write('\n')
193
-
194
-
195
- # line by line
196
- def load_from_json(path):
197
- with open(path, 'r', encoding='utf-8') as rd:
198
- corpus = []
199
- while True:
200
- line = rd.readline()
201
- if line:
202
- corpus.append(json.loads(line))
203
- else:
204
- break
205
- return corpus
206
-
207
-
208
- def pickle_save(data, path):
209
- with open(path, 'wb') as f:
210
- pickle.dump(data, f)
211
-
212
-
213
- def pickle_load(path):
214
- with open(path, 'rb') as f:
215
- data = pickle.load(f)
216
- return data
217
-
218
-
219
- def save_to_csv(df, save_path, index_flag=False):
220
- with open(save_path, 'wb+') as csvfile:
221
- csvfile.write(codecs.BOM_UTF8)
222
- df.to_csv(save_path, mode='a', index=index_flag)
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import codecs
5
+ import json
6
+ import pickle
7
+ import random
8
+ import time
9
+ from itertools import (takewhile, repeat)
10
+ import pandas as pd
11
+ # import omegaconf
12
+ # import yaml
13
+ from ..utils.package import *
14
+
15
+ LARGE_FILE_THRESHOLD = 1e5
16
+
17
+
18
+ def read_yaml(path, omega=False):
19
+ if omega:
20
+ return omegaconf.OmegaConf.load(path)
21
+ return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
22
+
23
+
24
+ def _merge_file(filelist, save_filename, shuffle=False):
25
+ contents = []
26
+ for file in filelist:
27
+ content = readtxt_list_all_strip(file)
28
+ contents.extend(content)
29
+ if shuffle:
30
+ random.shuffle(contents)
31
+ writetxt_w_list(contents, save_filename)
32
+
33
+
34
+ # file's io ----------------------------------------------------------------------
35
+ def iter_count(file_name):
36
+ """
37
+ 最快的文件行数统计,不知道和wc -l 谁快
38
+ author: unknown
39
+ """
40
+ buffer = 1024 * 1024
41
+ with codecs.open(file_name, 'r', 'utf-8') as f:
42
+ buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
43
+ return sum(buf.count('\n') for buf in buf_gen)
44
+
45
+
46
+ # 需要加入进度条的函数包括
47
+ """
48
+ readtxt_list_all_strip
49
+ save_to_json
50
+ load_from_json
51
+ """
52
+
53
+
54
+ # 读txt文件 一次全读完 返回list 去换行
55
+ def readtxt_list_all_strip(path, encoding='utf-8'):
56
+ file_line_num = iter_count(path)
57
+ lines = []
58
+ with codecs.open(path, 'r', encoding) as r:
59
+ if file_line_num > LARGE_FILE_THRESHOLD:
60
+ iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
61
+ else:
62
+ iter_obj = enumerate(r.readlines())
63
+
64
+ for ldx, line in iter_obj:
65
+ lines.append(line.strip('\n').strip("\r"))
66
+ return lines
67
+
68
+
69
+ # 读txt 一次读一行 最后返回list
70
+ def readtxt_list_each(path):
71
+ lines = []
72
+ with codecs.open(path, 'r', 'utf-8') as r:
73
+ line = r.readline()
74
+ while line:
75
+ lines.append(line)
76
+ line = r.readline()
77
+ return lines
78
+
79
+
80
+ def readtxt_list_each_strip(path):
81
+ """
82
+ yield方法
83
+ """
84
+ with codecs.open(path, 'r', 'utf-8') as r:
85
+ line = r.readline()
86
+ while line:
87
+ yield line.strip("\n").strip("\r")
88
+ line = r.readline()
89
+
90
+
91
+ # 读txt文件 一次全读完 返回list
92
+ def readtxt_list_all(path):
93
+ with codecs.open(path, 'r', 'utf-8') as r:
94
+ lines = r.readlines()
95
+ return lines
96
+
97
+
98
+ # 读byte文件 读成一条string
99
+ def readtxt_byte(path, encoding="utf-8"):
100
+ with codecs.open(path, 'rb') as r:
101
+ lines = r.read()
102
+ lines = lines.decode(encoding)
103
+ return lines.replace('\r', '')
104
+
105
+
106
+ # 读txt文件 读成一条string
107
+ def readtxt_string(path, encoding="utf-8"):
108
+ with codecs.open(path, 'r', encoding) as r:
109
+ lines = r.read()
110
+ return lines.replace('\r', '')
111
+
112
+
113
+ # 写txt文件覆盖
114
+ def writetxt_w(txt, path, r='w'):
115
+ with codecs.open(path, r, 'utf-8') as w:
116
+ w.writelines(txt)
117
+
118
+
119
+ # 写txt文件追加
120
+ def writetxt_a(txt, path):
121
+ with codecs.open(path, 'a', 'utf-8') as w:
122
+ w.writelines(txt)
123
+
124
+
125
+ def writetxt(txt, path, encoding="utf-8"):
126
+ with codecs.open(path, 'w', encoding) as w:
127
+ w.write(txt)
128
+
129
+
130
+ def writetxt_wb(txt, path):
131
+ with codecs.open(path, 'wb') as w:
132
+ w.write(txt)
133
+
134
+
135
+ # 写list 覆盖
136
+ def writetxt_w_list(list, path, num_lf=1):
137
+ with codecs.open(path, 'w', "utf-8") as w:
138
+ for i in list:
139
+ w.write(i)
140
+ w.write("\n" * num_lf)
141
+
142
+
143
+ # 写list 追加
144
+ def writetxt_a_list(list, path, num_lf=2):
145
+ with codecs.open(path, 'a', "utf-8") as w:
146
+ for i in list:
147
+ w.write(i)
148
+ w.write("\n" * num_lf)
149
+
150
+
151
+ def save_to_json(content, path):
152
+ with codecs.open(path, "w", "utf-8") as w:
153
+ json.dump(content, w, ensure_ascii=False, indent=1)
154
+
155
+
156
+ def load_from_json(path):
157
+ with codecs.open(path, "r", "utf-8") as r:
158
+ content = json.load(r)
159
+ return content
160
+
161
+
162
+ # 读txt文件 读成一条string if gb2312
163
+ def readtxt_string_all_encoding(path):
164
+ try:
165
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
166
+ lines = r.read()
167
+ return lines
168
+ except:
169
+ try:
170
+ with codecs.open(path, 'rb', "utf-8") as r:
171
+ lines = r.reacd()
172
+ return lines
173
+ except:
174
+ try:
175
+ with codecs.open(path, 'rb', "big5") as r:
176
+ lines = r.read()
177
+ return lines
178
+ except:
179
+ print(path)
180
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
181
+ lines = r.read()
182
+ return lines
183
+
184
+
185
+ def readtxt_list_all_encoding(path):
186
+ try:
187
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
188
+ lines = r.readlines()
189
+ return lines
190
+ except:
191
+ try:
192
+ with codecs.open(path, 'rb', "utf-8") as r:
193
+ lines = r.readlines()
194
+ return lines
195
+ except:
196
+ try:
197
+ with codecs.open(path, 'rb', "big5") as r:
198
+ lines = r.readlines()
199
+ return lines
200
+ except:
201
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
202
+ lines = r.readlines()
203
+ return lines
204
+
205
+
206
+ # line by line
207
+ def save_to_jsonl(corpus, path):
208
+ with open(path, 'w', encoding='utf-8') as wt:
209
+ for i in corpus:
210
+ wt.write(json.dumps(i, ensure_ascii=False))
211
+ wt.write('\n')
212
+
213
+
214
+ # line by line
215
+ def load_from_jsonl(path):
216
+ file_line_num = iter_count(path)
217
+ if file_line_num > 1e5:
218
+ with open(path, 'r', encoding='utf-8') as rd:
219
+ corpus = []
220
+ while True:
221
+ line = rd.readline()
222
+ if line:
223
+ corpus.append(json.loads(line))
224
+ else:
225
+ break
226
+ return corpus
227
+ else:
228
+ with open(path, 'r', encoding='utf-8') as rd:
229
+ corpus = []
230
+ while True:
231
+ line = rd.readline()
232
+ if line:
233
+ corpus.append(json.loads(line))
234
+ else:
235
+ break
236
+ return corpus
237
+
238
+
239
+ def pickle_save(data, path):
240
+ with open(path, 'wb') as f:
241
+ pickle.dump(data, f)
242
+
243
+
244
+ def pickle_load(path):
245
+ with open(path, 'rb') as f:
246
+ data = pickle.load(f)
247
+ return data
248
+
249
+
250
+ def save_to_csv(df, save_path, index_flag=False):
251
+ with open(save_path, 'wb+') as csvfile:
252
+ csvfile.write(codecs.BOM_UTF8)
253
+ df.to_csv(save_path, mode='a', index=index_flag)
254
+
255
+
256
+ def save_to_mongo():
257
+ # fake
258
+ """
259
+ 示例
260
+
261
+ """
262
+ pass
263
+
264
+ def load_from_mongo():
265
+ pass
266
+
267
+
268
+ def unmerge_cells_df(df) -> pd.DataFrame:
269
+ for column in df.columns:
270
+ values = []
271
+ for i in df[column]:
272
+ if pd.isna(i):
273
+ values.append(values[-1])
274
+ else:
275
+ values.append(i)
276
+ df[column] = values
277
+ return df