nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. nlpertools/__init__.py +24 -11
  2. nlpertools/algo/__init__.py +0 -0
  3. nlpertools/algo/ac.py +18 -0
  4. nlpertools/algo/bit_ops.py +28 -0
  5. nlpertools/algo/kmp.py +94 -0
  6. nlpertools/algo/num_ops.py +12 -0
  7. nlpertools/algo/template.py +116 -0
  8. nlpertools/algo/union.py +13 -0
  9. nlpertools/data_client.py +387 -0
  10. nlpertools/data_structure/__init__.py +0 -0
  11. nlpertools/data_structure/base_structure.py +109 -0
  12. nlpertools/dataprocess.py +611 -3
  13. nlpertools/default_db_config.yml +41 -0
  14. nlpertools/io/__init__.py +3 -3
  15. nlpertools/io/dir.py +54 -47
  16. nlpertools/io/file.py +277 -205
  17. nlpertools/ml.py +483 -317
  18. nlpertools/monitor/__init__.py +0 -0
  19. nlpertools/monitor/gpu.py +18 -0
  20. nlpertools/monitor/memory.py +24 -0
  21. nlpertools/movie.py +36 -0
  22. nlpertools/nlpertools_config.yml +1 -0
  23. nlpertools/{openApi.py → open_api.py} +65 -62
  24. nlpertools/other.py +364 -188
  25. nlpertools/pic.py +288 -0
  26. nlpertools/plugin.py +43 -34
  27. nlpertools/reminder.py +98 -15
  28. nlpertools/template/__init__.py +0 -0
  29. nlpertools/utils/__init__.py +3 -0
  30. nlpertools/utils/lazy.py +727 -0
  31. nlpertools/utils/log_util.py +20 -0
  32. nlpertools/utils/package.py +89 -0
  33. nlpertools/utils/package_v1.py +94 -0
  34. nlpertools/utils/package_v2.py +117 -0
  35. nlpertools/utils_for_nlpertools.py +93 -0
  36. nlpertools/vector_index_demo.py +108 -0
  37. nlpertools/wrapper.py +161 -0
  38. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  39. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  40. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  41. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  42. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  43. nlpertools_helper/__init__.py +10 -0
  44. nlpertools-1.0.4.dist-info/METADATA +0 -42
  45. nlpertools-1.0.4.dist-info/RECORD +0 -15
  46. nlpertools-1.0.4.dist-info/top_level.txt +0 -1
nlpertools/io/dir.py CHANGED
@@ -1,47 +1,54 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import os
6
- import json
7
- import pickle
8
- import time
9
- # dir ----------------------------------------------------------------------
10
- def j_mkdir(name):
11
- os.makedirs(name, exist_ok=True)
12
-
13
-
14
- def get_filename(path):
15
- '''
16
- 返回路径最后的文件名
17
- :param path:
18
- :return:
19
- '''
20
- # path = r'***/**/***.txt'
21
- filename = os.path.split(path)[-1]
22
- return filename
23
-
24
-
25
- # TODO 还没写
26
- def walk():
27
- paths = os.walk(r'F:\**\**\**\***')
28
- for root, dir, files in paths:
29
- for name in files:
30
- if name == '***.**':
31
- os.remove(os.path.join(root, name))
32
-
33
-
34
- def j_listdir(dir_name, including_dir=True):
35
- # yield
36
- filenames = os.listdir(dir_name)
37
- for filename in filenames:
38
- if including_dir:
39
- yield os.path.join(dir_name, filename)
40
- else:
41
- yield filename
42
-
43
- # 合并文件 TODO 还没写
44
- def imgrate_files(path):
45
- filenames = os.listdir(path)
46
- return None
47
-
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import os
5
+ from pathlib import Path
6
+
7
+
8
+ # dir ----------------------------------------------------------------------
9
+ def j_mkdir(name):
10
+ os.makedirs(name, exist_ok=True)
11
+
12
+
13
+ def get_filename(path) -> str:
14
+ """
15
+ 返回路径最后的文件名
16
+ :param path:
17
+ :return:
18
+ """
19
+ # path = r'***/**/***.txt'
20
+ filename = os.path.split(path)[-1]
21
+ return filename
22
+
23
+
24
+ def j_listdir(dir_name, including_dir=True):
25
+ # yield
26
+ filenames = os.listdir(dir_name)
27
+ for filename in filenames:
28
+ if including_dir:
29
+ yield os.path.join(dir_name, filename)
30
+ else:
31
+ yield filename
32
+
33
+
34
+ # 合并文件 TODO 还没写
35
+ def imgrate_files(path):
36
+ filenames = os.listdir(path)
37
+ return None
38
+
39
+
40
+ def case_sensitive_path_exists(path: str, relative_path=False):
41
+ """
42
+ https://juejin.cn/post/7316725867086692391
43
+ Check if the path exists in a case-sensitive manner.
44
+ """
45
+ # 构造成Path
46
+ if relative_path:
47
+ path = Path.cwd() / path
48
+ else:
49
+ path = Path(path)
50
+ if not path.exists():
51
+ return False
52
+ # resolved_path是系统里的该文件实际名称
53
+ resolved_path = path.resolve()
54
+ return str(resolved_path) == str(path)
nlpertools/io/file.py CHANGED
@@ -1,205 +1,277 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import codecs
5
- import os
6
- import json
7
- import pickle
8
- import time
9
-
10
- def _merge_file(filelist, save_filename, shuffle=False):
11
- contents = []
12
- for file in filelist:
13
- content = nlpertools.readtxt_list_all_strip(file)
14
- contents.extend(content)
15
- if shuffle:
16
- random.shuffle(contents)
17
- nlpertools.writetxt_w_list(contents, save_filename)
18
-
19
-
20
- # file's io ----------------------------------------------------------------------
21
- # 读txt文件 一次全读完 返回list 去换行
22
- def readtxt_list_all_strip(path, encoding='utf-8'):
23
- lines = []
24
- with codecs.open(path, 'r', encoding) as r:
25
- for line in r.readlines():
26
- line = line.strip('\n').strip("\r")
27
- lines.append(line)
28
- return lines
29
-
30
-
31
- # 读txt 一次读一行 最后返回list
32
- def readtxt_list_each(path):
33
- lines = []
34
- with codecs.open(path, 'r', 'utf-8') as r:
35
- line = r.readline()
36
- while line:
37
- lines.append(line)
38
- line = r.readline()
39
- return lines
40
-
41
-
42
- # 读txt 一次读一行 最后返回list 去换行
43
- def readtxt_list_each_strip(path):
44
- lines = []
45
- with codecs.open(path, 'r', 'utf-8') as r:
46
- line = r.readline()
47
- while line:
48
- lines.append(line.strip("\n").strip("\r"))
49
- line = r.readline()
50
- return lines
51
-
52
-
53
- # 读txt文件 一次全读完 返回list
54
- def readtxt_list_all(path):
55
- with codecs.open(path, 'r', 'utf-8') as r:
56
- lines = r.readlines()
57
- return lines
58
-
59
-
60
- # 读byte文件 读成一条string
61
- def readtxt_byte(path, encoding="utf-8"):
62
- with codecs.open(path, 'rb') as r:
63
- lines = r.read()
64
- lines = lines.decode(encoding)
65
- return lines.replace('\r', '')
66
-
67
-
68
- # 读txt文件 读成一条string
69
- def readtxt_string(path, encoding="utf-8"):
70
- with codecs.open(path, 'r', encoding) as r:
71
- lines = r.read()
72
- return lines.replace('\r', '')
73
-
74
-
75
- # 写txt文件覆盖
76
- def writetxt_w(txt, path, r='w'):
77
- with codecs.open(path, r, 'utf-8') as w:
78
- w.writelines(txt)
79
-
80
-
81
- # 写txt文件追加
82
- def writetxt_a(txt, path):
83
- with codecs.open(path, 'a', 'utf-8') as w:
84
- w.writelines(txt)
85
-
86
-
87
- def writetxt(txt, path, encoding="utf-8"):
88
- with codecs.open(path, 'w', encoding) as w:
89
- w.write(txt)
90
-
91
-
92
- def writetxt_wb(txt, path):
93
- with codecs.open(path, 'wb') as w:
94
- w.write(txt)
95
-
96
-
97
- # 写list 覆盖
98
- def writetxt_w_list(list, path, num_lf=1):
99
- with codecs.open(path, 'w', "utf-8") as w:
100
- for i in list:
101
- w.write(i)
102
- w.write("\n" * num_lf)
103
-
104
-
105
- # 写list 追加
106
- def writetxt_a_list(list, path, num_lf=2):
107
- with codecs.open(path, 'a', "utf-8") as w:
108
- for i in list:
109
- w.write(i)
110
- w.write("\n" * num_lf)
111
-
112
-
113
- # 写二维list 追加
114
- def writetxt_a_2list(list, path):
115
- with codecs.open(path, 'a', "utf-8") as w:
116
- for i in list:
117
- writetxt_a_list(i, path)
118
-
119
-
120
-
121
-
122
- def SaveToJson(content, path):
123
- with codecs.open(path, "w", "utf-8") as w:
124
- json.dump(content, w, ensure_ascii=False, indent=1)
125
-
126
-
127
- def LoadFromJson(path):
128
- with codecs.open(path, "r", "utf-8") as r:
129
- content = json.load(r)
130
- return content
131
-
132
-
133
- # 读txt文件 读成一条string if gb2312
134
- def readtxt_string_all_encoding(path):
135
- try:
136
- with codecs.open(path, 'rb', "utf-8-sig") as r:
137
- lines = r.read()
138
- return lines
139
- except:
140
- try:
141
- with codecs.open(path, 'rb', "utf-8") as r:
142
- lines = r.reacd()
143
- return lines
144
- except:
145
- try:
146
- with codecs.open(path, 'rb', "big5") as r:
147
- lines = r.read()
148
- return lines
149
- except:
150
- print(path)
151
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
152
- lines = r.read()
153
- return lines
154
-
155
-
156
- def readtxt_list_all_encoding(path):
157
- try:
158
- with codecs.open(path, 'rb', "utf-8-sig") as r:
159
- lines = r.readlines()
160
- return lines
161
- except:
162
- try:
163
- with codecs.open(path, 'rb', "utf-8") as r:
164
- lines = r.readlines()
165
- return lines
166
- except:
167
- try:
168
- with codecs.open(path, 'rb', "big5") as r:
169
- lines = r.readlines()
170
- return lines
171
- except:
172
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
173
- lines = r.readlines()
174
- return lines
175
-
176
- # line by line
177
- def save_to_json(corpus, path):
178
- with open(path, 'w', encoding='utf-8') as wt:
179
- for i in corpus:
180
- wt.write(json.dumps(i, ensure_ascii=False))
181
- wt.write('\n')
182
-
183
-
184
- # line by line
185
- def load_from_json(path):
186
- with open(path, 'r', encoding='utf-8') as rd:
187
- corpus = []
188
- while True:
189
- line = rd.readline()
190
- if line:
191
- corpus.append(json.loads(line))
192
- else:
193
- break
194
- return corpus
195
-
196
- def pickle_save(data, path):
197
- with open(path, 'wb') as f:
198
- pickle.dump(data, f)
199
-
200
-
201
- def pickle_load(path):
202
- with open(path, 'rb') as f:
203
- data = pickle.load(f)
204
- return data
205
-
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import codecs
5
+ import json
6
+ import pickle
7
+ import random
8
+ import time
9
+ from itertools import (takewhile, repeat)
10
+ import pandas as pd
11
+ # import omegaconf
12
+ # import yaml
13
+ from ..utils.package import *
14
+
15
+ LARGE_FILE_THRESHOLD = 1e5
16
+
17
+
18
+ def read_yaml(path, omega=False):
19
+ if omega:
20
+ return omegaconf.OmegaConf.load(path)
21
+ return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
22
+
23
+
24
+ def _merge_file(filelist, save_filename, shuffle=False):
25
+ contents = []
26
+ for file in filelist:
27
+ content = readtxt_list_all_strip(file)
28
+ contents.extend(content)
29
+ if shuffle:
30
+ random.shuffle(contents)
31
+ writetxt_w_list(contents, save_filename)
32
+
33
+
34
+ # file's io ----------------------------------------------------------------------
35
+ def iter_count(file_name):
36
+ """
37
+ 最快的文件行数统计,不知道和wc -l 谁快
38
+ author: unknown
39
+ """
40
+ buffer = 1024 * 1024
41
+ with codecs.open(file_name, 'r', 'utf-8') as f:
42
+ buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
43
+ return sum(buf.count('\n') for buf in buf_gen)
44
+
45
+
46
+ # 需要加入进度条的函数包括
47
+ """
48
+ readtxt_list_all_strip
49
+ save_to_json
50
+ load_from_json
51
+ """
52
+
53
+
54
+ # 读txt文件 一次全读完 返回list 去换行
55
+ def readtxt_list_all_strip(path, encoding='utf-8'):
56
+ file_line_num = iter_count(path)
57
+ lines = []
58
+ with codecs.open(path, 'r', encoding) as r:
59
+ if file_line_num > LARGE_FILE_THRESHOLD:
60
+ iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
61
+ else:
62
+ iter_obj = enumerate(r.readlines())
63
+
64
+ for ldx, line in iter_obj:
65
+ lines.append(line.strip('\n').strip("\r"))
66
+ return lines
67
+
68
+
69
+ # 读txt 一次读一行 最后返回list
70
+ def readtxt_list_each(path):
71
+ lines = []
72
+ with codecs.open(path, 'r', 'utf-8') as r:
73
+ line = r.readline()
74
+ while line:
75
+ lines.append(line)
76
+ line = r.readline()
77
+ return lines
78
+
79
+
80
+ def readtxt_list_each_strip(path):
81
+ """
82
+ yield方法
83
+ """
84
+ with codecs.open(path, 'r', 'utf-8') as r:
85
+ line = r.readline()
86
+ while line:
87
+ yield line.strip("\n").strip("\r")
88
+ line = r.readline()
89
+
90
+
91
+ # 读txt文件 一次全读完 返回list
92
+ def readtxt_list_all(path):
93
+ with codecs.open(path, 'r', 'utf-8') as r:
94
+ lines = r.readlines()
95
+ return lines
96
+
97
+
98
+ # 读byte文件 读成一条string
99
+ def readtxt_byte(path, encoding="utf-8"):
100
+ with codecs.open(path, 'rb') as r:
101
+ lines = r.read()
102
+ lines = lines.decode(encoding)
103
+ return lines.replace('\r', '')
104
+
105
+
106
+ # 读txt文件 读成一条string
107
+ def readtxt_string(path, encoding="utf-8"):
108
+ with codecs.open(path, 'r', encoding) as r:
109
+ lines = r.read()
110
+ return lines.replace('\r', '')
111
+
112
+
113
+ # 写txt文件覆盖
114
+ def writetxt_w(txt, path, r='w'):
115
+ with codecs.open(path, r, 'utf-8') as w:
116
+ w.writelines(txt)
117
+
118
+
119
+ # 写txt文件追加
120
+ def writetxt_a(txt, path):
121
+ with codecs.open(path, 'a', 'utf-8') as w:
122
+ w.writelines(txt)
123
+
124
+
125
+ def writetxt(txt, path, encoding="utf-8"):
126
+ with codecs.open(path, 'w', encoding) as w:
127
+ w.write(txt)
128
+
129
+
130
+ def writetxt_wb(txt, path):
131
+ with codecs.open(path, 'wb') as w:
132
+ w.write(txt)
133
+
134
+
135
+ # 写list 覆盖
136
+ def writetxt_w_list(list, path, num_lf=1):
137
+ with codecs.open(path, 'w', "utf-8") as w:
138
+ for i in list:
139
+ w.write(i)
140
+ w.write("\n" * num_lf)
141
+
142
+
143
+ # 写list 追加
144
+ def writetxt_a_list(list, path, num_lf=2):
145
+ with codecs.open(path, 'a', "utf-8") as w:
146
+ for i in list:
147
+ w.write(i)
148
+ w.write("\n" * num_lf)
149
+
150
+
151
+ def save_to_json(content, path):
152
+ with codecs.open(path, "w", "utf-8") as w:
153
+ json.dump(content, w, ensure_ascii=False, indent=1)
154
+
155
+
156
+ def load_from_json(path):
157
+ with codecs.open(path, "r", "utf-8") as r:
158
+ content = json.load(r)
159
+ return content
160
+
161
+
162
+ # 读txt文件 读成一条string if gb2312
163
+ def readtxt_string_all_encoding(path):
164
+ try:
165
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
166
+ lines = r.read()
167
+ return lines
168
+ except:
169
+ try:
170
+ with codecs.open(path, 'rb', "utf-8") as r:
171
+ lines = r.reacd()
172
+ return lines
173
+ except:
174
+ try:
175
+ with codecs.open(path, 'rb', "big5") as r:
176
+ lines = r.read()
177
+ return lines
178
+ except:
179
+ print(path)
180
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
181
+ lines = r.read()
182
+ return lines
183
+
184
+
185
+ def readtxt_list_all_encoding(path):
186
+ try:
187
+ with codecs.open(path, 'rb', "utf-8-sig") as r:
188
+ lines = r.readlines()
189
+ return lines
190
+ except:
191
+ try:
192
+ with codecs.open(path, 'rb', "utf-8") as r:
193
+ lines = r.readlines()
194
+ return lines
195
+ except:
196
+ try:
197
+ with codecs.open(path, 'rb', "big5") as r:
198
+ lines = r.readlines()
199
+ return lines
200
+ except:
201
+ with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
202
+ lines = r.readlines()
203
+ return lines
204
+
205
+
206
+ # line by line
207
+ def save_to_jsonl(corpus, path):
208
+ with open(path, 'w', encoding='utf-8') as wt:
209
+ for i in corpus:
210
+ wt.write(json.dumps(i, ensure_ascii=False))
211
+ wt.write('\n')
212
+
213
+
214
+ # line by line
215
+ def load_from_jsonl(path):
216
+ file_line_num = iter_count(path)
217
+ if file_line_num > 1e5:
218
+ with open(path, 'r', encoding='utf-8') as rd:
219
+ corpus = []
220
+ while True:
221
+ line = rd.readline()
222
+ if line:
223
+ corpus.append(json.loads(line))
224
+ else:
225
+ break
226
+ return corpus
227
+ else:
228
+ with open(path, 'r', encoding='utf-8') as rd:
229
+ corpus = []
230
+ while True:
231
+ line = rd.readline()
232
+ if line:
233
+ corpus.append(json.loads(line))
234
+ else:
235
+ break
236
+ return corpus
237
+
238
+
239
+ def pickle_save(data, path):
240
+ with open(path, 'wb') as f:
241
+ pickle.dump(data, f)
242
+
243
+
244
+ def pickle_load(path):
245
+ with open(path, 'rb') as f:
246
+ data = pickle.load(f)
247
+ return data
248
+
249
+
250
+ def save_to_csv(df, save_path, index_flag=False):
251
+ with open(save_path, 'wb+') as csvfile:
252
+ csvfile.write(codecs.BOM_UTF8)
253
+ df.to_csv(save_path, mode='a', index=index_flag)
254
+
255
+
256
+ def save_to_mongo():
257
+ # fake
258
+ """
259
+ 示例
260
+
261
+ """
262
+ pass
263
+
264
+ def load_from_mongo():
265
+ pass
266
+
267
+
268
+ def unmerge_cells_df(df) -> pd.DataFrame:
269
+ for column in df.columns:
270
+ values = []
271
+ for i in df[column]:
272
+ if pd.isna(i):
273
+ values.append(values[-1])
274
+ else:
275
+ values.append(i)
276
+ df[column] = values
277
+ return df