nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +24 -20
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -55
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -257
- nlpertools/data_structure/base_structure.py +109 -13
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -36
- nlpertools/io/file.py +277 -222
- nlpertools/ml.py +483 -460
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -65
- nlpertools/other.py +364 -249
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -43
- nlpertools/reminder.py +98 -87
- nlpertools/utils/__init__.py +3 -3
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -76
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -93
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -96
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.5.dist-info/METADATA +0 -85
- nlpertools-1.0.5.dist-info/RECORD +0 -25
- nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/io/dir.py
CHANGED
@@ -1,36 +1,54 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
4
|
-
import os
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
:
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
|
8
|
+
# dir ----------------------------------------------------------------------
|
9
|
+
def j_mkdir(name):
|
10
|
+
os.makedirs(name, exist_ok=True)
|
11
|
+
|
12
|
+
|
13
|
+
def get_filename(path) -> str:
|
14
|
+
"""
|
15
|
+
返回路径最后的文件名
|
16
|
+
:param path:
|
17
|
+
:return:
|
18
|
+
"""
|
19
|
+
# path = r'***/**/***.txt'
|
20
|
+
filename = os.path.split(path)[-1]
|
21
|
+
return filename
|
22
|
+
|
23
|
+
|
24
|
+
def j_listdir(dir_name, including_dir=True):
|
25
|
+
# yield
|
26
|
+
filenames = os.listdir(dir_name)
|
27
|
+
for filename in filenames:
|
28
|
+
if including_dir:
|
29
|
+
yield os.path.join(dir_name, filename)
|
30
|
+
else:
|
31
|
+
yield filename
|
32
|
+
|
33
|
+
|
34
|
+
# 合并文件 TODO 还没写
|
35
|
+
def imgrate_files(path):
|
36
|
+
filenames = os.listdir(path)
|
37
|
+
return None
|
38
|
+
|
39
|
+
|
40
|
+
def case_sensitive_path_exists(path: str, relative_path=False):
|
41
|
+
"""
|
42
|
+
https://juejin.cn/post/7316725867086692391
|
43
|
+
Check if the path exists in a case-sensitive manner.
|
44
|
+
"""
|
45
|
+
# 构造成Path
|
46
|
+
if relative_path:
|
47
|
+
path = Path.cwd() / path
|
48
|
+
else:
|
49
|
+
path = Path(path)
|
50
|
+
if not path.exists():
|
51
|
+
return False
|
52
|
+
# resolved_path是系统里的该文件实际名称
|
53
|
+
resolved_path = path.resolve()
|
54
|
+
return str(resolved_path) == str(path)
|
nlpertools/io/file.py
CHANGED
@@ -1,222 +1,277 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
4
|
-
import codecs
|
5
|
-
import json
|
6
|
-
import pickle
|
7
|
-
import random
|
8
|
-
import time
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
with codecs.open(path, 'r',
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
def
|
100
|
-
with codecs.open(path, '
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
#
|
144
|
-
def
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import codecs
|
5
|
+
import json
|
6
|
+
import pickle
|
7
|
+
import random
|
8
|
+
import time
|
9
|
+
from itertools import (takewhile, repeat)
|
10
|
+
import pandas as pd
|
11
|
+
# import omegaconf
|
12
|
+
# import yaml
|
13
|
+
from ..utils.package import *
|
14
|
+
|
15
|
+
LARGE_FILE_THRESHOLD = 1e5
|
16
|
+
|
17
|
+
|
18
|
+
def read_yaml(path, omega=False):
|
19
|
+
if omega:
|
20
|
+
return omegaconf.OmegaConf.load(path)
|
21
|
+
return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
|
22
|
+
|
23
|
+
|
24
|
+
def _merge_file(filelist, save_filename, shuffle=False):
|
25
|
+
contents = []
|
26
|
+
for file in filelist:
|
27
|
+
content = readtxt_list_all_strip(file)
|
28
|
+
contents.extend(content)
|
29
|
+
if shuffle:
|
30
|
+
random.shuffle(contents)
|
31
|
+
writetxt_w_list(contents, save_filename)
|
32
|
+
|
33
|
+
|
34
|
+
# file's io ----------------------------------------------------------------------
|
35
|
+
def iter_count(file_name):
|
36
|
+
"""
|
37
|
+
最快的文件行数统计,不知道和wc -l 谁快
|
38
|
+
author: unknown
|
39
|
+
"""
|
40
|
+
buffer = 1024 * 1024
|
41
|
+
with codecs.open(file_name, 'r', 'utf-8') as f:
|
42
|
+
buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
|
43
|
+
return sum(buf.count('\n') for buf in buf_gen)
|
44
|
+
|
45
|
+
|
46
|
+
# 需要加入进度条的函数包括
|
47
|
+
"""
|
48
|
+
readtxt_list_all_strip
|
49
|
+
save_to_json
|
50
|
+
load_from_json
|
51
|
+
"""
|
52
|
+
|
53
|
+
|
54
|
+
# 读txt文件 一次全读完 返回list 去换行
|
55
|
+
def readtxt_list_all_strip(path, encoding='utf-8'):
|
56
|
+
file_line_num = iter_count(path)
|
57
|
+
lines = []
|
58
|
+
with codecs.open(path, 'r', encoding) as r:
|
59
|
+
if file_line_num > LARGE_FILE_THRESHOLD:
|
60
|
+
iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
|
61
|
+
else:
|
62
|
+
iter_obj = enumerate(r.readlines())
|
63
|
+
|
64
|
+
for ldx, line in iter_obj:
|
65
|
+
lines.append(line.strip('\n').strip("\r"))
|
66
|
+
return lines
|
67
|
+
|
68
|
+
|
69
|
+
# 读txt 一次读一行 最后返回list
|
70
|
+
def readtxt_list_each(path):
|
71
|
+
lines = []
|
72
|
+
with codecs.open(path, 'r', 'utf-8') as r:
|
73
|
+
line = r.readline()
|
74
|
+
while line:
|
75
|
+
lines.append(line)
|
76
|
+
line = r.readline()
|
77
|
+
return lines
|
78
|
+
|
79
|
+
|
80
|
+
def readtxt_list_each_strip(path):
|
81
|
+
"""
|
82
|
+
yield方法
|
83
|
+
"""
|
84
|
+
with codecs.open(path, 'r', 'utf-8') as r:
|
85
|
+
line = r.readline()
|
86
|
+
while line:
|
87
|
+
yield line.strip("\n").strip("\r")
|
88
|
+
line = r.readline()
|
89
|
+
|
90
|
+
|
91
|
+
# 读txt文件 一次全读完 返回list
|
92
|
+
def readtxt_list_all(path):
|
93
|
+
with codecs.open(path, 'r', 'utf-8') as r:
|
94
|
+
lines = r.readlines()
|
95
|
+
return lines
|
96
|
+
|
97
|
+
|
98
|
+
# 读byte文件 读成一条string
|
99
|
+
def readtxt_byte(path, encoding="utf-8"):
|
100
|
+
with codecs.open(path, 'rb') as r:
|
101
|
+
lines = r.read()
|
102
|
+
lines = lines.decode(encoding)
|
103
|
+
return lines.replace('\r', '')
|
104
|
+
|
105
|
+
|
106
|
+
# 读txt文件 读成一条string
|
107
|
+
def readtxt_string(path, encoding="utf-8"):
|
108
|
+
with codecs.open(path, 'r', encoding) as r:
|
109
|
+
lines = r.read()
|
110
|
+
return lines.replace('\r', '')
|
111
|
+
|
112
|
+
|
113
|
+
# 写txt文件覆盖
|
114
|
+
def writetxt_w(txt, path, r='w'):
|
115
|
+
with codecs.open(path, r, 'utf-8') as w:
|
116
|
+
w.writelines(txt)
|
117
|
+
|
118
|
+
|
119
|
+
# 写txt文件追加
|
120
|
+
def writetxt_a(txt, path):
|
121
|
+
with codecs.open(path, 'a', 'utf-8') as w:
|
122
|
+
w.writelines(txt)
|
123
|
+
|
124
|
+
|
125
|
+
def writetxt(txt, path, encoding="utf-8"):
|
126
|
+
with codecs.open(path, 'w', encoding) as w:
|
127
|
+
w.write(txt)
|
128
|
+
|
129
|
+
|
130
|
+
def writetxt_wb(txt, path):
|
131
|
+
with codecs.open(path, 'wb') as w:
|
132
|
+
w.write(txt)
|
133
|
+
|
134
|
+
|
135
|
+
# 写list 覆盖
|
136
|
+
def writetxt_w_list(list, path, num_lf=1):
|
137
|
+
with codecs.open(path, 'w', "utf-8") as w:
|
138
|
+
for i in list:
|
139
|
+
w.write(i)
|
140
|
+
w.write("\n" * num_lf)
|
141
|
+
|
142
|
+
|
143
|
+
# 写list 追加
|
144
|
+
def writetxt_a_list(list, path, num_lf=2):
|
145
|
+
with codecs.open(path, 'a', "utf-8") as w:
|
146
|
+
for i in list:
|
147
|
+
w.write(i)
|
148
|
+
w.write("\n" * num_lf)
|
149
|
+
|
150
|
+
|
151
|
+
def save_to_json(content, path):
|
152
|
+
with codecs.open(path, "w", "utf-8") as w:
|
153
|
+
json.dump(content, w, ensure_ascii=False, indent=1)
|
154
|
+
|
155
|
+
|
156
|
+
def load_from_json(path):
|
157
|
+
with codecs.open(path, "r", "utf-8") as r:
|
158
|
+
content = json.load(r)
|
159
|
+
return content
|
160
|
+
|
161
|
+
|
162
|
+
# 读txt文件 读成一条string if gb2312
|
163
|
+
def readtxt_string_all_encoding(path):
|
164
|
+
try:
|
165
|
+
with codecs.open(path, 'rb', "utf-8-sig") as r:
|
166
|
+
lines = r.read()
|
167
|
+
return lines
|
168
|
+
except:
|
169
|
+
try:
|
170
|
+
with codecs.open(path, 'rb', "utf-8") as r:
|
171
|
+
lines = r.reacd()
|
172
|
+
return lines
|
173
|
+
except:
|
174
|
+
try:
|
175
|
+
with codecs.open(path, 'rb', "big5") as r:
|
176
|
+
lines = r.read()
|
177
|
+
return lines
|
178
|
+
except:
|
179
|
+
print(path)
|
180
|
+
with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
|
181
|
+
lines = r.read()
|
182
|
+
return lines
|
183
|
+
|
184
|
+
|
185
|
+
def readtxt_list_all_encoding(path):
|
186
|
+
try:
|
187
|
+
with codecs.open(path, 'rb', "utf-8-sig") as r:
|
188
|
+
lines = r.readlines()
|
189
|
+
return lines
|
190
|
+
except:
|
191
|
+
try:
|
192
|
+
with codecs.open(path, 'rb', "utf-8") as r:
|
193
|
+
lines = r.readlines()
|
194
|
+
return lines
|
195
|
+
except:
|
196
|
+
try:
|
197
|
+
with codecs.open(path, 'rb', "big5") as r:
|
198
|
+
lines = r.readlines()
|
199
|
+
return lines
|
200
|
+
except:
|
201
|
+
with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
|
202
|
+
lines = r.readlines()
|
203
|
+
return lines
|
204
|
+
|
205
|
+
|
206
|
+
# line by line
|
207
|
+
def save_to_jsonl(corpus, path):
|
208
|
+
with open(path, 'w', encoding='utf-8') as wt:
|
209
|
+
for i in corpus:
|
210
|
+
wt.write(json.dumps(i, ensure_ascii=False))
|
211
|
+
wt.write('\n')
|
212
|
+
|
213
|
+
|
214
|
+
# line by line
|
215
|
+
def load_from_jsonl(path):
|
216
|
+
file_line_num = iter_count(path)
|
217
|
+
if file_line_num > 1e5:
|
218
|
+
with open(path, 'r', encoding='utf-8') as rd:
|
219
|
+
corpus = []
|
220
|
+
while True:
|
221
|
+
line = rd.readline()
|
222
|
+
if line:
|
223
|
+
corpus.append(json.loads(line))
|
224
|
+
else:
|
225
|
+
break
|
226
|
+
return corpus
|
227
|
+
else:
|
228
|
+
with open(path, 'r', encoding='utf-8') as rd:
|
229
|
+
corpus = []
|
230
|
+
while True:
|
231
|
+
line = rd.readline()
|
232
|
+
if line:
|
233
|
+
corpus.append(json.loads(line))
|
234
|
+
else:
|
235
|
+
break
|
236
|
+
return corpus
|
237
|
+
|
238
|
+
|
239
|
+
def pickle_save(data, path):
|
240
|
+
with open(path, 'wb') as f:
|
241
|
+
pickle.dump(data, f)
|
242
|
+
|
243
|
+
|
244
|
+
def pickle_load(path):
|
245
|
+
with open(path, 'rb') as f:
|
246
|
+
data = pickle.load(f)
|
247
|
+
return data
|
248
|
+
|
249
|
+
|
250
|
+
def save_to_csv(df, save_path, index_flag=False):
|
251
|
+
with open(save_path, 'wb+') as csvfile:
|
252
|
+
csvfile.write(codecs.BOM_UTF8)
|
253
|
+
df.to_csv(save_path, mode='a', index=index_flag)
|
254
|
+
|
255
|
+
|
256
|
+
def save_to_mongo():
|
257
|
+
# fake
|
258
|
+
"""
|
259
|
+
示例
|
260
|
+
|
261
|
+
"""
|
262
|
+
pass
|
263
|
+
|
264
|
+
def load_from_mongo():
|
265
|
+
pass
|
266
|
+
|
267
|
+
|
268
|
+
def unmerge_cells_df(df) -> pd.DataFrame:
|
269
|
+
for column in df.columns:
|
270
|
+
values = []
|
271
|
+
for i in df[column]:
|
272
|
+
if pd.isna(i):
|
273
|
+
values.append(values[-1])
|
274
|
+
else:
|
275
|
+
values.append(i)
|
276
|
+
df[column] = values
|
277
|
+
return df
|