nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- nlpertools/__init__.py +23 -20
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -55
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/cli.py +87 -0
- nlpertools/data_client.py +426 -257
- nlpertools/data_structure/base_structure.py +109 -13
- nlpertools/dataprocess.py +627 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/draw/__init__.py +0 -0
- nlpertools/draw/draw.py +83 -0
- nlpertools/draw/math_func.py +33 -0
- nlpertools/get_2fa.py +0 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +86 -36
- nlpertools/io/file.py +283 -222
- nlpertools/ml.py +511 -460
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -65
- nlpertools/other.py +475 -249
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -43
- nlpertools/reminder.py +98 -87
- nlpertools/utils/__init__.py +3 -3
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -76
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -93
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -96
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
- nlpertools-1.0.8.dist-info/METADATA +132 -0
- nlpertools-1.0.8.dist-info/RECORD +49 -0
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
- nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
- nlpertools-1.0.8.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.5.dist-info/METADATA +0 -85
- nlpertools-1.0.5.dist-info/RECORD +0 -25
- nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/other.py
CHANGED
@@ -1,249 +1,475 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
4
|
-
import
|
5
|
-
import
|
6
|
-
import
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
from .
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
"""
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
#
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
#
|
95
|
-
#
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
def
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import itertools
|
5
|
+
import os
|
6
|
+
import re
|
7
|
+
import string
|
8
|
+
import subprocess
|
9
|
+
import threading
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
11
|
+
from functools import reduce
|
12
|
+
import math
|
13
|
+
import datetime
|
14
|
+
import difflib
|
15
|
+
import psutil
|
16
|
+
from .io.file import writetxt_w_list, writetxt_a
|
17
|
+
# import numpy as np
|
18
|
+
# import psutil
|
19
|
+
# import pyquery as pq
|
20
|
+
# import requests
|
21
|
+
# import torch
|
22
|
+
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
23
|
+
# from sklearn.metrics import precision_recall_fscore_support
|
24
|
+
# from tqdm import tqdm
|
25
|
+
# from win32evtlogutil import langid
|
26
|
+
from .utils.package import *
|
27
|
+
|
28
|
+
CHINESE_PUNCTUATION = list(',。;:‘’“”!?《》「」【】<>()、')
|
29
|
+
ENGLISH_PUNCTUATION = list(',.;:\'"!?<>()')
|
30
|
+
OTHER_PUNCTUATION = list('!@#$%^&*')
|
31
|
+
|
32
|
+
|
33
|
+
def get_diff_parts(str1, str2):
|
34
|
+
# 创建一个 SequenceMatcher 对象
|
35
|
+
matcher = difflib.SequenceMatcher(None, str1, str2)
|
36
|
+
|
37
|
+
# 获取差异部分
|
38
|
+
diff_parts = []
|
39
|
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
40
|
+
if tag == 'replace' or tag == 'delete' or tag == 'insert':
|
41
|
+
diff_parts.append((tag, str1[i1:i2], str2[j1:j2]))
|
42
|
+
|
43
|
+
return diff_parts
|
44
|
+
|
45
|
+
|
46
|
+
def run_cmd_with_timeout(cmd, timeout):
|
47
|
+
"""
|
48
|
+
https://juejin.cn/post/7391703459803086848
|
49
|
+
"""
|
50
|
+
process = subprocess.Popen(cmd, shell=True, encoding="utf-8", errors="ignore", stdout=subprocess.PIPE,
|
51
|
+
stderr=subprocess.PIPE)
|
52
|
+
res = [None]
|
53
|
+
|
54
|
+
def target():
|
55
|
+
try:
|
56
|
+
ans = process.communicate()
|
57
|
+
res[0] = ans
|
58
|
+
except subprocess.TimeoutExpired:
|
59
|
+
process.kill()
|
60
|
+
process.communicate()
|
61
|
+
|
62
|
+
thread = threading.Thread(target=target)
|
63
|
+
thread.start()
|
64
|
+
thread.join(timeout)
|
65
|
+
if thread.is_alive():
|
66
|
+
print(f"Terminating {cmd}")
|
67
|
+
process.terminate()
|
68
|
+
thread.join()
|
69
|
+
print("Terminated successfully")
|
70
|
+
return False, f"{cmd} is running over {timeout}s"
|
71
|
+
if process.returncode == 0:
|
72
|
+
# res[0][0] 是output
|
73
|
+
return True, res[0][0]
|
74
|
+
else:
|
75
|
+
return False, res[0][0]
|
76
|
+
|
77
|
+
|
78
|
+
def print_three_line_table(df):
|
79
|
+
# TODO 这里需要添加可以支持excel里变红的功能
|
80
|
+
import webbrowser
|
81
|
+
|
82
|
+
# import pandas as pd
|
83
|
+
# data = {'from_pc': ['valid_data', 'illegal_char', 'more_data'],
|
84
|
+
# 'rom_pc': ['another_valid_data', 'illegal_char', 'data']}
|
85
|
+
# df = pd.DataFrame(data)
|
86
|
+
|
87
|
+
# 将 DataFrame 转换为 HTML 表格
|
88
|
+
html_table = df.to_html(index=False)
|
89
|
+
html_table = html_table.replace('border="1"', 'border="0"')
|
90
|
+
|
91
|
+
first_line_px = str(2)
|
92
|
+
second_line_px = str(1)
|
93
|
+
third_line_px = str(2)
|
94
|
+
# 定义三线表的 CSS 样式
|
95
|
+
# // thead 表头
|
96
|
+
# // tr 行
|
97
|
+
# // td 单元格
|
98
|
+
head = """<!DOCTYPE html>
|
99
|
+
<html lang="zh">
|
100
|
+
<head>
|
101
|
+
<meta charset="UTF-8">
|
102
|
+
<title>页面标题</title>
|
103
|
+
</head>"""
|
104
|
+
style = """
|
105
|
+
<style>
|
106
|
+
|
107
|
+
table {
|
108
|
+
border-collapse: collapse;
|
109
|
+
}
|
110
|
+
|
111
|
+
tr, td, th {
|
112
|
+
text-align: center; /* 水平居中文本 */
|
113
|
+
vertical-align: middle; /* 垂直居中文本 */
|
114
|
+
}
|
115
|
+
thead tr {
|
116
|
+
border-top: (first_line_px)px solid black;
|
117
|
+
border-bottom: (second_line_px)px solid black;
|
118
|
+
}
|
119
|
+
|
120
|
+
thead th {
|
121
|
+
border-bottom: (second_line_px)px solid black;
|
122
|
+
}
|
123
|
+
|
124
|
+
tbody tr td {
|
125
|
+
border-bottom: 0px solid black;
|
126
|
+
}
|
127
|
+
|
128
|
+
tbody tr:last-child td {
|
129
|
+
border-bottom: (third_line_px)px solid black;
|
130
|
+
}
|
131
|
+
</style>"""
|
132
|
+
style = style.replace("(first_line_px)", first_line_px).replace("(second_line_px)", second_line_px).replace(
|
133
|
+
"(third_line_px)", third_line_px)
|
134
|
+
# 将 CSS 样式和 HTML 表格结合起来
|
135
|
+
html = f"{style}{html_table}"
|
136
|
+
print(html)
|
137
|
+
temp_file_path = "temp.html"
|
138
|
+
# 将 HTML 保存到文件中
|
139
|
+
with open(temp_file_path, "w") as f:
|
140
|
+
f.write(html)
|
141
|
+
webbrowser.open('file://' + os.path.realpath(temp_file_path))
|
142
|
+
|
143
|
+
|
144
|
+
def jprint(obj, depth=0):
|
145
|
+
if isinstance(obj, dict):
|
146
|
+
sep = "-" * (10 - depth * 3)
|
147
|
+
for k, v in obj.items():
|
148
|
+
print(depth * "|", sep, k, sep)
|
149
|
+
jprint(v)
|
150
|
+
elif isinstance(obj, list):
|
151
|
+
for v in obj:
|
152
|
+
jprint(v, depth + 1)
|
153
|
+
else:
|
154
|
+
print(obj)
|
155
|
+
|
156
|
+
|
157
|
+
def print_split(sign="=", num=20):
|
158
|
+
print(sign * num)
|
159
|
+
|
160
|
+
|
161
|
+
def seed_everything():
|
162
|
+
import torch
|
163
|
+
# seed everything
|
164
|
+
seed = 7777777
|
165
|
+
np.random.seed(seed)
|
166
|
+
torch.manual_seed(seed) # CPU随机种子确定
|
167
|
+
torch.cuda.manual_seed(seed)
|
168
|
+
torch.cuda.manual_seed_all(seed)
|
169
|
+
|
170
|
+
|
171
|
+
def sent_email(mail_user, mail_pass, receiver, title, content, attach_path=None):
|
172
|
+
import smtplib
|
173
|
+
from email.mime.multipart import MIMEMultipart
|
174
|
+
from email.mime.text import MIMEText
|
175
|
+
from email.mime.application import MIMEApplication
|
176
|
+
|
177
|
+
mail_host = 'smtp.qq.com'
|
178
|
+
mail_user = mail_user
|
179
|
+
mail_pass = mail_pass
|
180
|
+
sender = mail_user
|
181
|
+
|
182
|
+
message = MIMEMultipart()
|
183
|
+
message.attach(MIMEText(content, 'plain', 'utf-8'))
|
184
|
+
if attach_path:
|
185
|
+
attachment = MIMEApplication(open(attach_path, 'rb').read())
|
186
|
+
attachment["Content-Type"] = 'application/octet-stream'
|
187
|
+
attachment.add_header('Content-Dispositon', 'attachment',
|
188
|
+
filename=('utf-8', '', attach_path)) # 注意:此处basename要转换为gbk编码,否则中文会有乱码。
|
189
|
+
message.attach(attachment)
|
190
|
+
message['Subject'] = title
|
191
|
+
message['From'] = sender
|
192
|
+
message['To'] = receiver
|
193
|
+
|
194
|
+
try:
|
195
|
+
smtp_obj = smtplib.SMTP()
|
196
|
+
smtp_obj.connect(mail_host, 25)
|
197
|
+
smtp_obj.login(mail_user, mail_pass)
|
198
|
+
smtp_obj.sendmail(sender, receiver, message.as_string())
|
199
|
+
smtp_obj.quit()
|
200
|
+
print('send email success')
|
201
|
+
except smtplib.SMTPException as e:
|
202
|
+
print('send failed', e)
|
203
|
+
|
204
|
+
|
205
|
+
def convert_np_to_py(obj):
|
206
|
+
if isinstance(obj, dict):
|
207
|
+
return {k: convert_np_to_py(v) for k, v in obj.items()}
|
208
|
+
elif isinstance(obj, list):
|
209
|
+
return [convert_np_to_py(v) for v in obj]
|
210
|
+
elif isinstance(obj, np.float64) or isinstance(obj, np.float32):
|
211
|
+
return float(obj)
|
212
|
+
else:
|
213
|
+
return obj
|
214
|
+
|
215
|
+
|
216
|
+
def snake_to_camel(s: str) -> str:
|
217
|
+
"""
|
218
|
+
author: u
|
219
|
+
将 snake case 转换到 camel case.
|
220
|
+
:param s: snake case variable
|
221
|
+
:return:
|
222
|
+
"""
|
223
|
+
return s.title().replace("_", "")
|
224
|
+
|
225
|
+
|
226
|
+
def camel_to_snake(s: str) -> str:
|
227
|
+
"""
|
228
|
+
将 camel case 转换到 snake case.
|
229
|
+
:param s: camel case variable
|
230
|
+
:return:
|
231
|
+
"""
|
232
|
+
return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()
|
233
|
+
|
234
|
+
|
235
|
+
# other ----------------------------------------------------------------------
|
236
|
+
# 统计词频
|
237
|
+
def calc_word_count(list_word, mode, path='tempcount.txt', sort_id=1, is_reverse=True):
|
238
|
+
word_count = {}
|
239
|
+
for key in list_word:
|
240
|
+
if key not in word_count:
|
241
|
+
word_count[key] = 1
|
242
|
+
else:
|
243
|
+
word_count[key] += 1
|
244
|
+
word_dict_sort = sorted(word_count.items(), key=lambda x: x[sort_id], reverse=is_reverse)
|
245
|
+
if mode == 'w':
|
246
|
+
for key in word_dict_sort:
|
247
|
+
writetxt_a(str(key[0]) + '\t' + str(key[1]) + '\n', path)
|
248
|
+
elif mode == 'p':
|
249
|
+
for key in word_dict_sort:
|
250
|
+
print(str(key[0]) + '\t' + str(key[1]))
|
251
|
+
elif mode == 'u':
|
252
|
+
return word_dict_sort
|
253
|
+
|
254
|
+
|
255
|
+
# 字典去重
|
256
|
+
def dupl_dict(dict_list, key):
|
257
|
+
new_dict_list, value_set = [], []
|
258
|
+
print('去重中...')
|
259
|
+
for i in tqdm(dict_list):
|
260
|
+
if i[key] not in value_set:
|
261
|
+
new_dict_list.append(i)
|
262
|
+
value_set.append(i[key])
|
263
|
+
return new_dict_list
|
264
|
+
|
265
|
+
|
266
|
+
def multi_thread_run(_task, data):
|
267
|
+
with ThreadPoolExecutor() as executor:
|
268
|
+
result = list(tqdm(executor.map(_task, data), total=len(data)))
|
269
|
+
return result
|
270
|
+
|
271
|
+
|
272
|
+
def del_special_char(sentence):
|
273
|
+
special_chars = ['\ufeff', '\xa0', '\u3000', '\xa0', '\ue627']
|
274
|
+
for i in special_chars:
|
275
|
+
sentence = sentence.replace(i, '')
|
276
|
+
return sentence
|
277
|
+
|
278
|
+
|
279
|
+
def en_pun_2_zh_pun(sentence):
|
280
|
+
# TODO 因为引号的问题,所以我没有写
|
281
|
+
for i in ENGLISH_PUNCTUATION:
|
282
|
+
pass
|
283
|
+
|
284
|
+
|
285
|
+
def spider(url):
|
286
|
+
"""
|
287
|
+
|
288
|
+
:param url:
|
289
|
+
:return:
|
290
|
+
"""
|
291
|
+
if 'baijiahao' in url:
|
292
|
+
content = requests.get(url)
|
293
|
+
# print(content.text)
|
294
|
+
html = pq.PyQuery(content.text)
|
295
|
+
title = html('.index-module_articleTitle_28fPT').text()
|
296
|
+
res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
|
297
|
+
return '{}\n{}'.format(title, res)
|
298
|
+
|
299
|
+
|
300
|
+
def eda(sentence):
|
301
|
+
url = 'https://x.x.x.x:x/eda'
|
302
|
+
json_data = dict({"sentence": sentence})
|
303
|
+
res = requests.post(url, json=json_data)
|
304
|
+
return res.json()['eda']
|
305
|
+
|
306
|
+
|
307
|
+
def find_language(text):
|
308
|
+
# TODO 替换为开源包
|
309
|
+
letters = list(string.ascii_letters)
|
310
|
+
if len(text) > 50:
|
311
|
+
passage = text[:50]
|
312
|
+
len_passage = 50
|
313
|
+
else:
|
314
|
+
len_passage = len(text)
|
315
|
+
count = 0
|
316
|
+
for c in passage:
|
317
|
+
if c in letters:
|
318
|
+
count += 1
|
319
|
+
if count / len_passage > 0.5:
|
320
|
+
return "en"
|
321
|
+
else:
|
322
|
+
return "not en"
|
323
|
+
|
324
|
+
|
325
|
+
def print_prf(y_true, y_pred, label=None):
|
326
|
+
# y_true = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
|
327
|
+
# y_pred = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
|
328
|
+
# p, r, f, s = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
|
329
|
+
# print("p\t{}".format(p))
|
330
|
+
# print("r\t{}".format(r))
|
331
|
+
# print("f\t{}".format(f))
|
332
|
+
# print("s\t{}".format(s))
|
333
|
+
result = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=label)
|
334
|
+
|
335
|
+
for i in range(len(label)):
|
336
|
+
res = []
|
337
|
+
for k in result:
|
338
|
+
res.append('%.5f' % k[i])
|
339
|
+
print('{}: {} {} {}'.format(label[i], *res[:3]))
|
340
|
+
|
341
|
+
|
342
|
+
def print_cpu():
|
343
|
+
p = psutil.Process()
|
344
|
+
# pro_info = p.as_dict(attrs=['pid', 'name', 'username'])
|
345
|
+
print(psutil.cpu_count())
|
346
|
+
|
347
|
+
|
348
|
+
def stress_test(func, ipts):
|
349
|
+
with ThreadPoolExecutor() as executor:
|
350
|
+
results = list(tqdm(executor.map(func, ipts), total=len(ipts)))
|
351
|
+
return results
|
352
|
+
|
353
|
+
|
354
|
+
def squeeze_list(high_dim_list):
|
355
|
+
return list(itertools.chain.from_iterable(high_dim_list))
|
356
|
+
|
357
|
+
|
358
|
+
def unsqueeze_list(flatten_list, each_element_len):
|
359
|
+
# 该函数是错的,被split_list替代了
|
360
|
+
two_dim_list = [flatten_list[i * each_element_len:(i + 1) * each_element_len] for i in
|
361
|
+
range(len(flatten_list) // each_element_len)]
|
362
|
+
return two_dim_list
|
363
|
+
|
364
|
+
def split_list(input_list, chunk_size):
|
365
|
+
# 使用列表推导式将列表分割成二维数组
|
366
|
+
return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
|
367
|
+
|
368
|
+
def auto_close():
|
369
|
+
"""
|
370
|
+
针对企业微信15分钟会显示离开的机制,假装自己还在上班
|
371
|
+
"""
|
372
|
+
import pyautogui as pg
|
373
|
+
import time
|
374
|
+
import os
|
375
|
+
cmd = 'schtasks /create /tn shut /tr "shutdown -s -f" /sc once /st 23:30'
|
376
|
+
os.system(cmd)
|
377
|
+
while 1:
|
378
|
+
pg.moveTo(970, 17, 2)
|
379
|
+
pg.click()
|
380
|
+
time.sleep(840)
|
381
|
+
|
382
|
+
|
383
|
+
def tf_idf(corpus, save_path):
|
384
|
+
tfidfdict = {}
|
385
|
+
vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
|
386
|
+
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
|
387
|
+
tfidf = transformer.fit_transform(
|
388
|
+
vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
|
389
|
+
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
|
390
|
+
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
|
391
|
+
for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
|
392
|
+
for j in range(len(word)):
|
393
|
+
getword = word[j]
|
394
|
+
getvalue = weight[i][j]
|
395
|
+
if getvalue != 0: # 去掉值为0的项
|
396
|
+
if getword in tfidfdict: # 更新全局TFIDF值
|
397
|
+
tfidfdict[getword] += float(getvalue)
|
398
|
+
else:
|
399
|
+
tfidfdict.update({getword: getvalue})
|
400
|
+
sorted_tfidf = sorted(tfidfdict.items(), key=lambda d: d[1], reverse=True)
|
401
|
+
to_write = ['{} {}'.format(i[0], i[1]) for i in sorted_tfidf]
|
402
|
+
writetxt_w_list(to_write, save_path, num_lf=1)
|
403
|
+
|
404
|
+
|
405
|
+
class GaussDecay(object):
|
406
|
+
"""
|
407
|
+
当前只实现了时间的,全部使用默认值
|
408
|
+
"""
|
409
|
+
|
410
|
+
def __init__(self, origin='2022-08-02', scale='90d', offset='5d', decay=0.5, task="time"):
|
411
|
+
self.origin = origin
|
412
|
+
self.task = task
|
413
|
+
self.scale, self.offset = self.translate(scale, offset)
|
414
|
+
self.decay = decay
|
415
|
+
self.time_coefficient = 0.6
|
416
|
+
self.related_coefficient = 0.4
|
417
|
+
|
418
|
+
def translate(self, scale, offset):
|
419
|
+
"""
|
420
|
+
将领域的输入转化为标准
|
421
|
+
:return:
|
422
|
+
"""
|
423
|
+
if self.task == "time":
|
424
|
+
scale = 180
|
425
|
+
offset = 5
|
426
|
+
else:
|
427
|
+
scale = 180
|
428
|
+
offset = 5
|
429
|
+
return scale, offset
|
430
|
+
|
431
|
+
@staticmethod
|
432
|
+
def translated_minus(field_value):
|
433
|
+
origin = datetime.datetime.now()
|
434
|
+
field_value = datetime.datetime.strptime(field_value, '%Y-%m-%d %H:%M:%S')
|
435
|
+
return (origin - field_value).days
|
436
|
+
|
437
|
+
def calc_exp(self):
|
438
|
+
pass
|
439
|
+
|
440
|
+
def calc_liner(self):
|
441
|
+
pass
|
442
|
+
|
443
|
+
def calc_gauss(self, raw_score, field_value):
|
444
|
+
"""
|
445
|
+
$$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
|
446
|
+
$$σ^2=-scale^2/(2·ln(decay))$$
|
447
|
+
:param raw_score:
|
448
|
+
:param field_value:
|
449
|
+
:return:
|
450
|
+
"""
|
451
|
+
numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
|
452
|
+
sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
|
453
|
+
denominator = 2 * sigma_square
|
454
|
+
s = math.exp(-1 * numerator / denominator)
|
455
|
+
return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)
|
456
|
+
|
457
|
+
|
458
|
+
if __name__ == '__main__':
|
459
|
+
gauss_decay = GaussDecay()
|
460
|
+
res = gauss_decay.calc_gauss(raw_score=1, field_value="2021-05-29 14:31:13")
|
461
|
+
print(res)
|
462
|
+
# res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
|
463
|
+
# print(res)
|
464
|
+
# res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
|
465
|
+
# print(res)
|
466
|
+
# res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
|
467
|
+
# print(res)
|
468
|
+
|
469
|
+
# 常用函数参考
|
470
|
+
# import tensorflow as tf
|
471
|
+
#
|
472
|
+
# gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
|
473
|
+
# sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
|
474
|
+
# for gpu in tf.config.experimental.list_physical_devices('GPU'):
|
475
|
+
# tf.config.experimental.set_memory_growth()
|