nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. nlpertools/__init__.py +23 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/cli.py +87 -0
  9. nlpertools/data_client.py +426 -257
  10. nlpertools/data_structure/base_structure.py +109 -13
  11. nlpertools/dataprocess.py +627 -3
  12. nlpertools/default_db_config.yml +41 -0
  13. nlpertools/draw/__init__.py +0 -0
  14. nlpertools/draw/draw.py +83 -0
  15. nlpertools/draw/math_func.py +33 -0
  16. nlpertools/get_2fa.py +0 -0
  17. nlpertools/io/__init__.py +3 -3
  18. nlpertools/io/dir.py +86 -36
  19. nlpertools/io/file.py +283 -222
  20. nlpertools/ml.py +511 -460
  21. nlpertools/monitor/__init__.py +0 -0
  22. nlpertools/monitor/gpu.py +18 -0
  23. nlpertools/monitor/memory.py +24 -0
  24. nlpertools/movie.py +36 -0
  25. nlpertools/nlpertools_config.yml +1 -0
  26. nlpertools/{openApi.py → open_api.py} +65 -65
  27. nlpertools/other.py +475 -249
  28. nlpertools/pic.py +288 -0
  29. nlpertools/plugin.py +43 -43
  30. nlpertools/reminder.py +98 -87
  31. nlpertools/utils/__init__.py +3 -3
  32. nlpertools/utils/lazy.py +727 -0
  33. nlpertools/utils/log_util.py +20 -0
  34. nlpertools/utils/package.py +89 -76
  35. nlpertools/utils/package_v1.py +94 -0
  36. nlpertools/utils/package_v2.py +117 -0
  37. nlpertools/utils_for_nlpertools.py +93 -93
  38. nlpertools/vector_index_demo.py +108 -0
  39. nlpertools/wrapper.py +161 -96
  40. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
  41. nlpertools-1.0.8.dist-info/METADATA +132 -0
  42. nlpertools-1.0.8.dist-info/RECORD +49 -0
  43. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
  44. nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
  45. nlpertools-1.0.8.dist-info/top_level.txt +2 -0
  46. nlpertools_helper/__init__.py +10 -0
  47. nlpertools-1.0.5.dist-info/METADATA +0 -85
  48. nlpertools-1.0.5.dist-info/RECORD +0 -25
  49. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/other.py CHANGED
@@ -1,249 +1,475 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import os
5
- import re
6
- import string
7
- from concurrent.futures import ThreadPoolExecutor
8
- from functools import reduce
9
-
10
- from .io.file import writetxt_w_list, writetxt_a
11
- # import numpy as np
12
- # import psutil
13
- # import pyquery as pq
14
- # import requests
15
- # import torch
16
- # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
17
- # from sklearn.metrics import precision_recall_fscore_support
18
- # from tqdm import tqdm
19
- # from win32evtlogutil import langid
20
- from .utils.package import *
21
-
22
- CHINESE_PUNCTUATION = list(',。;:‘’“”!?《》「」【】<>()、')
23
- ENGLISH_PUNCTUATION = list(',.;:\'"!?<>()')
24
-
25
-
26
- def seed_everything():
27
- # seed everything
28
- seed = 7777777
29
- np.random.seed(seed)
30
- torch.manual_seed(seed) # CPU随机种子确定
31
- torch.cuda.manual_seed(seed)
32
- torch.cuda.manual_seed_all(seed)
33
-
34
-
35
- def convert_np_to_py(res):
36
- np2py = {
37
- np.float64: float,
38
- np.int32: int
39
- }
40
- news_dict = {}
41
- for k, v in res.best_params_.items():
42
- if type(v) in np2py:
43
- v = np2py[type(v)](v)
44
- news_dict[k] = v
45
- return news_dict
46
-
47
-
48
- def git_push():
49
- """
50
- 针对国内提交github经常失败,自动提交
51
- """
52
- num = -1
53
- while 1:
54
- num += 1
55
- print("retry num: {}".format(num))
56
- res = os.system("git push --set-upstream origin main")
57
- print(str(res))
58
- if not str(res).startswith("fatal"):
59
- print("scucess")
60
- break
61
-
62
-
63
- def snake_to_camel(s: str) -> str:
64
- """
65
- author: u
66
- snake case 转换到 camel case.
67
- :param s: snake case variable
68
- :return:
69
- """
70
- return s.title().replace("_", "")
71
-
72
-
73
- def camel_to_snake(s: str) -> str:
74
- """
75
- camel case 转换到 snake case.
76
- :param s: camel case variable
77
- :return:
78
- """
79
- return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()
80
-
81
-
82
- def identify_language(text):
83
- language = langid.classify(text[:200])[0]
84
- # print(language)
85
- if language == 'zh':
86
- return 'zh'
87
- elif language == 'en':
88
- return 'en'
89
- else:
90
- return 'other'
91
- # return 'en'
92
-
93
-
94
- # other ----------------------------------------------------------------------
95
- # 统计词频
96
- def calc_word_count(list_word, mode, path='tempcount.txt', sort_id=1, is_reverse=True):
97
- word_count = {}
98
- for key in list_word:
99
- if key not in word_count:
100
- word_count[key] = 1
101
- else:
102
- word_count[key] += 1
103
- word_dict_sort = sorted(word_count.items(), key=lambda x: x[sort_id], reverse=is_reverse)
104
- if mode == 'w':
105
- for key in word_dict_sort:
106
- writetxt_a(str(key[0]) + '\t' + str(key[1]) + '\n', path)
107
- elif mode == 'p':
108
- for key in word_dict_sort:
109
- print(str(key[0]) + '\t' + str(key[1]))
110
- elif mode == 'u':
111
- return word_dict_sort
112
-
113
-
114
- # 字典去重
115
- def dupl_dict(dict_list, key):
116
- new_dict_list, value_set = [], []
117
- print('去重中...')
118
- for i in tqdm(dict_list):
119
- if i[key] not in value_set:
120
- new_dict_list.append(i)
121
- value_set.append(i[key])
122
- return new_dict_list
123
-
124
-
125
- def multi_thread_run(_task, data):
126
- with ThreadPoolExecutor() as executor:
127
- result = list(tqdm(executor.map(_task, data), total=len(ata)))
128
- return result
129
-
130
-
131
- def del_special_char(sentence):
132
- special_chars = ['\ufeff', '\xa0', '\u3000', '\xa0', '\ue627']
133
- for i in special_chars:
134
- sentence = sentence.replace(i, '')
135
- return sentence
136
-
137
-
138
- def en_pun_2_zh_pun(sentence):
139
- # TODO 因为引号的问题,所以我没有写
140
- for i in ENGLISH_PUNCTUATION:
141
- pass
142
-
143
-
144
- def spider(url):
145
- """
146
-
147
- :param url:
148
- :return:
149
- """
150
- if 'baijiahao' in url:
151
- content = requests.get(url)
152
- # print(content.text)
153
- html = pq.PyQuery(content.text)
154
- title = html('.index-module_articleTitle_28fPT').text()
155
- res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
156
- return '{}\n{}'.format(title, res)
157
-
158
-
159
- def eda(sentence):
160
- url = 'http://x.x.x.x:x/eda'
161
- json_data = dict({"sentence": sentence})
162
- res = requests.post(url, json=json_data)
163
- return res.json()['eda']
164
-
165
-
166
- def find_language(text):
167
- # TODO 替换为开源包
168
- letters = list(string.ascii_letters)
169
- if len(text) > 50:
170
- passage = text[:50]
171
- len_passage = 50
172
- else:
173
- len_passage = len(text)
174
- count = 0
175
- for c in passage:
176
- if c in letters:
177
- count += 1
178
- if count / len_passage > 0.5:
179
- return "en"
180
- else:
181
- return "not en"
182
-
183
-
184
- def print_prf(y_true, y_pred, label=None):
185
- # y_true = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
186
- # y_pred = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
187
- # p, r, f, s = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
188
- # print("p\t{}".format(p))
189
- # print("r\t{}".format(r))
190
- # print("f\t{}".format(f))
191
- # print("s\t{}".format(s))
192
- result = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=label)
193
-
194
- for i in range(len(label)):
195
- res = []
196
- for k in result:
197
- res.append('%.5f' % k[i])
198
- print('{}: {} {} {}'.format(label[i], *res[:3]))
199
-
200
-
201
- def print_cpu():
202
- p = psutil.Process()
203
- # pro_info = p.as_dict(attrs=['pid', 'name', 'username'])
204
- print(psutil.cpu_count())
205
-
206
-
207
- def stress_test(func, ipts):
208
- with ThreadPoolExecutor() as executor:
209
- results = list(tqdm(executor.map(func, ipts), total=len(ipts)))
210
- return results
211
-
212
-
213
- def get_substring_loc(text, subtext):
214
- res = re.finditer(
215
- subtext.replace('\\', '\\\\').replace('?', '\?').replace('(', '\(').replace(')', '\)').replace(']',
216
- '\]').replace(
217
- '[', '\[').replace('+', '\+'), text)
218
- l, r = [i for i in res][0].regs[0]
219
- return l, r
220
-
221
-
222
- def tf_idf(corpus, save_path):
223
- tfidfdict = {}
224
- vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
225
- transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
226
- tfidf = transformer.fit_transform(
227
- vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
228
- word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
229
- weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
230
- for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
231
- for j in range(len(word)):
232
- getword = word[j]
233
- getvalue = weight[i][j]
234
- if getvalue != 0: # 去掉值为0的项
235
- if getword in tfidfdict: # 更新全局TFIDF值
236
- tfidfdict[getword] += float(getvalue)
237
- else:
238
- tfidfdict.update({getword: getvalue})
239
- sorted_tfidf = sorted(tfidfdict.items(), key=lambda d: d[1], reverse=True)
240
- to_write = ['{} {}'.format(i[0], i[1]) for i in sorted_tfidf]
241
- writetxt_w_list(to_write, save_path, num_lf=1)
242
-
243
- # 常用函数参考
244
- # import tensorflow as tf
245
- #
246
- # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
247
- # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
248
- # for gpu in tf.config.experimental.list_physical_devices('GPU'):
249
- # tf.config.experimental.set_memory_growth()
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import itertools
5
+ import os
6
+ import re
7
+ import string
8
+ import subprocess
9
+ import threading
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from functools import reduce
12
+ import math
13
+ import datetime
14
+ import difflib
15
+ import psutil
16
+ from .io.file import writetxt_w_list, writetxt_a
17
+ # import numpy as np
18
+ # import psutil
19
+ # import pyquery as pq
20
+ # import requests
21
+ # import torch
22
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
23
+ # from sklearn.metrics import precision_recall_fscore_support
24
+ # from tqdm import tqdm
25
+ # from win32evtlogutil import langid
26
+ from .utils.package import *
27
+
28
+ CHINESE_PUNCTUATION = list(',。;:‘’“”!?《》「」【】<>()、')
29
+ ENGLISH_PUNCTUATION = list(',.;:\'"!?<>()')
30
+ OTHER_PUNCTUATION = list('!@#$%^&*')
31
+
32
+
33
+ def get_diff_parts(str1, str2):
34
+ # 创建一个 SequenceMatcher 对象
35
+ matcher = difflib.SequenceMatcher(None, str1, str2)
36
+
37
+ # 获取差异部分
38
+ diff_parts = []
39
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
40
+ if tag == 'replace' or tag == 'delete' or tag == 'insert':
41
+ diff_parts.append((tag, str1[i1:i2], str2[j1:j2]))
42
+
43
+ return diff_parts
44
+
45
+
46
+ def run_cmd_with_timeout(cmd, timeout):
47
+ """
48
+ https://juejin.cn/post/7391703459803086848
49
+ """
50
+ process = subprocess.Popen(cmd, shell=True, encoding="utf-8", errors="ignore", stdout=subprocess.PIPE,
51
+ stderr=subprocess.PIPE)
52
+ res = [None]
53
+
54
+ def target():
55
+ try:
56
+ ans = process.communicate()
57
+ res[0] = ans
58
+ except subprocess.TimeoutExpired:
59
+ process.kill()
60
+ process.communicate()
61
+
62
+ thread = threading.Thread(target=target)
63
+ thread.start()
64
+ thread.join(timeout)
65
+ if thread.is_alive():
66
+ print(f"Terminating {cmd}")
67
+ process.terminate()
68
+ thread.join()
69
+ print("Terminated successfully")
70
+ return False, f"{cmd} is running over {timeout}s"
71
+ if process.returncode == 0:
72
+ # res[0][0] 是output
73
+ return True, res[0][0]
74
+ else:
75
+ return False, res[0][0]
76
+
77
+
78
+ def print_three_line_table(df):
79
+ # TODO 这里需要添加可以支持excel里变红的功能
80
+ import webbrowser
81
+
82
+ # import pandas as pd
83
+ # data = {'from_pc': ['valid_data', 'illegal_char', 'more_data'],
84
+ # 'rom_pc': ['another_valid_data', 'illegal_char', 'data']}
85
+ # df = pd.DataFrame(data)
86
+
87
+ # DataFrame 转换为 HTML 表格
88
+ html_table = df.to_html(index=False)
89
+ html_table = html_table.replace('border="1"', 'border="0"')
90
+
91
+ first_line_px = str(2)
92
+ second_line_px = str(1)
93
+ third_line_px = str(2)
94
+ # 定义三线表的 CSS 样式
95
+ # // thead 表头
96
+ # // tr
97
+ # // td 单元格
98
+ head = """<!DOCTYPE html>
99
+ <html lang="zh">
100
+ <head>
101
+ <meta charset="UTF-8">
102
+ <title>页面标题</title>
103
+ </head>"""
104
+ style = """
105
+ <style>
106
+
107
+ table {
108
+ border-collapse: collapse;
109
+ }
110
+
111
+ tr, td, th {
112
+ text-align: center; /* 水平居中文本 */
113
+ vertical-align: middle; /* 垂直居中文本 */
114
+ }
115
+ thead tr {
116
+ border-top: (first_line_px)px solid black;
117
+ border-bottom: (second_line_px)px solid black;
118
+ }
119
+
120
+ thead th {
121
+ border-bottom: (second_line_px)px solid black;
122
+ }
123
+
124
+ tbody tr td {
125
+ border-bottom: 0px solid black;
126
+ }
127
+
128
+ tbody tr:last-child td {
129
+ border-bottom: (third_line_px)px solid black;
130
+ }
131
+ </style>"""
132
+ style = style.replace("(first_line_px)", first_line_px).replace("(second_line_px)", second_line_px).replace(
133
+ "(third_line_px)", third_line_px)
134
+ # CSS 样式和 HTML 表格结合起来
135
+ html = f"{style}{html_table}"
136
+ print(html)
137
+ temp_file_path = "temp.html"
138
+ # 将 HTML 保存到文件中
139
+ with open(temp_file_path, "w") as f:
140
+ f.write(html)
141
+ webbrowser.open('file://' + os.path.realpath(temp_file_path))
142
+
143
+
144
+ def jprint(obj, depth=0):
145
+ if isinstance(obj, dict):
146
+ sep = "-" * (10 - depth * 3)
147
+ for k, v in obj.items():
148
+ print(depth * "|", sep, k, sep)
149
+ jprint(v)
150
+ elif isinstance(obj, list):
151
+ for v in obj:
152
+ jprint(v, depth + 1)
153
+ else:
154
+ print(obj)
155
+
156
+
157
+ def print_split(sign="=", num=20):
158
+ print(sign * num)
159
+
160
+
161
+ def seed_everything():
162
+ import torch
163
+ # seed everything
164
+ seed = 7777777
165
+ np.random.seed(seed)
166
+ torch.manual_seed(seed) # CPU随机种子确定
167
+ torch.cuda.manual_seed(seed)
168
+ torch.cuda.manual_seed_all(seed)
169
+
170
+
171
+ def sent_email(mail_user, mail_pass, receiver, title, content, attach_path=None):
172
+ import smtplib
173
+ from email.mime.multipart import MIMEMultipart
174
+ from email.mime.text import MIMEText
175
+ from email.mime.application import MIMEApplication
176
+
177
+ mail_host = 'smtp.qq.com'
178
+ mail_user = mail_user
179
+ mail_pass = mail_pass
180
+ sender = mail_user
181
+
182
+ message = MIMEMultipart()
183
+ message.attach(MIMEText(content, 'plain', 'utf-8'))
184
+ if attach_path:
185
+ attachment = MIMEApplication(open(attach_path, 'rb').read())
186
+ attachment["Content-Type"] = 'application/octet-stream'
187
+ attachment.add_header('Content-Dispositon', 'attachment',
188
+ filename=('utf-8', '', attach_path)) # 注意:此处basename要转换为gbk编码,否则中文会有乱码。
189
+ message.attach(attachment)
190
+ message['Subject'] = title
191
+ message['From'] = sender
192
+ message['To'] = receiver
193
+
194
+ try:
195
+ smtp_obj = smtplib.SMTP()
196
+ smtp_obj.connect(mail_host, 25)
197
+ smtp_obj.login(mail_user, mail_pass)
198
+ smtp_obj.sendmail(sender, receiver, message.as_string())
199
+ smtp_obj.quit()
200
+ print('send email success')
201
+ except smtplib.SMTPException as e:
202
+ print('send failed', e)
203
+
204
+
205
+ def convert_np_to_py(obj):
206
+ if isinstance(obj, dict):
207
+ return {k: convert_np_to_py(v) for k, v in obj.items()}
208
+ elif isinstance(obj, list):
209
+ return [convert_np_to_py(v) for v in obj]
210
+ elif isinstance(obj, np.float64) or isinstance(obj, np.float32):
211
+ return float(obj)
212
+ else:
213
+ return obj
214
+
215
+
216
+ def snake_to_camel(s: str) -> str:
217
+ """
218
+ author: u
219
+ snake case 转换到 camel case.
220
+ :param s: snake case variable
221
+ :return:
222
+ """
223
+ return s.title().replace("_", "")
224
+
225
+
226
+ def camel_to_snake(s: str) -> str:
227
+ """
228
+ camel case 转换到 snake case.
229
+ :param s: camel case variable
230
+ :return:
231
+ """
232
+ return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()
233
+
234
+
235
+ # other ----------------------------------------------------------------------
236
+ # 统计词频
237
+ def calc_word_count(list_word, mode, path='tempcount.txt', sort_id=1, is_reverse=True):
238
+ word_count = {}
239
+ for key in list_word:
240
+ if key not in word_count:
241
+ word_count[key] = 1
242
+ else:
243
+ word_count[key] += 1
244
+ word_dict_sort = sorted(word_count.items(), key=lambda x: x[sort_id], reverse=is_reverse)
245
+ if mode == 'w':
246
+ for key in word_dict_sort:
247
+ writetxt_a(str(key[0]) + '\t' + str(key[1]) + '\n', path)
248
+ elif mode == 'p':
249
+ for key in word_dict_sort:
250
+ print(str(key[0]) + '\t' + str(key[1]))
251
+ elif mode == 'u':
252
+ return word_dict_sort
253
+
254
+
255
+ # 字典去重
256
+ def dupl_dict(dict_list, key):
257
+ new_dict_list, value_set = [], []
258
+ print('去重中...')
259
+ for i in tqdm(dict_list):
260
+ if i[key] not in value_set:
261
+ new_dict_list.append(i)
262
+ value_set.append(i[key])
263
+ return new_dict_list
264
+
265
+
266
+ def multi_thread_run(_task, data):
267
+ with ThreadPoolExecutor() as executor:
268
+ result = list(tqdm(executor.map(_task, data), total=len(data)))
269
+ return result
270
+
271
+
272
+ def del_special_char(sentence):
273
+ special_chars = ['\ufeff', '\xa0', '\u3000', '\xa0', '\ue627']
274
+ for i in special_chars:
275
+ sentence = sentence.replace(i, '')
276
+ return sentence
277
+
278
+
279
+ def en_pun_2_zh_pun(sentence):
280
+ # TODO 因为引号的问题,所以我没有写
281
+ for i in ENGLISH_PUNCTUATION:
282
+ pass
283
+
284
+
285
+ def spider(url):
286
+ """
287
+
288
+ :param url:
289
+ :return:
290
+ """
291
+ if 'baijiahao' in url:
292
+ content = requests.get(url)
293
+ # print(content.text)
294
+ html = pq.PyQuery(content.text)
295
+ title = html('.index-module_articleTitle_28fPT').text()
296
+ res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
297
+ return '{}\n{}'.format(title, res)
298
+
299
+
300
+ def eda(sentence):
301
+ url = 'https://x.x.x.x:x/eda'
302
+ json_data = dict({"sentence": sentence})
303
+ res = requests.post(url, json=json_data)
304
+ return res.json()['eda']
305
+
306
+
307
+ def find_language(text):
308
+ # TODO 替换为开源包
309
+ letters = list(string.ascii_letters)
310
+ if len(text) > 50:
311
+ passage = text[:50]
312
+ len_passage = 50
313
+ else:
314
+ len_passage = len(text)
315
+ count = 0
316
+ for c in passage:
317
+ if c in letters:
318
+ count += 1
319
+ if count / len_passage > 0.5:
320
+ return "en"
321
+ else:
322
+ return "not en"
323
+
324
+
325
+ def print_prf(y_true, y_pred, label=None):
326
+ # y_true = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
327
+ # y_pred = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
328
+ # p, r, f, s = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
329
+ # print("p\t{}".format(p))
330
+ # print("r\t{}".format(r))
331
+ # print("f\t{}".format(f))
332
+ # print("s\t{}".format(s))
333
+ result = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=label)
334
+
335
+ for i in range(len(label)):
336
+ res = []
337
+ for k in result:
338
+ res.append('%.5f' % k[i])
339
+ print('{}: {} {} {}'.format(label[i], *res[:3]))
340
+
341
+
342
+ def print_cpu():
343
+ p = psutil.Process()
344
+ # pro_info = p.as_dict(attrs=['pid', 'name', 'username'])
345
+ print(psutil.cpu_count())
346
+
347
+
348
+ def stress_test(func, ipts):
349
+ with ThreadPoolExecutor() as executor:
350
+ results = list(tqdm(executor.map(func, ipts), total=len(ipts)))
351
+ return results
352
+
353
+
354
+ def squeeze_list(high_dim_list):
355
+ return list(itertools.chain.from_iterable(high_dim_list))
356
+
357
+
358
+ def unsqueeze_list(flatten_list, each_element_len):
359
+ # 该函数是错的,被split_list替代了
360
+ two_dim_list = [flatten_list[i * each_element_len:(i + 1) * each_element_len] for i in
361
+ range(len(flatten_list) // each_element_len)]
362
+ return two_dim_list
363
+
364
+ def split_list(input_list, chunk_size):
365
+ # 使用列表推导式将列表分割成二维数组
366
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
367
+
368
+ def auto_close():
369
+ """
370
+ 针对企业微信15分钟会显示离开的机制,假装自己还在上班
371
+ """
372
+ import pyautogui as pg
373
+ import time
374
+ import os
375
+ cmd = 'schtasks /create /tn shut /tr "shutdown -s -f" /sc once /st 23:30'
376
+ os.system(cmd)
377
+ while 1:
378
+ pg.moveTo(970, 17, 2)
379
+ pg.click()
380
+ time.sleep(840)
381
+
382
+
383
+ def tf_idf(corpus, save_path):
384
+ tfidfdict = {}
385
+ vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
386
+ transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
387
+ tfidf = transformer.fit_transform(
388
+ vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
389
+ word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
390
+ weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
391
+ for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
392
+ for j in range(len(word)):
393
+ getword = word[j]
394
+ getvalue = weight[i][j]
395
+ if getvalue != 0: # 去掉值为0的项
396
+ if getword in tfidfdict: # 更新全局TFIDF值
397
+ tfidfdict[getword] += float(getvalue)
398
+ else:
399
+ tfidfdict.update({getword: getvalue})
400
+ sorted_tfidf = sorted(tfidfdict.items(), key=lambda d: d[1], reverse=True)
401
+ to_write = ['{} {}'.format(i[0], i[1]) for i in sorted_tfidf]
402
+ writetxt_w_list(to_write, save_path, num_lf=1)
403
+
404
+
405
+ class GaussDecay(object):
406
+ """
407
+ 当前只实现了时间的,全部使用默认值
408
+ """
409
+
410
+ def __init__(self, origin='2022-08-02', scale='90d', offset='5d', decay=0.5, task="time"):
411
+ self.origin = origin
412
+ self.task = task
413
+ self.scale, self.offset = self.translate(scale, offset)
414
+ self.decay = decay
415
+ self.time_coefficient = 0.6
416
+ self.related_coefficient = 0.4
417
+
418
+ def translate(self, scale, offset):
419
+ """
420
+ 将领域的输入转化为标准
421
+ :return:
422
+ """
423
+ if self.task == "time":
424
+ scale = 180
425
+ offset = 5
426
+ else:
427
+ scale = 180
428
+ offset = 5
429
+ return scale, offset
430
+
431
+ @staticmethod
432
+ def translated_minus(field_value):
433
+ origin = datetime.datetime.now()
434
+ field_value = datetime.datetime.strptime(field_value, '%Y-%m-%d %H:%M:%S')
435
+ return (origin - field_value).days
436
+
437
+ def calc_exp(self):
438
+ pass
439
+
440
+ def calc_liner(self):
441
+ pass
442
+
443
+ def calc_gauss(self, raw_score, field_value):
444
+ """
445
+ $$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
446
+ $$σ^2=-scale^2/(2·ln(decay))$$
447
+ :param raw_score:
448
+ :param field_value:
449
+ :return:
450
+ """
451
+ numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
452
+ sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
453
+ denominator = 2 * sigma_square
454
+ s = math.exp(-1 * numerator / denominator)
455
+ return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)
456
+
457
+
458
+ if __name__ == '__main__':
459
+ gauss_decay = GaussDecay()
460
+ res = gauss_decay.calc_gauss(raw_score=1, field_value="2021-05-29 14:31:13")
461
+ print(res)
462
+ # res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
463
+ # print(res)
464
+ # res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
465
+ # print(res)
466
+ # res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
467
+ # print(res)
468
+
469
+ # 常用函数参考
470
+ # import tensorflow as tf
471
+ #
472
+ # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
473
+ # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
474
+ # for gpu in tf.config.experimental.list_physical_devices('GPU'):
475
+ # tf.config.experimental.set_memory_growth()