nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. nlpertools/__init__.py +23 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/cli.py +87 -0
  9. nlpertools/data_client.py +426 -257
  10. nlpertools/data_structure/base_structure.py +109 -13
  11. nlpertools/dataprocess.py +627 -3
  12. nlpertools/default_db_config.yml +41 -0
  13. nlpertools/draw/__init__.py +0 -0
  14. nlpertools/draw/draw.py +83 -0
  15. nlpertools/draw/math_func.py +33 -0
  16. nlpertools/get_2fa.py +0 -0
  17. nlpertools/io/__init__.py +3 -3
  18. nlpertools/io/dir.py +86 -36
  19. nlpertools/io/file.py +283 -222
  20. nlpertools/ml.py +511 -460
  21. nlpertools/monitor/__init__.py +0 -0
  22. nlpertools/monitor/gpu.py +18 -0
  23. nlpertools/monitor/memory.py +24 -0
  24. nlpertools/movie.py +36 -0
  25. nlpertools/nlpertools_config.yml +1 -0
  26. nlpertools/{openApi.py → open_api.py} +65 -65
  27. nlpertools/other.py +475 -249
  28. nlpertools/pic.py +288 -0
  29. nlpertools/plugin.py +43 -43
  30. nlpertools/reminder.py +98 -87
  31. nlpertools/utils/__init__.py +3 -3
  32. nlpertools/utils/lazy.py +727 -0
  33. nlpertools/utils/log_util.py +20 -0
  34. nlpertools/utils/package.py +89 -76
  35. nlpertools/utils/package_v1.py +94 -0
  36. nlpertools/utils/package_v2.py +117 -0
  37. nlpertools/utils_for_nlpertools.py +93 -93
  38. nlpertools/vector_index_demo.py +108 -0
  39. nlpertools/wrapper.py +161 -96
  40. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
  41. nlpertools-1.0.8.dist-info/METADATA +132 -0
  42. nlpertools-1.0.8.dist-info/RECORD +49 -0
  43. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
  44. nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
  45. nlpertools-1.0.8.dist-info/top_level.txt +2 -0
  46. nlpertools_helper/__init__.py +10 -0
  47. nlpertools-1.0.5.dist-info/METADATA +0 -85
  48. nlpertools-1.0.5.dist-info/RECORD +0 -25
  49. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/other.py CHANGED
@@ -1,249 +1,475 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import os
5
- import re
6
- import string
7
- from concurrent.futures import ThreadPoolExecutor
8
- from functools import reduce
9
-
10
- from .io.file import writetxt_w_list, writetxt_a
11
- # import numpy as np
12
- # import psutil
13
- # import pyquery as pq
14
- # import requests
15
- # import torch
16
- # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
17
- # from sklearn.metrics import precision_recall_fscore_support
18
- # from tqdm import tqdm
19
- # from win32evtlogutil import langid
20
- from .utils.package import *
21
-
22
- CHINESE_PUNCTUATION = list(',。;:‘’“”!?《》「」【】<>()、')
23
- ENGLISH_PUNCTUATION = list(',.;:\'"!?<>()')
24
-
25
-
26
- def seed_everything():
27
- # seed everything
28
- seed = 7777777
29
- np.random.seed(seed)
30
- torch.manual_seed(seed) # CPU随机种子确定
31
- torch.cuda.manual_seed(seed)
32
- torch.cuda.manual_seed_all(seed)
33
-
34
-
35
- def convert_np_to_py(res):
36
- np2py = {
37
- np.float64: float,
38
- np.int32: int
39
- }
40
- news_dict = {}
41
- for k, v in res.best_params_.items():
42
- if type(v) in np2py:
43
- v = np2py[type(v)](v)
44
- news_dict[k] = v
45
- return news_dict
46
-
47
-
48
- def git_push():
49
- """
50
- 针对国内提交github经常失败,自动提交
51
- """
52
- num = -1
53
- while 1:
54
- num += 1
55
- print("retry num: {}".format(num))
56
- res = os.system("git push --set-upstream origin main")
57
- print(str(res))
58
- if not str(res).startswith("fatal"):
59
- print("scucess")
60
- break
61
-
62
-
63
- def snake_to_camel(s: str) -> str:
64
- """
65
- author: u
66
- snake case 转换到 camel case.
67
- :param s: snake case variable
68
- :return:
69
- """
70
- return s.title().replace("_", "")
71
-
72
-
73
- def camel_to_snake(s: str) -> str:
74
- """
75
- camel case 转换到 snake case.
76
- :param s: camel case variable
77
- :return:
78
- """
79
- return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()
80
-
81
-
82
- def identify_language(text):
83
- language = langid.classify(text[:200])[0]
84
- # print(language)
85
- if language == 'zh':
86
- return 'zh'
87
- elif language == 'en':
88
- return 'en'
89
- else:
90
- return 'other'
91
- # return 'en'
92
-
93
-
94
- # other ----------------------------------------------------------------------
95
- # 统计词频
96
- def calc_word_count(list_word, mode, path='tempcount.txt', sort_id=1, is_reverse=True):
97
- word_count = {}
98
- for key in list_word:
99
- if key not in word_count:
100
- word_count[key] = 1
101
- else:
102
- word_count[key] += 1
103
- word_dict_sort = sorted(word_count.items(), key=lambda x: x[sort_id], reverse=is_reverse)
104
- if mode == 'w':
105
- for key in word_dict_sort:
106
- writetxt_a(str(key[0]) + '\t' + str(key[1]) + '\n', path)
107
- elif mode == 'p':
108
- for key in word_dict_sort:
109
- print(str(key[0]) + '\t' + str(key[1]))
110
- elif mode == 'u':
111
- return word_dict_sort
112
-
113
-
114
- # 字典去重
115
- def dupl_dict(dict_list, key):
116
- new_dict_list, value_set = [], []
117
- print('去重中...')
118
- for i in tqdm(dict_list):
119
- if i[key] not in value_set:
120
- new_dict_list.append(i)
121
- value_set.append(i[key])
122
- return new_dict_list
123
-
124
-
125
- def multi_thread_run(_task, data):
126
- with ThreadPoolExecutor() as executor:
127
- result = list(tqdm(executor.map(_task, data), total=len(ata)))
128
- return result
129
-
130
-
131
- def del_special_char(sentence):
132
- special_chars = ['\ufeff', '\xa0', '\u3000', '\xa0', '\ue627']
133
- for i in special_chars:
134
- sentence = sentence.replace(i, '')
135
- return sentence
136
-
137
-
138
- def en_pun_2_zh_pun(sentence):
139
- # TODO 因为引号的问题,所以我没有写
140
- for i in ENGLISH_PUNCTUATION:
141
- pass
142
-
143
-
144
- def spider(url):
145
- """
146
-
147
- :param url:
148
- :return:
149
- """
150
- if 'baijiahao' in url:
151
- content = requests.get(url)
152
- # print(content.text)
153
- html = pq.PyQuery(content.text)
154
- title = html('.index-module_articleTitle_28fPT').text()
155
- res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
156
- return '{}\n{}'.format(title, res)
157
-
158
-
159
- def eda(sentence):
160
- url = 'http://x.x.x.x:x/eda'
161
- json_data = dict({"sentence": sentence})
162
- res = requests.post(url, json=json_data)
163
- return res.json()['eda']
164
-
165
-
166
- def find_language(text):
167
- # TODO 替换为开源包
168
- letters = list(string.ascii_letters)
169
- if len(text) > 50:
170
- passage = text[:50]
171
- len_passage = 50
172
- else:
173
- len_passage = len(text)
174
- count = 0
175
- for c in passage:
176
- if c in letters:
177
- count += 1
178
- if count / len_passage > 0.5:
179
- return "en"
180
- else:
181
- return "not en"
182
-
183
-
184
- def print_prf(y_true, y_pred, label=None):
185
- # y_true = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
186
- # y_pred = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
187
- # p, r, f, s = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
188
- # print("p\t{}".format(p))
189
- # print("r\t{}".format(r))
190
- # print("f\t{}".format(f))
191
- # print("s\t{}".format(s))
192
- result = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=label)
193
-
194
- for i in range(len(label)):
195
- res = []
196
- for k in result:
197
- res.append('%.5f' % k[i])
198
- print('{}: {} {} {}'.format(label[i], *res[:3]))
199
-
200
-
201
- def print_cpu():
202
- p = psutil.Process()
203
- # pro_info = p.as_dict(attrs=['pid', 'name', 'username'])
204
- print(psutil.cpu_count())
205
-
206
-
207
- def stress_test(func, ipts):
208
- with ThreadPoolExecutor() as executor:
209
- results = list(tqdm(executor.map(func, ipts), total=len(ipts)))
210
- return results
211
-
212
-
213
- def get_substring_loc(text, subtext):
214
- res = re.finditer(
215
- subtext.replace('\\', '\\\\').replace('?', '\?').replace('(', '\(').replace(')', '\)').replace(']',
216
- '\]').replace(
217
- '[', '\[').replace('+', '\+'), text)
218
- l, r = [i for i in res][0].regs[0]
219
- return l, r
220
-
221
-
222
- def tf_idf(corpus, save_path):
223
- tfidfdict = {}
224
- vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
225
- transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
226
- tfidf = transformer.fit_transform(
227
- vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
228
- word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
229
- weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
230
- for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
231
- for j in range(len(word)):
232
- getword = word[j]
233
- getvalue = weight[i][j]
234
- if getvalue != 0: # 去掉值为0的项
235
- if getword in tfidfdict: # 更新全局TFIDF值
236
- tfidfdict[getword] += float(getvalue)
237
- else:
238
- tfidfdict.update({getword: getvalue})
239
- sorted_tfidf = sorted(tfidfdict.items(), key=lambda d: d[1], reverse=True)
240
- to_write = ['{} {}'.format(i[0], i[1]) for i in sorted_tfidf]
241
- writetxt_w_list(to_write, save_path, num_lf=1)
242
-
243
- # 常用函数参考
244
- # import tensorflow as tf
245
- #
246
- # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
247
- # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
248
- # for gpu in tf.config.experimental.list_physical_devices('GPU'):
249
- # tf.config.experimental.set_memory_growth()
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import itertools
5
+ import os
6
+ import re
7
+ import string
8
+ import subprocess
9
+ import threading
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from functools import reduce
12
+ import math
13
+ import datetime
14
+ import difflib
15
+ import psutil
16
+ from .io.file import writetxt_w_list, writetxt_a
17
+ # import numpy as np
18
+ # import psutil
19
+ # import pyquery as pq
20
+ # import requests
21
+ # import torch
22
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
23
+ # from sklearn.metrics import precision_recall_fscore_support
24
+ # from tqdm import tqdm
25
+ # from win32evtlogutil import langid
26
+ from .utils.package import *
27
+
28
+ CHINESE_PUNCTUATION = list(',。;:‘’“”!?《》「」【】<>()、')
29
+ ENGLISH_PUNCTUATION = list(',.;:\'"!?<>()')
30
+ OTHER_PUNCTUATION = list('!@#$%^&*')
31
+
32
+
33
+ def get_diff_parts(str1, str2):
34
+ # 创建一个 SequenceMatcher 对象
35
+ matcher = difflib.SequenceMatcher(None, str1, str2)
36
+
37
+ # 获取差异部分
38
+ diff_parts = []
39
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
40
+ if tag == 'replace' or tag == 'delete' or tag == 'insert':
41
+ diff_parts.append((tag, str1[i1:i2], str2[j1:j2]))
42
+
43
+ return diff_parts
44
+
45
+
46
+ def run_cmd_with_timeout(cmd, timeout):
47
+ """
48
+ https://juejin.cn/post/7391703459803086848
49
+ """
50
+ process = subprocess.Popen(cmd, shell=True, encoding="utf-8", errors="ignore", stdout=subprocess.PIPE,
51
+ stderr=subprocess.PIPE)
52
+ res = [None]
53
+
54
+ def target():
55
+ try:
56
+ ans = process.communicate()
57
+ res[0] = ans
58
+ except subprocess.TimeoutExpired:
59
+ process.kill()
60
+ process.communicate()
61
+
62
+ thread = threading.Thread(target=target)
63
+ thread.start()
64
+ thread.join(timeout)
65
+ if thread.is_alive():
66
+ print(f"Terminating {cmd}")
67
+ process.terminate()
68
+ thread.join()
69
+ print("Terminated successfully")
70
+ return False, f"{cmd} is running over {timeout}s"
71
+ if process.returncode == 0:
72
+ # res[0][0] 是output
73
+ return True, res[0][0]
74
+ else:
75
+ return False, res[0][0]
76
+
77
+
78
+ def print_three_line_table(df):
79
+ # TODO 这里需要添加可以支持excel里变红的功能
80
+ import webbrowser
81
+
82
+ # import pandas as pd
83
+ # data = {'from_pc': ['valid_data', 'illegal_char', 'more_data'],
84
+ # 'rom_pc': ['another_valid_data', 'illegal_char', 'data']}
85
+ # df = pd.DataFrame(data)
86
+
87
+ # DataFrame 转换为 HTML 表格
88
+ html_table = df.to_html(index=False)
89
+ html_table = html_table.replace('border="1"', 'border="0"')
90
+
91
+ first_line_px = str(2)
92
+ second_line_px = str(1)
93
+ third_line_px = str(2)
94
+ # 定义三线表的 CSS 样式
95
+ # // thead 表头
96
+ # // tr
97
+ # // td 单元格
98
+ head = """<!DOCTYPE html>
99
+ <html lang="zh">
100
+ <head>
101
+ <meta charset="UTF-8">
102
+ <title>页面标题</title>
103
+ </head>"""
104
+ style = """
105
+ <style>
106
+
107
+ table {
108
+ border-collapse: collapse;
109
+ }
110
+
111
+ tr, td, th {
112
+ text-align: center; /* 水平居中文本 */
113
+ vertical-align: middle; /* 垂直居中文本 */
114
+ }
115
+ thead tr {
116
+ border-top: (first_line_px)px solid black;
117
+ border-bottom: (second_line_px)px solid black;
118
+ }
119
+
120
+ thead th {
121
+ border-bottom: (second_line_px)px solid black;
122
+ }
123
+
124
+ tbody tr td {
125
+ border-bottom: 0px solid black;
126
+ }
127
+
128
+ tbody tr:last-child td {
129
+ border-bottom: (third_line_px)px solid black;
130
+ }
131
+ </style>"""
132
+ style = style.replace("(first_line_px)", first_line_px).replace("(second_line_px)", second_line_px).replace(
133
+ "(third_line_px)", third_line_px)
134
+ # CSS 样式和 HTML 表格结合起来
135
+ html = f"{style}{html_table}"
136
+ print(html)
137
+ temp_file_path = "temp.html"
138
+ # 将 HTML 保存到文件中
139
+ with open(temp_file_path, "w") as f:
140
+ f.write(html)
141
+ webbrowser.open('file://' + os.path.realpath(temp_file_path))
142
+
143
+
144
+ def jprint(obj, depth=0):
145
+ if isinstance(obj, dict):
146
+ sep = "-" * (10 - depth * 3)
147
+ for k, v in obj.items():
148
+ print(depth * "|", sep, k, sep)
149
+ jprint(v)
150
+ elif isinstance(obj, list):
151
+ for v in obj:
152
+ jprint(v, depth + 1)
153
+ else:
154
+ print(obj)
155
+
156
+
157
+ def print_split(sign="=", num=20):
158
+ print(sign * num)
159
+
160
+
161
+ def seed_everything():
162
+ import torch
163
+ # seed everything
164
+ seed = 7777777
165
+ np.random.seed(seed)
166
+ torch.manual_seed(seed) # CPU随机种子确定
167
+ torch.cuda.manual_seed(seed)
168
+ torch.cuda.manual_seed_all(seed)
169
+
170
+
171
+ def sent_email(mail_user, mail_pass, receiver, title, content, attach_path=None):
172
+ import smtplib
173
+ from email.mime.multipart import MIMEMultipart
174
+ from email.mime.text import MIMEText
175
+ from email.mime.application import MIMEApplication
176
+
177
+ mail_host = 'smtp.qq.com'
178
+ mail_user = mail_user
179
+ mail_pass = mail_pass
180
+ sender = mail_user
181
+
182
+ message = MIMEMultipart()
183
+ message.attach(MIMEText(content, 'plain', 'utf-8'))
184
+ if attach_path:
185
+ attachment = MIMEApplication(open(attach_path, 'rb').read())
186
+ attachment["Content-Type"] = 'application/octet-stream'
187
+ attachment.add_header('Content-Dispositon', 'attachment',
188
+ filename=('utf-8', '', attach_path)) # 注意:此处basename要转换为gbk编码,否则中文会有乱码。
189
+ message.attach(attachment)
190
+ message['Subject'] = title
191
+ message['From'] = sender
192
+ message['To'] = receiver
193
+
194
+ try:
195
+ smtp_obj = smtplib.SMTP()
196
+ smtp_obj.connect(mail_host, 25)
197
+ smtp_obj.login(mail_user, mail_pass)
198
+ smtp_obj.sendmail(sender, receiver, message.as_string())
199
+ smtp_obj.quit()
200
+ print('send email success')
201
+ except smtplib.SMTPException as e:
202
+ print('send failed', e)
203
+
204
+
205
+ def convert_np_to_py(obj):
206
+ if isinstance(obj, dict):
207
+ return {k: convert_np_to_py(v) for k, v in obj.items()}
208
+ elif isinstance(obj, list):
209
+ return [convert_np_to_py(v) for v in obj]
210
+ elif isinstance(obj, np.float64) or isinstance(obj, np.float32):
211
+ return float(obj)
212
+ else:
213
+ return obj
214
+
215
+
216
+ def snake_to_camel(s: str) -> str:
217
+ """
218
+ author: u
219
+ snake case 转换到 camel case.
220
+ :param s: snake case variable
221
+ :return:
222
+ """
223
+ return s.title().replace("_", "")
224
+
225
+
226
+ def camel_to_snake(s: str) -> str:
227
+ """
228
+ camel case 转换到 snake case.
229
+ :param s: camel case variable
230
+ :return:
231
+ """
232
+ return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()
233
+
234
+
235
+ # other ----------------------------------------------------------------------
236
+ # 统计词频
237
+ def calc_word_count(list_word, mode, path='tempcount.txt', sort_id=1, is_reverse=True):
238
+ word_count = {}
239
+ for key in list_word:
240
+ if key not in word_count:
241
+ word_count[key] = 1
242
+ else:
243
+ word_count[key] += 1
244
+ word_dict_sort = sorted(word_count.items(), key=lambda x: x[sort_id], reverse=is_reverse)
245
+ if mode == 'w':
246
+ for key in word_dict_sort:
247
+ writetxt_a(str(key[0]) + '\t' + str(key[1]) + '\n', path)
248
+ elif mode == 'p':
249
+ for key in word_dict_sort:
250
+ print(str(key[0]) + '\t' + str(key[1]))
251
+ elif mode == 'u':
252
+ return word_dict_sort
253
+
254
+
255
+ # 字典去重
256
+ def dupl_dict(dict_list, key):
257
+ new_dict_list, value_set = [], []
258
+ print('去重中...')
259
+ for i in tqdm(dict_list):
260
+ if i[key] not in value_set:
261
+ new_dict_list.append(i)
262
+ value_set.append(i[key])
263
+ return new_dict_list
264
+
265
+
266
+ def multi_thread_run(_task, data):
267
+ with ThreadPoolExecutor() as executor:
268
+ result = list(tqdm(executor.map(_task, data), total=len(data)))
269
+ return result
270
+
271
+
272
+ def del_special_char(sentence):
273
+ special_chars = ['\ufeff', '\xa0', '\u3000', '\xa0', '\ue627']
274
+ for i in special_chars:
275
+ sentence = sentence.replace(i, '')
276
+ return sentence
277
+
278
+
279
+ def en_pun_2_zh_pun(sentence):
280
+ # TODO 因为引号的问题,所以我没有写
281
+ for i in ENGLISH_PUNCTUATION:
282
+ pass
283
+
284
+
285
+ def spider(url):
286
+ """
287
+
288
+ :param url:
289
+ :return:
290
+ """
291
+ if 'baijiahao' in url:
292
+ content = requests.get(url)
293
+ # print(content.text)
294
+ html = pq.PyQuery(content.text)
295
+ title = html('.index-module_articleTitle_28fPT').text()
296
+ res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
297
+ return '{}\n{}'.format(title, res)
298
+
299
+
300
+ def eda(sentence):
301
+ url = 'https://x.x.x.x:x/eda'
302
+ json_data = dict({"sentence": sentence})
303
+ res = requests.post(url, json=json_data)
304
+ return res.json()['eda']
305
+
306
+
307
+ def find_language(text):
308
+ # TODO 替换为开源包
309
+ letters = list(string.ascii_letters)
310
+ if len(text) > 50:
311
+ passage = text[:50]
312
+ len_passage = 50
313
+ else:
314
+ len_passage = len(text)
315
+ count = 0
316
+ for c in passage:
317
+ if c in letters:
318
+ count += 1
319
+ if count / len_passage > 0.5:
320
+ return "en"
321
+ else:
322
+ return "not en"
323
+
324
+
325
+ def print_prf(y_true, y_pred, label=None):
326
+ # y_true = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
327
+ # y_pred = [0, 1, 2, 1, 1, 2, 3, 1, 1, 1]
328
+ # p, r, f, s = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
329
+ # print("p\t{}".format(p))
330
+ # print("r\t{}".format(r))
331
+ # print("f\t{}".format(f))
332
+ # print("s\t{}".format(s))
333
+ result = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, labels=label)
334
+
335
+ for i in range(len(label)):
336
+ res = []
337
+ for k in result:
338
+ res.append('%.5f' % k[i])
339
+ print('{}: {} {} {}'.format(label[i], *res[:3]))
340
+
341
+
342
+ def print_cpu():
343
+ p = psutil.Process()
344
+ # pro_info = p.as_dict(attrs=['pid', 'name', 'username'])
345
+ print(psutil.cpu_count())
346
+
347
+
348
+ def stress_test(func, ipts):
349
+ with ThreadPoolExecutor() as executor:
350
+ results = list(tqdm(executor.map(func, ipts), total=len(ipts)))
351
+ return results
352
+
353
+
354
+ def squeeze_list(high_dim_list):
355
+ return list(itertools.chain.from_iterable(high_dim_list))
356
+
357
+
358
+ def unsqueeze_list(flatten_list, each_element_len):
359
+ # 该函数是错的,被split_list替代了
360
+ two_dim_list = [flatten_list[i * each_element_len:(i + 1) * each_element_len] for i in
361
+ range(len(flatten_list) // each_element_len)]
362
+ return two_dim_list
363
+
364
+ def split_list(input_list, chunk_size):
365
+ # 使用列表推导式将列表分割成二维数组
366
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
367
+
368
+ def auto_close():
369
+ """
370
+ 针对企业微信15分钟会显示离开的机制,假装自己还在上班
371
+ """
372
+ import pyautogui as pg
373
+ import time
374
+ import os
375
+ cmd = 'schtasks /create /tn shut /tr "shutdown -s -f" /sc once /st 23:30'
376
+ os.system(cmd)
377
+ while 1:
378
+ pg.moveTo(970, 17, 2)
379
+ pg.click()
380
+ time.sleep(840)
381
+
382
+
383
+ def tf_idf(corpus, save_path):
384
+ tfidfdict = {}
385
+ vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
386
+ transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
387
+ tfidf = transformer.fit_transform(
388
+ vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
389
+ word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
390
+ weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
391
+ for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
392
+ for j in range(len(word)):
393
+ getword = word[j]
394
+ getvalue = weight[i][j]
395
+ if getvalue != 0: # 去掉值为0的项
396
+ if getword in tfidfdict: # 更新全局TFIDF值
397
+ tfidfdict[getword] += float(getvalue)
398
+ else:
399
+ tfidfdict.update({getword: getvalue})
400
+ sorted_tfidf = sorted(tfidfdict.items(), key=lambda d: d[1], reverse=True)
401
+ to_write = ['{} {}'.format(i[0], i[1]) for i in sorted_tfidf]
402
+ writetxt_w_list(to_write, save_path, num_lf=1)
403
+
404
+
405
+ class GaussDecay(object):
406
+ """
407
+ 当前只实现了时间的,全部使用默认值
408
+ """
409
+
410
+ def __init__(self, origin='2022-08-02', scale='90d', offset='5d', decay=0.5, task="time"):
411
+ self.origin = origin
412
+ self.task = task
413
+ self.scale, self.offset = self.translate(scale, offset)
414
+ self.decay = decay
415
+ self.time_coefficient = 0.6
416
+ self.related_coefficient = 0.4
417
+
418
+ def translate(self, scale, offset):
419
+ """
420
+ 将领域的输入转化为标准
421
+ :return:
422
+ """
423
+ if self.task == "time":
424
+ scale = 180
425
+ offset = 5
426
+ else:
427
+ scale = 180
428
+ offset = 5
429
+ return scale, offset
430
+
431
+ @staticmethod
432
+ def translated_minus(field_value):
433
+ origin = datetime.datetime.now()
434
+ field_value = datetime.datetime.strptime(field_value, '%Y-%m-%d %H:%M:%S')
435
+ return (origin - field_value).days
436
+
437
+ def calc_exp(self):
438
+ pass
439
+
440
+ def calc_liner(self):
441
+ pass
442
+
443
+ def calc_gauss(self, raw_score, field_value):
444
+ """
445
+ $$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
446
+ $$σ^2=-scale^2/(2·ln(decay))$$
447
+ :param raw_score:
448
+ :param field_value:
449
+ :return:
450
+ """
451
+ numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
452
+ sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
453
+ denominator = 2 * sigma_square
454
+ s = math.exp(-1 * numerator / denominator)
455
+ return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)
456
+
457
+
458
+ if __name__ == '__main__':
459
+ gauss_decay = GaussDecay()
460
+ res = gauss_decay.calc_gauss(raw_score=1, field_value="2021-05-29 14:31:13")
461
+ print(res)
462
+ # res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
463
+ # print(res)
464
+ # res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
465
+ # print(res)
466
+ # res = gauss_decay.calc_gauss(raw_score=1, field_value="2022-05-29 14:31:13")
467
+ # print(res)
468
+
469
+ # 常用函数参考
470
+ # import tensorflow as tf
471
+ #
472
+ # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
473
+ # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
474
+ # for gpu in tf.config.experimental.list_physical_devices('GPU'):
475
+ # tf.config.experimental.set_memory_growth()