nlpertools 1.0.6.dev0__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +3 -4
- nlpertools/cli.py +87 -0
- nlpertools/data_client.py +56 -17
- nlpertools/dataprocess.py +28 -12
- nlpertools/draw/__init__.py +0 -0
- nlpertools/draw/draw.py +83 -0
- nlpertools/draw/math_func.py +33 -0
- nlpertools/get_2fa.py +0 -0
- nlpertools/io/dir.py +34 -2
- nlpertools/io/file.py +15 -9
- nlpertools/ml.py +52 -24
- nlpertools/other.py +135 -24
- {nlpertools-1.0.6.dev0.dist-info → nlpertools-1.0.8.dist-info}/METADATA +29 -8
- {nlpertools-1.0.6.dev0.dist-info → nlpertools-1.0.8.dist-info}/RECORD +18 -12
- {nlpertools-1.0.6.dev0.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
- nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
- {nlpertools-1.0.6.dev0.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +0 -0
- {nlpertools-1.0.6.dev0.dist-info → nlpertools-1.0.8.dist-info}/top_level.txt +0 -0
nlpertools/__init__.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# @Author : youshu.Ji
|
4
4
|
from .algo.kmp import *
|
5
5
|
from .data_structure.base_structure import *
|
6
|
+
from .draw import *
|
6
7
|
from .dataprocess import *
|
7
8
|
from .io.dir import *
|
8
9
|
from .io.file import *
|
@@ -15,10 +16,8 @@ from .reminder import *
|
|
15
16
|
from .utils_for_nlpertools import *
|
16
17
|
from .wrapper import *
|
17
18
|
from .monitor import *
|
19
|
+
from .cli import *
|
18
20
|
|
19
|
-
import os
|
20
21
|
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
__version__ = '1.0.5'
|
23
|
+
__version__ = '1.0.8'
|
nlpertools/cli.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
import argparse
|
2
|
+
import os
|
3
|
+
import uuid
|
4
|
+
import sys
|
5
|
+
|
6
|
+
import pyotp
|
7
|
+
|
8
|
+
"""
|
9
|
+
如何Debug cli.py
|
10
|
+
"""
|
11
|
+
|
12
|
+
|
13
|
+
def git_push():
|
14
|
+
"""
|
15
|
+
针对国内提交github经常失败,自动提交
|
16
|
+
"""
|
17
|
+
num = -1
|
18
|
+
while 1:
|
19
|
+
num += 1
|
20
|
+
print("retry num: {}".format(num))
|
21
|
+
info = os.system("git push --set-upstream origin main")
|
22
|
+
print(str(info))
|
23
|
+
if not str(info).startswith("fatal"):
|
24
|
+
print("scucess")
|
25
|
+
break
|
26
|
+
|
27
|
+
|
28
|
+
def git_pull():
|
29
|
+
"""
|
30
|
+
针对国内提交github经常失败,自动提交
|
31
|
+
"""
|
32
|
+
num = -1
|
33
|
+
while 1:
|
34
|
+
num += 1
|
35
|
+
print("retry num: {}".format(num))
|
36
|
+
info = os.system("git pull")
|
37
|
+
print(str(info))
|
38
|
+
if not str(info).startswith("fatal") and not str(info).startswith("error"):
|
39
|
+
print("scucess")
|
40
|
+
break
|
41
|
+
|
42
|
+
|
43
|
+
def get_mac_address():
|
44
|
+
mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
|
45
|
+
mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
|
46
|
+
print("mac address 不一定准确")
|
47
|
+
print(mac_address)
|
48
|
+
return mac_address
|
49
|
+
|
50
|
+
|
51
|
+
def get_2af_value(key):
|
52
|
+
"""
|
53
|
+
key应该是7位的
|
54
|
+
"""
|
55
|
+
print(key)
|
56
|
+
totp = pyotp.TOTP(key)
|
57
|
+
print(totp.now())
|
58
|
+
|
59
|
+
|
60
|
+
def main():
|
61
|
+
parser = argparse.ArgumentParser(description="CLI tool for git operations and getting MAC address.")
|
62
|
+
parser.add_argument('--gitpush', action='store_true', help='Perform git push operation.')
|
63
|
+
parser.add_argument('--gitpull', action='store_true', help='Perform git push operation.')
|
64
|
+
parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
|
65
|
+
|
66
|
+
parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
|
67
|
+
parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
|
68
|
+
|
69
|
+
args = parser.parse_args()
|
70
|
+
|
71
|
+
if args.gitpush:
|
72
|
+
git_push()
|
73
|
+
elif args.gitpull:
|
74
|
+
git_pull()
|
75
|
+
elif args.mac_address:
|
76
|
+
get_mac_address()
|
77
|
+
elif args.get_2fa:
|
78
|
+
if args.get_2fa_key:
|
79
|
+
get_2af_value(args.get_2fa_key)
|
80
|
+
else:
|
81
|
+
print("Please provide a key as an argument.")
|
82
|
+
else:
|
83
|
+
print("No operation specified. Use --gitpush or --get_mac_address.")
|
84
|
+
|
85
|
+
|
86
|
+
if __name__ == '__main__':
|
87
|
+
main()
|
nlpertools/data_client.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding=utf-8
|
1
2
|
# !/usr/bin/python3.8
|
2
3
|
# -*- coding: utf-8 -*-
|
3
4
|
# @Author : youshu.Ji
|
@@ -5,9 +6,11 @@ import datetime
|
|
5
6
|
import json
|
6
7
|
import logging
|
7
8
|
|
8
|
-
from . import DB_CONFIG_FILE
|
9
9
|
from .io.file import read_yaml
|
10
10
|
from .utils.package import *
|
11
|
+
import os
|
12
|
+
|
13
|
+
DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "default_db_config.yml")
|
11
14
|
|
12
15
|
# import aioredis
|
13
16
|
# import happybase
|
@@ -28,21 +31,24 @@ class Neo4jOps(object):
|
|
28
31
|
NEO4J_TIMEOUT = 0.3
|
29
32
|
pass
|
30
33
|
|
34
|
+
|
31
35
|
class SqliteOps(object):
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
conn.
|
36
|
+
pass
|
37
|
+
# import sqlite3
|
38
|
+
# database_path = r'xx.db'
|
39
|
+
# conn = sqlite3.connect(database_path)
|
40
|
+
# c = conn.cursor()
|
41
|
+
# sql = "select name from sqlite_master where type='table' order by name"
|
42
|
+
# c.execute(sql)
|
43
|
+
# print(c.fetchall())
|
44
|
+
# sql = "select * from typecho_contents"
|
45
|
+
# c.execute(sql)
|
46
|
+
# res = c.fetchall()
|
47
|
+
# print(res[3])
|
48
|
+
#
|
49
|
+
# conn.commit()
|
50
|
+
# conn.close()
|
51
|
+
|
46
52
|
|
47
53
|
class MysqlOps(object):
|
48
54
|
import pandas as pd
|
@@ -116,6 +122,41 @@ class EsOps(object):
|
|
116
122
|
print(f"批量保存数据: {_res}")
|
117
123
|
|
118
124
|
|
125
|
+
class MongoDB_BETA:
|
126
|
+
def __init__(self, host='localhost', port=27017, db_name=None, collection_name=None):
|
127
|
+
self.host = host
|
128
|
+
self.port = port
|
129
|
+
self.db_name = db_name
|
130
|
+
self.collection_name = collection_name
|
131
|
+
self.client = None
|
132
|
+
self.db = None
|
133
|
+
self.collection = None
|
134
|
+
|
135
|
+
def connect(self):
|
136
|
+
self.client = MongoClient(self.host, self.port)
|
137
|
+
self.db = self.client[self.db_name]
|
138
|
+
self.collection = self.db[self.collection_name]
|
139
|
+
|
140
|
+
def close(self):
|
141
|
+
if self.client:
|
142
|
+
self.client.close()
|
143
|
+
|
144
|
+
def insert_data(self, data):
|
145
|
+
if isinstance(data, list):
|
146
|
+
self.collection.insert_many(data)
|
147
|
+
else:
|
148
|
+
self.collection.insert_one(data)
|
149
|
+
|
150
|
+
def check_data_exists(self, query):
|
151
|
+
"""
|
152
|
+
检查某个数据是否存在于数据库中
|
153
|
+
:param query: 查询条件
|
154
|
+
:return: 布尔值,表示数据是否存在
|
155
|
+
"""
|
156
|
+
return self.collection.count_documents(query) > 0
|
157
|
+
|
158
|
+
|
159
|
+
|
119
160
|
class MongoOps(object):
|
120
161
|
from pymongo import MongoClient
|
121
162
|
def __init__(self, config=global_db_config["mongo"]):
|
@@ -348,8 +389,6 @@ class KafkaOps(object):
|
|
348
389
|
print(recv)
|
349
390
|
|
350
391
|
|
351
|
-
|
352
|
-
|
353
392
|
class MilvusOps(object):
|
354
393
|
def __init__(self, config=global_db_config.milvus):
|
355
394
|
from pymilvus import connections, Collection
|
nlpertools/dataprocess.py
CHANGED
@@ -55,9 +55,9 @@ class Pattern:
|
|
55
55
|
# 中文人名
|
56
56
|
chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
|
57
57
|
# 英文人名
|
58
|
-
english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
58
|
+
english_name_pattern = r"(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
59
59
|
# 纯数字
|
60
|
-
pure_num_pattern = "\d+"
|
60
|
+
pure_num_pattern = r"\d+"
|
61
61
|
# xxxx图/表 之类的表述
|
62
62
|
pic_table_descript_pattern = ".{1,15}图"
|
63
63
|
|
@@ -66,20 +66,20 @@ class Pattern:
|
|
66
66
|
hlink_pattern = (
|
67
67
|
r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
|
68
68
|
)
|
69
|
-
http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
69
|
+
http_pattern = r"(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
70
70
|
# 邮箱
|
71
|
-
email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
71
|
+
email_pattern = r"[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
72
72
|
# html 可能过于严格了
|
73
|
-
html_pattern = "<[\s\S]*?>"
|
73
|
+
html_pattern = r"<[\s\S]*?>"
|
74
74
|
# 重复 “asdasdasdasd”
|
75
75
|
repeat_pattern = "(.)\1+"
|
76
76
|
# 日期
|
77
|
-
day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
77
|
+
day_time_pattern = r"\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
78
78
|
# 小时
|
79
|
-
hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
79
|
+
hour_time_pattern = r"(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
80
80
|
# 股票
|
81
81
|
stock_pattern = (
|
82
|
-
"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
82
|
+
r"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
83
83
|
)
|
84
84
|
|
85
85
|
# 一般是需要替换的
|
@@ -91,7 +91,7 @@ class Pattern:
|
|
91
91
|
# 微博视频等
|
92
92
|
weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
|
93
93
|
# @
|
94
|
-
at_pattern = "@\w+"
|
94
|
+
at_pattern = r"@\w+"
|
95
95
|
|
96
96
|
# from https://github.com/bigscience-workshop/data-preparation pii
|
97
97
|
year_patterns = [
|
@@ -116,7 +116,7 @@ class Pattern:
|
|
116
116
|
ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
|
117
117
|
ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
|
118
118
|
ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
|
119
|
-
[ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
119
|
+
[ipv4_pattern, ipv6_pattern]) + r")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
120
120
|
|
121
121
|
# https://regex101.com/r/EpA5B7/1
|
122
122
|
email_line_pattern = r'''
|
@@ -466,7 +466,7 @@ class TextProcess(object):
|
|
466
466
|
p = re.compile(pattern, re.S)
|
467
467
|
text = p.sub("", text)
|
468
468
|
|
469
|
-
dr = re.compile("@\w+", re.S)
|
469
|
+
dr = re.compile(r"@\w+", re.S)
|
470
470
|
text = dr.sub("", text)
|
471
471
|
|
472
472
|
return text
|
@@ -527,7 +527,7 @@ class TextProcess(object):
|
|
527
527
|
text = re.sub(pattern, replace, text)
|
528
528
|
return text
|
529
529
|
|
530
|
-
def calc_proportion_zh(self,text):
|
530
|
+
def calc_proportion_zh(self, text):
|
531
531
|
text = text.strip()
|
532
532
|
# 如果是中国英文的情况,并且英文有空格分开
|
533
533
|
if " " in text:
|
@@ -538,6 +538,8 @@ class TextProcess(object):
|
|
538
538
|
chinese_count += 1
|
539
539
|
else:
|
540
540
|
pass
|
541
|
+
|
542
|
+
|
541
543
|
class CopyFunc():
|
542
544
|
# from https://github.com/lemon234071/clean-dialog
|
543
545
|
def is_chinese_char(cp):
|
@@ -597,6 +599,20 @@ def convert_basic2fullwidth(sentence):
|
|
597
599
|
new_sentence += char
|
598
600
|
return new_sentence
|
599
601
|
|
602
|
+
|
603
|
+
def clean_illegal_chars_for_excel(df):
|
604
|
+
# openpyxl 库写入 Excel 文件时,有一些非法字符,需要删除
|
605
|
+
# 定义一个函数来移除字符串中的非法字符
|
606
|
+
def remove_illegal_chars(s):
|
607
|
+
if isinstance(s, str):
|
608
|
+
# 移除 ASCII 码在非法范围内的字符
|
609
|
+
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
|
610
|
+
return s
|
611
|
+
|
612
|
+
# 应用清理函数到数据框的每个元素
|
613
|
+
return df.map(remove_illegal_chars)
|
614
|
+
|
615
|
+
|
600
616
|
if __name__ == "__main__":
|
601
617
|
pattern_for_filter = [
|
602
618
|
Pattern.redundancy_space_pattern,
|
File without changes
|
nlpertools/draw/draw.py
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
from ..utils.package import plt
|
5
|
+
|
6
|
+
|
7
|
+
def confused_matrix(confuse_matrix):
|
8
|
+
import seaborn as sns
|
9
|
+
sns.set()
|
10
|
+
f, ax = plt.subplots()
|
11
|
+
ticklabels = ["l1", "l2", "l31"]
|
12
|
+
sns.heatmap(confuse_matrix, annot=True, fmt=".3g", ax=ax, cmap='rainbow',
|
13
|
+
xticklabels=ticklabels, yticklabels=ticklabels) # 画热力图
|
14
|
+
|
15
|
+
ax.set_title('confusion matrix') # 标题
|
16
|
+
ax.set_xlabel('predict') # x轴
|
17
|
+
ax.set_ylabel('true') # y轴
|
18
|
+
plt.show()
|
19
|
+
|
20
|
+
f.savefig('tmp.jpg', bbox_inches='tight')
|
21
|
+
|
22
|
+
|
23
|
+
def plot_histogram(data, bin_size):
|
24
|
+
"""
|
25
|
+
画直方图,超过1000的统一按1000算
|
26
|
+
:param data:
|
27
|
+
:param bin_size:
|
28
|
+
:return:
|
29
|
+
"""
|
30
|
+
import matplotlib.pyplot as plt
|
31
|
+
import numpy as np
|
32
|
+
import pandas as pd
|
33
|
+
from matplotlib.ticker import MaxNLocator
|
34
|
+
# 将超过1000的值改为1000
|
35
|
+
def process_lengths(data):
|
36
|
+
return [length if length <= 1000 else 1003 for length in data]
|
37
|
+
|
38
|
+
# 前闭后开
|
39
|
+
min_num, max_num = 0, 1000
|
40
|
+
# min_num, max_num = min(data), max(data)
|
41
|
+
|
42
|
+
plt.figure(figsize=(12, 8))
|
43
|
+
processed_data = process_lengths(data)
|
44
|
+
bins = np.arange(0, 1000 + 2 * bin_size, bin_size)
|
45
|
+
# 绘制直方图
|
46
|
+
n, new_bins, patches = plt.hist(processed_data, bins=bins, edgecolor='black', color='skyblue', alpha=0.7,
|
47
|
+
linewidth=0)
|
48
|
+
|
49
|
+
# 添加"∞"的标签
|
50
|
+
# bins会改变
|
51
|
+
plt.gca().set_xticks(bins)
|
52
|
+
plt.gca().set_xticklabels([str(i) for i in plt.xticks()[0][:-1]] + ["∞"])
|
53
|
+
|
54
|
+
mean_val = np.mean(data)
|
55
|
+
plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1)
|
56
|
+
plt.text(mean_val + bin_size / 10, max(n) * 0.9, f'Mean: {mean_val:.2f}', color='red')
|
57
|
+
|
58
|
+
# 添加标题和标签
|
59
|
+
plt.title('Module Line Number Distribution', fontsize=16, fontweight='bold')
|
60
|
+
plt.xlabel('module line number', fontsize=14)
|
61
|
+
plt.ylabel('frequency', fontsize=14)
|
62
|
+
|
63
|
+
# 添加网格
|
64
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
65
|
+
|
66
|
+
# 美化x轴和y轴的刻度
|
67
|
+
plt.xticks(fontsize=12)
|
68
|
+
plt.yticks(fontsize=12)
|
69
|
+
|
70
|
+
# 在每个柱状图上显示数值
|
71
|
+
for i in range(len(patches)):
|
72
|
+
plt.text(patches[i].get_x() + patches[i].get_width() / 2, patches[i].get_height(),
|
73
|
+
str(int(n[i])), ha='center', va='bottom', fontsize=12)
|
74
|
+
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
|
75
|
+
# 显示图表
|
76
|
+
plt.show()
|
77
|
+
|
78
|
+
|
79
|
+
if __name__ == '__main__':
|
80
|
+
# 调整区间大小
|
81
|
+
bin_size = 50
|
82
|
+
# 示例模块长度数据
|
83
|
+
plot_histogram([1, 100, 999, 1000, 1002, 1100, 1150], bin_size)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# 数学函数
|
2
|
+
def draw_log():
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import numpy as np
|
5
|
+
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
|
6
|
+
|
7
|
+
# 生成一些数据
|
8
|
+
x = np.linspace(0.1, 10, 100)
|
9
|
+
# 默认log指的时loge
|
10
|
+
y = np.log(x)
|
11
|
+
|
12
|
+
# 创建一个新的图形和轴
|
13
|
+
fig, ax = plt.subplots()
|
14
|
+
|
15
|
+
# 绘制log图像
|
16
|
+
ax.plot(x, y)
|
17
|
+
|
18
|
+
# 设置图像标题和轴标签
|
19
|
+
ax.set_title("Logarithmic Function")
|
20
|
+
ax.set_xlabel("x")
|
21
|
+
ax.set_ylabel("log(x)")
|
22
|
+
# 设置横坐标的刻度间隔为1
|
23
|
+
ax.xaxis.set_major_locator(MultipleLocator(1))
|
24
|
+
|
25
|
+
# 设置横坐标的刻度格式
|
26
|
+
ax.xaxis.set_major_formatter(FormatStrFormatter("%.1f"))
|
27
|
+
# 添加x=1的虚线
|
28
|
+
ax.axvline(x=1, linestyle="--", color="gray")
|
29
|
+
# 添加y=1的虚线
|
30
|
+
ax.axhline(y=0, linestyle="--", color="gray")
|
31
|
+
|
32
|
+
# 显示图像
|
33
|
+
plt.show()
|
nlpertools/get_2fa.py
ADDED
File without changes
|
nlpertools/io/dir.py
CHANGED
@@ -10,7 +10,30 @@ def j_mkdir(name):
|
|
10
10
|
os.makedirs(name, exist_ok=True)
|
11
11
|
|
12
12
|
|
13
|
-
def
|
13
|
+
def j_walk(name, suffix=None):
|
14
|
+
paths = []
|
15
|
+
for root, dirs, files in os.walk(name):
|
16
|
+
for file in files:
|
17
|
+
path = os.path.join(root, file)
|
18
|
+
if not (suffix and not path.endswith(suffix)):
|
19
|
+
paths.append(path)
|
20
|
+
return paths
|
21
|
+
|
22
|
+
|
23
|
+
def windows_to_wsl_path(windows_path):
|
24
|
+
# 转换驱动器号
|
25
|
+
if windows_path[1:3] == ':\\':
|
26
|
+
drive_letter = windows_path[0].lower()
|
27
|
+
path = windows_path[2:].replace('\\', '/')
|
28
|
+
wsl_path = f'/mnt/{drive_letter}{path}'
|
29
|
+
else:
|
30
|
+
# 如果路径不是以驱动器号开头,则直接替换路径分隔符
|
31
|
+
wsl_path = windows_path.replace('\\', '/').replace("'", "\'")
|
32
|
+
|
33
|
+
return wsl_path
|
34
|
+
|
35
|
+
|
36
|
+
def get_filename(path, suffix=True) -> str:
|
14
37
|
"""
|
15
38
|
返回路径最后的文件名
|
16
39
|
:param path:
|
@@ -18,11 +41,20 @@ def get_filename(path) -> str:
|
|
18
41
|
"""
|
19
42
|
# path = r'***/**/***.txt'
|
20
43
|
filename = os.path.split(path)[-1]
|
44
|
+
if not suffix:
|
45
|
+
filename = filename.split('.')[0]
|
21
46
|
return filename
|
22
47
|
|
23
48
|
|
24
49
|
def j_listdir(dir_name, including_dir=True):
|
25
|
-
|
50
|
+
filenames = os.listdir(dir_name)
|
51
|
+
if including_dir:
|
52
|
+
return [os.path.join(dir_name, filename) for filename in filenames]
|
53
|
+
else:
|
54
|
+
return list(filenames)
|
55
|
+
|
56
|
+
|
57
|
+
def j_listdir_yield(dir_name, including_dir=True):
|
26
58
|
filenames = os.listdir(dir_name)
|
27
59
|
for filename in filenames:
|
28
60
|
if including_dir:
|
nlpertools/io/file.py
CHANGED
@@ -5,7 +5,6 @@ import codecs
|
|
5
5
|
import json
|
6
6
|
import pickle
|
7
7
|
import random
|
8
|
-
import time
|
9
8
|
from itertools import (takewhile, repeat)
|
10
9
|
import pandas as pd
|
11
10
|
# import omegaconf
|
@@ -15,10 +14,16 @@ from ..utils.package import *
|
|
15
14
|
LARGE_FILE_THRESHOLD = 1e5
|
16
15
|
|
17
16
|
|
17
|
+
def safe_filename(filename: str) -> str:
|
18
|
+
for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|']:
|
19
|
+
filename = filename.replace(char, '_')
|
20
|
+
return filename
|
21
|
+
|
22
|
+
|
18
23
|
def read_yaml(path, omega=False):
|
19
24
|
if omega:
|
20
25
|
return omegaconf.OmegaConf.load(path)
|
21
|
-
return yaml.load(codecs.open(path), Loader=yaml.FullLoader)
|
26
|
+
return yaml.load(codecs.open(path, encoding='utf-8'), Loader=yaml.FullLoader)
|
22
27
|
|
23
28
|
|
24
29
|
def _merge_file(filelist, save_filename, shuffle=False):
|
@@ -52,7 +57,7 @@ load_from_json
|
|
52
57
|
|
53
58
|
|
54
59
|
# 读txt文件 一次全读完 返回list 去换行
|
55
|
-
def readtxt_list_all_strip(path, encoding='utf-8'):
|
60
|
+
def readtxt_list_all_strip(path, encoding='utf-8') -> list:
|
56
61
|
file_line_num = iter_count(path)
|
57
62
|
lines = []
|
58
63
|
with codecs.open(path, 'r', encoding) as r:
|
@@ -67,7 +72,7 @@ def readtxt_list_all_strip(path, encoding='utf-8'):
|
|
67
72
|
|
68
73
|
|
69
74
|
# 读txt 一次读一行 最后返回list
|
70
|
-
def readtxt_list_each(path):
|
75
|
+
def readtxt_list_each(path) -> list:
|
71
76
|
lines = []
|
72
77
|
with codecs.open(path, 'r', 'utf-8') as r:
|
73
78
|
line = r.readline()
|
@@ -77,7 +82,7 @@ def readtxt_list_each(path):
|
|
77
82
|
return lines
|
78
83
|
|
79
84
|
|
80
|
-
def readtxt_list_each_strip(path):
|
85
|
+
def readtxt_list_each_strip(path) -> list:
|
81
86
|
"""
|
82
87
|
yield方法
|
83
88
|
"""
|
@@ -89,14 +94,14 @@ def readtxt_list_each_strip(path):
|
|
89
94
|
|
90
95
|
|
91
96
|
# 读txt文件 一次全读完 返回list
|
92
|
-
def readtxt_list_all(path):
|
97
|
+
def readtxt_list_all(path) -> list:
|
93
98
|
with codecs.open(path, 'r', 'utf-8') as r:
|
94
99
|
lines = r.readlines()
|
95
100
|
return lines
|
96
101
|
|
97
102
|
|
98
103
|
# 读byte文件 读成一条string
|
99
|
-
def readtxt_byte(path, encoding="utf-8"):
|
104
|
+
def readtxt_byte(path, encoding="utf-8") -> str:
|
100
105
|
with codecs.open(path, 'rb') as r:
|
101
106
|
lines = r.read()
|
102
107
|
lines = lines.decode(encoding)
|
@@ -104,7 +109,7 @@ def readtxt_byte(path, encoding="utf-8"):
|
|
104
109
|
|
105
110
|
|
106
111
|
# 读txt文件 读成一条string
|
107
|
-
def readtxt_string(path, encoding="utf-8"):
|
112
|
+
def readtxt_string(path, encoding="utf-8") -> str:
|
108
113
|
with codecs.open(path, 'r', encoding) as r:
|
109
114
|
lines = r.read()
|
110
115
|
return lines.replace('\r', '')
|
@@ -261,6 +266,7 @@ def save_to_mongo():
|
|
261
266
|
"""
|
262
267
|
pass
|
263
268
|
|
269
|
+
|
264
270
|
def load_from_mongo():
|
265
271
|
pass
|
266
272
|
|
@@ -274,4 +280,4 @@ def unmerge_cells_df(df) -> pd.DataFrame:
|
|
274
280
|
else:
|
275
281
|
values.append(i)
|
276
282
|
df[column] = values
|
277
|
-
return df
|
283
|
+
return df
|
nlpertools/ml.py
CHANGED
@@ -18,9 +18,8 @@ from .utils.package import *
|
|
18
18
|
|
19
19
|
|
20
20
|
def calc_llm_train_activation_memory(
|
21
|
-
|
21
|
+
model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
|
22
22
|
):
|
23
|
-
|
24
23
|
"""
|
25
24
|
return bytes
|
26
25
|
|
@@ -33,18 +32,18 @@ def calc_llm_train_activation_memory(
|
|
33
32
|
# FFN
|
34
33
|
# Layer Norm
|
35
34
|
r1 = (
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
sequence_length
|
36
|
+
* batch_size
|
37
|
+
* hidden_dim
|
38
|
+
* lay_number
|
39
|
+
* (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
|
41
40
|
)
|
42
41
|
# reference2
|
43
42
|
r2 = (
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
43
|
+
lay_number * (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
|
44
|
+
* sequence_length
|
45
|
+
* batch_size
|
46
|
+
/ gpu_num
|
48
47
|
)
|
49
48
|
print(r1)
|
50
49
|
print(r2)
|
@@ -80,7 +79,7 @@ class DataStructure:
|
|
80
79
|
}
|
81
80
|
ner_input_example = "这句话一共有两个实体分别为大象和老鼠。"
|
82
81
|
ner_label_example = (
|
83
|
-
|
82
|
+
list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
|
84
83
|
)
|
85
84
|
|
86
85
|
|
@@ -135,7 +134,7 @@ class STEM(object):
|
|
135
134
|
if each_srl:
|
136
135
|
args = []
|
137
136
|
for arg in each_srl:
|
138
|
-
args.extend(seg[arg[1]
|
137
|
+
args.extend(seg[arg[1]: arg[2] + 1])
|
139
138
|
# 添加上谓词
|
140
139
|
args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
|
141
140
|
events.append(args)
|
@@ -174,7 +173,7 @@ def subject_object_labeling(spo_list, text):
|
|
174
173
|
q_list_length = len(q_list)
|
175
174
|
k_list_length = len(k_list)
|
176
175
|
for idx in range(k_list_length - q_list_length + 1):
|
177
|
-
t = [q == k for q, k in zip(q_list, k_list[idx
|
176
|
+
t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
|
178
177
|
# print(idx, t)
|
179
178
|
if all(t):
|
180
179
|
# print(idx)
|
@@ -187,8 +186,8 @@ def subject_object_labeling(spo_list, text):
|
|
187
186
|
if len(spo) == 2:
|
188
187
|
labeling_list[idx_start + 1] = "I-" + spo_type
|
189
188
|
elif len(spo) >= 3:
|
190
|
-
labeling_list[idx_start + 1
|
191
|
-
|
189
|
+
labeling_list[idx_start + 1: idx_start + len(spo)] = ["I-" + spo_type] * (
|
190
|
+
len(spo) - 1
|
192
191
|
)
|
193
192
|
else:
|
194
193
|
pass
|
@@ -239,12 +238,12 @@ def convert_crf_format_10_fold(corpus, objdir_path):
|
|
239
238
|
split_position = int(len(corpus) / 10)
|
240
239
|
for k in range(0, 10):
|
241
240
|
if k == 9:
|
242
|
-
dev_set = corpus[k * split_position
|
241
|
+
dev_set = corpus[k * split_position:]
|
243
242
|
train_set = corpus[: k * split_position]
|
244
243
|
else:
|
245
|
-
dev_set = corpus[k * split_position
|
244
|
+
dev_set = corpus[k * split_position: (k + 1) * split_position]
|
246
245
|
train_set = (
|
247
|
-
|
246
|
+
corpus[: k * split_position] + corpus[(k + 1) * split_position:]
|
248
247
|
)
|
249
248
|
writetxt_w_list(
|
250
249
|
train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1))
|
@@ -292,12 +291,41 @@ def kfold_txt(corpus, path, k=9, is_shuffle=True):
|
|
292
291
|
if is_shuffle:
|
293
292
|
random.shuffle(corpus)
|
294
293
|
split_position = int(len(corpus) / 10)
|
295
|
-
train_set, dev_set = corpus[: k * split_position], corpus[k * split_position
|
294
|
+
train_set, dev_set = corpus[: k * split_position], corpus[k * split_position:]
|
296
295
|
writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
|
297
296
|
writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
|
298
297
|
writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
|
299
298
|
|
300
299
|
|
300
|
+
def sample():
|
301
|
+
import pandas as pd
|
302
|
+
from sklearn.model_selection import StratifiedShuffleSplit
|
303
|
+
|
304
|
+
# 假设 df 是你的 DataFrame
|
305
|
+
|
306
|
+
df = pd.DataFrame({
|
307
|
+
"count_line": [i for i in range(100)],
|
308
|
+
"x": [i for i in range(100)],
|
309
|
+
"y": [i // 10 for i in range(100)],
|
310
|
+
})
|
311
|
+
print(df)
|
312
|
+
# count_line 是用于分层抽样的字段
|
313
|
+
|
314
|
+
# 创建 StratifiedShuffleSplit 对象,设置测试集比例为 0.1
|
315
|
+
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
|
316
|
+
|
317
|
+
# 获取训练集和测试集的索引
|
318
|
+
train_index, test_index = next(split.split(df, df['y']))
|
319
|
+
|
320
|
+
# 根据索引划分训练集和测试集
|
321
|
+
train_df = df.loc[train_index]
|
322
|
+
test_df = df.loc[test_index]
|
323
|
+
|
324
|
+
# 打印训练集和测试集的行数
|
325
|
+
print("训练集行数:", len(train_df))
|
326
|
+
print("测试集行数:", len(test_df))
|
327
|
+
|
328
|
+
|
301
329
|
def kfold_df(df, save_dir=None):
|
302
330
|
"""
|
303
331
|
划分train test val集, 写为windows可读的csv。
|
@@ -389,7 +417,7 @@ def split_sentence(sentence, language="chinese", cross_line=True):
|
|
389
417
|
for idx, char in enumerate(sentence):
|
390
418
|
if idx == len(sentence) - 1:
|
391
419
|
if char in split_signs:
|
392
|
-
sentences.append(sentence[start_idx
|
420
|
+
sentences.append(sentence[start_idx: idx + 1].strip())
|
393
421
|
start_idx = idx + 1
|
394
422
|
else:
|
395
423
|
sentences.append(sentence[start_idx:].strip())
|
@@ -399,10 +427,10 @@ def split_sentence(sentence, language="chinese", cross_line=True):
|
|
399
427
|
if idx < len(sentence) - 2:
|
400
428
|
# 处理。”。
|
401
429
|
if sentence[idx + 2] not in split_signs:
|
402
|
-
sentences.append(sentence[start_idx
|
430
|
+
sentences.append(sentence[start_idx: idx + 2].strip())
|
403
431
|
start_idx = idx + 2
|
404
432
|
elif sentence[idx + 1] not in split_signs:
|
405
|
-
sentences.append(sentence[start_idx
|
433
|
+
sentences.append(sentence[start_idx: idx + 1].strip())
|
406
434
|
start_idx = idx + 1
|
407
435
|
return sentences
|
408
436
|
|
@@ -480,4 +508,4 @@ if __name__ == "__main__":
|
|
480
508
|
attention_heads_num=32,
|
481
509
|
gpu_num=1
|
482
510
|
)
|
483
|
-
print(res, "G")
|
511
|
+
print(res, "G")
|
nlpertools/other.py
CHANGED
@@ -5,10 +5,13 @@ import itertools
|
|
5
5
|
import os
|
6
6
|
import re
|
7
7
|
import string
|
8
|
+
import subprocess
|
9
|
+
import threading
|
8
10
|
from concurrent.futures import ThreadPoolExecutor
|
9
11
|
from functools import reduce
|
10
12
|
import math
|
11
13
|
import datetime
|
14
|
+
import difflib
|
12
15
|
import psutil
|
13
16
|
from .io.file import writetxt_w_list, writetxt_a
|
14
17
|
# import numpy as np
|
@@ -27,6 +30,134 @@ ENGLISH_PUNCTUATION = list(',.;:\'"!?<>()')
|
|
27
30
|
OTHER_PUNCTUATION = list('!@#$%^&*')
|
28
31
|
|
29
32
|
|
33
|
+
def get_diff_parts(str1, str2):
|
34
|
+
# 创建一个 SequenceMatcher 对象
|
35
|
+
matcher = difflib.SequenceMatcher(None, str1, str2)
|
36
|
+
|
37
|
+
# 获取差异部分
|
38
|
+
diff_parts = []
|
39
|
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
40
|
+
if tag == 'replace' or tag == 'delete' or tag == 'insert':
|
41
|
+
diff_parts.append((tag, str1[i1:i2], str2[j1:j2]))
|
42
|
+
|
43
|
+
return diff_parts
|
44
|
+
|
45
|
+
|
46
|
+
def run_cmd_with_timeout(cmd, timeout):
|
47
|
+
"""
|
48
|
+
https://juejin.cn/post/7391703459803086848
|
49
|
+
"""
|
50
|
+
process = subprocess.Popen(cmd, shell=True, encoding="utf-8", errors="ignore", stdout=subprocess.PIPE,
|
51
|
+
stderr=subprocess.PIPE)
|
52
|
+
res = [None]
|
53
|
+
|
54
|
+
def target():
|
55
|
+
try:
|
56
|
+
ans = process.communicate()
|
57
|
+
res[0] = ans
|
58
|
+
except subprocess.TimeoutExpired:
|
59
|
+
process.kill()
|
60
|
+
process.communicate()
|
61
|
+
|
62
|
+
thread = threading.Thread(target=target)
|
63
|
+
thread.start()
|
64
|
+
thread.join(timeout)
|
65
|
+
if thread.is_alive():
|
66
|
+
print(f"Terminating {cmd}")
|
67
|
+
process.terminate()
|
68
|
+
thread.join()
|
69
|
+
print("Terminated successfully")
|
70
|
+
return False, f"{cmd} is running over {timeout}s"
|
71
|
+
if process.returncode == 0:
|
72
|
+
# res[0][0] 是output
|
73
|
+
return True, res[0][0]
|
74
|
+
else:
|
75
|
+
return False, res[0][0]
|
76
|
+
|
77
|
+
|
78
|
+
def print_three_line_table(df):
|
79
|
+
# TODO 这里需要添加可以支持excel里变红的功能
|
80
|
+
import webbrowser
|
81
|
+
|
82
|
+
# import pandas as pd
|
83
|
+
# data = {'from_pc': ['valid_data', 'illegal_char', 'more_data'],
|
84
|
+
# 'rom_pc': ['another_valid_data', 'illegal_char', 'data']}
|
85
|
+
# df = pd.DataFrame(data)
|
86
|
+
|
87
|
+
# 将 DataFrame 转换为 HTML 表格
|
88
|
+
html_table = df.to_html(index=False)
|
89
|
+
html_table = html_table.replace('border="1"', 'border="0"')
|
90
|
+
|
91
|
+
first_line_px = str(2)
|
92
|
+
second_line_px = str(1)
|
93
|
+
third_line_px = str(2)
|
94
|
+
# 定义三线表的 CSS 样式
|
95
|
+
# // thead 表头
|
96
|
+
# // tr 行
|
97
|
+
# // td 单元格
|
98
|
+
head = """<!DOCTYPE html>
|
99
|
+
<html lang="zh">
|
100
|
+
<head>
|
101
|
+
<meta charset="UTF-8">
|
102
|
+
<title>页面标题</title>
|
103
|
+
</head>"""
|
104
|
+
style = """
|
105
|
+
<style>
|
106
|
+
|
107
|
+
table {
|
108
|
+
border-collapse: collapse;
|
109
|
+
}
|
110
|
+
|
111
|
+
tr, td, th {
|
112
|
+
text-align: center; /* 水平居中文本 */
|
113
|
+
vertical-align: middle; /* 垂直居中文本 */
|
114
|
+
}
|
115
|
+
thead tr {
|
116
|
+
border-top: (first_line_px)px solid black;
|
117
|
+
border-bottom: (second_line_px)px solid black;
|
118
|
+
}
|
119
|
+
|
120
|
+
thead th {
|
121
|
+
border-bottom: (second_line_px)px solid black;
|
122
|
+
}
|
123
|
+
|
124
|
+
tbody tr td {
|
125
|
+
border-bottom: 0px solid black;
|
126
|
+
}
|
127
|
+
|
128
|
+
tbody tr:last-child td {
|
129
|
+
border-bottom: (third_line_px)px solid black;
|
130
|
+
}
|
131
|
+
</style>"""
|
132
|
+
style = style.replace("(first_line_px)", first_line_px).replace("(second_line_px)", second_line_px).replace(
|
133
|
+
"(third_line_px)", third_line_px)
|
134
|
+
# 将 CSS 样式和 HTML 表格结合起来
|
135
|
+
html = f"{style}{html_table}"
|
136
|
+
print(html)
|
137
|
+
temp_file_path = "temp.html"
|
138
|
+
# 将 HTML 保存到文件中
|
139
|
+
with open(temp_file_path, "w") as f:
|
140
|
+
f.write(html)
|
141
|
+
webbrowser.open('file://' + os.path.realpath(temp_file_path))
|
142
|
+
|
143
|
+
|
144
|
+
def jprint(obj, depth=0):
|
145
|
+
if isinstance(obj, dict):
|
146
|
+
sep = "-" * (10 - depth * 3)
|
147
|
+
for k, v in obj.items():
|
148
|
+
print(depth * "|", sep, k, sep)
|
149
|
+
jprint(v)
|
150
|
+
elif isinstance(obj, list):
|
151
|
+
for v in obj:
|
152
|
+
jprint(v, depth + 1)
|
153
|
+
else:
|
154
|
+
print(obj)
|
155
|
+
|
156
|
+
|
157
|
+
def print_split(sign="=", num=20):
|
158
|
+
print(sign * num)
|
159
|
+
|
160
|
+
|
30
161
|
def seed_everything():
|
31
162
|
import torch
|
32
163
|
# seed everything
|
@@ -82,21 +213,6 @@ def convert_np_to_py(obj):
|
|
82
213
|
return obj
|
83
214
|
|
84
215
|
|
85
|
-
def git_push():
|
86
|
-
"""
|
87
|
-
针对国内提交github经常失败,自动提交
|
88
|
-
"""
|
89
|
-
num = -1
|
90
|
-
while 1:
|
91
|
-
num += 1
|
92
|
-
print("retry num: {}".format(num))
|
93
|
-
info = os.system("git push --set-upstream origin main")
|
94
|
-
print(str(info))
|
95
|
-
if not str(info).startswith("fatal"):
|
96
|
-
print("scucess")
|
97
|
-
break
|
98
|
-
|
99
|
-
|
100
216
|
def snake_to_camel(s: str) -> str:
|
101
217
|
"""
|
102
218
|
author: u
|
@@ -235,24 +351,19 @@ def stress_test(func, ipts):
|
|
235
351
|
return results
|
236
352
|
|
237
353
|
|
238
|
-
def get_substring_loc(text, subtext):
|
239
|
-
res = re.finditer(
|
240
|
-
subtext.replace('\\', '\\\\').replace('?', '\?').replace('(', '\(').replace(')', '\)').replace(']',
|
241
|
-
'\]').replace(
|
242
|
-
'[', '\[').replace('+', '\+'), text)
|
243
|
-
l, r = [i for i in res][0].regs[0]
|
244
|
-
return l, r
|
245
|
-
|
246
|
-
|
247
354
|
def squeeze_list(high_dim_list):
|
248
355
|
return list(itertools.chain.from_iterable(high_dim_list))
|
249
356
|
|
250
357
|
|
251
358
|
def unsqueeze_list(flatten_list, each_element_len):
|
359
|
+
# 该函数是错的,被split_list替代了
|
252
360
|
two_dim_list = [flatten_list[i * each_element_len:(i + 1) * each_element_len] for i in
|
253
361
|
range(len(flatten_list) // each_element_len)]
|
254
362
|
return two_dim_list
|
255
363
|
|
364
|
+
def split_list(input_list, chunk_size):
|
365
|
+
# 使用列表推导式将列表分割成二维数组
|
366
|
+
return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
|
256
367
|
|
257
368
|
def auto_close():
|
258
369
|
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nlpertools
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.8
|
4
4
|
Summary: A small package about small basic IO operation when coding
|
5
5
|
Home-page: https://github.com/lvzii/nlpertools
|
6
6
|
Author: youshuJi
|
@@ -12,6 +12,11 @@ Classifier: Operating System :: OS Independent
|
|
12
12
|
Requires-Python: >=3.6
|
13
13
|
Description-Content-Type: text/markdown
|
14
14
|
License-File: LICENSE
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: psutil
|
18
|
+
Provides-Extra: torch
|
19
|
+
Requires-Dist: torch; extra == "torch"
|
15
20
|
|
16
21
|
<div align="center">
|
17
22
|
<h4 align="center">
|
@@ -23,9 +28,6 @@ License-File: LICENSE
|
|
23
28
|
</div>
|
24
29
|
|
25
30
|
|
26
|
-
# 当前版本
|
27
|
-
|
28
|
-
1.0.5
|
29
31
|
|
30
32
|
# 说明
|
31
33
|
|
@@ -75,9 +77,9 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
75
77
|
def __init__(self, IPT_MODEL_PATH):
|
76
78
|
self.ltp = LTP(IPT_MODEL_PATH)
|
77
79
|
```
|
78
|
-
|
80
|
+
通过`pyinstrument`判断,超过1s的包即采用这种方式
|
79
81
|
- 2s+ happybase、seaborn、torch、jieba
|
80
|
-
- 1s+
|
82
|
+
- 1s+ /
|
81
83
|
- 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
|
82
84
|
|
83
85
|
|
@@ -85,6 +87,8 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
85
87
|
|
86
88
|
- [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
|
87
89
|
|
90
|
+
- [打包发布指南](https://juejin.cn/post/7369413136224878644)
|
91
|
+
|
88
92
|
- 发布版本需要加tag
|
89
93
|
|
90
94
|
## 开发哲学
|
@@ -106,6 +110,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
|
|
106
110
|
```
|
107
111
|
|
108
112
|
```bash
|
109
|
-
#
|
110
|
-
python -m nlpertools
|
113
|
+
# 生成pypi双因素认证的实时密钥(需要提供key)
|
114
|
+
python -m nlpertools.get_2fa your_key
|
115
|
+
|
116
|
+
## git
|
117
|
+
python nlpertools.cli --git_push
|
118
|
+
python nlpertools.cli --git_pull
|
119
|
+
|
120
|
+
# 以下功能被nvitop替代,不推荐使用
|
121
|
+
## 监控gpu显存
|
122
|
+
python -m nlpertools.monitor.gpu
|
123
|
+
## 监控cpu
|
124
|
+
python -m nlpertools.monitor.memory
|
111
125
|
```
|
126
|
+
|
127
|
+
## 一些常用项目
|
128
|
+
|
129
|
+
nvitop
|
130
|
+
|
131
|
+
ydata-profiling
|
132
|
+
|
@@ -1,12 +1,14 @@
|
|
1
|
-
nlpertools/__init__.py,sha256=
|
2
|
-
nlpertools/
|
3
|
-
nlpertools/
|
1
|
+
nlpertools/__init__.py,sha256=h7JJEN_JRn3iKcqIcaFgYtAjP90XiT1KILrm8utoHvQ,483
|
2
|
+
nlpertools/cli.py,sha256=xDl_tWl9pfqQ3PUdd7oesvgM2FVqnaw8dFFliEX5c4Y,2203
|
3
|
+
nlpertools/data_client.py,sha256=esX8lUQrTui4uVkqPfhpHVok7Eq6ywpuemKjLeqoglc,14674
|
4
|
+
nlpertools/dataprocess.py,sha256=v1mobuYN7I3dT6xIKlNOHVtcg31YtjF6FwNPTxeBFFY,23153
|
4
5
|
nlpertools/default_db_config.yml,sha256=E1K9k_xzXVlsf-HJQh8kyHXHYuvTpD12jD4Hfe5rUk8,606
|
5
|
-
nlpertools/
|
6
|
+
nlpertools/get_2fa.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
nlpertools/ml.py,sha256=z-0ep9svAyzcS2n7Lsyqo65VEQRGzWKFMLdZofCv1LQ,17716
|
6
8
|
nlpertools/movie.py,sha256=rkyOnAXdsbWfMSbi1sE1VNRT7f66Hp9BnZsN_58Afmw,897
|
7
9
|
nlpertools/nlpertools_config.yml,sha256=ksXejxFs7pxR47tNAsrN88_4gvq9PCA2ZMO07H-dJXY,26
|
8
10
|
nlpertools/open_api.py,sha256=uyTY00OUlM57Cn0Wm0yZXcIS8vAszy9rKnDMBEWfWJM,1744
|
9
|
-
nlpertools/other.py,sha256=
|
11
|
+
nlpertools/other.py,sha256=CeUea17Oe5MV_r-CmeYdAhdj5kWLvmxoDDgRc56o7bE,14704
|
10
12
|
nlpertools/pic.py,sha256=13aaFJh3USGYGs4Y9tAKTvWjmdQR4YDjl3LlIhJheOA,9906
|
11
13
|
nlpertools/plugin.py,sha256=LB7j9GdoQi6TITddH-6EglHlOa0WIHLUT7X5vb_aIZY,1168
|
12
14
|
nlpertools/reminder.py,sha256=wiXwZQmxMck5vY3EvG8_oakP3FAdjGTikAIOiTPUQrs,2977
|
@@ -22,9 +24,12 @@ nlpertools/algo/template.py,sha256=9vsHr4g3jZZ5KVU_2I9i97o8asRXq-8pSaCXIv0sHeM,2
|
|
22
24
|
nlpertools/algo/union.py,sha256=0l7lGZbw1qIfW1z5TE8Oo3tybL1bKIP5rzpa5ZT-vLQ,249
|
23
25
|
nlpertools/data_structure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
26
|
nlpertools/data_structure/base_structure.py,sha256=gVUvJZ5jsCAswRETTpMwcEjLKoageWiTuCKNEwIWKWk,2641
|
27
|
+
nlpertools/draw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
|
+
nlpertools/draw/draw.py,sha256=PgdG7unpCtbbQdYISODTYMV7p10GwWDh9czeURkG0x4,2629
|
29
|
+
nlpertools/draw/math_func.py,sha256=0NQ22Dfi9DFG6Bg_hXnCT27w65-dqpOOIgZX7oUIW-Q,881
|
25
30
|
nlpertools/io/__init__.py,sha256=YMuKtC2Ddh5dL5MvXjyUKYOOuqzFYUhBPFaP2kyFG9I,68
|
26
|
-
nlpertools/io/dir.py,sha256=
|
27
|
-
nlpertools/io/file.py,sha256=
|
31
|
+
nlpertools/io/dir.py,sha256=p7J34qUxYCqKSO5DQMhL8FxFcHDrwn_1lIxNl0klasU,2267
|
32
|
+
nlpertools/io/file.py,sha256=CsFdluEczuz3fonbeZi9dHPasL1Hm18JL3Aux2ziQMU,7198
|
28
33
|
nlpertools/monitor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
29
34
|
nlpertools/monitor/gpu.py,sha256=M59O6i0hlew7AzXZlaVZqbZA5IR93OhBY2WI0-T_HtY,531
|
30
35
|
nlpertools/monitor/memory.py,sha256=9t6q9BC8VVx4o3G4sBCn7IoQRx272zMPjSnL3yvTBAQ,657
|
@@ -36,8 +41,9 @@ nlpertools/utils/package.py,sha256=wLg_M8j7Y6ReRjWHWCWoZJHrzEwuAr9TyG2jvb7OQCo,3
|
|
36
41
|
nlpertools/utils/package_v1.py,sha256=sqgFb-zbTdMd5ziJLY6YUPqR49qUNZjxBH35DnyR5Wg,3542
|
37
42
|
nlpertools/utils/package_v2.py,sha256=WOcsguWfUd4XSAfmPgCtL8HtUbqJ6GRSMHb0OsB47r0,3932
|
38
43
|
nlpertools_helper/__init__.py,sha256=obxRUdZDctvcvK_iA1Dx2HmQFMlMzJto-xDPryq1lJ0,198
|
39
|
-
nlpertools-1.0.
|
40
|
-
nlpertools-1.0.
|
41
|
-
nlpertools-1.0.
|
42
|
-
nlpertools-1.0.
|
43
|
-
nlpertools-1.0.
|
44
|
+
nlpertools-1.0.8.dist-info/LICENSE,sha256=SBcMozykvTbZJ--MqSiKUmHLLROdnr25V70xCQgEwqw,11331
|
45
|
+
nlpertools-1.0.8.dist-info/METADATA,sha256=v2doRda1amZbXXfIYuzo-rFPvTICt3ByDCKVr6gsUw0,3276
|
46
|
+
nlpertools-1.0.8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
47
|
+
nlpertools-1.0.8.dist-info/entry_points.txt,sha256=XEazQ4vUwJMoMAgAwk1Lq4PRQGklPkPBaFkiP0zN_JE,45
|
48
|
+
nlpertools-1.0.8.dist-info/top_level.txt,sha256=_4q4MIFvMr4cAUbhWKWYdRXIXsF4PJDg4BUsZvgk94s,29
|
49
|
+
nlpertools-1.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|