nlpertools 1.0.6.dev0__tar.gz → 1.0.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {nlpertools-1.0.6.dev0/src/nlpertools.egg-info → nlpertools-1.0.9}/PKG-INFO +33 -10
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/README.md +24 -8
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/setup.cfg +0 -1
- nlpertools-1.0.9/setup.py +36 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/__init__.py +3 -4
- nlpertools-1.0.9/src/nlpertools/cli.py +143 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/data_client.py +56 -17
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/dataprocess.py +28 -12
- nlpertools-1.0.9/src/nlpertools/draw/draw.py +81 -0
- nlpertools-1.0.9/src/nlpertools/draw/math_func.py +33 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/io/dir.py +35 -3
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/io/file.py +17 -11
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/ml.py +74 -24
- nlpertools-1.0.9/src/nlpertools/monitor/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/other.py +152 -24
- nlpertools-1.0.9/src/nlpertools/template/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9/src/nlpertools.egg-info}/PKG-INFO +33 -10
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools.egg-info/SOURCES.txt +8 -0
- nlpertools-1.0.9/src/nlpertools.egg-info/entry_points.txt +2 -0
- nlpertools-1.0.9/src/nlpertools.egg-info/requires.txt +6 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/LICENSE +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/pyproject.toml +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/ac.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/bit_ops.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/kmp.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/num_ops.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/template.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/union.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/data_structure/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/data_structure/base_structure.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/default_db_config.yml +0 -0
- {nlpertools-1.0.6.dev0/src/nlpertools/monitor → nlpertools-1.0.9/src/nlpertools/draw}/__init__.py +0 -0
- /nlpertools-1.0.6.dev0/src/nlpertools/template/__init__.py → /nlpertools-1.0.9/src/nlpertools/get_2fa.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/io/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/monitor/gpu.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/monitor/memory.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/movie.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/nlpertools_config.yml +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/open_api.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/pic.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/plugin.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/reminder.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/lazy.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/log_util.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/package.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/package_v1.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/package_v2.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils_for_nlpertools.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/vector_index_demo.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/wrapper.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools.egg-info/dependency_links.txt +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools.egg-info/top_level.txt +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools_helper/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/tests/test_kmp.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/tests/test_path_exists.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: nlpertools
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.9
|
4
4
|
Summary: A small package about small basic IO operation when coding
|
5
5
|
Home-page: https://github.com/lvzii/nlpertools
|
6
6
|
Author: youshuJi
|
@@ -12,6 +12,13 @@ Classifier: Operating System :: OS Independent
|
|
12
12
|
Requires-Python: >=3.6
|
13
13
|
Description-Content-Type: text/markdown
|
14
14
|
License-File: LICENSE
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: psutil
|
18
|
+
Provides-Extra: torch
|
19
|
+
Requires-Dist: torch; extra == "torch"
|
20
|
+
Dynamic: provides-extra
|
21
|
+
Dynamic: requires-dist
|
15
22
|
|
16
23
|
<div align="center">
|
17
24
|
<h4 align="center">
|
@@ -23,9 +30,6 @@ License-File: LICENSE
|
|
23
30
|
</div>
|
24
31
|
|
25
32
|
|
26
|
-
# 当前版本
|
27
|
-
|
28
|
-
1.0.5
|
29
33
|
|
30
34
|
# 说明
|
31
35
|
|
@@ -33,7 +37,7 @@ License-File: LICENSE
|
|
33
37
|
|
34
38
|
它解决了什么问题:
|
35
39
|
|
36
|
-
- 很多函数是记不住的,
|
40
|
+
- 很多函数是记不住的, 每次写都要~~搜~~问大模型 ,例如pandas排序
|
37
41
|
- 刷题的时候,树结构的题目很难调试
|
38
42
|
|
39
43
|
|
@@ -75,9 +79,9 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
75
79
|
def __init__(self, IPT_MODEL_PATH):
|
76
80
|
self.ltp = LTP(IPT_MODEL_PATH)
|
77
81
|
```
|
78
|
-
|
82
|
+
通过`pyinstrument`判断,超过1s的包即采用这种方式
|
79
83
|
- 2s+ happybase、seaborn、torch、jieba
|
80
|
-
- 1s+
|
84
|
+
- 1s+ /
|
81
85
|
- 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
|
82
86
|
|
83
87
|
|
@@ -85,6 +89,8 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
85
89
|
|
86
90
|
- [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
|
87
91
|
|
92
|
+
- [打包发布指南](https://juejin.cn/post/7369413136224878644)
|
93
|
+
|
88
94
|
- 发布版本需要加tag
|
89
95
|
|
90
96
|
## 开发哲学
|
@@ -106,6 +112,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
|
|
106
112
|
```
|
107
113
|
|
108
114
|
```bash
|
109
|
-
#
|
110
|
-
python -m nlpertools
|
115
|
+
# 生成pypi双因素认证的实时密钥(需要提供key)
|
116
|
+
python -m nlpertools.cli --get_2fa --get_2fa_key your_key
|
117
|
+
|
118
|
+
## git
|
119
|
+
python -m nlpertools.cli --git_push
|
120
|
+
python -m nlpertools.cli --git_pull
|
121
|
+
|
122
|
+
# 以下功能被nvitop替代,不推荐使用
|
123
|
+
## 监控gpu显存
|
124
|
+
python -m nlpertools.monitor.gpu
|
125
|
+
## 监控cpu
|
126
|
+
python -m nlpertools.monitor.memory
|
111
127
|
```
|
128
|
+
|
129
|
+
## 一些常用项目
|
130
|
+
|
131
|
+
nvitop
|
132
|
+
|
133
|
+
ydata-profiling
|
134
|
+
|
@@ -8,9 +8,6 @@
|
|
8
8
|
</div>
|
9
9
|
|
10
10
|
|
11
|
-
# 当前版本
|
12
|
-
|
13
|
-
1.0.5
|
14
11
|
|
15
12
|
# 说明
|
16
13
|
|
@@ -18,7 +15,7 @@
|
|
18
15
|
|
19
16
|
它解决了什么问题:
|
20
17
|
|
21
|
-
- 很多函数是记不住的,
|
18
|
+
- 很多函数是记不住的, 每次写都要~~搜~~问大模型 ,例如pandas排序
|
22
19
|
- 刷题的时候,树结构的题目很难调试
|
23
20
|
|
24
21
|
|
@@ -60,9 +57,9 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
60
57
|
def __init__(self, IPT_MODEL_PATH):
|
61
58
|
self.ltp = LTP(IPT_MODEL_PATH)
|
62
59
|
```
|
63
|
-
|
60
|
+
通过`pyinstrument`判断,超过1s的包即采用这种方式
|
64
61
|
- 2s+ happybase、seaborn、torch、jieba
|
65
|
-
- 1s+
|
62
|
+
- 1s+ /
|
66
63
|
- 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
|
67
64
|
|
68
65
|
|
@@ -70,6 +67,8 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
70
67
|
|
71
68
|
- [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
|
72
69
|
|
70
|
+
- [打包发布指南](https://juejin.cn/post/7369413136224878644)
|
71
|
+
|
73
72
|
- 发布版本需要加tag
|
74
73
|
|
75
74
|
## 开发哲学
|
@@ -91,6 +90,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
|
|
91
90
|
```
|
92
91
|
|
93
92
|
```bash
|
94
|
-
#
|
95
|
-
python -m nlpertools
|
93
|
+
# 生成pypi双因素认证的实时密钥(需要提供key)
|
94
|
+
python -m nlpertools.cli --get_2fa --get_2fa_key your_key
|
95
|
+
|
96
|
+
## git
|
97
|
+
python -m nlpertools.cli --git_push
|
98
|
+
python -m nlpertools.cli --git_pull
|
99
|
+
|
100
|
+
# 以下功能被nvitop替代,不推荐使用
|
101
|
+
## 监控gpu显存
|
102
|
+
python -m nlpertools.monitor.gpu
|
103
|
+
## 监控cpu
|
104
|
+
python -m nlpertools.monitor.memory
|
96
105
|
```
|
106
|
+
|
107
|
+
## 一些常用项目
|
108
|
+
|
109
|
+
nvitop
|
110
|
+
|
111
|
+
ydata-profiling
|
112
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
|
4
|
+
from setuptools import setup
|
5
|
+
|
6
|
+
|
7
|
+
def get_version():
|
8
|
+
with open(os.path.join("src", "nlpertools", "__init__.py"), "r", encoding="utf-8") as f:
|
9
|
+
file_content = f.read()
|
10
|
+
pattern = r"{}\W*=\W*\'([^\"]+)\'".format("__version__")
|
11
|
+
(version,) = re.findall(pattern, file_content)
|
12
|
+
return version
|
13
|
+
|
14
|
+
|
15
|
+
def main():
|
16
|
+
setup(
|
17
|
+
# https://juejin.cn/post/7369349560421040128
|
18
|
+
install_requires=[
|
19
|
+
"numpy",
|
20
|
+
"pandas",
|
21
|
+
"psutil"
|
22
|
+
],
|
23
|
+
extras_require={
|
24
|
+
"torch": ["torch"],
|
25
|
+
},
|
26
|
+
version=get_version(),
|
27
|
+
entry_points={
|
28
|
+
"console_scripts": [
|
29
|
+
"ncli=nlpertools.cli:main",
|
30
|
+
]
|
31
|
+
}
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
if __name__ == '__main__':
|
36
|
+
main()
|
@@ -3,6 +3,7 @@
|
|
3
3
|
# @Author : youshu.Ji
|
4
4
|
from .algo.kmp import *
|
5
5
|
from .data_structure.base_structure import *
|
6
|
+
from .draw import *
|
6
7
|
from .dataprocess import *
|
7
8
|
from .io.dir import *
|
8
9
|
from .io.file import *
|
@@ -15,10 +16,8 @@ from .reminder import *
|
|
15
16
|
from .utils_for_nlpertools import *
|
16
17
|
from .wrapper import *
|
17
18
|
from .monitor import *
|
19
|
+
from .cli import *
|
18
20
|
|
19
|
-
import os
|
20
21
|
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
__version__ = '1.0.5'
|
23
|
+
__version__ = '1.0.9'
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import argparse
|
2
|
+
import os
|
3
|
+
import uuid
|
4
|
+
import sys
|
5
|
+
|
6
|
+
"""
|
7
|
+
如何Debug cli.py
|
8
|
+
"""
|
9
|
+
|
10
|
+
|
11
|
+
def git_push():
|
12
|
+
"""
|
13
|
+
针对国内提交github经常失败,自动提交
|
14
|
+
"""
|
15
|
+
num = -1
|
16
|
+
while 1:
|
17
|
+
num += 1
|
18
|
+
print("retry num: {}".format(num))
|
19
|
+
info = os.system("git push --set-upstream origin main")
|
20
|
+
print(str(info))
|
21
|
+
if not str(info).startswith("fatal"):
|
22
|
+
print("scucess")
|
23
|
+
break
|
24
|
+
|
25
|
+
|
26
|
+
def git_pull():
|
27
|
+
"""
|
28
|
+
针对国内提交github经常失败,自动提交
|
29
|
+
"""
|
30
|
+
num = -1
|
31
|
+
while 1:
|
32
|
+
num += 1
|
33
|
+
print("retry num: {}".format(num))
|
34
|
+
info = os.system("git pull")
|
35
|
+
print(str(info))
|
36
|
+
if not str(info).startswith("fatal") and not str(info).startswith("error"):
|
37
|
+
print("scucess")
|
38
|
+
break
|
39
|
+
|
40
|
+
|
41
|
+
def get_mac_address():
|
42
|
+
mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
|
43
|
+
mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
|
44
|
+
print("mac address 不一定准确")
|
45
|
+
print(mac_address)
|
46
|
+
return mac_address
|
47
|
+
|
48
|
+
|
49
|
+
def get_2af_value(key):
|
50
|
+
import pyotp
|
51
|
+
"""
|
52
|
+
key应该是7位的
|
53
|
+
"""
|
54
|
+
print(key)
|
55
|
+
totp = pyotp.TOTP(key)
|
56
|
+
print(totp.now())
|
57
|
+
|
58
|
+
|
59
|
+
def start_gpu_usage_notify_server():
|
60
|
+
from flask import Flask
|
61
|
+
|
62
|
+
app = Flask(__name__)
|
63
|
+
|
64
|
+
@app.route("/notify", methods=["GET"])
|
65
|
+
def notify():
|
66
|
+
# 这里可以根据需要动态生成通知内容
|
67
|
+
usage = os.popen("nvidia-smi --query-gpu=memory.used --format=csv").read().split("\n")[1:]
|
68
|
+
res = 0
|
69
|
+
for edx, each in enumerate(usage):
|
70
|
+
if each.startswith("0"):
|
71
|
+
res += 1
|
72
|
+
print(res)
|
73
|
+
return str(res), 200
|
74
|
+
|
75
|
+
app.run(host="0.0.0.0", port=5000)
|
76
|
+
|
77
|
+
|
78
|
+
def start_gpu_usage_notify_client():
|
79
|
+
import requests
|
80
|
+
from plyer import notification
|
81
|
+
import time
|
82
|
+
|
83
|
+
SERVER_URL = 'http://127.0.0.1:5000/notify' # 服务器的 API 地址
|
84
|
+
|
85
|
+
def notify(text):
|
86
|
+
# 使用 plyer 发送通知
|
87
|
+
notification.notify(
|
88
|
+
title='远程通知',
|
89
|
+
message=text,
|
90
|
+
timeout=10 # 10秒的通知显示时间
|
91
|
+
)
|
92
|
+
|
93
|
+
"""定时轮询服务器获取通知"""
|
94
|
+
while True:
|
95
|
+
try:
|
96
|
+
response = requests.get(SERVER_URL)
|
97
|
+
if response.status_code == 200:
|
98
|
+
num = int(response.text)
|
99
|
+
if num > 0:
|
100
|
+
notify(f"服务器有{num}张卡")
|
101
|
+
print(f"服务器有{num}张卡")
|
102
|
+
else:
|
103
|
+
print("服务器没有新通知")
|
104
|
+
except Exception as e:
|
105
|
+
print(f"与服务器连接失败: {e}")
|
106
|
+
|
107
|
+
time.sleep(1)
|
108
|
+
|
109
|
+
|
110
|
+
def main():
|
111
|
+
parser = argparse.ArgumentParser(description="CLI tool for git operations and getting MAC address.")
|
112
|
+
parser.add_argument('--gitpush', action='store_true', help='Perform git push operation.')
|
113
|
+
parser.add_argument('--gitpull', action='store_true', help='Perform git pull operation.')
|
114
|
+
parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
|
115
|
+
|
116
|
+
parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
|
117
|
+
parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
|
118
|
+
parser.add_argument('--monitor_gpu_cli', action='store_true', help='Get the 2fa value.')
|
119
|
+
parser.add_argument('--monitor_gpu_ser', action='store_true', help='Get the 2fa value.')
|
120
|
+
|
121
|
+
args = parser.parse_args()
|
122
|
+
|
123
|
+
if args.gitpush:
|
124
|
+
git_push()
|
125
|
+
elif args.gitpull:
|
126
|
+
git_pull()
|
127
|
+
elif args.mac_address:
|
128
|
+
get_mac_address()
|
129
|
+
elif args.monitor_gpu_cli:
|
130
|
+
start_gpu_usage_notify_client()
|
131
|
+
elif args.monitor_gpu_ser:
|
132
|
+
start_gpu_usage_notify_server()
|
133
|
+
elif args.get_2fa:
|
134
|
+
if args.get_2fa_key:
|
135
|
+
get_2af_value(args.get_2fa_key)
|
136
|
+
else:
|
137
|
+
print("Please provide a key as an argument.")
|
138
|
+
else:
|
139
|
+
print("No operation specified.")
|
140
|
+
|
141
|
+
|
142
|
+
if __name__ == '__main__':
|
143
|
+
main()
|
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding=utf-8
|
1
2
|
# !/usr/bin/python3.8
|
2
3
|
# -*- coding: utf-8 -*-
|
3
4
|
# @Author : youshu.Ji
|
@@ -5,9 +6,11 @@ import datetime
|
|
5
6
|
import json
|
6
7
|
import logging
|
7
8
|
|
8
|
-
from . import DB_CONFIG_FILE
|
9
9
|
from .io.file import read_yaml
|
10
10
|
from .utils.package import *
|
11
|
+
import os
|
12
|
+
|
13
|
+
DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "default_db_config.yml")
|
11
14
|
|
12
15
|
# import aioredis
|
13
16
|
# import happybase
|
@@ -28,21 +31,24 @@ class Neo4jOps(object):
|
|
28
31
|
NEO4J_TIMEOUT = 0.3
|
29
32
|
pass
|
30
33
|
|
34
|
+
|
31
35
|
class SqliteOps(object):
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
conn.
|
36
|
+
pass
|
37
|
+
# import sqlite3
|
38
|
+
# database_path = r'xx.db'
|
39
|
+
# conn = sqlite3.connect(database_path)
|
40
|
+
# c = conn.cursor()
|
41
|
+
# sql = "select name from sqlite_master where type='table' order by name"
|
42
|
+
# c.execute(sql)
|
43
|
+
# print(c.fetchall())
|
44
|
+
# sql = "select * from typecho_contents"
|
45
|
+
# c.execute(sql)
|
46
|
+
# res = c.fetchall()
|
47
|
+
# print(res[3])
|
48
|
+
#
|
49
|
+
# conn.commit()
|
50
|
+
# conn.close()
|
51
|
+
|
46
52
|
|
47
53
|
class MysqlOps(object):
|
48
54
|
import pandas as pd
|
@@ -116,6 +122,41 @@ class EsOps(object):
|
|
116
122
|
print(f"批量保存数据: {_res}")
|
117
123
|
|
118
124
|
|
125
|
+
class MongoDB_BETA:
|
126
|
+
def __init__(self, host='localhost', port=27017, db_name=None, collection_name=None):
|
127
|
+
self.host = host
|
128
|
+
self.port = port
|
129
|
+
self.db_name = db_name
|
130
|
+
self.collection_name = collection_name
|
131
|
+
self.client = None
|
132
|
+
self.db = None
|
133
|
+
self.collection = None
|
134
|
+
|
135
|
+
def connect(self):
|
136
|
+
self.client = MongoClient(self.host, self.port)
|
137
|
+
self.db = self.client[self.db_name]
|
138
|
+
self.collection = self.db[self.collection_name]
|
139
|
+
|
140
|
+
def close(self):
|
141
|
+
if self.client:
|
142
|
+
self.client.close()
|
143
|
+
|
144
|
+
def insert_data(self, data):
|
145
|
+
if isinstance(data, list):
|
146
|
+
self.collection.insert_many(data)
|
147
|
+
else:
|
148
|
+
self.collection.insert_one(data)
|
149
|
+
|
150
|
+
def check_data_exists(self, query):
|
151
|
+
"""
|
152
|
+
检查某个数据是否存在于数据库中
|
153
|
+
:param query: 查询条件
|
154
|
+
:return: 布尔值,表示数据是否存在
|
155
|
+
"""
|
156
|
+
return self.collection.count_documents(query) > 0
|
157
|
+
|
158
|
+
|
159
|
+
|
119
160
|
class MongoOps(object):
|
120
161
|
from pymongo import MongoClient
|
121
162
|
def __init__(self, config=global_db_config["mongo"]):
|
@@ -348,8 +389,6 @@ class KafkaOps(object):
|
|
348
389
|
print(recv)
|
349
390
|
|
350
391
|
|
351
|
-
|
352
|
-
|
353
392
|
class MilvusOps(object):
|
354
393
|
def __init__(self, config=global_db_config.milvus):
|
355
394
|
from pymilvus import connections, Collection
|
@@ -55,9 +55,9 @@ class Pattern:
|
|
55
55
|
# 中文人名
|
56
56
|
chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
|
57
57
|
# 英文人名
|
58
|
-
english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
58
|
+
english_name_pattern = r"(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
59
59
|
# 纯数字
|
60
|
-
pure_num_pattern = "\d+"
|
60
|
+
pure_num_pattern = r"\d+"
|
61
61
|
# xxxx图/表 之类的表述
|
62
62
|
pic_table_descript_pattern = ".{1,15}图"
|
63
63
|
|
@@ -66,20 +66,20 @@ class Pattern:
|
|
66
66
|
hlink_pattern = (
|
67
67
|
r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
|
68
68
|
)
|
69
|
-
http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
69
|
+
http_pattern = r"(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
70
70
|
# 邮箱
|
71
|
-
email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
71
|
+
email_pattern = r"[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
72
72
|
# html 可能过于严格了
|
73
|
-
html_pattern = "<[\s\S]*?>"
|
73
|
+
html_pattern = r"<[\s\S]*?>"
|
74
74
|
# 重复 “asdasdasdasd”
|
75
75
|
repeat_pattern = "(.)\1+"
|
76
76
|
# 日期
|
77
|
-
day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
77
|
+
day_time_pattern = r"\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
78
78
|
# 小时
|
79
|
-
hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
79
|
+
hour_time_pattern = r"(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
80
80
|
# 股票
|
81
81
|
stock_pattern = (
|
82
|
-
"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
82
|
+
r"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
83
83
|
)
|
84
84
|
|
85
85
|
# 一般是需要替换的
|
@@ -91,7 +91,7 @@ class Pattern:
|
|
91
91
|
# 微博视频等
|
92
92
|
weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
|
93
93
|
# @
|
94
|
-
at_pattern = "@\w+"
|
94
|
+
at_pattern = r"@\w+"
|
95
95
|
|
96
96
|
# from https://github.com/bigscience-workshop/data-preparation pii
|
97
97
|
year_patterns = [
|
@@ -116,7 +116,7 @@ class Pattern:
|
|
116
116
|
ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
|
117
117
|
ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
|
118
118
|
ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
|
119
|
-
[ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
119
|
+
[ipv4_pattern, ipv6_pattern]) + r")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
120
120
|
|
121
121
|
# https://regex101.com/r/EpA5B7/1
|
122
122
|
email_line_pattern = r'''
|
@@ -466,7 +466,7 @@ class TextProcess(object):
|
|
466
466
|
p = re.compile(pattern, re.S)
|
467
467
|
text = p.sub("", text)
|
468
468
|
|
469
|
-
dr = re.compile("@\w+", re.S)
|
469
|
+
dr = re.compile(r"@\w+", re.S)
|
470
470
|
text = dr.sub("", text)
|
471
471
|
|
472
472
|
return text
|
@@ -527,7 +527,7 @@ class TextProcess(object):
|
|
527
527
|
text = re.sub(pattern, replace, text)
|
528
528
|
return text
|
529
529
|
|
530
|
-
def calc_proportion_zh(self,text):
|
530
|
+
def calc_proportion_zh(self, text):
|
531
531
|
text = text.strip()
|
532
532
|
# 如果是中国英文的情况,并且英文有空格分开
|
533
533
|
if " " in text:
|
@@ -538,6 +538,8 @@ class TextProcess(object):
|
|
538
538
|
chinese_count += 1
|
539
539
|
else:
|
540
540
|
pass
|
541
|
+
|
542
|
+
|
541
543
|
class CopyFunc():
|
542
544
|
# from https://github.com/lemon234071/clean-dialog
|
543
545
|
def is_chinese_char(cp):
|
@@ -597,6 +599,20 @@ def convert_basic2fullwidth(sentence):
|
|
597
599
|
new_sentence += char
|
598
600
|
return new_sentence
|
599
601
|
|
602
|
+
|
603
|
+
def clean_illegal_chars_for_excel(df):
|
604
|
+
# openpyxl 库写入 Excel 文件时,有一些非法字符,需要删除
|
605
|
+
# 定义一个函数来移除字符串中的非法字符
|
606
|
+
def remove_illegal_chars(s):
|
607
|
+
if isinstance(s, str):
|
608
|
+
# 移除 ASCII 码在非法范围内的字符
|
609
|
+
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
|
610
|
+
return s
|
611
|
+
|
612
|
+
# 应用清理函数到数据框的每个元素
|
613
|
+
return df.map(remove_illegal_chars)
|
614
|
+
|
615
|
+
|
600
616
|
if __name__ == "__main__":
|
601
617
|
pattern_for_filter = [
|
602
618
|
Pattern.redundancy_space_pattern,
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
from ..utils.package import plt
|
5
|
+
|
6
|
+
|
7
|
+
def confused_matrix(confuse_matrix):
|
8
|
+
import seaborn as sns
|
9
|
+
sns.set()
|
10
|
+
f, ax = plt.subplots()
|
11
|
+
ticklabels = ["l1", "l2", "l31"]
|
12
|
+
sns.heatmap(confuse_matrix, annot=True, fmt=".3g", ax=ax, cmap='rainbow',
|
13
|
+
xticklabels=ticklabels, yticklabels=ticklabels) # 画热力图
|
14
|
+
|
15
|
+
ax.set_title('confusion matrix') # 标题
|
16
|
+
ax.set_xlabel('predict') # x轴
|
17
|
+
ax.set_ylabel('true') # y轴
|
18
|
+
plt.show()
|
19
|
+
|
20
|
+
f.savefig('tmp.jpg', bbox_inches='tight')
|
21
|
+
|
22
|
+
|
23
|
+
def plot_histogram(data, bin_size, max_bin):
|
24
|
+
"""
|
25
|
+
画直方图,超过1000的统一按1000算
|
26
|
+
:param data:
|
27
|
+
:param bin_size:
|
28
|
+
:return:
|
29
|
+
"""
|
30
|
+
import matplotlib.pyplot as plt
|
31
|
+
import numpy as np
|
32
|
+
import pandas as pd
|
33
|
+
from matplotlib.ticker import MaxNLocator
|
34
|
+
# 将超过1000的值改为1000
|
35
|
+
def process_lengths(data):
|
36
|
+
return [length if length <= max_bin else max_bin + 3 for length in data]
|
37
|
+
|
38
|
+
# 前闭后开
|
39
|
+
# min_num, max_num = 0, 1000
|
40
|
+
# min_num, max_num = min(data), max(data)
|
41
|
+
|
42
|
+
plt.figure(figsize=(12, 8))
|
43
|
+
processed_data = process_lengths(data)
|
44
|
+
bins = np.arange(0, max_bin + 2 * bin_size, bin_size)
|
45
|
+
# 绘制直方图
|
46
|
+
n, new_bins, patches = plt.hist(processed_data, bins=bins, edgecolor='black', color='skyblue', alpha=0.7,
|
47
|
+
linewidth=0)
|
48
|
+
|
49
|
+
# 添加"∞"的标签
|
50
|
+
# bins会改变
|
51
|
+
plt.gca().set_xticks(bins)
|
52
|
+
plt.gca().set_xticklabels([str(i) for i in plt.xticks()[0][:-1]] + ["∞"])
|
53
|
+
|
54
|
+
mean_val = np.mean(data)
|
55
|
+
plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1)
|
56
|
+
plt.text(mean_val + bin_size / 10, max(n) * 0.9, f'Mean: {mean_val:.2f}', color='red')
|
57
|
+
|
58
|
+
# 添加标题和标签
|
59
|
+
plt.title('Module Line Number Distribution', fontsize=16, fontweight='bold')
|
60
|
+
plt.xlabel('module line number', fontsize=14)
|
61
|
+
plt.ylabel('frequency', fontsize=14)
|
62
|
+
|
63
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
64
|
+
|
65
|
+
plt.xticks(fontsize=12)
|
66
|
+
plt.yticks(fontsize=12)
|
67
|
+
|
68
|
+
# 在每个柱状图上显示数值
|
69
|
+
for i in range(len(patches)):
|
70
|
+
plt.text(patches[i].get_x() + patches[i].get_width() / 2, patches[i].get_height(),
|
71
|
+
str(int(n[i])), ha='center', va='bottom', fontsize=12)
|
72
|
+
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
|
73
|
+
# 显示图表
|
74
|
+
plt.show()
|
75
|
+
|
76
|
+
|
77
|
+
if __name__ == '__main__':
|
78
|
+
# 调整区间大小
|
79
|
+
bin_size = 50
|
80
|
+
# 示例模块长度数据
|
81
|
+
plot_histogram([1, 100, 999, 1000, 1002, 1100, 1150], bin_size, max_bin=1000)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# 数学函数
|
2
|
+
def draw_log():
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import numpy as np
|
5
|
+
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
|
6
|
+
|
7
|
+
# 生成一些数据
|
8
|
+
x = np.linspace(0.1, 10, 100)
|
9
|
+
# 默认log指的时loge
|
10
|
+
y = np.log(x)
|
11
|
+
|
12
|
+
# 创建一个新的图形和轴
|
13
|
+
fig, ax = plt.subplots()
|
14
|
+
|
15
|
+
# 绘制log图像
|
16
|
+
ax.plot(x, y)
|
17
|
+
|
18
|
+
# 设置图像标题和轴标签
|
19
|
+
ax.set_title("Logarithmic Function")
|
20
|
+
ax.set_xlabel("x")
|
21
|
+
ax.set_ylabel("log(x)")
|
22
|
+
# 设置横坐标的刻度间隔为1
|
23
|
+
ax.xaxis.set_major_locator(MultipleLocator(1))
|
24
|
+
|
25
|
+
# 设置横坐标的刻度格式
|
26
|
+
ax.xaxis.set_major_formatter(FormatStrFormatter("%.1f"))
|
27
|
+
# 添加x=1的虚线
|
28
|
+
ax.axvline(x=1, linestyle="--", color="gray")
|
29
|
+
# 添加y=1的虚线
|
30
|
+
ax.axhline(y=0, linestyle="--", color="gray")
|
31
|
+
|
32
|
+
# 显示图像
|
33
|
+
plt.show()
|