nlpertools 1.0.6.dev0__tar.gz → 1.0.8__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {nlpertools-1.0.6.dev0/src/nlpertools.egg-info → nlpertools-1.0.8}/PKG-INFO +29 -8
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/README.md +23 -7
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/setup.cfg +0 -1
- nlpertools-1.0.8/setup.py +36 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/__init__.py +3 -4
- nlpertools-1.0.8/src/nlpertools/cli.py +87 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/data_client.py +56 -17
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/dataprocess.py +28 -12
- nlpertools-1.0.8/src/nlpertools/draw/draw.py +83 -0
- nlpertools-1.0.8/src/nlpertools/draw/math_func.py +33 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/io/dir.py +34 -2
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/io/file.py +15 -9
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/ml.py +52 -24
- nlpertools-1.0.8/src/nlpertools/monitor/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/other.py +135 -24
- nlpertools-1.0.8/src/nlpertools/template/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8/src/nlpertools.egg-info}/PKG-INFO +29 -8
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools.egg-info/SOURCES.txt +8 -0
- nlpertools-1.0.8/src/nlpertools.egg-info/entry_points.txt +2 -0
- nlpertools-1.0.8/src/nlpertools.egg-info/requires.txt +6 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/LICENSE +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/pyproject.toml +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/ac.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/bit_ops.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/kmp.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/num_ops.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/template.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/algo/union.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/data_structure/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/data_structure/base_structure.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/default_db_config.yml +0 -0
- {nlpertools-1.0.6.dev0/src/nlpertools/monitor → nlpertools-1.0.8/src/nlpertools/draw}/__init__.py +0 -0
- /nlpertools-1.0.6.dev0/src/nlpertools/template/__init__.py → /nlpertools-1.0.8/src/nlpertools/get_2fa.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/io/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/monitor/gpu.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/monitor/memory.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/movie.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/nlpertools_config.yml +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/open_api.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/pic.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/plugin.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/reminder.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils/lazy.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils/log_util.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils/package.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils/package_v1.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils/package_v2.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/utils_for_nlpertools.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/vector_index_demo.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools/wrapper.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools.egg-info/dependency_links.txt +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools.egg-info/top_level.txt +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/src/nlpertools_helper/__init__.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/tests/test_kmp.py +0 -0
- {nlpertools-1.0.6.dev0 → nlpertools-1.0.8}/tests/test_path_exists.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nlpertools
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.8
|
4
4
|
Summary: A small package about small basic IO operation when coding
|
5
5
|
Home-page: https://github.com/lvzii/nlpertools
|
6
6
|
Author: youshuJi
|
@@ -12,6 +12,11 @@ Classifier: Operating System :: OS Independent
|
|
12
12
|
Requires-Python: >=3.6
|
13
13
|
Description-Content-Type: text/markdown
|
14
14
|
License-File: LICENSE
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: pandas
|
17
|
+
Requires-Dist: psutil
|
18
|
+
Provides-Extra: torch
|
19
|
+
Requires-Dist: torch; extra == "torch"
|
15
20
|
|
16
21
|
<div align="center">
|
17
22
|
<h4 align="center">
|
@@ -23,9 +28,6 @@ License-File: LICENSE
|
|
23
28
|
</div>
|
24
29
|
|
25
30
|
|
26
|
-
# 当前版本
|
27
|
-
|
28
|
-
1.0.5
|
29
31
|
|
30
32
|
# 说明
|
31
33
|
|
@@ -75,9 +77,9 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
75
77
|
def __init__(self, IPT_MODEL_PATH):
|
76
78
|
self.ltp = LTP(IPT_MODEL_PATH)
|
77
79
|
```
|
78
|
-
|
80
|
+
通过`pyinstrument`判断,超过1s的包即采用这种方式
|
79
81
|
- 2s+ happybase、seaborn、torch、jieba
|
80
|
-
- 1s+
|
82
|
+
- 1s+ /
|
81
83
|
- 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
|
82
84
|
|
83
85
|
|
@@ -85,6 +87,8 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
85
87
|
|
86
88
|
- [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
|
87
89
|
|
90
|
+
- [打包发布指南](https://juejin.cn/post/7369413136224878644)
|
91
|
+
|
88
92
|
- 发布版本需要加tag
|
89
93
|
|
90
94
|
## 开发哲学
|
@@ -106,6 +110,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
|
|
106
110
|
```
|
107
111
|
|
108
112
|
```bash
|
109
|
-
#
|
110
|
-
python -m nlpertools
|
113
|
+
# 生成pypi双因素认证的实时密钥(需要提供key)
|
114
|
+
python -m nlpertools.get_2fa your_key
|
115
|
+
|
116
|
+
## git
|
117
|
+
python nlpertools.cli --git_push
|
118
|
+
python nlpertools.cli --git_pull
|
119
|
+
|
120
|
+
# 以下功能被nvitop替代,不推荐使用
|
121
|
+
## 监控gpu显存
|
122
|
+
python -m nlpertools.monitor.gpu
|
123
|
+
## 监控cpu
|
124
|
+
python -m nlpertools.monitor.memory
|
111
125
|
```
|
126
|
+
|
127
|
+
## 一些常用项目
|
128
|
+
|
129
|
+
nvitop
|
130
|
+
|
131
|
+
ydata-profiling
|
132
|
+
|
@@ -8,9 +8,6 @@
|
|
8
8
|
</div>
|
9
9
|
|
10
10
|
|
11
|
-
# 当前版本
|
12
|
-
|
13
|
-
1.0.5
|
14
11
|
|
15
12
|
# 说明
|
16
13
|
|
@@ -60,9 +57,9 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
60
57
|
def __init__(self, IPT_MODEL_PATH):
|
61
58
|
self.ltp = LTP(IPT_MODEL_PATH)
|
62
59
|
```
|
63
|
-
|
60
|
+
通过`pyinstrument`判断,超过1s的包即采用这种方式
|
64
61
|
- 2s+ happybase、seaborn、torch、jieba
|
65
|
-
- 1s+
|
62
|
+
- 1s+ /
|
66
63
|
- 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
|
67
64
|
|
68
65
|
|
@@ -70,6 +67,8 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
70
67
|
|
71
68
|
- [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
|
72
69
|
|
70
|
+
- [打包发布指南](https://juejin.cn/post/7369413136224878644)
|
71
|
+
|
73
72
|
- 发布版本需要加tag
|
74
73
|
|
75
74
|
## 开发哲学
|
@@ -91,6 +90,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
|
|
91
90
|
```
|
92
91
|
|
93
92
|
```bash
|
94
|
-
#
|
95
|
-
python -m nlpertools
|
93
|
+
# 生成pypi双因素认证的实时密钥(需要提供key)
|
94
|
+
python -m nlpertools.get_2fa your_key
|
95
|
+
|
96
|
+
## git
|
97
|
+
python nlpertools.cli --git_push
|
98
|
+
python nlpertools.cli --git_pull
|
99
|
+
|
100
|
+
# 以下功能被nvitop替代,不推荐使用
|
101
|
+
## 监控gpu显存
|
102
|
+
python -m nlpertools.monitor.gpu
|
103
|
+
## 监控cpu
|
104
|
+
python -m nlpertools.monitor.memory
|
96
105
|
```
|
106
|
+
|
107
|
+
## 一些常用项目
|
108
|
+
|
109
|
+
nvitop
|
110
|
+
|
111
|
+
ydata-profiling
|
112
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
|
4
|
+
from setuptools import setup
|
5
|
+
|
6
|
+
|
7
|
+
def get_version():
|
8
|
+
with open(os.path.join("src", "nlpertools", "__init__.py"), "r", encoding="utf-8") as f:
|
9
|
+
file_content = f.read()
|
10
|
+
pattern = r"{}\W*=\W*\'([^\"]+)\'".format("__version__")
|
11
|
+
(version,) = re.findall(pattern, file_content)
|
12
|
+
return version
|
13
|
+
|
14
|
+
|
15
|
+
def main():
|
16
|
+
setup(
|
17
|
+
# https://juejin.cn/post/7369349560421040128
|
18
|
+
install_requires=[
|
19
|
+
"numpy",
|
20
|
+
"pandas",
|
21
|
+
"psutil"
|
22
|
+
],
|
23
|
+
extras_require={
|
24
|
+
"torch": ["torch"],
|
25
|
+
},
|
26
|
+
version=get_version(),
|
27
|
+
entry_points={
|
28
|
+
"console_scripts": [
|
29
|
+
"ncli=nlpertools.cli:main",
|
30
|
+
]
|
31
|
+
}
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
if __name__ == '__main__':
|
36
|
+
main()
|
@@ -3,6 +3,7 @@
|
|
3
3
|
# @Author : youshu.Ji
|
4
4
|
from .algo.kmp import *
|
5
5
|
from .data_structure.base_structure import *
|
6
|
+
from .draw import *
|
6
7
|
from .dataprocess import *
|
7
8
|
from .io.dir import *
|
8
9
|
from .io.file import *
|
@@ -15,10 +16,8 @@ from .reminder import *
|
|
15
16
|
from .utils_for_nlpertools import *
|
16
17
|
from .wrapper import *
|
17
18
|
from .monitor import *
|
19
|
+
from .cli import *
|
18
20
|
|
19
|
-
import os
|
20
21
|
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
__version__ = '1.0.5'
|
23
|
+
__version__ = '1.0.8'
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import argparse
|
2
|
+
import os
|
3
|
+
import uuid
|
4
|
+
import sys
|
5
|
+
|
6
|
+
import pyotp
|
7
|
+
|
8
|
+
"""
|
9
|
+
如何Debug cli.py
|
10
|
+
"""
|
11
|
+
|
12
|
+
|
13
|
+
def git_push():
|
14
|
+
"""
|
15
|
+
针对国内提交github经常失败,自动提交
|
16
|
+
"""
|
17
|
+
num = -1
|
18
|
+
while 1:
|
19
|
+
num += 1
|
20
|
+
print("retry num: {}".format(num))
|
21
|
+
info = os.system("git push --set-upstream origin main")
|
22
|
+
print(str(info))
|
23
|
+
if not str(info).startswith("fatal"):
|
24
|
+
print("scucess")
|
25
|
+
break
|
26
|
+
|
27
|
+
|
28
|
+
def git_pull():
|
29
|
+
"""
|
30
|
+
针对国内提交github经常失败,自动提交
|
31
|
+
"""
|
32
|
+
num = -1
|
33
|
+
while 1:
|
34
|
+
num += 1
|
35
|
+
print("retry num: {}".format(num))
|
36
|
+
info = os.system("git pull")
|
37
|
+
print(str(info))
|
38
|
+
if not str(info).startswith("fatal") and not str(info).startswith("error"):
|
39
|
+
print("scucess")
|
40
|
+
break
|
41
|
+
|
42
|
+
|
43
|
+
def get_mac_address():
|
44
|
+
mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
|
45
|
+
mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
|
46
|
+
print("mac address 不一定准确")
|
47
|
+
print(mac_address)
|
48
|
+
return mac_address
|
49
|
+
|
50
|
+
|
51
|
+
def get_2af_value(key):
|
52
|
+
"""
|
53
|
+
key应该是7位的
|
54
|
+
"""
|
55
|
+
print(key)
|
56
|
+
totp = pyotp.TOTP(key)
|
57
|
+
print(totp.now())
|
58
|
+
|
59
|
+
|
60
|
+
def main():
|
61
|
+
parser = argparse.ArgumentParser(description="CLI tool for git operations and getting MAC address.")
|
62
|
+
parser.add_argument('--gitpush', action='store_true', help='Perform git push operation.')
|
63
|
+
parser.add_argument('--gitpull', action='store_true', help='Perform git push operation.')
|
64
|
+
parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
|
65
|
+
|
66
|
+
parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
|
67
|
+
parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
|
68
|
+
|
69
|
+
args = parser.parse_args()
|
70
|
+
|
71
|
+
if args.gitpush:
|
72
|
+
git_push()
|
73
|
+
elif args.gitpull:
|
74
|
+
git_pull()
|
75
|
+
elif args.mac_address:
|
76
|
+
get_mac_address()
|
77
|
+
elif args.get_2fa:
|
78
|
+
if args.get_2fa_key:
|
79
|
+
get_2af_value(args.get_2fa_key)
|
80
|
+
else:
|
81
|
+
print("Please provide a key as an argument.")
|
82
|
+
else:
|
83
|
+
print("No operation specified. Use --gitpush or --get_mac_address.")
|
84
|
+
|
85
|
+
|
86
|
+
if __name__ == '__main__':
|
87
|
+
main()
|
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding=utf-8
|
1
2
|
# !/usr/bin/python3.8
|
2
3
|
# -*- coding: utf-8 -*-
|
3
4
|
# @Author : youshu.Ji
|
@@ -5,9 +6,11 @@ import datetime
|
|
5
6
|
import json
|
6
7
|
import logging
|
7
8
|
|
8
|
-
from . import DB_CONFIG_FILE
|
9
9
|
from .io.file import read_yaml
|
10
10
|
from .utils.package import *
|
11
|
+
import os
|
12
|
+
|
13
|
+
DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "default_db_config.yml")
|
11
14
|
|
12
15
|
# import aioredis
|
13
16
|
# import happybase
|
@@ -28,21 +31,24 @@ class Neo4jOps(object):
|
|
28
31
|
NEO4J_TIMEOUT = 0.3
|
29
32
|
pass
|
30
33
|
|
34
|
+
|
31
35
|
class SqliteOps(object):
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
conn.
|
36
|
+
pass
|
37
|
+
# import sqlite3
|
38
|
+
# database_path = r'xx.db'
|
39
|
+
# conn = sqlite3.connect(database_path)
|
40
|
+
# c = conn.cursor()
|
41
|
+
# sql = "select name from sqlite_master where type='table' order by name"
|
42
|
+
# c.execute(sql)
|
43
|
+
# print(c.fetchall())
|
44
|
+
# sql = "select * from typecho_contents"
|
45
|
+
# c.execute(sql)
|
46
|
+
# res = c.fetchall()
|
47
|
+
# print(res[3])
|
48
|
+
#
|
49
|
+
# conn.commit()
|
50
|
+
# conn.close()
|
51
|
+
|
46
52
|
|
47
53
|
class MysqlOps(object):
|
48
54
|
import pandas as pd
|
@@ -116,6 +122,41 @@ class EsOps(object):
|
|
116
122
|
print(f"批量保存数据: {_res}")
|
117
123
|
|
118
124
|
|
125
|
+
class MongoDB_BETA:
|
126
|
+
def __init__(self, host='localhost', port=27017, db_name=None, collection_name=None):
|
127
|
+
self.host = host
|
128
|
+
self.port = port
|
129
|
+
self.db_name = db_name
|
130
|
+
self.collection_name = collection_name
|
131
|
+
self.client = None
|
132
|
+
self.db = None
|
133
|
+
self.collection = None
|
134
|
+
|
135
|
+
def connect(self):
|
136
|
+
self.client = MongoClient(self.host, self.port)
|
137
|
+
self.db = self.client[self.db_name]
|
138
|
+
self.collection = self.db[self.collection_name]
|
139
|
+
|
140
|
+
def close(self):
|
141
|
+
if self.client:
|
142
|
+
self.client.close()
|
143
|
+
|
144
|
+
def insert_data(self, data):
|
145
|
+
if isinstance(data, list):
|
146
|
+
self.collection.insert_many(data)
|
147
|
+
else:
|
148
|
+
self.collection.insert_one(data)
|
149
|
+
|
150
|
+
def check_data_exists(self, query):
|
151
|
+
"""
|
152
|
+
检查某个数据是否存在于数据库中
|
153
|
+
:param query: 查询条件
|
154
|
+
:return: 布尔值,表示数据是否存在
|
155
|
+
"""
|
156
|
+
return self.collection.count_documents(query) > 0
|
157
|
+
|
158
|
+
|
159
|
+
|
119
160
|
class MongoOps(object):
|
120
161
|
from pymongo import MongoClient
|
121
162
|
def __init__(self, config=global_db_config["mongo"]):
|
@@ -348,8 +389,6 @@ class KafkaOps(object):
|
|
348
389
|
print(recv)
|
349
390
|
|
350
391
|
|
351
|
-
|
352
|
-
|
353
392
|
class MilvusOps(object):
|
354
393
|
def __init__(self, config=global_db_config.milvus):
|
355
394
|
from pymilvus import connections, Collection
|
@@ -55,9 +55,9 @@ class Pattern:
|
|
55
55
|
# 中文人名
|
56
56
|
chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
|
57
57
|
# 英文人名
|
58
|
-
english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
58
|
+
english_name_pattern = r"(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
|
59
59
|
# 纯数字
|
60
|
-
pure_num_pattern = "\d+"
|
60
|
+
pure_num_pattern = r"\d+"
|
61
61
|
# xxxx图/表 之类的表述
|
62
62
|
pic_table_descript_pattern = ".{1,15}图"
|
63
63
|
|
@@ -66,20 +66,20 @@ class Pattern:
|
|
66
66
|
hlink_pattern = (
|
67
67
|
r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
|
68
68
|
)
|
69
|
-
http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
69
|
+
http_pattern = r"(http|https):\/\/([\w.]+\/?)\S*/\S*"
|
70
70
|
# 邮箱
|
71
|
-
email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
71
|
+
email_pattern = r"[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
|
72
72
|
# html 可能过于严格了
|
73
|
-
html_pattern = "<[\s\S]*?>"
|
73
|
+
html_pattern = r"<[\s\S]*?>"
|
74
74
|
# 重复 “asdasdasdasd”
|
75
75
|
repeat_pattern = "(.)\1+"
|
76
76
|
# 日期
|
77
|
-
day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
77
|
+
day_time_pattern = r"\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
|
78
78
|
# 小时
|
79
|
-
hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
79
|
+
hour_time_pattern = r"(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
|
80
80
|
# 股票
|
81
81
|
stock_pattern = (
|
82
|
-
"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
82
|
+
r"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
|
83
83
|
)
|
84
84
|
|
85
85
|
# 一般是需要替换的
|
@@ -91,7 +91,7 @@ class Pattern:
|
|
91
91
|
# 微博视频等
|
92
92
|
weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
|
93
93
|
# @
|
94
|
-
at_pattern = "@\w+"
|
94
|
+
at_pattern = r"@\w+"
|
95
95
|
|
96
96
|
# from https://github.com/bigscience-workshop/data-preparation pii
|
97
97
|
year_patterns = [
|
@@ -116,7 +116,7 @@ class Pattern:
|
|
116
116
|
ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
|
117
117
|
ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
|
118
118
|
ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
|
119
|
-
[ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
119
|
+
[ipv4_pattern, ipv6_pattern]) + r")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
|
120
120
|
|
121
121
|
# https://regex101.com/r/EpA5B7/1
|
122
122
|
email_line_pattern = r'''
|
@@ -466,7 +466,7 @@ class TextProcess(object):
|
|
466
466
|
p = re.compile(pattern, re.S)
|
467
467
|
text = p.sub("", text)
|
468
468
|
|
469
|
-
dr = re.compile("@\w+", re.S)
|
469
|
+
dr = re.compile(r"@\w+", re.S)
|
470
470
|
text = dr.sub("", text)
|
471
471
|
|
472
472
|
return text
|
@@ -527,7 +527,7 @@ class TextProcess(object):
|
|
527
527
|
text = re.sub(pattern, replace, text)
|
528
528
|
return text
|
529
529
|
|
530
|
-
def calc_proportion_zh(self,text):
|
530
|
+
def calc_proportion_zh(self, text):
|
531
531
|
text = text.strip()
|
532
532
|
# 如果是中国英文的情况,并且英文有空格分开
|
533
533
|
if " " in text:
|
@@ -538,6 +538,8 @@ class TextProcess(object):
|
|
538
538
|
chinese_count += 1
|
539
539
|
else:
|
540
540
|
pass
|
541
|
+
|
542
|
+
|
541
543
|
class CopyFunc():
|
542
544
|
# from https://github.com/lemon234071/clean-dialog
|
543
545
|
def is_chinese_char(cp):
|
@@ -597,6 +599,20 @@ def convert_basic2fullwidth(sentence):
|
|
597
599
|
new_sentence += char
|
598
600
|
return new_sentence
|
599
601
|
|
602
|
+
|
603
|
+
def clean_illegal_chars_for_excel(df):
|
604
|
+
# openpyxl 库写入 Excel 文件时,有一些非法字符,需要删除
|
605
|
+
# 定义一个函数来移除字符串中的非法字符
|
606
|
+
def remove_illegal_chars(s):
|
607
|
+
if isinstance(s, str):
|
608
|
+
# 移除 ASCII 码在非法范围内的字符
|
609
|
+
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
|
610
|
+
return s
|
611
|
+
|
612
|
+
# 应用清理函数到数据框的每个元素
|
613
|
+
return df.map(remove_illegal_chars)
|
614
|
+
|
615
|
+
|
600
616
|
if __name__ == "__main__":
|
601
617
|
pattern_for_filter = [
|
602
618
|
Pattern.redundancy_space_pattern,
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
from ..utils.package import plt
|
5
|
+
|
6
|
+
|
7
|
+
def confused_matrix(confuse_matrix):
|
8
|
+
import seaborn as sns
|
9
|
+
sns.set()
|
10
|
+
f, ax = plt.subplots()
|
11
|
+
ticklabels = ["l1", "l2", "l31"]
|
12
|
+
sns.heatmap(confuse_matrix, annot=True, fmt=".3g", ax=ax, cmap='rainbow',
|
13
|
+
xticklabels=ticklabels, yticklabels=ticklabels) # 画热力图
|
14
|
+
|
15
|
+
ax.set_title('confusion matrix') # 标题
|
16
|
+
ax.set_xlabel('predict') # x轴
|
17
|
+
ax.set_ylabel('true') # y轴
|
18
|
+
plt.show()
|
19
|
+
|
20
|
+
f.savefig('tmp.jpg', bbox_inches='tight')
|
21
|
+
|
22
|
+
|
23
|
+
def plot_histogram(data, bin_size):
|
24
|
+
"""
|
25
|
+
画直方图,超过1000的统一按1000算
|
26
|
+
:param data:
|
27
|
+
:param bin_size:
|
28
|
+
:return:
|
29
|
+
"""
|
30
|
+
import matplotlib.pyplot as plt
|
31
|
+
import numpy as np
|
32
|
+
import pandas as pd
|
33
|
+
from matplotlib.ticker import MaxNLocator
|
34
|
+
# 将超过1000的值改为1000
|
35
|
+
def process_lengths(data):
|
36
|
+
return [length if length <= 1000 else 1003 for length in data]
|
37
|
+
|
38
|
+
# 前闭后开
|
39
|
+
min_num, max_num = 0, 1000
|
40
|
+
# min_num, max_num = min(data), max(data)
|
41
|
+
|
42
|
+
plt.figure(figsize=(12, 8))
|
43
|
+
processed_data = process_lengths(data)
|
44
|
+
bins = np.arange(0, 1000 + 2 * bin_size, bin_size)
|
45
|
+
# 绘制直方图
|
46
|
+
n, new_bins, patches = plt.hist(processed_data, bins=bins, edgecolor='black', color='skyblue', alpha=0.7,
|
47
|
+
linewidth=0)
|
48
|
+
|
49
|
+
# 添加"∞"的标签
|
50
|
+
# bins会改变
|
51
|
+
plt.gca().set_xticks(bins)
|
52
|
+
plt.gca().set_xticklabels([str(i) for i in plt.xticks()[0][:-1]] + ["∞"])
|
53
|
+
|
54
|
+
mean_val = np.mean(data)
|
55
|
+
plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1)
|
56
|
+
plt.text(mean_val + bin_size / 10, max(n) * 0.9, f'Mean: {mean_val:.2f}', color='red')
|
57
|
+
|
58
|
+
# 添加标题和标签
|
59
|
+
plt.title('Module Line Number Distribution', fontsize=16, fontweight='bold')
|
60
|
+
plt.xlabel('module line number', fontsize=14)
|
61
|
+
plt.ylabel('frequency', fontsize=14)
|
62
|
+
|
63
|
+
# 添加网格
|
64
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
65
|
+
|
66
|
+
# 美化x轴和y轴的刻度
|
67
|
+
plt.xticks(fontsize=12)
|
68
|
+
plt.yticks(fontsize=12)
|
69
|
+
|
70
|
+
# 在每个柱状图上显示数值
|
71
|
+
for i in range(len(patches)):
|
72
|
+
plt.text(patches[i].get_x() + patches[i].get_width() / 2, patches[i].get_height(),
|
73
|
+
str(int(n[i])), ha='center', va='bottom', fontsize=12)
|
74
|
+
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
|
75
|
+
# 显示图表
|
76
|
+
plt.show()
|
77
|
+
|
78
|
+
|
79
|
+
if __name__ == '__main__':
|
80
|
+
# 调整区间大小
|
81
|
+
bin_size = 50
|
82
|
+
# 示例模块长度数据
|
83
|
+
plot_histogram([1, 100, 999, 1000, 1002, 1100, 1150], bin_size)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# 数学函数
|
2
|
+
def draw_log():
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import numpy as np
|
5
|
+
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
|
6
|
+
|
7
|
+
# 生成一些数据
|
8
|
+
x = np.linspace(0.1, 10, 100)
|
9
|
+
# 默认log指的时loge
|
10
|
+
y = np.log(x)
|
11
|
+
|
12
|
+
# 创建一个新的图形和轴
|
13
|
+
fig, ax = plt.subplots()
|
14
|
+
|
15
|
+
# 绘制log图像
|
16
|
+
ax.plot(x, y)
|
17
|
+
|
18
|
+
# 设置图像标题和轴标签
|
19
|
+
ax.set_title("Logarithmic Function")
|
20
|
+
ax.set_xlabel("x")
|
21
|
+
ax.set_ylabel("log(x)")
|
22
|
+
# 设置横坐标的刻度间隔为1
|
23
|
+
ax.xaxis.set_major_locator(MultipleLocator(1))
|
24
|
+
|
25
|
+
# 设置横坐标的刻度格式
|
26
|
+
ax.xaxis.set_major_formatter(FormatStrFormatter("%.1f"))
|
27
|
+
# 添加x=1的虚线
|
28
|
+
ax.axvline(x=1, linestyle="--", color="gray")
|
29
|
+
# 添加y=1的虚线
|
30
|
+
ax.axhline(y=0, linestyle="--", color="gray")
|
31
|
+
|
32
|
+
# 显示图像
|
33
|
+
plt.show()
|
@@ -10,7 +10,30 @@ def j_mkdir(name):
|
|
10
10
|
os.makedirs(name, exist_ok=True)
|
11
11
|
|
12
12
|
|
13
|
-
def
|
13
|
+
def j_walk(name, suffix=None):
|
14
|
+
paths = []
|
15
|
+
for root, dirs, files in os.walk(name):
|
16
|
+
for file in files:
|
17
|
+
path = os.path.join(root, file)
|
18
|
+
if not (suffix and not path.endswith(suffix)):
|
19
|
+
paths.append(path)
|
20
|
+
return paths
|
21
|
+
|
22
|
+
|
23
|
+
def windows_to_wsl_path(windows_path):
|
24
|
+
# 转换驱动器号
|
25
|
+
if windows_path[1:3] == ':\\':
|
26
|
+
drive_letter = windows_path[0].lower()
|
27
|
+
path = windows_path[2:].replace('\\', '/')
|
28
|
+
wsl_path = f'/mnt/{drive_letter}{path}'
|
29
|
+
else:
|
30
|
+
# 如果路径不是以驱动器号开头,则直接替换路径分隔符
|
31
|
+
wsl_path = windows_path.replace('\\', '/').replace("'", "\'")
|
32
|
+
|
33
|
+
return wsl_path
|
34
|
+
|
35
|
+
|
36
|
+
def get_filename(path, suffix=True) -> str:
|
14
37
|
"""
|
15
38
|
返回路径最后的文件名
|
16
39
|
:param path:
|
@@ -18,11 +41,20 @@ def get_filename(path) -> str:
|
|
18
41
|
"""
|
19
42
|
# path = r'***/**/***.txt'
|
20
43
|
filename = os.path.split(path)[-1]
|
44
|
+
if not suffix:
|
45
|
+
filename = filename.split('.')[0]
|
21
46
|
return filename
|
22
47
|
|
23
48
|
|
24
49
|
def j_listdir(dir_name, including_dir=True):
|
25
|
-
|
50
|
+
filenames = os.listdir(dir_name)
|
51
|
+
if including_dir:
|
52
|
+
return [os.path.join(dir_name, filename) for filename in filenames]
|
53
|
+
else:
|
54
|
+
return list(filenames)
|
55
|
+
|
56
|
+
|
57
|
+
def j_listdir_yield(dir_name, including_dir=True):
|
26
58
|
filenames = os.listdir(dir_name)
|
27
59
|
for filename in filenames:
|
28
60
|
if including_dir:
|