nlpertools 1.0.6.dev0__tar.gz → 1.0.9__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. {nlpertools-1.0.6.dev0/src/nlpertools.egg-info → nlpertools-1.0.9}/PKG-INFO +33 -10
  2. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/README.md +24 -8
  3. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/setup.cfg +0 -1
  4. nlpertools-1.0.9/setup.py +36 -0
  5. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/__init__.py +3 -4
  6. nlpertools-1.0.9/src/nlpertools/cli.py +143 -0
  7. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/data_client.py +56 -17
  8. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/dataprocess.py +28 -12
  9. nlpertools-1.0.9/src/nlpertools/draw/draw.py +81 -0
  10. nlpertools-1.0.9/src/nlpertools/draw/math_func.py +33 -0
  11. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/io/dir.py +35 -3
  12. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/io/file.py +17 -11
  13. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/ml.py +74 -24
  14. nlpertools-1.0.9/src/nlpertools/monitor/__init__.py +0 -0
  15. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/other.py +152 -24
  16. nlpertools-1.0.9/src/nlpertools/template/__init__.py +0 -0
  17. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9/src/nlpertools.egg-info}/PKG-INFO +33 -10
  18. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools.egg-info/SOURCES.txt +8 -0
  19. nlpertools-1.0.9/src/nlpertools.egg-info/entry_points.txt +2 -0
  20. nlpertools-1.0.9/src/nlpertools.egg-info/requires.txt +6 -0
  21. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/LICENSE +0 -0
  22. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/pyproject.toml +0 -0
  23. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/__init__.py +0 -0
  24. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/ac.py +0 -0
  25. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/bit_ops.py +0 -0
  26. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/kmp.py +0 -0
  27. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/num_ops.py +0 -0
  28. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/template.py +0 -0
  29. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/algo/union.py +0 -0
  30. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/data_structure/__init__.py +0 -0
  31. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/data_structure/base_structure.py +0 -0
  32. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/default_db_config.yml +0 -0
  33. {nlpertools-1.0.6.dev0/src/nlpertools/monitor → nlpertools-1.0.9/src/nlpertools/draw}/__init__.py +0 -0
  34. /nlpertools-1.0.6.dev0/src/nlpertools/template/__init__.py → /nlpertools-1.0.9/src/nlpertools/get_2fa.py +0 -0
  35. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/io/__init__.py +0 -0
  36. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/monitor/gpu.py +0 -0
  37. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/monitor/memory.py +0 -0
  38. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/movie.py +0 -0
  39. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/nlpertools_config.yml +0 -0
  40. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/open_api.py +0 -0
  41. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/pic.py +0 -0
  42. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/plugin.py +0 -0
  43. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/reminder.py +0 -0
  44. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/__init__.py +0 -0
  45. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/lazy.py +0 -0
  46. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/log_util.py +0 -0
  47. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/package.py +0 -0
  48. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/package_v1.py +0 -0
  49. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils/package_v2.py +0 -0
  50. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/utils_for_nlpertools.py +0 -0
  51. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/vector_index_demo.py +0 -0
  52. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools/wrapper.py +0 -0
  53. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools.egg-info/dependency_links.txt +0 -0
  54. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools.egg-info/top_level.txt +0 -0
  55. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/src/nlpertools_helper/__init__.py +0 -0
  56. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/tests/test_kmp.py +0 -0
  57. {nlpertools-1.0.6.dev0 → nlpertools-1.0.9}/tests/test_path_exists.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: nlpertools
3
- Version: 1.0.6.dev0
3
+ Version: 1.0.9
4
4
  Summary: A small package about small basic IO operation when coding
5
5
  Home-page: https://github.com/lvzii/nlpertools
6
6
  Author: youshuJi
@@ -12,6 +12,13 @@ Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.6
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: psutil
18
+ Provides-Extra: torch
19
+ Requires-Dist: torch; extra == "torch"
20
+ Dynamic: provides-extra
21
+ Dynamic: requires-dist
15
22
 
16
23
  <div align="center">
17
24
  <h4 align="center">
@@ -23,9 +30,6 @@ License-File: LICENSE
23
30
  </div>
24
31
 
25
32
 
26
- # 当前版本
27
-
28
- 1.0.5
29
33
 
30
34
  # 说明
31
35
 
@@ -33,7 +37,7 @@ License-File: LICENSE
33
37
 
34
38
  它解决了什么问题:
35
39
 
36
- - 很多函数是记不住的, ~~每次写每次都要搜~~ 每次都要问大模型 ,例如pandas排序
40
+ - 很多函数是记不住的, 每次写都要~~搜~~问大模型 ,例如pandas排序
37
41
  - 刷题的时候,树结构的题目很难调试
38
42
 
39
43
 
@@ -75,9 +79,9 @@ https://nlpertools.readthedocs.io/en/latest/
75
79
  def __init__(self, IPT_MODEL_PATH):
76
80
  self.ltp = LTP(IPT_MODEL_PATH)
77
81
  ```
78
- 通过pyinstrument判断,超过1s的包即采用这种方式
82
+ 通过`pyinstrument`判断,超过1s的包即采用这种方式
79
83
  - 2s+ happybase、seaborn、torch、jieba
80
- - 1s+
84
+ - 1s+ /
81
85
  - 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
82
86
 
83
87
 
@@ -85,6 +89,8 @@ https://nlpertools.readthedocs.io/en/latest/
85
89
 
86
90
  - [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
87
91
 
92
+ - [打包发布指南](https://juejin.cn/post/7369413136224878644)
93
+
88
94
  - 发布版本需要加tag
89
95
 
90
96
  ## 开发哲学
@@ -106,6 +112,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
106
112
  ```
107
113
 
108
114
  ```bash
109
- # 监控gpu显存
110
- python -m nlpertools
115
+ # 生成pypi双因素认证的实时密钥(需要提供key)
116
+ python -m nlpertools.cli --get_2fa --get_2fa_key your_key
117
+
118
+ ## git
119
+ python -m nlpertools.cli --git_push
120
+ python -m nlpertools.cli --git_pull
121
+
122
+ # 以下功能被nvitop替代,不推荐使用
123
+ ## 监控gpu显存
124
+ python -m nlpertools.monitor.gpu
125
+ ## 监控cpu
126
+ python -m nlpertools.monitor.memory
111
127
  ```
128
+
129
+ ## 一些常用项目
130
+
131
+ nvitop
132
+
133
+ ydata-profiling
134
+
@@ -8,9 +8,6 @@
8
8
  </div>
9
9
 
10
10
 
11
- # 当前版本
12
-
13
- 1.0.5
14
11
 
15
12
  # 说明
16
13
 
@@ -18,7 +15,7 @@
18
15
 
19
16
  它解决了什么问题:
20
17
 
21
- - 很多函数是记不住的, ~~每次写每次都要搜~~ 每次都要问大模型 ,例如pandas排序
18
+ - 很多函数是记不住的, 每次写都要~~搜~~问大模型 ,例如pandas排序
22
19
  - 刷题的时候,树结构的题目很难调试
23
20
 
24
21
 
@@ -60,9 +57,9 @@ https://nlpertools.readthedocs.io/en/latest/
60
57
  def __init__(self, IPT_MODEL_PATH):
61
58
  self.ltp = LTP(IPT_MODEL_PATH)
62
59
  ```
63
- 通过pyinstrument判断,超过1s的包即采用这种方式
60
+ 通过`pyinstrument`判断,超过1s的包即采用这种方式
64
61
  - 2s+ happybase、seaborn、torch、jieba
65
- - 1s+
62
+ - 1s+ /
66
63
  - 0.5s+ pandas elasticsearch transformers xgboost nltk mongo
67
64
 
68
65
 
@@ -70,6 +67,8 @@ https://nlpertools.readthedocs.io/en/latest/
70
67
 
71
68
  - [readthedoc 检查文档构建状况](https://readthedocs.org/projects/nlpertools/builds)
72
69
 
70
+ - [打包发布指南](https://juejin.cn/post/7369413136224878644)
71
+
73
72
  - 发布版本需要加tag
74
73
 
75
74
  ## 开发哲学
@@ -91,6 +90,23 @@ b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
91
90
  ```
92
91
 
93
92
  ```bash
94
- # 监控gpu显存
95
- python -m nlpertools
93
+ # 生成pypi双因素认证的实时密钥(需要提供key)
94
+ python -m nlpertools.cli --get_2fa --get_2fa_key your_key
95
+
96
+ ## git
97
+ python -m nlpertools.cli --git_push
98
+ python -m nlpertools.cli --git_pull
99
+
100
+ # 以下功能被nvitop替代,不推荐使用
101
+ ## 监控gpu显存
102
+ python -m nlpertools.monitor.gpu
103
+ ## 监控cpu
104
+ python -m nlpertools.monitor.memory
96
105
  ```
106
+
107
+ ## 一些常用项目
108
+
109
+ nvitop
110
+
111
+ ydata-profiling
112
+
@@ -1,6 +1,5 @@
1
1
  [metadata]
2
2
  name = nlpertools
3
- version = 1.0.6dev
4
3
  author = youshuJi
5
4
  author_email = zjs20001205@gmail.com
6
5
  description = A small package about small basic IO operation when coding
@@ -0,0 +1,36 @@
1
+ import os
2
+ import re
3
+
4
+ from setuptools import setup
5
+
6
+
7
+ def get_version():
8
+ with open(os.path.join("src", "nlpertools", "__init__.py"), "r", encoding="utf-8") as f:
9
+ file_content = f.read()
10
+ pattern = r"{}\W*=\W*\'([^\"]+)\'".format("__version__")
11
+ (version,) = re.findall(pattern, file_content)
12
+ return version
13
+
14
+
15
+ def main():
16
+ setup(
17
+ # https://juejin.cn/post/7369349560421040128
18
+ install_requires=[
19
+ "numpy",
20
+ "pandas",
21
+ "psutil"
22
+ ],
23
+ extras_require={
24
+ "torch": ["torch"],
25
+ },
26
+ version=get_version(),
27
+ entry_points={
28
+ "console_scripts": [
29
+ "ncli=nlpertools.cli:main",
30
+ ]
31
+ }
32
+ )
33
+
34
+
35
+ if __name__ == '__main__':
36
+ main()
@@ -3,6 +3,7 @@
3
3
  # @Author : youshu.Ji
4
4
  from .algo.kmp import *
5
5
  from .data_structure.base_structure import *
6
+ from .draw import *
6
7
  from .dataprocess import *
7
8
  from .io.dir import *
8
9
  from .io.file import *
@@ -15,10 +16,8 @@ from .reminder import *
15
16
  from .utils_for_nlpertools import *
16
17
  from .wrapper import *
17
18
  from .monitor import *
19
+ from .cli import *
18
20
 
19
- import os
20
21
 
21
22
 
22
- DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__),"default_db_config.yml")
23
-
24
- __version__ = '1.0.5'
23
+ __version__ = '1.0.9'
@@ -0,0 +1,143 @@
1
+ import argparse
2
+ import os
3
+ import uuid
4
+ import sys
5
+
6
+ """
7
+ 如何Debug cli.py
8
+ """
9
+
10
+
11
+ def git_push():
12
+ """
13
+ 针对国内提交github经常失败,自动提交
14
+ """
15
+ num = -1
16
+ while 1:
17
+ num += 1
18
+ print("retry num: {}".format(num))
19
+ info = os.system("git push --set-upstream origin main")
20
+ print(str(info))
21
+ if not str(info).startswith("fatal"):
22
+ print("scucess")
23
+ break
24
+
25
+
26
+ def git_pull():
27
+ """
28
+ 针对国内提交github经常失败,自动提交
29
+ """
30
+ num = -1
31
+ while 1:
32
+ num += 1
33
+ print("retry num: {}".format(num))
34
+ info = os.system("git pull")
35
+ print(str(info))
36
+ if not str(info).startswith("fatal") and not str(info).startswith("error"):
37
+ print("scucess")
38
+ break
39
+
40
+
41
+ def get_mac_address():
42
+ mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
43
+ mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
44
+ print("mac address 不一定准确")
45
+ print(mac_address)
46
+ return mac_address
47
+
48
+
49
+ def get_2af_value(key):
50
+ import pyotp
51
+ """
52
+ key应该是7位的
53
+ """
54
+ print(key)
55
+ totp = pyotp.TOTP(key)
56
+ print(totp.now())
57
+
58
+
59
+ def start_gpu_usage_notify_server():
60
+ from flask import Flask
61
+
62
+ app = Flask(__name__)
63
+
64
+ @app.route("/notify", methods=["GET"])
65
+ def notify():
66
+ # 这里可以根据需要动态生成通知内容
67
+ usage = os.popen("nvidia-smi --query-gpu=memory.used --format=csv").read().split("\n")[1:]
68
+ res = 0
69
+ for edx, each in enumerate(usage):
70
+ if each.startswith("0"):
71
+ res += 1
72
+ print(res)
73
+ return str(res), 200
74
+
75
+ app.run(host="0.0.0.0", port=5000)
76
+
77
+
78
+ def start_gpu_usage_notify_client():
79
+ import requests
80
+ from plyer import notification
81
+ import time
82
+
83
+ SERVER_URL = 'http://127.0.0.1:5000/notify' # 服务器的 API 地址
84
+
85
+ def notify(text):
86
+ # 使用 plyer 发送通知
87
+ notification.notify(
88
+ title='远程通知',
89
+ message=text,
90
+ timeout=10 # 10秒的通知显示时间
91
+ )
92
+
93
+ """定时轮询服务器获取通知"""
94
+ while True:
95
+ try:
96
+ response = requests.get(SERVER_URL)
97
+ if response.status_code == 200:
98
+ num = int(response.text)
99
+ if num > 0:
100
+ notify(f"服务器有{num}张卡")
101
+ print(f"服务器有{num}张卡")
102
+ else:
103
+ print("服务器没有新通知")
104
+ except Exception as e:
105
+ print(f"与服务器连接失败: {e}")
106
+
107
+ time.sleep(1)
108
+
109
+
110
+ def main():
111
+ parser = argparse.ArgumentParser(description="CLI tool for git operations and getting MAC address.")
112
+ parser.add_argument('--gitpush', action='store_true', help='Perform git push operation.')
113
+ parser.add_argument('--gitpull', action='store_true', help='Perform git pull operation.')
114
+ parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
115
+
116
+ parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
117
+ parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
118
+ parser.add_argument('--monitor_gpu_cli', action='store_true', help='Get the 2fa value.')
119
+ parser.add_argument('--monitor_gpu_ser', action='store_true', help='Get the 2fa value.')
120
+
121
+ args = parser.parse_args()
122
+
123
+ if args.gitpush:
124
+ git_push()
125
+ elif args.gitpull:
126
+ git_pull()
127
+ elif args.mac_address:
128
+ get_mac_address()
129
+ elif args.monitor_gpu_cli:
130
+ start_gpu_usage_notify_client()
131
+ elif args.monitor_gpu_ser:
132
+ start_gpu_usage_notify_server()
133
+ elif args.get_2fa:
134
+ if args.get_2fa_key:
135
+ get_2af_value(args.get_2fa_key)
136
+ else:
137
+ print("Please provide a key as an argument.")
138
+ else:
139
+ print("No operation specified.")
140
+
141
+
142
+ if __name__ == '__main__':
143
+ main()
@@ -1,3 +1,4 @@
1
+ #encoding=utf-8
1
2
  # !/usr/bin/python3.8
2
3
  # -*- coding: utf-8 -*-
3
4
  # @Author : youshu.Ji
@@ -5,9 +6,11 @@ import datetime
5
6
  import json
6
7
  import logging
7
8
 
8
- from . import DB_CONFIG_FILE
9
9
  from .io.file import read_yaml
10
10
  from .utils.package import *
11
+ import os
12
+
13
+ DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "default_db_config.yml")
11
14
 
12
15
  # import aioredis
13
16
  # import happybase
@@ -28,21 +31,24 @@ class Neo4jOps(object):
28
31
  NEO4J_TIMEOUT = 0.3
29
32
  pass
30
33
 
34
+
31
35
  class SqliteOps(object):
32
- import sqlite3
33
- database_path = r'xx.db'
34
- conn = sqlite3.connect(database_path)
35
- c = conn.cursor()
36
- sql = "select name from sqlite_master where type='table' order by name"
37
- c.execute(sql)
38
- print(c.fetchall())
39
- sql = "select * from typecho_contents"
40
- c.execute(sql)
41
- res = c.fetchall()
42
- print(res[3])
43
-
44
- conn.commit()
45
- conn.close()
36
+ pass
37
+ # import sqlite3
38
+ # database_path = r'xx.db'
39
+ # conn = sqlite3.connect(database_path)
40
+ # c = conn.cursor()
41
+ # sql = "select name from sqlite_master where type='table' order by name"
42
+ # c.execute(sql)
43
+ # print(c.fetchall())
44
+ # sql = "select * from typecho_contents"
45
+ # c.execute(sql)
46
+ # res = c.fetchall()
47
+ # print(res[3])
48
+ #
49
+ # conn.commit()
50
+ # conn.close()
51
+
46
52
 
47
53
  class MysqlOps(object):
48
54
  import pandas as pd
@@ -116,6 +122,41 @@ class EsOps(object):
116
122
  print(f"批量保存数据: {_res}")
117
123
 
118
124
 
125
+ class MongoDB_BETA:
126
+ def __init__(self, host='localhost', port=27017, db_name=None, collection_name=None):
127
+ self.host = host
128
+ self.port = port
129
+ self.db_name = db_name
130
+ self.collection_name = collection_name
131
+ self.client = None
132
+ self.db = None
133
+ self.collection = None
134
+
135
+ def connect(self):
136
+ self.client = MongoClient(self.host, self.port)
137
+ self.db = self.client[self.db_name]
138
+ self.collection = self.db[self.collection_name]
139
+
140
+ def close(self):
141
+ if self.client:
142
+ self.client.close()
143
+
144
+ def insert_data(self, data):
145
+ if isinstance(data, list):
146
+ self.collection.insert_many(data)
147
+ else:
148
+ self.collection.insert_one(data)
149
+
150
+ def check_data_exists(self, query):
151
+ """
152
+ 检查某个数据是否存在于数据库中
153
+ :param query: 查询条件
154
+ :return: 布尔值,表示数据是否存在
155
+ """
156
+ return self.collection.count_documents(query) > 0
157
+
158
+
159
+
119
160
  class MongoOps(object):
120
161
  from pymongo import MongoClient
121
162
  def __init__(self, config=global_db_config["mongo"]):
@@ -348,8 +389,6 @@ class KafkaOps(object):
348
389
  print(recv)
349
390
 
350
391
 
351
-
352
-
353
392
  class MilvusOps(object):
354
393
  def __init__(self, config=global_db_config.milvus):
355
394
  from pymilvus import connections, Collection
@@ -55,9 +55,9 @@ class Pattern:
55
55
  # 中文人名
56
56
  chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
57
57
  # 英文人名
58
- english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
58
+ english_name_pattern = r"(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
59
59
  # 纯数字
60
- pure_num_pattern = "\d+"
60
+ pure_num_pattern = r"\d+"
61
61
  # xxxx图/表 之类的表述
62
62
  pic_table_descript_pattern = ".{1,15}图"
63
63
 
@@ -66,20 +66,20 @@ class Pattern:
66
66
  hlink_pattern = (
67
67
  r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
68
68
  )
69
- http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
69
+ http_pattern = r"(http|https):\/\/([\w.]+\/?)\S*/\S*"
70
70
  # 邮箱
71
- email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
71
+ email_pattern = r"[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
72
72
  # html 可能过于严格了
73
- html_pattern = "<[\s\S]*?>"
73
+ html_pattern = r"<[\s\S]*?>"
74
74
  # 重复 “asdasdasdasd”
75
75
  repeat_pattern = "(.)\1+"
76
76
  # 日期
77
- day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
77
+ day_time_pattern = r"\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
78
78
  # 小时
79
- hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
79
+ hour_time_pattern = r"(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
80
80
  # 股票
81
81
  stock_pattern = (
82
- "(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
82
+ r"(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
83
83
  )
84
84
 
85
85
  # 一般是需要替换的
@@ -91,7 +91,7 @@ class Pattern:
91
91
  # 微博视频等
92
92
  weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
93
93
  # @
94
- at_pattern = "@\w+"
94
+ at_pattern = r"@\w+"
95
95
 
96
96
  # from https://github.com/bigscience-workshop/data-preparation pii
97
97
  year_patterns = [
@@ -116,7 +116,7 @@ class Pattern:
116
116
  ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
117
117
  ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
118
118
  ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
119
- [ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
119
+ [ipv4_pattern, ipv6_pattern]) + r")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
120
120
 
121
121
  # https://regex101.com/r/EpA5B7/1
122
122
  email_line_pattern = r'''
@@ -466,7 +466,7 @@ class TextProcess(object):
466
466
  p = re.compile(pattern, re.S)
467
467
  text = p.sub("", text)
468
468
 
469
- dr = re.compile("@\w+", re.S)
469
+ dr = re.compile(r"@\w+", re.S)
470
470
  text = dr.sub("", text)
471
471
 
472
472
  return text
@@ -527,7 +527,7 @@ class TextProcess(object):
527
527
  text = re.sub(pattern, replace, text)
528
528
  return text
529
529
 
530
- def calc_proportion_zh(self,text):
530
+ def calc_proportion_zh(self, text):
531
531
  text = text.strip()
532
532
  # 如果是中国英文的情况,并且英文有空格分开
533
533
  if " " in text:
@@ -538,6 +538,8 @@ class TextProcess(object):
538
538
  chinese_count += 1
539
539
  else:
540
540
  pass
541
+
542
+
541
543
  class CopyFunc():
542
544
  # from https://github.com/lemon234071/clean-dialog
543
545
  def is_chinese_char(cp):
@@ -597,6 +599,20 @@ def convert_basic2fullwidth(sentence):
597
599
  new_sentence += char
598
600
  return new_sentence
599
601
 
602
+
603
+ def clean_illegal_chars_for_excel(df):
604
+ # openpyxl 库写入 Excel 文件时,有一些非法字符,需要删除
605
+ # 定义一个函数来移除字符串中的非法字符
606
+ def remove_illegal_chars(s):
607
+ if isinstance(s, str):
608
+ # 移除 ASCII 码在非法范围内的字符
609
+ return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
610
+ return s
611
+
612
+ # 应用清理函数到数据框的每个元素
613
+ return df.map(remove_illegal_chars)
614
+
615
+
600
616
  if __name__ == "__main__":
601
617
  pattern_for_filter = [
602
618
  Pattern.redundancy_space_pattern,
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ from ..utils.package import plt
5
+
6
+
7
+ def confused_matrix(confuse_matrix):
8
+ import seaborn as sns
9
+ sns.set()
10
+ f, ax = plt.subplots()
11
+ ticklabels = ["l1", "l2", "l31"]
12
+ sns.heatmap(confuse_matrix, annot=True, fmt=".3g", ax=ax, cmap='rainbow',
13
+ xticklabels=ticklabels, yticklabels=ticklabels) # 画热力图
14
+
15
+ ax.set_title('confusion matrix') # 标题
16
+ ax.set_xlabel('predict') # x轴
17
+ ax.set_ylabel('true') # y轴
18
+ plt.show()
19
+
20
+ f.savefig('tmp.jpg', bbox_inches='tight')
21
+
22
+
23
+ def plot_histogram(data, bin_size, max_bin):
24
+ """
25
+ 画直方图,超过1000的统一按1000算
26
+ :param data:
27
+ :param bin_size:
28
+ :return:
29
+ """
30
+ import matplotlib.pyplot as plt
31
+ import numpy as np
32
+ import pandas as pd
33
+ from matplotlib.ticker import MaxNLocator
34
+ # 将超过1000的值改为1000
35
+ def process_lengths(data):
36
+ return [length if length <= max_bin else max_bin + 3 for length in data]
37
+
38
+ # 前闭后开
39
+ # min_num, max_num = 0, 1000
40
+ # min_num, max_num = min(data), max(data)
41
+
42
+ plt.figure(figsize=(12, 8))
43
+ processed_data = process_lengths(data)
44
+ bins = np.arange(0, max_bin + 2 * bin_size, bin_size)
45
+ # 绘制直方图
46
+ n, new_bins, patches = plt.hist(processed_data, bins=bins, edgecolor='black', color='skyblue', alpha=0.7,
47
+ linewidth=0)
48
+
49
+ # 添加"∞"的标签
50
+ # bins会改变
51
+ plt.gca().set_xticks(bins)
52
+ plt.gca().set_xticklabels([str(i) for i in plt.xticks()[0][:-1]] + ["∞"])
53
+
54
+ mean_val = np.mean(data)
55
+ plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1)
56
+ plt.text(mean_val + bin_size / 10, max(n) * 0.9, f'Mean: {mean_val:.2f}', color='red')
57
+
58
+ # 添加标题和标签
59
+ plt.title('Module Line Number Distribution', fontsize=16, fontweight='bold')
60
+ plt.xlabel('module line number', fontsize=14)
61
+ plt.ylabel('frequency', fontsize=14)
62
+
63
+ plt.grid(True, linestyle='--', alpha=0.6)
64
+
65
+ plt.xticks(fontsize=12)
66
+ plt.yticks(fontsize=12)
67
+
68
+ # 在每个柱状图上显示数值
69
+ for i in range(len(patches)):
70
+ plt.text(patches[i].get_x() + patches[i].get_width() / 2, patches[i].get_height(),
71
+ str(int(n[i])), ha='center', va='bottom', fontsize=12)
72
+ plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
73
+ # 显示图表
74
+ plt.show()
75
+
76
+
77
+ if __name__ == '__main__':
78
+ # 调整区间大小
79
+ bin_size = 50
80
+ # 示例模块长度数据
81
+ plot_histogram([1, 100, 999, 1000, 1002, 1100, 1150], bin_size, max_bin=1000)
@@ -0,0 +1,33 @@
1
+ # 数学函数
2
+ def draw_log():
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ from matplotlib.ticker import MultipleLocator, FormatStrFormatter
6
+
7
+ # 生成一些数据
8
+ x = np.linspace(0.1, 10, 100)
9
+ # 默认log指的时loge
10
+ y = np.log(x)
11
+
12
+ # 创建一个新的图形和轴
13
+ fig, ax = plt.subplots()
14
+
15
+ # 绘制log图像
16
+ ax.plot(x, y)
17
+
18
+ # 设置图像标题和轴标签
19
+ ax.set_title("Logarithmic Function")
20
+ ax.set_xlabel("x")
21
+ ax.set_ylabel("log(x)")
22
+ # 设置横坐标的刻度间隔为1
23
+ ax.xaxis.set_major_locator(MultipleLocator(1))
24
+
25
+ # 设置横坐标的刻度格式
26
+ ax.xaxis.set_major_formatter(FormatStrFormatter("%.1f"))
27
+ # 添加x=1的虚线
28
+ ax.axvline(x=1, linestyle="--", color="gray")
29
+ # 添加y=1的虚线
30
+ ax.axhline(y=0, linestyle="--", color="gray")
31
+
32
+ # 显示图像
33
+ plt.show()