nlpertools 1.0.10__tar.gz → 1.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nlpertools-1.0.10/src/nlpertools.egg-info → nlpertools-1.0.11}/PKG-INFO +3 -1
- {nlpertools-1.0.10 → nlpertools-1.0.11}/README.md +2 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/__init__.py +2 -1
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/cli.py +14 -17
- nlpertools-1.0.11/src/nlpertools/dataprocess/__init__.py +1 -0
- nlpertools-1.0.11/src/nlpertools/dataprocess/dedupl.py +9 -0
- nlpertools-1.0.10/src/nlpertools/dataprocess.py → nlpertools-1.0.11/src/nlpertools/dataprocess/dp_main.py +1 -1
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/io/dir.py +25 -5
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/io/file.py +46 -43
- nlpertools-1.0.11/src/nlpertools/llm/__init__.py +3 -0
- nlpertools-1.0.11/src/nlpertools/llm/call_llm_once.py +60 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/llm/infer.py +50 -5
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/other.py +77 -51
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils/package.py +9 -10
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/wrapper.py +6 -4
- {nlpertools-1.0.10 → nlpertools-1.0.11/src/nlpertools.egg-info}/PKG-INFO +3 -1
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools.egg-info/SOURCES.txt +3 -1
- nlpertools-1.0.10/src/nlpertools/llm/call_llm_once.py +0 -30
- nlpertools-1.0.10/src/nlpertools/template/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/LICENSE +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/pyproject.toml +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/setup.cfg +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/setup.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/ac.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/bit_ops.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/kmp.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/num_ops.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/template.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/algo/union.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/data_client.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/data_structure/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/data_structure/base_structure.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/default_db_config.yml +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/draw/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/draw/draw.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/draw/math_func.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/get_2fa.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/io/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/llm/price.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/ml.py +0 -0
- {nlpertools-1.0.10/src/nlpertools/llm → nlpertools-1.0.11/src/nlpertools/monitor}/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/monitor/gpu.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/monitor/memory.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/movie.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/nlpertools_config.yml +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/open_api.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/pic.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/plugin.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/reminder.py +0 -0
- {nlpertools-1.0.10/src/nlpertools/monitor → nlpertools-1.0.11/src/nlpertools/template}/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils/lazy.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils/log_util.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils/package_v1.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils/package_v2.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/utils_for_nlpertools.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/vector_index_demo.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools.egg-info/dependency_links.txt +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools.egg-info/entry_points.txt +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools.egg-info/requires.txt +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools.egg-info/top_level.txt +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools_helper/__init__.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/tests/test_kmp.py +0 -0
- {nlpertools-1.0.10 → nlpertools-1.0.11}/tests/test_path_exists.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nlpertools
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.11
|
4
4
|
Summary: A small package about small basic IO operation when coding
|
5
5
|
Home-page: https://github.com/lvzii/nlpertools
|
6
6
|
Author: youshuJi
|
@@ -64,6 +64,8 @@ json_data = nlpertools.load_from_json('res.json')
|
|
64
64
|
```bash
|
65
65
|
## git, 连接github不稳定的时候非常有用
|
66
66
|
ncli git pull
|
67
|
+
## 带有参数时,加上--以避免-u被解析
|
68
|
+
ncli -- git push -u origin main
|
67
69
|
|
68
70
|
# 生成pypi双因素认证的实时密钥(需要提供key)
|
69
71
|
ncli --get_2fa --get_2fa_key your_key
|
@@ -4,6 +4,7 @@
|
|
4
4
|
from .algo.kmp import *
|
5
5
|
from .data_structure.base_structure import *
|
6
6
|
from .draw import *
|
7
|
+
from .dataprocess.dp_main import *
|
7
8
|
from .dataprocess import *
|
8
9
|
from .io.dir import *
|
9
10
|
from .io.file import *
|
@@ -20,4 +21,4 @@ from .cli import *
|
|
20
21
|
from .llm import *
|
21
22
|
|
22
23
|
|
23
|
-
__version__ = "1.0.
|
24
|
+
__version__ = "1.0.11"
|
@@ -2,7 +2,7 @@ import argparse
|
|
2
2
|
import os
|
3
3
|
import uuid
|
4
4
|
import sys
|
5
|
-
from .dataprocess import startwith
|
5
|
+
from .dataprocess.dp_main import startwith
|
6
6
|
|
7
7
|
|
8
8
|
def run_git_command(command):
|
@@ -17,7 +17,7 @@ def run_git_command(command):
|
|
17
17
|
info = os.system(command)
|
18
18
|
print(str(info))
|
19
19
|
# 检查命令执行结果,若未出现错误则认为执行成功
|
20
|
-
if not startwith(str(info), ["fatal", "error", "128", "1"]):
|
20
|
+
if (not startwith(str(info), ["fatal", "error", "128", "1"])) and "fatal" not in str(info):
|
21
21
|
print("success")
|
22
22
|
print(f"success info : ##{info}##")
|
23
23
|
break
|
@@ -25,7 +25,7 @@ def run_git_command(command):
|
|
25
25
|
|
26
26
|
def get_mac_address():
|
27
27
|
mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
|
28
|
-
mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
|
28
|
+
mac_address = ":".join([mac[e : e + 2] for e in range(0, 11, 2)])
|
29
29
|
print("mac address 不一定准确")
|
30
30
|
print(mac_address)
|
31
31
|
return mac_address
|
@@ -33,6 +33,7 @@ def get_mac_address():
|
|
33
33
|
|
34
34
|
def get_2af_value(key):
|
35
35
|
import pyotp
|
36
|
+
|
36
37
|
"""
|
37
38
|
key应该是7位的
|
38
39
|
"""
|
@@ -65,15 +66,11 @@ def start_gpu_usage_notify_client():
|
|
65
66
|
from plyer import notification
|
66
67
|
import time
|
67
68
|
|
68
|
-
SERVER_URL =
|
69
|
+
SERVER_URL = "http://127.0.0.1:5000/notify" # 服务器的 API 地址
|
69
70
|
|
70
71
|
def notify(text):
|
71
72
|
# 使用 plyer 发送通知
|
72
|
-
notification.notify(
|
73
|
-
title='远程通知',
|
74
|
-
message=text,
|
75
|
-
timeout=10 # 10秒的通知显示时间
|
76
|
-
)
|
73
|
+
notification.notify(title="远程通知", message=text, timeout=10) # 10秒的通知显示时间
|
77
74
|
|
78
75
|
"""定时轮询服务器获取通知"""
|
79
76
|
while True:
|
@@ -94,12 +91,12 @@ def start_gpu_usage_notify_client():
|
|
94
91
|
|
95
92
|
def main():
|
96
93
|
parser = argparse.ArgumentParser(description="CLI tool for git operations and other functions.")
|
97
|
-
parser.add_argument(
|
98
|
-
parser.add_argument(
|
99
|
-
parser.add_argument(
|
100
|
-
parser.add_argument(
|
101
|
-
parser.add_argument(
|
102
|
-
parser.add_argument(
|
94
|
+
parser.add_argument("git_command", nargs="*", help="Any git command (e.g., push, pull)")
|
95
|
+
parser.add_argument("--mac_address", action="store_true", help="Get the MAC address.")
|
96
|
+
parser.add_argument("--get_2fa", action="store_true", help="Get the 2fa value.")
|
97
|
+
parser.add_argument("--get_2fa_key", type=str, help="Get the 2fa value.")
|
98
|
+
parser.add_argument("--monitor_gpu_cli", action="store_true", help="monitor gpu cli")
|
99
|
+
parser.add_argument("--monitor_gpu_ser", action="store_true", help="monitor gpu ser")
|
103
100
|
|
104
101
|
args = parser.parse_args()
|
105
102
|
|
@@ -121,5 +118,5 @@ def main():
|
|
121
118
|
print("No operation specified.")
|
122
119
|
|
123
120
|
|
124
|
-
if __name__ ==
|
125
|
-
main()
|
121
|
+
if __name__ == "__main__":
|
122
|
+
main()
|
@@ -0,0 +1 @@
|
|
1
|
+
from .dedupl import *
|
@@ -8,7 +8,7 @@ from typing import List
|
|
8
8
|
import numpy as np
|
9
9
|
|
10
10
|
# from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
|
11
|
-
from
|
11
|
+
from ..utils.package import *
|
12
12
|
|
13
13
|
main_special_characters = string.punctuation + string.digits + string.whitespace
|
14
14
|
other_special_characters = (
|
@@ -3,6 +3,7 @@
|
|
3
3
|
# @Author : youshu.Ji
|
4
4
|
import os
|
5
5
|
from pathlib import Path
|
6
|
+
from typing import overload,Literal,Union
|
6
7
|
|
7
8
|
|
8
9
|
# dir ----------------------------------------------------------------------
|
@@ -45,15 +46,34 @@ def get_filename(path, suffix=True) -> str:
|
|
45
46
|
filename = filename.split('.')[0]
|
46
47
|
return filename
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
"""
|
50
|
+
因为os.listdir无法支持Path类型,虽然是bytelikepath,但是传入Path后只会返回字符串
|
51
|
+
且无法只返回文件名
|
52
|
+
故重新实现
|
53
|
+
"""
|
54
|
+
@overload
|
55
|
+
def listdir(dir_name: Path, including_dir: Literal[True]) -> list[Path]: ...
|
56
|
+
@overload
|
57
|
+
def listdir(dir_name: str, including_dir: Literal[True]) -> list[str]: ...
|
58
|
+
@overload
|
59
|
+
def listdir(dir_name: Path, including_dir: Literal[False] = False) -> list[str]: ...
|
60
|
+
@overload
|
61
|
+
def listdir(dir_name: str, including_dir: Literal[False] = False) -> list[str]: ...
|
62
|
+
|
63
|
+
def listdir(dir_name: Union[Path, str], including_dir: bool = False) -> list[Path] | list[str]:
|
64
|
+
"""
|
65
|
+
including_dir=True -> list[Path] or list[str]
|
66
|
+
including_dir=False -> list[str]
|
67
|
+
"""
|
68
|
+
filenames = os.listdir(str(dir_name))
|
51
69
|
if including_dir:
|
52
|
-
|
70
|
+
if isinstance(dir_name, Path):
|
71
|
+
return [dir_name / filename for filename in filenames]
|
72
|
+
else:
|
73
|
+
return [os.path.join(dir_name, filename) for filename in filenames]
|
53
74
|
else:
|
54
75
|
return list(filenames)
|
55
76
|
|
56
|
-
|
57
77
|
def listdir_yield(dir_name, including_dir=True):
|
58
78
|
filenames = os.listdir(dir_name)
|
59
79
|
for filename in filenames:
|
@@ -5,8 +5,11 @@ import codecs
|
|
5
5
|
import json
|
6
6
|
import pickle
|
7
7
|
import random
|
8
|
-
from itertools import
|
8
|
+
from itertools import takewhile, repeat
|
9
|
+
from typing import Optional
|
10
|
+
from pathlib import Path
|
9
11
|
import pandas as pd
|
12
|
+
|
10
13
|
# import omegaconf
|
11
14
|
# import yaml
|
12
15
|
from ..utils.package import *
|
@@ -15,18 +18,18 @@ LARGE_FILE_THRESHOLD = 1e5
|
|
15
18
|
|
16
19
|
|
17
20
|
def safe_filename(filename: str) -> str:
|
18
|
-
for char in [
|
19
|
-
filename = filename.replace(char,
|
21
|
+
for char in ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]:
|
22
|
+
filename = filename.replace(char, "_")
|
20
23
|
return filename
|
21
24
|
|
22
25
|
|
23
26
|
def read_yaml(path, omega=False):
|
24
27
|
if omega:
|
25
28
|
return omegaconf.OmegaConf.load(path)
|
26
|
-
return yaml.load(codecs.open(path, encoding=
|
29
|
+
return yaml.load(codecs.open(path, encoding="utf-8"), Loader=yaml.FullLoader)
|
27
30
|
|
28
31
|
|
29
|
-
def
|
32
|
+
def merge_file(filelist, save_filename, shuffle=False):
|
30
33
|
contents = []
|
31
34
|
for file in filelist:
|
32
35
|
content = readtxt_list_all_strip(file)
|
@@ -43,9 +46,9 @@ def iter_count(file_name):
|
|
43
46
|
author: unknown
|
44
47
|
"""
|
45
48
|
buffer = 1024 * 1024
|
46
|
-
with codecs.open(file_name,
|
49
|
+
with codecs.open(file_name, "r", "utf-8") as f:
|
47
50
|
buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
|
48
|
-
return sum(buf.count(
|
51
|
+
return sum(buf.count("\n") for buf in buf_gen)
|
49
52
|
|
50
53
|
|
51
54
|
# 需要加入进度条的函数包括
|
@@ -57,24 +60,24 @@ load_from_json
|
|
57
60
|
|
58
61
|
|
59
62
|
# 读txt文件 一次全读完 返回list 去换行
|
60
|
-
def readtxt_list_all_strip(path, encoding=
|
63
|
+
def readtxt_list_all_strip(path, encoding="utf-8") -> list:
|
61
64
|
file_line_num = iter_count(path)
|
62
65
|
lines = []
|
63
|
-
with codecs.open(path,
|
66
|
+
with codecs.open(path, "r", encoding) as r:
|
64
67
|
if file_line_num > LARGE_FILE_THRESHOLD:
|
65
68
|
iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
|
66
69
|
else:
|
67
70
|
iter_obj = enumerate(r.readlines())
|
68
71
|
|
69
72
|
for ldx, line in iter_obj:
|
70
|
-
lines.append(line.strip(
|
73
|
+
lines.append(line.strip("\n").strip("\r"))
|
71
74
|
return lines
|
72
75
|
|
73
76
|
|
74
77
|
# 读txt 一次读一行 最后返回list
|
75
78
|
def readtxt_list_each(path) -> list:
|
76
79
|
lines = []
|
77
|
-
with codecs.open(path,
|
80
|
+
with codecs.open(path, "r", "utf-8") as r:
|
78
81
|
line = r.readline()
|
79
82
|
while line:
|
80
83
|
lines.append(line)
|
@@ -82,11 +85,11 @@ def readtxt_list_each(path) -> list:
|
|
82
85
|
return lines
|
83
86
|
|
84
87
|
|
85
|
-
def readtxt_list_each_strip(path
|
88
|
+
def readtxt_list_each_strip(path: Optional[str | Path]):
|
86
89
|
"""
|
87
90
|
yield方法
|
88
91
|
"""
|
89
|
-
with codecs.open(path,
|
92
|
+
with codecs.open(path, "r", "utf-8") as r:
|
90
93
|
line = r.readline()
|
91
94
|
while line:
|
92
95
|
yield line.strip("\n").strip("\r")
|
@@ -95,51 +98,51 @@ def readtxt_list_each_strip(path) -> list:
|
|
95
98
|
|
96
99
|
# 读txt文件 一次全读完 返回list
|
97
100
|
def readtxt_list_all(path) -> list:
|
98
|
-
with codecs.open(path,
|
101
|
+
with codecs.open(path, "r", "utf-8") as r:
|
99
102
|
lines = r.readlines()
|
100
103
|
return lines
|
101
104
|
|
102
105
|
|
103
106
|
# 读byte文件 读成一条string
|
104
107
|
def readtxt_byte(path, encoding="utf-8") -> str:
|
105
|
-
with codecs.open(path,
|
108
|
+
with codecs.open(path, "rb") as r:
|
106
109
|
lines = r.read()
|
107
110
|
lines = lines.decode(encoding)
|
108
|
-
return lines.replace(
|
111
|
+
return lines.replace("\r", "")
|
109
112
|
|
110
113
|
|
111
114
|
# 读txt文件 读成一条string
|
112
|
-
def
|
113
|
-
with codecs.open(path,
|
115
|
+
def read_text(path, encoding="utf-8") -> str:
|
116
|
+
with codecs.open(path, "r", encoding) as r:
|
114
117
|
lines = r.read()
|
115
|
-
return lines.replace(
|
118
|
+
return lines.replace("\r", "")
|
116
119
|
|
117
120
|
|
118
121
|
# 写txt文件覆盖
|
119
|
-
def writetxt_w(txt, path, r=
|
120
|
-
with codecs.open(path, r,
|
122
|
+
def writetxt_w(txt, path, r="w"):
|
123
|
+
with codecs.open(path, r, "utf-8") as w:
|
121
124
|
w.writelines(txt)
|
122
125
|
|
123
126
|
|
124
127
|
# 写txt文件追加
|
125
128
|
def writetxt_a(txt, path):
|
126
|
-
with codecs.open(path,
|
129
|
+
with codecs.open(path, "a", "utf-8") as w:
|
127
130
|
w.writelines(txt)
|
128
131
|
|
129
132
|
|
130
133
|
def writetxt(txt, path, encoding="utf-8"):
|
131
|
-
with codecs.open(path,
|
134
|
+
with codecs.open(path, "w", encoding) as w:
|
132
135
|
w.write(txt)
|
133
136
|
|
134
137
|
|
135
138
|
def writetxt_wb(txt, path):
|
136
|
-
with codecs.open(path,
|
139
|
+
with codecs.open(path, "wb") as w:
|
137
140
|
w.write(txt)
|
138
141
|
|
139
142
|
|
140
143
|
# 写list 覆盖
|
141
144
|
def writetxt_w_list(list, path, num_lf=1):
|
142
|
-
with codecs.open(path,
|
145
|
+
with codecs.open(path, "w", "utf-8") as w:
|
143
146
|
for i in list:
|
144
147
|
w.write(i)
|
145
148
|
w.write("\n" * num_lf)
|
@@ -147,7 +150,7 @@ def writetxt_w_list(list, path, num_lf=1):
|
|
147
150
|
|
148
151
|
# 写list 追加
|
149
152
|
def writetxt_a_list(list, path, num_lf=2):
|
150
|
-
with codecs.open(path,
|
153
|
+
with codecs.open(path, "a", "utf-8") as w:
|
151
154
|
for i in list:
|
152
155
|
w.write(i)
|
153
156
|
w.write("\n" * num_lf)
|
@@ -158,7 +161,7 @@ def save_to_json(content, path):
|
|
158
161
|
json.dump(content, w, ensure_ascii=False, indent=1)
|
159
162
|
|
160
163
|
|
161
|
-
def load_from_json(path):
|
164
|
+
def load_from_json(path: Optional[str | Path]):
|
162
165
|
with codecs.open(path, "r", "utf-8") as r:
|
163
166
|
content = json.load(r)
|
164
167
|
return content
|
@@ -167,60 +170,60 @@ def load_from_json(path):
|
|
167
170
|
# 读txt文件 读成一条string if gb2312
|
168
171
|
def readtxt_string_all_encoding(path):
|
169
172
|
try:
|
170
|
-
with codecs.open(path,
|
173
|
+
with codecs.open(path, "rb", "utf-8-sig") as r:
|
171
174
|
lines = r.read()
|
172
175
|
return lines
|
173
176
|
except:
|
174
177
|
try:
|
175
|
-
with codecs.open(path,
|
178
|
+
with codecs.open(path, "rb", "utf-8") as r:
|
176
179
|
lines = r.reacd()
|
177
180
|
return lines
|
178
181
|
except:
|
179
182
|
try:
|
180
|
-
with codecs.open(path,
|
183
|
+
with codecs.open(path, "rb", "big5") as r:
|
181
184
|
lines = r.read()
|
182
185
|
return lines
|
183
186
|
except:
|
184
187
|
print(path)
|
185
|
-
with codecs.open(path,
|
188
|
+
with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
|
186
189
|
lines = r.read()
|
187
190
|
return lines
|
188
191
|
|
189
192
|
|
190
193
|
def readtxt_list_all_encoding(path):
|
191
194
|
try:
|
192
|
-
with codecs.open(path,
|
195
|
+
with codecs.open(path, "rb", "utf-8-sig") as r:
|
193
196
|
lines = r.readlines()
|
194
197
|
return lines
|
195
198
|
except:
|
196
199
|
try:
|
197
|
-
with codecs.open(path,
|
200
|
+
with codecs.open(path, "rb", "utf-8") as r:
|
198
201
|
lines = r.readlines()
|
199
202
|
return lines
|
200
203
|
except:
|
201
204
|
try:
|
202
|
-
with codecs.open(path,
|
205
|
+
with codecs.open(path, "rb", "big5") as r:
|
203
206
|
lines = r.readlines()
|
204
207
|
return lines
|
205
208
|
except:
|
206
|
-
with codecs.open(path,
|
209
|
+
with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
|
207
210
|
lines = r.readlines()
|
208
211
|
return lines
|
209
212
|
|
210
213
|
|
211
214
|
# line by line
|
212
215
|
def save_to_jsonl(corpus, path):
|
213
|
-
with open(path,
|
216
|
+
with open(path, "w", encoding="utf-8") as wt:
|
214
217
|
for i in corpus:
|
215
218
|
wt.write(json.dumps(i, ensure_ascii=False))
|
216
|
-
wt.write(
|
219
|
+
wt.write("\n")
|
217
220
|
|
218
221
|
|
219
222
|
# line by line
|
220
223
|
def load_from_jsonl(path):
|
221
224
|
file_line_num = iter_count(path)
|
222
225
|
if file_line_num > 1e5:
|
223
|
-
with open(path,
|
226
|
+
with open(path, "r", encoding="utf-8") as rd:
|
224
227
|
corpus = []
|
225
228
|
while True:
|
226
229
|
line = rd.readline()
|
@@ -230,7 +233,7 @@ def load_from_jsonl(path):
|
|
230
233
|
break
|
231
234
|
return corpus
|
232
235
|
else:
|
233
|
-
with open(path,
|
236
|
+
with open(path, "r", encoding="utf-8") as rd:
|
234
237
|
corpus = []
|
235
238
|
while True:
|
236
239
|
line = rd.readline()
|
@@ -242,20 +245,20 @@ def load_from_jsonl(path):
|
|
242
245
|
|
243
246
|
|
244
247
|
def save_pkl(data, path):
|
245
|
-
with open(path,
|
248
|
+
with open(path, "wb") as f:
|
246
249
|
pickle.dump(data, f)
|
247
250
|
|
248
251
|
|
249
252
|
def load_pkl(path):
|
250
|
-
with open(path,
|
253
|
+
with open(path, "rb") as f:
|
251
254
|
data = pickle.load(f)
|
252
255
|
return data
|
253
256
|
|
254
257
|
|
255
258
|
def save_to_csv(df, save_path, index_flag=False):
|
256
|
-
with open(save_path,
|
259
|
+
with open(save_path, "wb+") as csvfile:
|
257
260
|
csvfile.write(codecs.BOM_UTF8)
|
258
|
-
df.to_csv(save_path, mode=
|
261
|
+
df.to_csv(save_path, mode="a", index=index_flag)
|
259
262
|
|
260
263
|
|
261
264
|
def save_to_mongo():
|
@@ -0,0 +1,60 @@
|
|
1
|
+
from ..io.file import read_yaml
|
2
|
+
from tqdm import tqdm
|
3
|
+
import os
|
4
|
+
from typing import Optional, Union
|
5
|
+
|
6
|
+
"""
|
7
|
+
从你当前的项目里找到.key文件 获取url和key
|
8
|
+
"""
|
9
|
+
|
10
|
+
|
11
|
+
def call_once_stream(
|
12
|
+
client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.2
|
13
|
+
) -> str:
|
14
|
+
"""
|
15
|
+
调用LLM模型进行一次推理
|
16
|
+
:param prompt: 输入的提示文本
|
17
|
+
:param model_name: 模型名称
|
18
|
+
:param max_tokens: 最大输出token数
|
19
|
+
:return: 模型的输出文本
|
20
|
+
"""
|
21
|
+
from openai import OpenAI
|
22
|
+
|
23
|
+
if isinstance(input, str):
|
24
|
+
message = [{"role": "user", "content": input}]
|
25
|
+
elif isinstance(input, list):
|
26
|
+
message = input
|
27
|
+
|
28
|
+
completion = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens, stream=True)
|
29
|
+
text = ""
|
30
|
+
for chunk in completion:
|
31
|
+
if chunk.choices:
|
32
|
+
c = chunk.choices[0].delta.content or ""
|
33
|
+
text += c
|
34
|
+
print(c, end="")
|
35
|
+
else:
|
36
|
+
print()
|
37
|
+
print(chunk.usage)
|
38
|
+
return text
|
39
|
+
|
40
|
+
|
41
|
+
def call_once(
|
42
|
+
client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.8
|
43
|
+
) -> str:
|
44
|
+
"""
|
45
|
+
调用LLM模型进行一次推理
|
46
|
+
:param prompt: 输入的提示文本
|
47
|
+
:param model_name: 模型名称
|
48
|
+
:param max_tokens: 最大输出token数
|
49
|
+
:return: 模型的输出文本
|
50
|
+
"""
|
51
|
+
from openai import OpenAI
|
52
|
+
|
53
|
+
if isinstance(input, str):
|
54
|
+
message = [{"role": "user", "content": input}]
|
55
|
+
elif isinstance(input, list):
|
56
|
+
message = input
|
57
|
+
|
58
|
+
response = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens,temperature=temperature)
|
59
|
+
|
60
|
+
return response.choices[0].message.content
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
from tqdm import tqdm
|
3
|
-
from openai import OpenAI
|
4
3
|
import concurrent.futures
|
4
|
+
import itertools
|
5
5
|
|
6
6
|
|
7
7
|
INFER_PARAS = {
|
@@ -15,14 +15,17 @@ INFER_PARAS = {
|
|
15
15
|
|
16
16
|
|
17
17
|
def parse_infer_data(infer_data: list):
|
18
|
+
# 解释一下为什么要[][],因为message本来就必须得是[]
|
18
19
|
if isinstance(infer_data[0], str):
|
19
|
-
message = [{"role": "user", "content": i} for i in infer_data]
|
20
|
+
message = [[{"role": "user", "content": i}] for i in infer_data]
|
20
21
|
elif isinstance(infer_data[0], list):
|
21
22
|
message = infer_data
|
22
23
|
return message
|
23
24
|
|
24
25
|
|
25
|
-
def common_api_infer_func(model_name, infer_data: list, infer_paras, client
|
26
|
+
def common_api_infer_func(model_name, infer_data: list, infer_paras, client):
|
27
|
+
from openai import OpenAI
|
28
|
+
|
26
29
|
"""
|
27
30
|
infer_data: list of messages/prompt
|
28
31
|
"""
|
@@ -31,16 +34,58 @@ def common_api_infer_func(model_name, infer_data: list, infer_paras, client: Ope
|
|
31
34
|
def get_response(model_name, messages, infer_paras):
|
32
35
|
responses = []
|
33
36
|
infer_times = infer_paras.get("infer_times", 1)
|
37
|
+
|
34
38
|
for _ in range(infer_times):
|
35
39
|
# 使用OpenAI API进行推理
|
36
|
-
response = client.chat.completions.create(
|
40
|
+
response = client.chat.completions.create(
|
41
|
+
model=model_name,
|
42
|
+
messages=messages,
|
43
|
+
temperature=infer_paras.get("temperature", 0.7),
|
44
|
+
max_tokens=infer_paras.get("max_tokens", 8192),
|
45
|
+
)
|
37
46
|
text = response.choices[0].message.content
|
38
47
|
responses.append({"text": text})
|
39
48
|
return responses
|
40
49
|
|
41
50
|
with concurrent.futures.ThreadPoolExecutor(16) as executor:
|
42
51
|
futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
|
43
|
-
results = [future.result() for future in concurrent.futures.as_completed(futures)]
|
52
|
+
# results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures))] # 乱序
|
53
|
+
results = [future.result() for future in tqdm(futures)]
|
54
|
+
|
55
|
+
return results
|
56
|
+
|
57
|
+
|
58
|
+
def common_api_infer_func_multi_client(model_name, infer_data: list, infer_paras, clients: list):
|
59
|
+
"""
|
60
|
+
infer_data: list of messages/prompt
|
61
|
+
"""
|
62
|
+
messages = parse_infer_data(infer_data)
|
63
|
+
iter_cycle = itertools.cycle(clients)
|
64
|
+
|
65
|
+
def get_response(model_name, messages, infer_paras):
|
66
|
+
client = next(iter_cycle)
|
67
|
+
# print(client.base_url)
|
68
|
+
responses = []
|
69
|
+
infer_times = infer_paras.get("infer_times", 1)
|
70
|
+
for _ in range(infer_times):
|
71
|
+
# 使用OpenAI API进行推理
|
72
|
+
try:
|
73
|
+
response = client.chat.completions.create(
|
74
|
+
model=model_name,
|
75
|
+
messages=messages,
|
76
|
+
temperature=infer_paras.get("temperature", 0.7),
|
77
|
+
max_tokens=infer_paras.get("max_tokens", 8192),
|
78
|
+
)
|
79
|
+
text = response.choices[0].message.content
|
80
|
+
except Exception as e:
|
81
|
+
print(e.__str__())
|
82
|
+
text = ""
|
83
|
+
responses.append({"text": text})
|
84
|
+
return responses
|
85
|
+
|
86
|
+
with concurrent.futures.ThreadPoolExecutor(128) as executor:
|
87
|
+
futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
|
88
|
+
results = [future.result() for future in tqdm(futures)]
|
44
89
|
|
45
90
|
return results
|
46
91
|
|