nlpertools 1.0.9__py3-none-any.whl → 1.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +2 -2
- nlpertools/cli.py +19 -37
- nlpertools/dataprocess.py +12 -0
- nlpertools/llm/__init__.py +0 -0
- nlpertools/llm/call_llm_once.py +30 -0
- nlpertools/llm/infer.py +74 -0
- nlpertools/llm/price.py +13 -0
- nlpertools/ml.py +72 -59
- nlpertools/other.py +5 -2
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.10.dist-info}/METADATA +25 -25
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.10.dist-info}/RECORD +15 -11
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.10.dist-info}/WHEEL +1 -1
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.10.dist-info}/entry_points.txt +0 -0
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.10.dist-info/licenses}/LICENSE +0 -0
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.10.dist-info}/top_level.txt +0 -0
nlpertools/__init__.py
CHANGED
nlpertools/cli.py
CHANGED
@@ -2,39 +2,24 @@ import argparse
|
|
2
2
|
import os
|
3
3
|
import uuid
|
4
4
|
import sys
|
5
|
+
from .dataprocess import startwith
|
5
6
|
|
6
|
-
"""
|
7
|
-
如何Debug cli.py
|
8
|
-
"""
|
9
7
|
|
10
|
-
|
11
|
-
def git_push():
|
8
|
+
def run_git_command(command):
|
12
9
|
"""
|
13
|
-
|
10
|
+
循环执行git命令,直到成功
|
14
11
|
"""
|
12
|
+
print(command)
|
15
13
|
num = -1
|
16
|
-
while
|
17
|
-
num += 1
|
18
|
-
print("retry num: {}".format(num))
|
19
|
-
info = os.system("git push --set-upstream origin main")
|
20
|
-
print(str(info))
|
21
|
-
if not str(info).startswith("fatal"):
|
22
|
-
print("scucess")
|
23
|
-
break
|
24
|
-
|
25
|
-
|
26
|
-
def git_pull():
|
27
|
-
"""
|
28
|
-
针对国内提交github经常失败,自动提交
|
29
|
-
"""
|
30
|
-
num = -1
|
31
|
-
while 1:
|
14
|
+
while True:
|
32
15
|
num += 1
|
33
|
-
print("retry num: {}"
|
34
|
-
info = os.system(
|
16
|
+
print(f"retry num: {num}")
|
17
|
+
info = os.system(command)
|
35
18
|
print(str(info))
|
36
|
-
|
37
|
-
|
19
|
+
# 检查命令执行结果,若未出现错误则认为执行成功
|
20
|
+
if not startwith(str(info), ["fatal", "error", "128", "1"]):
|
21
|
+
print("success")
|
22
|
+
print(f"success info : ##{info}##")
|
38
23
|
break
|
39
24
|
|
40
25
|
|
@@ -108,22 +93,19 @@ def start_gpu_usage_notify_client():
|
|
108
93
|
|
109
94
|
|
110
95
|
def main():
|
111
|
-
parser = argparse.ArgumentParser(description="CLI tool for git operations and
|
112
|
-
parser.add_argument('
|
113
|
-
parser.add_argument('--gitpull', action='store_true', help='Perform git pull operation.')
|
96
|
+
parser = argparse.ArgumentParser(description="CLI tool for git operations and other functions.")
|
97
|
+
parser.add_argument('git_command', nargs='*', help='Any git command (e.g., push, pull)')
|
114
98
|
parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
|
115
|
-
|
116
99
|
parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
|
117
100
|
parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
|
118
|
-
parser.add_argument('--monitor_gpu_cli', action='store_true', help='
|
119
|
-
parser.add_argument('--monitor_gpu_ser', action='store_true', help='
|
101
|
+
parser.add_argument('--monitor_gpu_cli', action='store_true', help='monitor gpu cli')
|
102
|
+
parser.add_argument('--monitor_gpu_ser', action='store_true', help='monitor gpu ser')
|
120
103
|
|
121
104
|
args = parser.parse_args()
|
122
105
|
|
123
|
-
if args.
|
124
|
-
|
125
|
-
|
126
|
-
git_pull()
|
106
|
+
if args.git_command:
|
107
|
+
git_cmd = " ".join(args.git_command)
|
108
|
+
run_git_command(git_cmd)
|
127
109
|
elif args.mac_address:
|
128
110
|
get_mac_address()
|
129
111
|
elif args.monitor_gpu_cli:
|
@@ -140,4 +122,4 @@ def main():
|
|
140
122
|
|
141
123
|
|
142
124
|
if __name__ == '__main__':
|
143
|
-
main()
|
125
|
+
main()
|
nlpertools/dataprocess.py
CHANGED
@@ -19,6 +19,18 @@ other_special_characters = (
|
|
19
19
|
"」﴾》"
|
20
20
|
)
|
21
21
|
|
22
|
+
def startwith(text: str, pattern_list: list) -> bool:
|
23
|
+
"""
|
24
|
+
判断text是否以pattern_list中的某个pattern开头
|
25
|
+
:param text:
|
26
|
+
:param pattern_list:
|
27
|
+
:return:
|
28
|
+
"""
|
29
|
+
for pattern in pattern_list:
|
30
|
+
if text.startswith(pattern):
|
31
|
+
return True
|
32
|
+
return False
|
33
|
+
|
22
34
|
|
23
35
|
class Pattern:
|
24
36
|
"""
|
File without changes
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from ..io.file import readtxt_string, read_yaml
|
2
|
+
from tqdm import tqdm
|
3
|
+
import os
|
4
|
+
from openai import Openai
|
5
|
+
from typing import Optional, Union
|
6
|
+
|
7
|
+
"""
|
8
|
+
从你当前的项目里找到.key文件 获取url和key
|
9
|
+
"""
|
10
|
+
|
11
|
+
|
12
|
+
def call_once(
|
13
|
+
client: Openai, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192
|
14
|
+
) -> str:
|
15
|
+
"""
|
16
|
+
调用LLM模型进行一次推理
|
17
|
+
:param prompt: 输入的提示文本
|
18
|
+
:param model_name: 模型名称
|
19
|
+
:param max_tokens: 最大输出token数
|
20
|
+
:return: 模型的输出文本
|
21
|
+
"""
|
22
|
+
|
23
|
+
if isinstance(input, str):
|
24
|
+
message = [{"role": "user", "content": input}]
|
25
|
+
elif isinstance(input, list):
|
26
|
+
message = input
|
27
|
+
|
28
|
+
response = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens)
|
29
|
+
|
30
|
+
return response.choices[0].message.content
|
nlpertools/llm/infer.py
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
import os
|
2
|
+
from tqdm import tqdm
|
3
|
+
from openai import OpenAI
|
4
|
+
import concurrent.futures
|
5
|
+
|
6
|
+
|
7
|
+
INFER_PARAS = {
|
8
|
+
"temperature": 0.7,
|
9
|
+
"infer_times": 1,
|
10
|
+
"max_tokens": 8192,
|
11
|
+
"top_p": 0.95,
|
12
|
+
"top_k": 40,
|
13
|
+
"repetition_penalty": 1.0,
|
14
|
+
}
|
15
|
+
|
16
|
+
|
17
|
+
def parse_infer_data(infer_data: list):
|
18
|
+
if isinstance(infer_data[0], str):
|
19
|
+
message = [{"role": "user", "content": i} for i in infer_data]
|
20
|
+
elif isinstance(infer_data[0], list):
|
21
|
+
message = infer_data
|
22
|
+
return message
|
23
|
+
|
24
|
+
|
25
|
+
def common_api_infer_func(model_name, infer_data: list, infer_paras, client: OpenAI):
|
26
|
+
"""
|
27
|
+
infer_data: list of messages/prompt
|
28
|
+
"""
|
29
|
+
messages = parse_infer_data(infer_data)
|
30
|
+
|
31
|
+
def get_response(model_name, messages, infer_paras):
|
32
|
+
responses = []
|
33
|
+
infer_times = infer_paras.get("infer_times", 1)
|
34
|
+
for _ in range(infer_times):
|
35
|
+
# 使用OpenAI API进行推理
|
36
|
+
response = client.chat.completions.create(model=model_name, messages=messages, **infer_paras)
|
37
|
+
text = response.choices[0].message.content
|
38
|
+
responses.append({"text": text})
|
39
|
+
return responses
|
40
|
+
|
41
|
+
with concurrent.futures.ThreadPoolExecutor(16) as executor:
|
42
|
+
futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
|
43
|
+
results = [future.result() for future in concurrent.futures.as_completed(futures)]
|
44
|
+
|
45
|
+
return results
|
46
|
+
|
47
|
+
|
48
|
+
def common_vllm_infer_func(model_path, infer_data: list, infer_paras: dict):
|
49
|
+
"""
|
50
|
+
infer_data: list of messages/prompt
|
51
|
+
"""
|
52
|
+
messages = parse_infer_data(infer_data)
|
53
|
+
from vllm import LLM, SamplingParams
|
54
|
+
|
55
|
+
temperature = infer_paras.get("temperature", 0.7)
|
56
|
+
infer_times = infer_paras.get("infer_times", 1)
|
57
|
+
vllm_card_num = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
|
58
|
+
|
59
|
+
llm = LLM(model=model_path, tensor_parallel_size=vllm_card_num, trust_remote_code=True, gpu_memory_utilization=0.85)
|
60
|
+
sampling_params = SamplingParams(
|
61
|
+
temperature=temperature,
|
62
|
+
n=infer_times,
|
63
|
+
max_tokens=8192,
|
64
|
+
# qwen3非思考模式推荐参数
|
65
|
+
# **infer_paras.get(template_name, {}),
|
66
|
+
# qwen3思考模式推荐参数
|
67
|
+
)
|
68
|
+
conversation = messages
|
69
|
+
outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=True)
|
70
|
+
return_texts = []
|
71
|
+
for idx, output in tqdm(enumerate(outputs)):
|
72
|
+
result = [{"text": i.text} for i in output.outputs]
|
73
|
+
return_texts.append(result)
|
74
|
+
return return_texts
|
nlpertools/llm/price.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
def estimate_cost(input_token_num, output_token_num, example_num=1, input_price=1, output_price=4):
|
2
|
+
"""
|
3
|
+
估算成本
|
4
|
+
:param input_token_num: 输入token数量
|
5
|
+
:param output_token_num: 输出token数量
|
6
|
+
:param example_num: 示例数量
|
7
|
+
:param input_price: 输入token单价 / 1M
|
8
|
+
:param output_price: 输出token单价 / 1M
|
9
|
+
:return: 成本
|
10
|
+
"""
|
11
|
+
price = (input_token_num * input_price + output_token_num * output_price) * example_num / 1000000
|
12
|
+
print(f"Estimated cost: {price:.2f} 元")
|
13
|
+
return price
|
nlpertools/ml.py
CHANGED
@@ -2,9 +2,11 @@
|
|
2
2
|
import codecs
|
3
3
|
import os
|
4
4
|
import random
|
5
|
+
import itertools
|
5
6
|
|
6
7
|
from .io.dir import j_mkdir
|
7
8
|
from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
|
9
|
+
|
8
10
|
# import numpy as np
|
9
11
|
# import seaborn as sns
|
10
12
|
# import torch
|
@@ -17,10 +19,11 @@ from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
|
|
17
19
|
from .utils.package import *
|
18
20
|
|
19
21
|
|
20
|
-
def estimate_pass_at_k(num_samples:list, num_correct:list, k):
|
22
|
+
def estimate_pass_at_k(num_samples: list, num_correct: list, k):
|
21
23
|
"""
|
22
24
|
copy from https://huggingface.co/spaces/evaluate-metric/code_eval/blob/main/code_eval.py
|
23
25
|
num_samples: list
|
26
|
+
Note: if num sample < k, acc = 1, it's incomprehensibly
|
24
27
|
"""
|
25
28
|
"""Estimates pass@k of each problem and returns them in an array."""
|
26
29
|
|
@@ -39,8 +42,21 @@ def estimate_pass_at_k(num_samples:list, num_correct:list, k):
|
|
39
42
|
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
40
43
|
|
41
44
|
|
45
|
+
def estimate_pass_at_k_fixed(num_samples: list, num_correct: list, k):
|
46
|
+
"""
|
47
|
+
优化了num_samples小于 k的情况
|
48
|
+
"""
|
49
|
+
num_samples = [k if i < k else i for i in num_samples]
|
50
|
+
return estimate_pass_at_k(num_samples, num_correct, k)
|
51
|
+
|
52
|
+
|
53
|
+
def estimate_pass_at_k_return_num(num_samples: list, num_correct: list, k):
|
54
|
+
"""直接返回求完平均的"""
|
55
|
+
return round(estimate_pass_at_k(num_samples, num_correct, k).mean() * 100, 2)
|
56
|
+
|
57
|
+
|
42
58
|
def calc_llm_train_activation_memory(
|
43
|
-
|
59
|
+
model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
|
44
60
|
):
|
45
61
|
"""
|
46
62
|
return bytes
|
@@ -54,18 +70,19 @@ def calc_llm_train_activation_memory(
|
|
54
70
|
# FFN
|
55
71
|
# Layer Norm
|
56
72
|
r1 = (
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
73
|
+
sequence_length
|
74
|
+
* batch_size
|
75
|
+
* hidden_dim
|
76
|
+
* lay_number
|
77
|
+
* (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
|
62
78
|
)
|
63
79
|
# reference2
|
64
80
|
r2 = (
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
81
|
+
lay_number
|
82
|
+
* (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
|
83
|
+
* sequence_length
|
84
|
+
* batch_size
|
85
|
+
/ gpu_num
|
69
86
|
)
|
70
87
|
print(r1)
|
71
88
|
print(r2)
|
@@ -100,9 +117,7 @@ class DataStructure:
|
|
100
117
|
"source": "baidu",
|
101
118
|
}
|
102
119
|
ner_input_example = "这句话一共有两个实体分别为大象和老鼠。"
|
103
|
-
ner_label_example = (
|
104
|
-
list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
|
105
|
-
)
|
120
|
+
ner_label_example = list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
|
106
121
|
|
107
122
|
|
108
123
|
def text_jaccard(ipt1, ipt2, ipt_level="char", sim_level="char"):
|
@@ -156,7 +171,7 @@ class STEM(object):
|
|
156
171
|
if each_srl:
|
157
172
|
args = []
|
158
173
|
for arg in each_srl:
|
159
|
-
args.extend(seg[arg[1]: arg[2] + 1])
|
174
|
+
args.extend(seg[arg[1] : arg[2] + 1])
|
160
175
|
# 添加上谓词
|
161
176
|
args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
|
162
177
|
events.append(args)
|
@@ -195,7 +210,7 @@ def subject_object_labeling(spo_list, text):
|
|
195
210
|
q_list_length = len(q_list)
|
196
211
|
k_list_length = len(k_list)
|
197
212
|
for idx in range(k_list_length - q_list_length + 1):
|
198
|
-
t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
|
213
|
+
t = [q == k for q, k in zip(q_list, k_list[idx : idx + q_list_length])]
|
199
214
|
# print(idx, t)
|
200
215
|
if all(t):
|
201
216
|
# print(idx)
|
@@ -208,9 +223,7 @@ def subject_object_labeling(spo_list, text):
|
|
208
223
|
if len(spo) == 2:
|
209
224
|
labeling_list[idx_start + 1] = "I-" + spo_type
|
210
225
|
elif len(spo) >= 3:
|
211
|
-
labeling_list[idx_start + 1: idx_start + len(spo)] = ["I-" + spo_type] * (
|
212
|
-
len(spo) - 1
|
213
|
-
)
|
226
|
+
labeling_list[idx_start + 1 : idx_start + len(spo)] = ["I-" + spo_type] * (len(spo) - 1)
|
214
227
|
else:
|
215
228
|
pass
|
216
229
|
|
@@ -219,7 +232,7 @@ def subject_object_labeling(spo_list, text):
|
|
219
232
|
# count = 0
|
220
233
|
for predicate, spo_list_form in spo_predicate_dict.items():
|
221
234
|
if predicate in text:
|
222
|
-
for
|
235
|
+
for spo_subject, spo_object in spo_list_form:
|
223
236
|
# if predicate not in spo_subject and predicate not in spo_object:
|
224
237
|
_labeling_type(spo_subject, "SUB")
|
225
238
|
_labeling_type(spo_object, "OBJ")
|
@@ -241,10 +254,7 @@ def label(text, labels):
|
|
241
254
|
:return:
|
242
255
|
"""
|
243
256
|
train_sequence = "\n".join(
|
244
|
-
[
|
245
|
-
"\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1])
|
246
|
-
for i in zip(list(text), labels)
|
247
|
-
]
|
257
|
+
["\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1]) for i in zip(list(text), labels)]
|
248
258
|
)
|
249
259
|
return train_sequence
|
250
260
|
|
@@ -260,16 +270,12 @@ def convert_crf_format_10_fold(corpus, objdir_path):
|
|
260
270
|
split_position = int(len(corpus) / 10)
|
261
271
|
for k in range(0, 10):
|
262
272
|
if k == 9:
|
263
|
-
dev_set = corpus[k * split_position:]
|
273
|
+
dev_set = corpus[k * split_position :]
|
264
274
|
train_set = corpus[: k * split_position]
|
265
275
|
else:
|
266
|
-
dev_set = corpus[k * split_position: (k + 1) * split_position]
|
267
|
-
train_set = (
|
268
|
-
|
269
|
-
)
|
270
|
-
writetxt_w_list(
|
271
|
-
train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1))
|
272
|
-
)
|
276
|
+
dev_set = corpus[k * split_position : (k + 1) * split_position]
|
277
|
+
train_set = corpus[: k * split_position] + corpus[(k + 1) * split_position :]
|
278
|
+
writetxt_w_list(train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1)))
|
273
279
|
writetxt_w_list(dev_set, os.path.join(objdir_path, "test{}.txt".format(k + 1)))
|
274
280
|
writetxt_w_list(dev_set, os.path.join(objdir_path, "dev{}.txt".format(k + 1)))
|
275
281
|
|
@@ -305,31 +311,19 @@ def read_seq_res(path, labels):
|
|
305
311
|
return text, raw_label, predict_label
|
306
312
|
|
307
313
|
|
308
|
-
def kfold_txt(corpus, path, k=9, is_shuffle=True):
|
309
|
-
"""
|
310
|
-
k是10份中训练集占了几份
|
311
|
-
"""
|
312
|
-
j_mkdir(path)
|
313
|
-
if is_shuffle:
|
314
|
-
random.shuffle(corpus)
|
315
|
-
split_position = int(len(corpus) / 10)
|
316
|
-
train_set, dev_set = corpus[: k * split_position], corpus[k * split_position:]
|
317
|
-
writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
|
318
|
-
writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
|
319
|
-
writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
|
320
|
-
|
321
|
-
|
322
314
|
def sample():
|
323
315
|
import pandas as pd
|
324
316
|
from sklearn.model_selection import StratifiedShuffleSplit
|
325
317
|
|
326
318
|
# 假设 df 是你的 DataFrame
|
327
319
|
|
328
|
-
df = pd.DataFrame(
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
320
|
+
df = pd.DataFrame(
|
321
|
+
{
|
322
|
+
"count_line": [i for i in range(100)],
|
323
|
+
"x": [i for i in range(100)],
|
324
|
+
"y": [i // 10 for i in range(100)],
|
325
|
+
}
|
326
|
+
)
|
333
327
|
print(df)
|
334
328
|
# count_line 是用于分层抽样的字段
|
335
329
|
|
@@ -337,7 +331,7 @@ def sample():
|
|
337
331
|
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
|
338
332
|
|
339
333
|
# 获取训练集和测试集的索引
|
340
|
-
train_index, test_index = next(split.split(df, df[
|
334
|
+
train_index, test_index = next(split.split(df, df["y"]))
|
341
335
|
|
342
336
|
# 根据索引划分训练集和测试集
|
343
337
|
train_df = df.loc[train_index]
|
@@ -348,6 +342,27 @@ def sample():
|
|
348
342
|
print("测试集行数:", len(test_df))
|
349
343
|
|
350
344
|
|
345
|
+
def kfold_txt(corpus, path, k=9, is_shuffle=True):
|
346
|
+
"""
|
347
|
+
k是10份中训练集占了几份
|
348
|
+
"""
|
349
|
+
j_mkdir(path)
|
350
|
+
if is_shuffle:
|
351
|
+
random.shuffle(corpus)
|
352
|
+
split_position = int(len(corpus) / 10)
|
353
|
+
train_set, dev_set = corpus[: k * split_position], corpus[k * split_position :]
|
354
|
+
writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
|
355
|
+
writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
|
356
|
+
writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
|
357
|
+
|
358
|
+
|
359
|
+
def kfold_list(list_data):
|
360
|
+
"""
|
361
|
+
sklearn.model_selection.train_test_split
|
362
|
+
"""
|
363
|
+
pass
|
364
|
+
|
365
|
+
|
351
366
|
def kfold_df(df, save_dir=None):
|
352
367
|
"""
|
353
368
|
划分train test val集, 写为windows可读的csv。
|
@@ -360,9 +375,7 @@ def kfold_df(df, save_dir=None):
|
|
360
375
|
|
361
376
|
train_idx, test_and_val_idx = KFold(n_splits=8, shuffle=True).split(df).__next__()
|
362
377
|
df_test_and_val = df.iloc[test_and_val_idx]
|
363
|
-
test_idx, val_idx = (
|
364
|
-
KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
|
365
|
-
)
|
378
|
+
test_idx, val_idx = KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
|
366
379
|
df_train = df.iloc[train_idx]
|
367
380
|
df_val = df.iloc[val_idx]
|
368
381
|
df_test = df.iloc[test_idx]
|
@@ -439,7 +452,7 @@ def split_sentence(sentence, language="chinese", cross_line=True):
|
|
439
452
|
for idx, char in enumerate(sentence):
|
440
453
|
if idx == len(sentence) - 1:
|
441
454
|
if char in split_signs:
|
442
|
-
sentences.append(sentence[start_idx: idx + 1].strip())
|
455
|
+
sentences.append(sentence[start_idx : idx + 1].strip())
|
443
456
|
start_idx = idx + 1
|
444
457
|
else:
|
445
458
|
sentences.append(sentence[start_idx:].strip())
|
@@ -449,10 +462,10 @@ def split_sentence(sentence, language="chinese", cross_line=True):
|
|
449
462
|
if idx < len(sentence) - 2:
|
450
463
|
# 处理。”。
|
451
464
|
if sentence[idx + 2] not in split_signs:
|
452
|
-
sentences.append(sentence[start_idx: idx + 2].strip())
|
465
|
+
sentences.append(sentence[start_idx : idx + 2].strip())
|
453
466
|
start_idx = idx + 2
|
454
467
|
elif sentence[idx + 1] not in split_signs:
|
455
|
-
sentences.append(sentence[start_idx: idx + 1].strip())
|
468
|
+
sentences.append(sentence[start_idx : idx + 1].strip())
|
456
469
|
start_idx = idx + 1
|
457
470
|
return sentences
|
458
471
|
|
@@ -528,6 +541,6 @@ if __name__ == "__main__":
|
|
528
541
|
hidden_dim=4096,
|
529
542
|
lay_number=28,
|
530
543
|
attention_heads_num=32,
|
531
|
-
gpu_num=1
|
544
|
+
gpu_num=1,
|
532
545
|
)
|
533
546
|
print(res, "G")
|
nlpertools/other.py
CHANGED
@@ -169,8 +169,11 @@ def jprint(obj, depth=0):
|
|
169
169
|
print(obj)
|
170
170
|
|
171
171
|
|
172
|
-
def print_split(sign="=", num=20):
|
173
|
-
|
172
|
+
def print_split(sign="=", num=20, char: str = None):
|
173
|
+
if char:
|
174
|
+
print(sign * num // 2, char, sign * num // 2)
|
175
|
+
else:
|
176
|
+
print(sign * num)
|
174
177
|
|
175
178
|
|
176
179
|
def seed_everything():
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: nlpertools
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.10
|
4
4
|
Summary: A small package about small basic IO operation when coding
|
5
5
|
Home-page: https://github.com/lvzii/nlpertools
|
6
6
|
Author: youshuJi
|
@@ -15,8 +15,10 @@ License-File: LICENSE
|
|
15
15
|
Requires-Dist: numpy
|
16
16
|
Requires-Dist: pandas
|
17
17
|
Requires-Dist: psutil
|
18
|
+
Requires-Dist: openai
|
18
19
|
Provides-Extra: torch
|
19
20
|
Requires-Dist: torch; extra == "torch"
|
21
|
+
Dynamic: license-file
|
20
22
|
Dynamic: provides-extra
|
21
23
|
Dynamic: requires-dist
|
22
24
|
|
@@ -50,6 +52,23 @@ nlpertools
|
|
50
52
|
|
51
53
|
```
|
52
54
|
|
55
|
+
# 最常用/喜欢的功能(使用示例)
|
56
|
+
```python
|
57
|
+
# 读txt, json文件
|
58
|
+
import nlpertools
|
59
|
+
|
60
|
+
txt_data = nlpertools.readtxt_list_all_strip('res.txt')
|
61
|
+
json_data = nlpertools.load_from_json('res.json')
|
62
|
+
```
|
63
|
+
|
64
|
+
```bash
|
65
|
+
## git, 连接github不稳定的时候非常有用
|
66
|
+
ncli git pull
|
67
|
+
|
68
|
+
# 生成pypi双因素认证的实时密钥(需要提供key)
|
69
|
+
ncli --get_2fa --get_2fa_key your_key
|
70
|
+
```
|
71
|
+
|
53
72
|
# 安装
|
54
73
|
|
55
74
|
Install the latest release version
|
@@ -101,30 +120,7 @@ https://nlpertools.readthedocs.io/en/latest/
|
|
101
120
|
|
102
121
|
一些可能需要配置才能用的函数,写上示例
|
103
122
|
|
104
|
-
## 使用示例
|
105
|
-
|
106
|
-
```python
|
107
|
-
import nlpertools
|
108
|
-
|
109
|
-
a = nlpertools.readtxt_list_all_strip('res.txt')
|
110
|
-
# 或
|
111
|
-
b = nlpertools.io.file.readtxt_list_all_strip('res.txt')
|
112
|
-
```
|
113
123
|
|
114
|
-
```bash
|
115
|
-
# 生成pypi双因素认证的实时密钥(需要提供key)
|
116
|
-
python -m nlpertools.cli --get_2fa --get_2fa_key your_key
|
117
|
-
|
118
|
-
## git
|
119
|
-
python -m nlpertools.cli --git_push
|
120
|
-
python -m nlpertools.cli --git_pull
|
121
|
-
|
122
|
-
# 以下功能被nvitop替代,不推荐使用
|
123
|
-
## 监控gpu显存
|
124
|
-
python -m nlpertools.monitor.gpu
|
125
|
-
## 监控cpu
|
126
|
-
python -m nlpertools.monitor.memory
|
127
|
-
```
|
128
124
|
|
129
125
|
## 一些常用项目
|
130
126
|
|
@@ -132,3 +128,7 @@ nvitop
|
|
132
128
|
|
133
129
|
ydata-profiling
|
134
130
|
|
131
|
+
## 贡献
|
132
|
+
|
133
|
+
https://github.com/bigscience-workshop/data-preparation
|
134
|
+
|
@@ -1,14 +1,14 @@
|
|
1
|
-
nlpertools/__init__.py,sha256=
|
2
|
-
nlpertools/cli.py,sha256=
|
1
|
+
nlpertools/__init__.py,sha256=3tjuCeGz_Q2DAGXn2K6n58YEQ8dpwGx4yg_rh_npw9M,502
|
2
|
+
nlpertools/cli.py,sha256=LlHZV9x9ZeqC9rILG4aYmNM2PymdkzYVc7lcbu1tMRw,3615
|
3
3
|
nlpertools/data_client.py,sha256=esX8lUQrTui4uVkqPfhpHVok7Eq6ywpuemKjLeqoglc,14674
|
4
|
-
nlpertools/dataprocess.py,sha256=
|
4
|
+
nlpertools/dataprocess.py,sha256=3ayCZAFc5t-Ov06oenRhMoGmnQrmCy-gtPhswecjEa4,23451
|
5
5
|
nlpertools/default_db_config.yml,sha256=E1K9k_xzXVlsf-HJQh8kyHXHYuvTpD12jD4Hfe5rUk8,606
|
6
6
|
nlpertools/get_2fa.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
nlpertools/ml.py,sha256=
|
7
|
+
nlpertools/ml.py,sha256=fjI-WMM1lRnOnRFfTLEGplUx_Uamgr2gfmoAvGlgF7E,18994
|
8
8
|
nlpertools/movie.py,sha256=rkyOnAXdsbWfMSbi1sE1VNRT7f66Hp9BnZsN_58Afmw,897
|
9
9
|
nlpertools/nlpertools_config.yml,sha256=ksXejxFs7pxR47tNAsrN88_4gvq9PCA2ZMO07H-dJXY,26
|
10
10
|
nlpertools/open_api.py,sha256=uyTY00OUlM57Cn0Wm0yZXcIS8vAszy9rKnDMBEWfWJM,1744
|
11
|
-
nlpertools/other.py,sha256=
|
11
|
+
nlpertools/other.py,sha256=WWUPwdBkRQrWpsmAMOYBm6GFFnKlyN1ANlFx5bLkj8s,15125
|
12
12
|
nlpertools/pic.py,sha256=13aaFJh3USGYGs4Y9tAKTvWjmdQR4YDjl3LlIhJheOA,9906
|
13
13
|
nlpertools/plugin.py,sha256=LB7j9GdoQi6TITddH-6EglHlOa0WIHLUT7X5vb_aIZY,1168
|
14
14
|
nlpertools/reminder.py,sha256=wiXwZQmxMck5vY3EvG8_oakP3FAdjGTikAIOiTPUQrs,2977
|
@@ -30,6 +30,10 @@ nlpertools/draw/math_func.py,sha256=0NQ22Dfi9DFG6Bg_hXnCT27w65-dqpOOIgZX7oUIW-Q,
|
|
30
30
|
nlpertools/io/__init__.py,sha256=YMuKtC2Ddh5dL5MvXjyUKYOOuqzFYUhBPFaP2kyFG9I,68
|
31
31
|
nlpertools/io/dir.py,sha256=FPY62COQN8Ji72pk0dYRoXkrORYaUlybKNcL4474uUI,2263
|
32
32
|
nlpertools/io/file.py,sha256=mLWl09IEi0rWPN4tTq3LwdYMvAjj4e_QsjEMhufuPPo,7192
|
33
|
+
nlpertools/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
+
nlpertools/llm/call_llm_once.py,sha256=vswnPDZmDZO2Gz2U1m7X7OhaCHUkyVnEzDy4g7CQhVU,856
|
35
|
+
nlpertools/llm/infer.py,sha256=u9DbopRY1-xQymcNGucsnVwo9Bgyrqg2ncWlK1f00rA,2483
|
36
|
+
nlpertools/llm/price.py,sha256=8zzEaLrbGiDUbTFSnuBGAduiSfDVXQUk4Oc_lE6eJFw,544
|
33
37
|
nlpertools/monitor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
38
|
nlpertools/monitor/gpu.py,sha256=M59O6i0hlew7AzXZlaVZqbZA5IR93OhBY2WI0-T_HtY,531
|
35
39
|
nlpertools/monitor/memory.py,sha256=9t6q9BC8VVx4o3G4sBCn7IoQRx272zMPjSnL3yvTBAQ,657
|
@@ -40,10 +44,10 @@ nlpertools/utils/log_util.py,sha256=ftJDoTOtroLH-LadOygZljeyltOQn0D2Xb5x7Td1Qdg,
|
|
40
44
|
nlpertools/utils/package.py,sha256=wLg_M8j7Y6ReRjWHWCWoZJHrzEwuAr9TyG2jvb7OQCo,3261
|
41
45
|
nlpertools/utils/package_v1.py,sha256=sqgFb-zbTdMd5ziJLY6YUPqR49qUNZjxBH35DnyR5Wg,3542
|
42
46
|
nlpertools/utils/package_v2.py,sha256=WOcsguWfUd4XSAfmPgCtL8HtUbqJ6GRSMHb0OsB47r0,3932
|
47
|
+
nlpertools-1.0.10.dist-info/licenses/LICENSE,sha256=SBcMozykvTbZJ--MqSiKUmHLLROdnr25V70xCQgEwqw,11331
|
43
48
|
nlpertools_helper/__init__.py,sha256=obxRUdZDctvcvK_iA1Dx2HmQFMlMzJto-xDPryq1lJ0,198
|
44
|
-
nlpertools-1.0.
|
45
|
-
nlpertools-1.0.
|
46
|
-
nlpertools-1.0.
|
47
|
-
nlpertools-1.0.
|
48
|
-
nlpertools-1.0.
|
49
|
-
nlpertools-1.0.9.dist-info/RECORD,,
|
49
|
+
nlpertools-1.0.10.dist-info/METADATA,sha256=z6WqwEQxdq4xOF3Pw8QXMcrckcMTYfaeRyEqs0aM428,3304
|
50
|
+
nlpertools-1.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
51
|
+
nlpertools-1.0.10.dist-info/entry_points.txt,sha256=XEazQ4vUwJMoMAgAwk1Lq4PRQGklPkPBaFkiP0zN_JE,45
|
52
|
+
nlpertools-1.0.10.dist-info/top_level.txt,sha256=_4q4MIFvMr4cAUbhWKWYdRXIXsF4PJDg4BUsZvgk94s,29
|
53
|
+
nlpertools-1.0.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|