nlpertools 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +3 -2
- nlpertools/cli.py +26 -47
- nlpertools/dataprocess/__init__.py +1 -0
- nlpertools/dataprocess/dedupl.py +9 -0
- nlpertools/{dataprocess.py → dataprocess/dp_main.py} +13 -1
- nlpertools/io/dir.py +25 -5
- nlpertools/io/file.py +46 -43
- nlpertools/llm/__init__.py +3 -0
- nlpertools/llm/call_llm_once.py +60 -0
- nlpertools/llm/infer.py +119 -0
- nlpertools/llm/price.py +13 -0
- nlpertools/ml.py +72 -59
- nlpertools/other.py +82 -53
- nlpertools/utils/package.py +9 -10
- nlpertools/wrapper.py +6 -4
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/METADATA +27 -25
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/RECORD +21 -15
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/WHEEL +1 -1
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/entry_points.txt +0 -0
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info/licenses}/LICENSE +0 -0
- {nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/top_level.txt +0 -0
nlpertools/llm/infer.py
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
import os
|
2
|
+
from tqdm import tqdm
|
3
|
+
import concurrent.futures
|
4
|
+
import itertools
|
5
|
+
|
6
|
+
|
7
|
+
INFER_PARAS = {
|
8
|
+
"temperature": 0.7,
|
9
|
+
"infer_times": 1,
|
10
|
+
"max_tokens": 8192,
|
11
|
+
"top_p": 0.95,
|
12
|
+
"top_k": 40,
|
13
|
+
"repetition_penalty": 1.0,
|
14
|
+
}
|
15
|
+
|
16
|
+
|
17
|
+
def parse_infer_data(infer_data: list):
|
18
|
+
# 解释一下为什么要[][],因为message本来就必须得是[]
|
19
|
+
if isinstance(infer_data[0], str):
|
20
|
+
message = [[{"role": "user", "content": i}] for i in infer_data]
|
21
|
+
elif isinstance(infer_data[0], list):
|
22
|
+
message = infer_data
|
23
|
+
return message
|
24
|
+
|
25
|
+
|
26
|
+
def common_api_infer_func(model_name, infer_data: list, infer_paras, client):
|
27
|
+
from openai import OpenAI
|
28
|
+
|
29
|
+
"""
|
30
|
+
infer_data: list of messages/prompt
|
31
|
+
"""
|
32
|
+
messages = parse_infer_data(infer_data)
|
33
|
+
|
34
|
+
def get_response(model_name, messages, infer_paras):
|
35
|
+
responses = []
|
36
|
+
infer_times = infer_paras.get("infer_times", 1)
|
37
|
+
|
38
|
+
for _ in range(infer_times):
|
39
|
+
# 使用OpenAI API进行推理
|
40
|
+
response = client.chat.completions.create(
|
41
|
+
model=model_name,
|
42
|
+
messages=messages,
|
43
|
+
temperature=infer_paras.get("temperature", 0.7),
|
44
|
+
max_tokens=infer_paras.get("max_tokens", 8192),
|
45
|
+
)
|
46
|
+
text = response.choices[0].message.content
|
47
|
+
responses.append({"text": text})
|
48
|
+
return responses
|
49
|
+
|
50
|
+
with concurrent.futures.ThreadPoolExecutor(16) as executor:
|
51
|
+
futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
|
52
|
+
# results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures))] # 乱序
|
53
|
+
results = [future.result() for future in tqdm(futures)]
|
54
|
+
|
55
|
+
return results
|
56
|
+
|
57
|
+
|
58
|
+
def common_api_infer_func_multi_client(model_name, infer_data: list, infer_paras, clients: list):
|
59
|
+
"""
|
60
|
+
infer_data: list of messages/prompt
|
61
|
+
"""
|
62
|
+
messages = parse_infer_data(infer_data)
|
63
|
+
iter_cycle = itertools.cycle(clients)
|
64
|
+
|
65
|
+
def get_response(model_name, messages, infer_paras):
|
66
|
+
client = next(iter_cycle)
|
67
|
+
# print(client.base_url)
|
68
|
+
responses = []
|
69
|
+
infer_times = infer_paras.get("infer_times", 1)
|
70
|
+
for _ in range(infer_times):
|
71
|
+
# 使用OpenAI API进行推理
|
72
|
+
try:
|
73
|
+
response = client.chat.completions.create(
|
74
|
+
model=model_name,
|
75
|
+
messages=messages,
|
76
|
+
temperature=infer_paras.get("temperature", 0.7),
|
77
|
+
max_tokens=infer_paras.get("max_tokens", 8192),
|
78
|
+
)
|
79
|
+
text = response.choices[0].message.content
|
80
|
+
except Exception as e:
|
81
|
+
print(e.__str__())
|
82
|
+
text = ""
|
83
|
+
responses.append({"text": text})
|
84
|
+
return responses
|
85
|
+
|
86
|
+
with concurrent.futures.ThreadPoolExecutor(128) as executor:
|
87
|
+
futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
|
88
|
+
results = [future.result() for future in tqdm(futures)]
|
89
|
+
|
90
|
+
return results
|
91
|
+
|
92
|
+
|
93
|
+
def common_vllm_infer_func(model_path, infer_data: list, infer_paras: dict):
|
94
|
+
"""
|
95
|
+
infer_data: list of messages/prompt
|
96
|
+
"""
|
97
|
+
messages = parse_infer_data(infer_data)
|
98
|
+
from vllm import LLM, SamplingParams
|
99
|
+
|
100
|
+
temperature = infer_paras.get("temperature", 0.7)
|
101
|
+
infer_times = infer_paras.get("infer_times", 1)
|
102
|
+
vllm_card_num = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
|
103
|
+
|
104
|
+
llm = LLM(model=model_path, tensor_parallel_size=vllm_card_num, trust_remote_code=True, gpu_memory_utilization=0.85)
|
105
|
+
sampling_params = SamplingParams(
|
106
|
+
temperature=temperature,
|
107
|
+
n=infer_times,
|
108
|
+
max_tokens=8192,
|
109
|
+
# qwen3非思考模式推荐参数
|
110
|
+
# **infer_paras.get(template_name, {}),
|
111
|
+
# qwen3思考模式推荐参数
|
112
|
+
)
|
113
|
+
conversation = messages
|
114
|
+
outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=True)
|
115
|
+
return_texts = []
|
116
|
+
for idx, output in tqdm(enumerate(outputs)):
|
117
|
+
result = [{"text": i.text} for i in output.outputs]
|
118
|
+
return_texts.append(result)
|
119
|
+
return return_texts
|
nlpertools/llm/price.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
def estimate_cost(input_token_num, output_token_num, example_num=1, input_price=1, output_price=4):
|
2
|
+
"""
|
3
|
+
估算成本
|
4
|
+
:param input_token_num: 输入token数量
|
5
|
+
:param output_token_num: 输出token数量
|
6
|
+
:param example_num: 示例数量
|
7
|
+
:param input_price: 输入token单价 / 1M
|
8
|
+
:param output_price: 输出token单价 / 1M
|
9
|
+
:return: 成本
|
10
|
+
"""
|
11
|
+
price = (input_token_num * input_price + output_token_num * output_price) * example_num / 1000000
|
12
|
+
print(f"Estimated cost: {price:.2f} 元")
|
13
|
+
return price
|
nlpertools/ml.py
CHANGED
@@ -2,9 +2,11 @@
|
|
2
2
|
import codecs
|
3
3
|
import os
|
4
4
|
import random
|
5
|
+
import itertools
|
5
6
|
|
6
7
|
from .io.dir import j_mkdir
|
7
8
|
from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
|
9
|
+
|
8
10
|
# import numpy as np
|
9
11
|
# import seaborn as sns
|
10
12
|
# import torch
|
@@ -17,10 +19,11 @@ from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
|
|
17
19
|
from .utils.package import *
|
18
20
|
|
19
21
|
|
20
|
-
def estimate_pass_at_k(num_samples:list, num_correct:list, k):
|
22
|
+
def estimate_pass_at_k(num_samples: list, num_correct: list, k):
|
21
23
|
"""
|
22
24
|
copy from https://huggingface.co/spaces/evaluate-metric/code_eval/blob/main/code_eval.py
|
23
25
|
num_samples: list
|
26
|
+
Note: if num sample < k, acc = 1, it's incomprehensibly
|
24
27
|
"""
|
25
28
|
"""Estimates pass@k of each problem and returns them in an array."""
|
26
29
|
|
@@ -39,8 +42,21 @@ def estimate_pass_at_k(num_samples:list, num_correct:list, k):
|
|
39
42
|
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
40
43
|
|
41
44
|
|
45
|
+
def estimate_pass_at_k_fixed(num_samples: list, num_correct: list, k):
|
46
|
+
"""
|
47
|
+
优化了num_samples小于 k的情况
|
48
|
+
"""
|
49
|
+
num_samples = [k if i < k else i for i in num_samples]
|
50
|
+
return estimate_pass_at_k(num_samples, num_correct, k)
|
51
|
+
|
52
|
+
|
53
|
+
def estimate_pass_at_k_return_num(num_samples: list, num_correct: list, k):
|
54
|
+
"""直接返回求完平均的"""
|
55
|
+
return round(estimate_pass_at_k(num_samples, num_correct, k).mean() * 100, 2)
|
56
|
+
|
57
|
+
|
42
58
|
def calc_llm_train_activation_memory(
|
43
|
-
|
59
|
+
model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
|
44
60
|
):
|
45
61
|
"""
|
46
62
|
return bytes
|
@@ -54,18 +70,19 @@ def calc_llm_train_activation_memory(
|
|
54
70
|
# FFN
|
55
71
|
# Layer Norm
|
56
72
|
r1 = (
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
73
|
+
sequence_length
|
74
|
+
* batch_size
|
75
|
+
* hidden_dim
|
76
|
+
* lay_number
|
77
|
+
* (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
|
62
78
|
)
|
63
79
|
# reference2
|
64
80
|
r2 = (
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
81
|
+
lay_number
|
82
|
+
* (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
|
83
|
+
* sequence_length
|
84
|
+
* batch_size
|
85
|
+
/ gpu_num
|
69
86
|
)
|
70
87
|
print(r1)
|
71
88
|
print(r2)
|
@@ -100,9 +117,7 @@ class DataStructure:
|
|
100
117
|
"source": "baidu",
|
101
118
|
}
|
102
119
|
ner_input_example = "这句话一共有两个实体分别为大象和老鼠。"
|
103
|
-
ner_label_example = (
|
104
|
-
list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
|
105
|
-
)
|
120
|
+
ner_label_example = list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
|
106
121
|
|
107
122
|
|
108
123
|
def text_jaccard(ipt1, ipt2, ipt_level="char", sim_level="char"):
|
@@ -156,7 +171,7 @@ class STEM(object):
|
|
156
171
|
if each_srl:
|
157
172
|
args = []
|
158
173
|
for arg in each_srl:
|
159
|
-
args.extend(seg[arg[1]: arg[2] + 1])
|
174
|
+
args.extend(seg[arg[1] : arg[2] + 1])
|
160
175
|
# 添加上谓词
|
161
176
|
args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
|
162
177
|
events.append(args)
|
@@ -195,7 +210,7 @@ def subject_object_labeling(spo_list, text):
|
|
195
210
|
q_list_length = len(q_list)
|
196
211
|
k_list_length = len(k_list)
|
197
212
|
for idx in range(k_list_length - q_list_length + 1):
|
198
|
-
t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
|
213
|
+
t = [q == k for q, k in zip(q_list, k_list[idx : idx + q_list_length])]
|
199
214
|
# print(idx, t)
|
200
215
|
if all(t):
|
201
216
|
# print(idx)
|
@@ -208,9 +223,7 @@ def subject_object_labeling(spo_list, text):
|
|
208
223
|
if len(spo) == 2:
|
209
224
|
labeling_list[idx_start + 1] = "I-" + spo_type
|
210
225
|
elif len(spo) >= 3:
|
211
|
-
labeling_list[idx_start + 1: idx_start + len(spo)] = ["I-" + spo_type] * (
|
212
|
-
len(spo) - 1
|
213
|
-
)
|
226
|
+
labeling_list[idx_start + 1 : idx_start + len(spo)] = ["I-" + spo_type] * (len(spo) - 1)
|
214
227
|
else:
|
215
228
|
pass
|
216
229
|
|
@@ -219,7 +232,7 @@ def subject_object_labeling(spo_list, text):
|
|
219
232
|
# count = 0
|
220
233
|
for predicate, spo_list_form in spo_predicate_dict.items():
|
221
234
|
if predicate in text:
|
222
|
-
for
|
235
|
+
for spo_subject, spo_object in spo_list_form:
|
223
236
|
# if predicate not in spo_subject and predicate not in spo_object:
|
224
237
|
_labeling_type(spo_subject, "SUB")
|
225
238
|
_labeling_type(spo_object, "OBJ")
|
@@ -241,10 +254,7 @@ def label(text, labels):
|
|
241
254
|
:return:
|
242
255
|
"""
|
243
256
|
train_sequence = "\n".join(
|
244
|
-
[
|
245
|
-
"\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1])
|
246
|
-
for i in zip(list(text), labels)
|
247
|
-
]
|
257
|
+
["\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1]) for i in zip(list(text), labels)]
|
248
258
|
)
|
249
259
|
return train_sequence
|
250
260
|
|
@@ -260,16 +270,12 @@ def convert_crf_format_10_fold(corpus, objdir_path):
|
|
260
270
|
split_position = int(len(corpus) / 10)
|
261
271
|
for k in range(0, 10):
|
262
272
|
if k == 9:
|
263
|
-
dev_set = corpus[k * split_position:]
|
273
|
+
dev_set = corpus[k * split_position :]
|
264
274
|
train_set = corpus[: k * split_position]
|
265
275
|
else:
|
266
|
-
dev_set = corpus[k * split_position: (k + 1) * split_position]
|
267
|
-
train_set = (
|
268
|
-
|
269
|
-
)
|
270
|
-
writetxt_w_list(
|
271
|
-
train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1))
|
272
|
-
)
|
276
|
+
dev_set = corpus[k * split_position : (k + 1) * split_position]
|
277
|
+
train_set = corpus[: k * split_position] + corpus[(k + 1) * split_position :]
|
278
|
+
writetxt_w_list(train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1)))
|
273
279
|
writetxt_w_list(dev_set, os.path.join(objdir_path, "test{}.txt".format(k + 1)))
|
274
280
|
writetxt_w_list(dev_set, os.path.join(objdir_path, "dev{}.txt".format(k + 1)))
|
275
281
|
|
@@ -305,31 +311,19 @@ def read_seq_res(path, labels):
|
|
305
311
|
return text, raw_label, predict_label
|
306
312
|
|
307
313
|
|
308
|
-
def kfold_txt(corpus, path, k=9, is_shuffle=True):
|
309
|
-
"""
|
310
|
-
k是10份中训练集占了几份
|
311
|
-
"""
|
312
|
-
j_mkdir(path)
|
313
|
-
if is_shuffle:
|
314
|
-
random.shuffle(corpus)
|
315
|
-
split_position = int(len(corpus) / 10)
|
316
|
-
train_set, dev_set = corpus[: k * split_position], corpus[k * split_position:]
|
317
|
-
writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
|
318
|
-
writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
|
319
|
-
writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
|
320
|
-
|
321
|
-
|
322
314
|
def sample():
|
323
315
|
import pandas as pd
|
324
316
|
from sklearn.model_selection import StratifiedShuffleSplit
|
325
317
|
|
326
318
|
# 假设 df 是你的 DataFrame
|
327
319
|
|
328
|
-
df = pd.DataFrame(
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
320
|
+
df = pd.DataFrame(
|
321
|
+
{
|
322
|
+
"count_line": [i for i in range(100)],
|
323
|
+
"x": [i for i in range(100)],
|
324
|
+
"y": [i // 10 for i in range(100)],
|
325
|
+
}
|
326
|
+
)
|
333
327
|
print(df)
|
334
328
|
# count_line 是用于分层抽样的字段
|
335
329
|
|
@@ -337,7 +331,7 @@ def sample():
|
|
337
331
|
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
|
338
332
|
|
339
333
|
# 获取训练集和测试集的索引
|
340
|
-
train_index, test_index = next(split.split(df, df[
|
334
|
+
train_index, test_index = next(split.split(df, df["y"]))
|
341
335
|
|
342
336
|
# 根据索引划分训练集和测试集
|
343
337
|
train_df = df.loc[train_index]
|
@@ -348,6 +342,27 @@ def sample():
|
|
348
342
|
print("测试集行数:", len(test_df))
|
349
343
|
|
350
344
|
|
345
|
+
def kfold_txt(corpus, path, k=9, is_shuffle=True):
|
346
|
+
"""
|
347
|
+
k是10份中训练集占了几份
|
348
|
+
"""
|
349
|
+
j_mkdir(path)
|
350
|
+
if is_shuffle:
|
351
|
+
random.shuffle(corpus)
|
352
|
+
split_position = int(len(corpus) / 10)
|
353
|
+
train_set, dev_set = corpus[: k * split_position], corpus[k * split_position :]
|
354
|
+
writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
|
355
|
+
writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
|
356
|
+
writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
|
357
|
+
|
358
|
+
|
359
|
+
def kfold_list(list_data):
|
360
|
+
"""
|
361
|
+
sklearn.model_selection.train_test_split
|
362
|
+
"""
|
363
|
+
pass
|
364
|
+
|
365
|
+
|
351
366
|
def kfold_df(df, save_dir=None):
|
352
367
|
"""
|
353
368
|
划分train test val集, 写为windows可读的csv。
|
@@ -360,9 +375,7 @@ def kfold_df(df, save_dir=None):
|
|
360
375
|
|
361
376
|
train_idx, test_and_val_idx = KFold(n_splits=8, shuffle=True).split(df).__next__()
|
362
377
|
df_test_and_val = df.iloc[test_and_val_idx]
|
363
|
-
test_idx, val_idx = (
|
364
|
-
KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
|
365
|
-
)
|
378
|
+
test_idx, val_idx = KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
|
366
379
|
df_train = df.iloc[train_idx]
|
367
380
|
df_val = df.iloc[val_idx]
|
368
381
|
df_test = df.iloc[test_idx]
|
@@ -439,7 +452,7 @@ def split_sentence(sentence, language="chinese", cross_line=True):
|
|
439
452
|
for idx, char in enumerate(sentence):
|
440
453
|
if idx == len(sentence) - 1:
|
441
454
|
if char in split_signs:
|
442
|
-
sentences.append(sentence[start_idx: idx + 1].strip())
|
455
|
+
sentences.append(sentence[start_idx : idx + 1].strip())
|
443
456
|
start_idx = idx + 1
|
444
457
|
else:
|
445
458
|
sentences.append(sentence[start_idx:].strip())
|
@@ -449,10 +462,10 @@ def split_sentence(sentence, language="chinese", cross_line=True):
|
|
449
462
|
if idx < len(sentence) - 2:
|
450
463
|
# 处理。”。
|
451
464
|
if sentence[idx + 2] not in split_signs:
|
452
|
-
sentences.append(sentence[start_idx: idx + 2].strip())
|
465
|
+
sentences.append(sentence[start_idx : idx + 2].strip())
|
453
466
|
start_idx = idx + 2
|
454
467
|
elif sentence[idx + 1] not in split_signs:
|
455
|
-
sentences.append(sentence[start_idx: idx + 1].strip())
|
468
|
+
sentences.append(sentence[start_idx : idx + 1].strip())
|
456
469
|
start_idx = idx + 1
|
457
470
|
return sentences
|
458
471
|
|
@@ -528,6 +541,6 @@ if __name__ == "__main__":
|
|
528
541
|
hidden_dim=4096,
|
529
542
|
lay_number=28,
|
530
543
|
attention_heads_num=32,
|
531
|
-
gpu_num=1
|
544
|
+
gpu_num=1,
|
532
545
|
)
|
533
546
|
print(res, "G")
|