nlpertools 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ import os
2
+ from tqdm import tqdm
3
+ import concurrent.futures
4
+ import itertools
5
+
6
+
7
+ INFER_PARAS = {
8
+ "temperature": 0.7,
9
+ "infer_times": 1,
10
+ "max_tokens": 8192,
11
+ "top_p": 0.95,
12
+ "top_k": 40,
13
+ "repetition_penalty": 1.0,
14
+ }
15
+
16
+
17
+ def parse_infer_data(infer_data: list):
18
+ # 解释一下为什么要[][],因为message本来就必须得是[]
19
+ if isinstance(infer_data[0], str):
20
+ message = [[{"role": "user", "content": i}] for i in infer_data]
21
+ elif isinstance(infer_data[0], list):
22
+ message = infer_data
23
+ return message
24
+
25
+
26
+ def common_api_infer_func(model_name, infer_data: list, infer_paras, client):
27
+ from openai import OpenAI
28
+
29
+ """
30
+ infer_data: list of messages/prompt
31
+ """
32
+ messages = parse_infer_data(infer_data)
33
+
34
+ def get_response(model_name, messages, infer_paras):
35
+ responses = []
36
+ infer_times = infer_paras.get("infer_times", 1)
37
+
38
+ for _ in range(infer_times):
39
+ # 使用OpenAI API进行推理
40
+ response = client.chat.completions.create(
41
+ model=model_name,
42
+ messages=messages,
43
+ temperature=infer_paras.get("temperature", 0.7),
44
+ max_tokens=infer_paras.get("max_tokens", 8192),
45
+ )
46
+ text = response.choices[0].message.content
47
+ responses.append({"text": text})
48
+ return responses
49
+
50
+ with concurrent.futures.ThreadPoolExecutor(16) as executor:
51
+ futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
52
+ # results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures))] # 乱序
53
+ results = [future.result() for future in tqdm(futures)]
54
+
55
+ return results
56
+
57
+
58
+ def common_api_infer_func_multi_client(model_name, infer_data: list, infer_paras, clients: list):
59
+ """
60
+ infer_data: list of messages/prompt
61
+ """
62
+ messages = parse_infer_data(infer_data)
63
+ iter_cycle = itertools.cycle(clients)
64
+
65
+ def get_response(model_name, messages, infer_paras):
66
+ client = next(iter_cycle)
67
+ # print(client.base_url)
68
+ responses = []
69
+ infer_times = infer_paras.get("infer_times", 1)
70
+ for _ in range(infer_times):
71
+ # 使用OpenAI API进行推理
72
+ try:
73
+ response = client.chat.completions.create(
74
+ model=model_name,
75
+ messages=messages,
76
+ temperature=infer_paras.get("temperature", 0.7),
77
+ max_tokens=infer_paras.get("max_tokens", 8192),
78
+ )
79
+ text = response.choices[0].message.content
80
+ except Exception as e:
81
+ print(e.__str__())
82
+ text = ""
83
+ responses.append({"text": text})
84
+ return responses
85
+
86
+ with concurrent.futures.ThreadPoolExecutor(128) as executor:
87
+ futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
88
+ results = [future.result() for future in tqdm(futures)]
89
+
90
+ return results
91
+
92
+
93
+ def common_vllm_infer_func(model_path, infer_data: list, infer_paras: dict):
94
+ """
95
+ infer_data: list of messages/prompt
96
+ """
97
+ messages = parse_infer_data(infer_data)
98
+ from vllm import LLM, SamplingParams
99
+
100
+ temperature = infer_paras.get("temperature", 0.7)
101
+ infer_times = infer_paras.get("infer_times", 1)
102
+ vllm_card_num = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
103
+
104
+ llm = LLM(model=model_path, tensor_parallel_size=vllm_card_num, trust_remote_code=True, gpu_memory_utilization=0.85)
105
+ sampling_params = SamplingParams(
106
+ temperature=temperature,
107
+ n=infer_times,
108
+ max_tokens=8192,
109
+ # qwen3非思考模式推荐参数
110
+ # **infer_paras.get(template_name, {}),
111
+ # qwen3思考模式推荐参数
112
+ )
113
+ conversation = messages
114
+ outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=True)
115
+ return_texts = []
116
+ for idx, output in tqdm(enumerate(outputs)):
117
+ result = [{"text": i.text} for i in output.outputs]
118
+ return_texts.append(result)
119
+ return return_texts
@@ -0,0 +1,13 @@
1
+ def estimate_cost(input_token_num, output_token_num, example_num=1, input_price=1, output_price=4):
2
+ """
3
+ 估算成本
4
+ :param input_token_num: 输入token数量
5
+ :param output_token_num: 输出token数量
6
+ :param example_num: 示例数量
7
+ :param input_price: 输入token单价 / 1M
8
+ :param output_price: 输出token单价 / 1M
9
+ :return: 成本
10
+ """
11
+ price = (input_token_num * input_price + output_token_num * output_price) * example_num / 1000000
12
+ print(f"Estimated cost: {price:.2f} 元")
13
+ return price
nlpertools/ml.py CHANGED
@@ -2,9 +2,11 @@
2
2
  import codecs
3
3
  import os
4
4
  import random
5
+ import itertools
5
6
 
6
7
  from .io.dir import j_mkdir
7
8
  from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
9
+
8
10
  # import numpy as np
9
11
  # import seaborn as sns
10
12
  # import torch
@@ -17,10 +19,11 @@ from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
17
19
  from .utils.package import *
18
20
 
19
21
 
20
- def estimate_pass_at_k(num_samples:list, num_correct:list, k):
22
+ def estimate_pass_at_k(num_samples: list, num_correct: list, k):
21
23
  """
22
24
  copy from https://huggingface.co/spaces/evaluate-metric/code_eval/blob/main/code_eval.py
23
25
  num_samples: list
26
+ Note: if num sample < k, acc = 1, it's incomprehensibly
24
27
  """
25
28
  """Estimates pass@k of each problem and returns them in an array."""
26
29
 
@@ -39,8 +42,21 @@ def estimate_pass_at_k(num_samples:list, num_correct:list, k):
39
42
  return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
40
43
 
41
44
 
45
+ def estimate_pass_at_k_fixed(num_samples: list, num_correct: list, k):
46
+ """
47
+ 优化了num_samples小于 k的情况
48
+ """
49
+ num_samples = [k if i < k else i for i in num_samples]
50
+ return estimate_pass_at_k(num_samples, num_correct, k)
51
+
52
+
53
+ def estimate_pass_at_k_return_num(num_samples: list, num_correct: list, k):
54
+ """直接返回求完平均的"""
55
+ return round(estimate_pass_at_k(num_samples, num_correct, k).mean() * 100, 2)
56
+
57
+
42
58
  def calc_llm_train_activation_memory(
43
- model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
59
+ model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
44
60
  ):
45
61
  """
46
62
  return bytes
@@ -54,18 +70,19 @@ def calc_llm_train_activation_memory(
54
70
  # FFN
55
71
  # Layer Norm
56
72
  r1 = (
57
- sequence_length
58
- * batch_size
59
- * hidden_dim
60
- * lay_number
61
- * (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
73
+ sequence_length
74
+ * batch_size
75
+ * hidden_dim
76
+ * lay_number
77
+ * (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
62
78
  )
63
79
  # reference2
64
80
  r2 = (
65
- lay_number * (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
66
- * sequence_length
67
- * batch_size
68
- / gpu_num
81
+ lay_number
82
+ * (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
83
+ * sequence_length
84
+ * batch_size
85
+ / gpu_num
69
86
  )
70
87
  print(r1)
71
88
  print(r2)
@@ -100,9 +117,7 @@ class DataStructure:
100
117
  "source": "baidu",
101
118
  }
102
119
  ner_input_example = "这句话一共有两个实体分别为大象和老鼠。"
103
- ner_label_example = (
104
- list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
105
- )
120
+ ner_label_example = list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
106
121
 
107
122
 
108
123
  def text_jaccard(ipt1, ipt2, ipt_level="char", sim_level="char"):
@@ -156,7 +171,7 @@ class STEM(object):
156
171
  if each_srl:
157
172
  args = []
158
173
  for arg in each_srl:
159
- args.extend(seg[arg[1]: arg[2] + 1])
174
+ args.extend(seg[arg[1] : arg[2] + 1])
160
175
  # 添加上谓词
161
176
  args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
162
177
  events.append(args)
@@ -195,7 +210,7 @@ def subject_object_labeling(spo_list, text):
195
210
  q_list_length = len(q_list)
196
211
  k_list_length = len(k_list)
197
212
  for idx in range(k_list_length - q_list_length + 1):
198
- t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
213
+ t = [q == k for q, k in zip(q_list, k_list[idx : idx + q_list_length])]
199
214
  # print(idx, t)
200
215
  if all(t):
201
216
  # print(idx)
@@ -208,9 +223,7 @@ def subject_object_labeling(spo_list, text):
208
223
  if len(spo) == 2:
209
224
  labeling_list[idx_start + 1] = "I-" + spo_type
210
225
  elif len(spo) >= 3:
211
- labeling_list[idx_start + 1: idx_start + len(spo)] = ["I-" + spo_type] * (
212
- len(spo) - 1
213
- )
226
+ labeling_list[idx_start + 1 : idx_start + len(spo)] = ["I-" + spo_type] * (len(spo) - 1)
214
227
  else:
215
228
  pass
216
229
 
@@ -219,7 +232,7 @@ def subject_object_labeling(spo_list, text):
219
232
  # count = 0
220
233
  for predicate, spo_list_form in spo_predicate_dict.items():
221
234
  if predicate in text:
222
- for (spo_subject, spo_object) in spo_list_form:
235
+ for spo_subject, spo_object in spo_list_form:
223
236
  # if predicate not in spo_subject and predicate not in spo_object:
224
237
  _labeling_type(spo_subject, "SUB")
225
238
  _labeling_type(spo_object, "OBJ")
@@ -241,10 +254,7 @@ def label(text, labels):
241
254
  :return:
242
255
  """
243
256
  train_sequence = "\n".join(
244
- [
245
- "\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1])
246
- for i in zip(list(text), labels)
247
- ]
257
+ ["\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1]) for i in zip(list(text), labels)]
248
258
  )
249
259
  return train_sequence
250
260
 
@@ -260,16 +270,12 @@ def convert_crf_format_10_fold(corpus, objdir_path):
260
270
  split_position = int(len(corpus) / 10)
261
271
  for k in range(0, 10):
262
272
  if k == 9:
263
- dev_set = corpus[k * split_position:]
273
+ dev_set = corpus[k * split_position :]
264
274
  train_set = corpus[: k * split_position]
265
275
  else:
266
- dev_set = corpus[k * split_position: (k + 1) * split_position]
267
- train_set = (
268
- corpus[: k * split_position] + corpus[(k + 1) * split_position:]
269
- )
270
- writetxt_w_list(
271
- train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1))
272
- )
276
+ dev_set = corpus[k * split_position : (k + 1) * split_position]
277
+ train_set = corpus[: k * split_position] + corpus[(k + 1) * split_position :]
278
+ writetxt_w_list(train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1)))
273
279
  writetxt_w_list(dev_set, os.path.join(objdir_path, "test{}.txt".format(k + 1)))
274
280
  writetxt_w_list(dev_set, os.path.join(objdir_path, "dev{}.txt".format(k + 1)))
275
281
 
@@ -305,31 +311,19 @@ def read_seq_res(path, labels):
305
311
  return text, raw_label, predict_label
306
312
 
307
313
 
308
- def kfold_txt(corpus, path, k=9, is_shuffle=True):
309
- """
310
- k是10份中训练集占了几份
311
- """
312
- j_mkdir(path)
313
- if is_shuffle:
314
- random.shuffle(corpus)
315
- split_position = int(len(corpus) / 10)
316
- train_set, dev_set = corpus[: k * split_position], corpus[k * split_position:]
317
- writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
318
- writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
319
- writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
320
-
321
-
322
314
  def sample():
323
315
  import pandas as pd
324
316
  from sklearn.model_selection import StratifiedShuffleSplit
325
317
 
326
318
  # 假设 df 是你的 DataFrame
327
319
 
328
- df = pd.DataFrame({
329
- "count_line": [i for i in range(100)],
330
- "x": [i for i in range(100)],
331
- "y": [i // 10 for i in range(100)],
332
- })
320
+ df = pd.DataFrame(
321
+ {
322
+ "count_line": [i for i in range(100)],
323
+ "x": [i for i in range(100)],
324
+ "y": [i // 10 for i in range(100)],
325
+ }
326
+ )
333
327
  print(df)
334
328
  # count_line 是用于分层抽样的字段
335
329
 
@@ -337,7 +331,7 @@ def sample():
337
331
  split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
338
332
 
339
333
  # 获取训练集和测试集的索引
340
- train_index, test_index = next(split.split(df, df['y']))
334
+ train_index, test_index = next(split.split(df, df["y"]))
341
335
 
342
336
  # 根据索引划分训练集和测试集
343
337
  train_df = df.loc[train_index]
@@ -348,6 +342,27 @@ def sample():
348
342
  print("测试集行数:", len(test_df))
349
343
 
350
344
 
345
+ def kfold_txt(corpus, path, k=9, is_shuffle=True):
346
+ """
347
+ k是10份中训练集占了几份
348
+ """
349
+ j_mkdir(path)
350
+ if is_shuffle:
351
+ random.shuffle(corpus)
352
+ split_position = int(len(corpus) / 10)
353
+ train_set, dev_set = corpus[: k * split_position], corpus[k * split_position :]
354
+ writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
355
+ writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
356
+ writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
357
+
358
+
359
+ def kfold_list(list_data):
360
+ """
361
+ sklearn.model_selection.train_test_split
362
+ """
363
+ pass
364
+
365
+
351
366
  def kfold_df(df, save_dir=None):
352
367
  """
353
368
  划分train test val集, 写为windows可读的csv。
@@ -360,9 +375,7 @@ def kfold_df(df, save_dir=None):
360
375
 
361
376
  train_idx, test_and_val_idx = KFold(n_splits=8, shuffle=True).split(df).__next__()
362
377
  df_test_and_val = df.iloc[test_and_val_idx]
363
- test_idx, val_idx = (
364
- KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
365
- )
378
+ test_idx, val_idx = KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
366
379
  df_train = df.iloc[train_idx]
367
380
  df_val = df.iloc[val_idx]
368
381
  df_test = df.iloc[test_idx]
@@ -439,7 +452,7 @@ def split_sentence(sentence, language="chinese", cross_line=True):
439
452
  for idx, char in enumerate(sentence):
440
453
  if idx == len(sentence) - 1:
441
454
  if char in split_signs:
442
- sentences.append(sentence[start_idx: idx + 1].strip())
455
+ sentences.append(sentence[start_idx : idx + 1].strip())
443
456
  start_idx = idx + 1
444
457
  else:
445
458
  sentences.append(sentence[start_idx:].strip())
@@ -449,10 +462,10 @@ def split_sentence(sentence, language="chinese", cross_line=True):
449
462
  if idx < len(sentence) - 2:
450
463
  # 处理。”。
451
464
  if sentence[idx + 2] not in split_signs:
452
- sentences.append(sentence[start_idx: idx + 2].strip())
465
+ sentences.append(sentence[start_idx : idx + 2].strip())
453
466
  start_idx = idx + 2
454
467
  elif sentence[idx + 1] not in split_signs:
455
- sentences.append(sentence[start_idx: idx + 1].strip())
468
+ sentences.append(sentence[start_idx : idx + 1].strip())
456
469
  start_idx = idx + 1
457
470
  return sentences
458
471
 
@@ -528,6 +541,6 @@ if __name__ == "__main__":
528
541
  hidden_dim=4096,
529
542
  lay_number=28,
530
543
  attention_heads_num=32,
531
- gpu_num=1
544
+ gpu_num=1,
532
545
  )
533
546
  print(res, "G")