PyPI - nlpertools - Versions diffs - 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl - Mend

nlpertools 1.0.9py3-none-any.whl → 1.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

nlpertools/__init__.py +3 -2
nlpertools/cli.py +26 -47
nlpertools/dataprocess/__init__.py +1 -0
nlpertools/dataprocess/dedupl.py +9 -0
nlpertools/{dataprocess.py → dataprocess/dp_main.py} +13 -1
nlpertools/io/dir.py +25 -5
nlpertools/io/file.py +46 -43
nlpertools/llm/__init__.py +3 -0
nlpertools/llm/call_llm_once.py +60 -0
nlpertools/llm/infer.py +119 -0
nlpertools/llm/price.py +13 -0
nlpertools/ml.py +72 -59
nlpertools/other.py +82 -53
nlpertools/utils/package.py +9 -10
nlpertools/wrapper.py +6 -4
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/METADATA +27 -25
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/RECORD +21 -15
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/WHEEL +1 -1
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/entry_points.txt +0 -0
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info/licenses}/LICENSE +0 -0
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/top_level.txt +0 -0

nlpertools/llm/infer.py ADDED Viewed

@@ -0,0 +1,119 @@
+import os
+from tqdm import tqdm
+import concurrent.futures
+import itertools
+INFER_PARAS = {
+    "temperature": 0.7,
+    "infer_times": 1,
+    "max_tokens": 8192,
+    "top_p": 0.95,
+    "top_k": 40,
+    "repetition_penalty": 1.0,
+}
+def parse_infer_data(infer_data: list):
+    # 解释一下为什么要[][]，因为message本来就必须得是[]
+    if isinstance(infer_data[0], str):
+        message = [[{"role": "user", "content": i}] for i in infer_data]
+    elif isinstance(infer_data[0], list):
+        message = infer_data
+    return message
+def common_api_infer_func(model_name, infer_data: list, infer_paras, client):
+    from openai import OpenAI
+    """
+    infer_data: list of messages/prompt
+    """
+    messages = parse_infer_data(infer_data)
+    def get_response(model_name, messages, infer_paras):
+        responses = []
+        infer_times = infer_paras.get("infer_times", 1)
+        for _ in range(infer_times):
+            # 使用OpenAI API进行推理
+            response = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                temperature=infer_paras.get("temperature", 0.7),
+                max_tokens=infer_paras.get("max_tokens", 8192),
+            )
+            text = response.choices[0].message.content
+            responses.append({"text": text})
+        return responses
+    with concurrent.futures.ThreadPoolExecutor(16) as executor:
+        futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
+        # results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures))] # 乱序
+        results = [future.result() for future in tqdm(futures)]
+    return results
+def common_api_infer_func_multi_client(model_name, infer_data: list, infer_paras, clients: list):
+    """
+    infer_data: list of messages/prompt
+    """
+    messages = parse_infer_data(infer_data)
+    iter_cycle = itertools.cycle(clients)
+    def get_response(model_name, messages, infer_paras):
+        client = next(iter_cycle)
+        # print(client.base_url)
+        responses = []
+        infer_times = infer_paras.get("infer_times", 1)
+        for _ in range(infer_times):
+            # 使用OpenAI API进行推理
+            try:
+                response = client.chat.completions.create(
+                    model=model_name,
+                    messages=messages,
+                    temperature=infer_paras.get("temperature", 0.7),
+                    max_tokens=infer_paras.get("max_tokens", 8192),
+                )
+                text = response.choices[0].message.content
+            except Exception as e:
+                print(e.__str__())
+                text = ""
+            responses.append({"text": text})
+        return responses
+    with concurrent.futures.ThreadPoolExecutor(128) as executor:
+        futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
+        results = [future.result() for future in tqdm(futures)]
+    return results
+def common_vllm_infer_func(model_path, infer_data: list, infer_paras: dict):
+    """
+    infer_data: list of messages/prompt
+    """
+    messages = parse_infer_data(infer_data)
+    from vllm import LLM, SamplingParams
+    temperature = infer_paras.get("temperature", 0.7)
+    infer_times = infer_paras.get("infer_times", 1)
+    vllm_card_num = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
+    llm = LLM(model=model_path, tensor_parallel_size=vllm_card_num, trust_remote_code=True, gpu_memory_utilization=0.85)
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        n=infer_times,
+        max_tokens=8192,
+        # qwen3非思考模式推荐参数
+        # **infer_paras.get(template_name, {}),
+        # qwen3思考模式推荐参数
+    )
+    conversation = messages
+    outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=True)
+    return_texts = []
+    for idx, output in tqdm(enumerate(outputs)):
+        result = [{"text": i.text} for i in output.outputs]
+        return_texts.append(result)
+    return return_texts

nlpertools/llm/price.py ADDED Viewed

@@ -0,0 +1,13 @@
+def estimate_cost(input_token_num, output_token_num, example_num=1, input_price=1, output_price=4):
+    """
+    估算成本
+    :param input_token_num: 输入token数量
+    :param output_token_num: 输出token数量
+    :param example_num: 示例数量
+    :param input_price: 输入token单价  / 1M
+    :param output_price: 输出token单价 / 1M
+    :return: 成本
+    """
+    price = (input_token_num * input_price + output_token_num * output_price) * example_num / 1000000
+    print(f"Estimated cost: {price:.2f} 元")
+    return price

nlpertools/ml.py CHANGED Viewed

@@ -2,9 +2,11 @@
 import codecs
 import os
 import random
+import itertools
 from .io.dir import j_mkdir
 from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
 # import numpy as np
 # import seaborn as sns
 # import torch
@@ -17,10 +19,11 @@ from .io.file import readtxt_list_all_strip, writetxt_w_list, save_to_csv
 from .utils.package import *
-def estimate_pass_at_k(num_samples:list, num_correct:list, k):
+def estimate_pass_at_k(num_samples: list, num_correct: list, k):
     """
     copy from https://huggingface.co/spaces/evaluate-metric/code_eval/blob/main/code_eval.py
     num_samples: list
+    Note: if num sample < k, acc = 1, it's incomprehensibly
     """
     """Estimates pass@k of each problem and returns them in an array."""
@@ -39,8 +42,21 @@ def estimate_pass_at_k(num_samples:list, num_correct:list, k):
     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+def estimate_pass_at_k_fixed(num_samples: list, num_correct: list, k):
+    """
+    优化了num_samples小于 k的情况
+    """
+    num_samples = [k if i < k else i for i in num_samples]
+    return estimate_pass_at_k(num_samples, num_correct, k)
+def estimate_pass_at_k_return_num(num_samples: list, num_correct: list, k):
+    """直接返回求完平均的"""
+    return round(estimate_pass_at_k(num_samples, num_correct, k).mean() * 100, 2)
 def calc_llm_train_activation_memory(
-        model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
+    model_name, sequence_length, batch_size, hidden_dim, lay_number, attention_heads_num, gpu_num=1
 ):
     """
     return bytes
@@ -54,18 +70,19 @@ def calc_llm_train_activation_memory(
     # FFN
     # Layer Norm
     r1 = (
-            sequence_length
-            * batch_size
-            * hidden_dim
-            * lay_number
-            * (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
+        sequence_length
+        * batch_size
+        * hidden_dim
+        * lay_number
+        * (34 + 5 * attention_heads_num * sequence_length / hidden_dim)
     )
     # reference2
     r2 = (
-            lay_number * (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
-            * sequence_length
-            * batch_size
-            / gpu_num
+        lay_number
+        * (2 * sequence_length * attention_heads_num + 16 * hidden_dim)
+        * sequence_length
+        * batch_size
+        / gpu_num
     )
     print(r1)
     print(r2)
@@ -100,9 +117,7 @@ class DataStructure:
         "source": "baidu",
     }
     ner_input_example = "这句话一共有两个实体分别为大象和老鼠。"
-    ner_label_example = (
-            list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
-    )
+    ner_label_example = list("OOOOOOOOOOOOO") + ["B-s", "I-s"] + ["O"] + ["B-o", "I-o"] + ["O"]
 def text_jaccard(ipt1, ipt2, ipt_level="char", sim_level="char"):
@@ -156,7 +171,7 @@ class STEM(object):
             if each_srl:
                 args = []
                 for arg in each_srl:
-                    args.extend(seg[arg[1]: arg[2] + 1])
+                    args.extend(seg[arg[1] : arg[2] + 1])
                 # 添加上谓词
                 args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
                 events.append(args)
@@ -195,7 +210,7 @@ def subject_object_labeling(spo_list, text):
         q_list_length = len(q_list)
         k_list_length = len(k_list)
         for idx in range(k_list_length - q_list_length + 1):
-            t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
+            t = [q == k for q, k in zip(q_list, k_list[idx : idx + q_list_length])]
             # print(idx, t)
             if all(t):
                 # print(idx)
@@ -208,9 +223,7 @@ def subject_object_labeling(spo_list, text):
         if len(spo) == 2:
             labeling_list[idx_start + 1] = "I-" + spo_type
         elif len(spo) >= 3:
-            labeling_list[idx_start + 1: idx_start + len(spo)] = ["I-" + spo_type] * (
-                    len(spo) - 1
-            )
+            labeling_list[idx_start + 1 : idx_start + len(spo)] = ["I-" + spo_type] * (len(spo) - 1)
         else:
             pass
@@ -219,7 +232,7 @@ def subject_object_labeling(spo_list, text):
     # count = 0
     for predicate, spo_list_form in spo_predicate_dict.items():
         if predicate in text:
-            for (spo_subject, spo_object) in spo_list_form:
+            for spo_subject, spo_object in spo_list_form:
                 # if predicate not in spo_subject and predicate not in spo_object:
                 _labeling_type(spo_subject, "SUB")
                 _labeling_type(spo_object, "OBJ")
@@ -241,10 +254,7 @@ def label(text, labels):
     :return:
     """
     train_sequence = "\n".join(
-        [
-            "\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1])
-            for i in zip(list(text), labels)
-        ]
+        ["\t".join(i) if i[0] != " " else "[null]\t{}".format(i[1]) for i in zip(list(text), labels)]
     )
     return train_sequence
@@ -260,16 +270,12 @@ def convert_crf_format_10_fold(corpus, objdir_path):
     split_position = int(len(corpus) / 10)
     for k in range(0, 10):
         if k == 9:
-            dev_set = corpus[k * split_position:]
+            dev_set = corpus[k * split_position :]
             train_set = corpus[: k * split_position]
         else:
-            dev_set = corpus[k * split_position: (k + 1) * split_position]
-            train_set = (
-                    corpus[: k * split_position] + corpus[(k + 1) * split_position:]
-            )
-        writetxt_w_list(
-            train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1))
-        )
+            dev_set = corpus[k * split_position : (k + 1) * split_position]
+            train_set = corpus[: k * split_position] + corpus[(k + 1) * split_position :]
+        writetxt_w_list(train_set, os.path.join(objdir_path, "train{}.txt".format(k + 1)))
         writetxt_w_list(dev_set, os.path.join(objdir_path, "test{}.txt".format(k + 1)))
         writetxt_w_list(dev_set, os.path.join(objdir_path, "dev{}.txt".format(k + 1)))
@@ -305,31 +311,19 @@ def read_seq_res(path, labels):
     return text, raw_label, predict_label
-def kfold_txt(corpus, path, k=9, is_shuffle=True):
-    """
-    k是10份中训练集占了几份
-    """
-    j_mkdir(path)
-    if is_shuffle:
-        random.shuffle(corpus)
-    split_position = int(len(corpus) / 10)
-    train_set, dev_set = corpus[: k * split_position], corpus[k * split_position:]
-    writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
-    writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
-    writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
 def sample():
     import pandas as pd
     from sklearn.model_selection import StratifiedShuffleSplit
     # 假设 df 是你的 DataFrame
-    df = pd.DataFrame({
-        "count_line": [i for i in range(100)],
-        "x": [i for i in range(100)],
-        "y": [i // 10 for i in range(100)],
-    })
+    df = pd.DataFrame(
+        {
+            "count_line": [i for i in range(100)],
+            "x": [i for i in range(100)],
+            "y": [i // 10 for i in range(100)],
+        }
+    )
     print(df)
     # count_line 是用于分层抽样的字段
@@ -337,7 +331,7 @@ def sample():
     split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
     # 获取训练集和测试集的索引
-    train_index, test_index = next(split.split(df, df['y']))
+    train_index, test_index = next(split.split(df, df["y"]))
     # 根据索引划分训练集和测试集
     train_df = df.loc[train_index]
@@ -348,6 +342,27 @@ def sample():
     print("测试集行数：", len(test_df))
+def kfold_txt(corpus, path, k=9, is_shuffle=True):
+    """
+    k是10份中训练集占了几份
+    """
+    j_mkdir(path)
+    if is_shuffle:
+        random.shuffle(corpus)
+    split_position = int(len(corpus) / 10)
+    train_set, dev_set = corpus[: k * split_position], corpus[k * split_position :]
+    writetxt_w_list(train_set, os.path.join(path, "train.tsv"), num_lf=1)
+    writetxt_w_list(dev_set, os.path.join(path, "test.tsv"), num_lf=1)
+    writetxt_w_list(dev_set, os.path.join(path, "dev.tsv"), num_lf=1)
+def kfold_list(list_data):
+    """
+    sklearn.model_selection.train_test_split
+    """
+    pass
 def kfold_df(df, save_dir=None):
     """
     划分train test val集， 写为windows可读的csv。
@@ -360,9 +375,7 @@ def kfold_df(df, save_dir=None):
     train_idx, test_and_val_idx = KFold(n_splits=8, shuffle=True).split(df).__next__()
     df_test_and_val = df.iloc[test_and_val_idx]
-    test_idx, val_idx = (
-        KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
-    )
+    test_idx, val_idx = KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
     df_train = df.iloc[train_idx]
     df_val = df.iloc[val_idx]
     df_test = df.iloc[test_idx]
@@ -439,7 +452,7 @@ def split_sentence(sentence, language="chinese", cross_line=True):
     for idx, char in enumerate(sentence):
         if idx == len(sentence) - 1:
             if char in split_signs:
-                sentences.append(sentence[start_idx: idx + 1].strip())
+                sentences.append(sentence[start_idx : idx + 1].strip())
                 start_idx = idx + 1
             else:
                 sentences.append(sentence[start_idx:].strip())
@@ -449,10 +462,10 @@ def split_sentence(sentence, language="chinese", cross_line=True):
                     if idx < len(sentence) - 2:
                         # 处理。”。
                         if sentence[idx + 2] not in split_signs:
-                            sentences.append(sentence[start_idx: idx + 2].strip())
+                            sentences.append(sentence[start_idx : idx + 2].strip())
                             start_idx = idx + 2
                 elif sentence[idx + 1] not in split_signs:
-                    sentences.append(sentence[start_idx: idx + 1].strip())
+                    sentences.append(sentence[start_idx : idx + 1].strip())
                     start_idx = idx + 1
     return sentences
@@ -528,6 +541,6 @@ if __name__ == "__main__":
         hidden_dim=4096,
         lay_number=28,
         attention_heads_num=32,
-        gpu_num=1
+        gpu_num=1,
     )
     print(res, "G")

nlpertools 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

nlpertools 1.0.9py3-none-any.whl → 1.0.11py3-none-any.whl