PyPI - nlpertools - Versions diffs - 1.0.10__tar.gz → 1.0.11__tar.gz - Mend

nlpertools 1.0.10tar.gz → 1.0.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{nlpertools-1.0.10/src/nlpertools.egg-info → nlpertools-1.0.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nlpertools
-Version: 1.0.10
+Version: 1.0.11
 Summary: A small package about small basic IO operation when coding
 Home-page: https://github.com/lvzii/nlpertools
 Author: youshuJi
@@ -64,6 +64,8 @@ json_data = nlpertools.load_from_json('res.json')
 ```bash
 ## git, 连接github不稳定的时候非常有用
 ncli git pull
+## 带有参数时，加上--以避免-u被解析
+ncli -- git push -u origin main
 # 生成pypi双因素认证的实时密钥(需要提供key)
 ncli --get_2fa --get_2fa_key your_key

{nlpertools-1.0.10 → nlpertools-1.0.11}/README.md RENAMED Viewed

@@ -40,6 +40,8 @@ json_data = nlpertools.load_from_json('res.json')
 ```bash
 ## git, 连接github不稳定的时候非常有用
 ncli git pull
+## 带有参数时，加上--以避免-u被解析
+ncli -- git push -u origin main
 # 生成pypi双因素认证的实时密钥(需要提供key)
 ncli --get_2fa --get_2fa_key your_key

{nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@
 from .algo.kmp import *
 from .data_structure.base_structure import *
 from .draw import *
+from .dataprocess.dp_main import *
 from .dataprocess import *
 from .io.dir import *
 from .io.file import *
@@ -20,4 +21,4 @@ from .cli import *
 from .llm import *
-__version__ = "1.0.10"
+__version__ = "1.0.11"

{nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/cli.py RENAMED Viewed

@@ -2,7 +2,7 @@ import argparse
 import os
 import uuid
 import sys
-from .dataprocess import startwith
+from .dataprocess.dp_main import startwith
 def run_git_command(command):
@@ -17,7 +17,7 @@ def run_git_command(command):
         info = os.system(command)
         print(str(info))
         # 检查命令执行结果，若未出现错误则认为执行成功
-        if not startwith(str(info), ["fatal", "error", "128", "1"]):
+        if (not startwith(str(info), ["fatal", "error", "128", "1"])) and "fatal" not in str(info):
             print("success")
             print(f"success info : ##{info}##")
             break
@@ -25,7 +25,7 @@ def run_git_command(command):
 def get_mac_address():
     mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
-    mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
+    mac_address = ":".join([mac[e : e + 2] for e in range(0, 11, 2)])
     print("mac address 不一定准确")
     print(mac_address)
     return mac_address
@@ -33,6 +33,7 @@ def get_mac_address():
 def get_2af_value(key):
     import pyotp
     """
     key应该是7位的
     """
@@ -65,15 +66,11 @@ def start_gpu_usage_notify_client():
     from plyer import notification
     import time
-    SERVER_URL = 'http://127.0.0.1:5000/notify'  # 服务器的 API 地址
+    SERVER_URL = "http://127.0.0.1:5000/notify"  # 服务器的 API 地址
     def notify(text):
         # 使用 plyer 发送通知
-        notification.notify(
-            title='远程通知',
-            message=text,
-            timeout=10  # 10秒的通知显示时间
-        )
+        notification.notify(title="远程通知", message=text, timeout=10)  # 10秒的通知显示时间
     """定时轮询服务器获取通知"""
     while True:
@@ -94,12 +91,12 @@ def start_gpu_usage_notify_client():
 def main():
     parser = argparse.ArgumentParser(description="CLI tool for git operations and other functions.")
-    parser.add_argument('git_command', nargs='*', help='Any git command (e.g., push, pull)')
-    parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
-    parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
-    parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
-    parser.add_argument('--monitor_gpu_cli', action='store_true', help='monitor gpu cli')
-    parser.add_argument('--monitor_gpu_ser', action='store_true', help='monitor gpu ser')
+    parser.add_argument("git_command", nargs="*", help="Any git command (e.g., push, pull)")
+    parser.add_argument("--mac_address", action="store_true", help="Get the MAC address.")
+    parser.add_argument("--get_2fa", action="store_true", help="Get the 2fa value.")
+    parser.add_argument("--get_2fa_key", type=str, help="Get the 2fa value.")
+    parser.add_argument("--monitor_gpu_cli", action="store_true", help="monitor gpu cli")
+    parser.add_argument("--monitor_gpu_ser", action="store_true", help="monitor gpu ser")
     args = parser.parse_args()
@@ -121,5 +118,5 @@ def main():
         print("No operation specified.")
-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    main()

nlpertools-1.0.11/src/nlpertools/dataprocess/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .dedupl import *

nlpertools-1.0.11/src/nlpertools/dataprocess/dedupl.py ADDED Viewed

@@ -0,0 +1,9 @@
+# 根据字段对一个元素为dict的list去重
+def deduplicate_dict_list(dict_list: list, key: str) -> list:
+    seen = set()
+    result = []
+    for d in dict_list:
+        if key in d and d[key] not in seen:
+            seen.add(d[key])
+            result.append(d)
+    return result

nlpertools-1.0.10/src/nlpertools/dataprocess.py → nlpertools-1.0.11/src/nlpertools/dataprocess/dp_main.py RENAMED Viewed

@@ -8,7 +8,7 @@ from typing import List
 import numpy as np
 # from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
-from .utils.package import *
+from ..utils.package import *
 main_special_characters = string.punctuation + string.digits + string.whitespace
 other_special_characters = (

{nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/io/dir.py RENAMED Viewed

@@ -3,6 +3,7 @@
 # @Author  : youshu.Ji
 import os
 from pathlib import Path
+from typing import overload,Literal,Union
 # dir ----------------------------------------------------------------------
@@ -45,15 +46,34 @@ def get_filename(path, suffix=True) -> str:
         filename = filename.split('.')[0]
     return filename
-def listdir(dir_name, including_dir=True):
-    filenames = os.listdir(dir_name)
+"""
+因为os.listdir无法支持Path类型，虽然是bytelikepath,但是传入Path后只会返回字符串
+且无法只返回文件名
+故重新实现
+"""
+@overload
+def listdir(dir_name: Path, including_dir: Literal[True]) -> list[Path]: ...
+@overload
+def listdir(dir_name: str, including_dir: Literal[True]) -> list[str]: ...
+@overload
+def listdir(dir_name: Path, including_dir: Literal[False] = False) -> list[str]: ...
+@overload
+def listdir(dir_name: str, including_dir: Literal[False] = False) -> list[str]: ...
+def listdir(dir_name: Union[Path, str], including_dir: bool = False) -> list[Path] | list[str]:
+    """
+    including_dir=True -> list[Path] or list[str]
+    including_dir=False -> list[str]
+    """
+    filenames = os.listdir(str(dir_name))
     if including_dir:
-        return [os.path.join(dir_name, filename) for filename in filenames]
+        if isinstance(dir_name, Path):
+            return [dir_name / filename for filename in filenames]
+        else:
+            return [os.path.join(dir_name, filename) for filename in filenames]
     else:
         return list(filenames)
 def listdir_yield(dir_name, including_dir=True):
     filenames = os.listdir(dir_name)
     for filename in filenames:

{nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/io/file.py RENAMED Viewed

@@ -5,8 +5,11 @@ import codecs
 import json
 import pickle
 import random
-from itertools import (takewhile, repeat)
+from itertools import takewhile, repeat
+from typing import Optional
+from pathlib import Path
 import pandas as pd
 # import omegaconf
 # import yaml
 from ..utils.package import *
@@ -15,18 +18,18 @@ LARGE_FILE_THRESHOLD = 1e5
 def safe_filename(filename: str) -> str:
-    for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|']:
-        filename = filename.replace(char, '_')
+    for char in ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]:
+        filename = filename.replace(char, "_")
     return filename
 def read_yaml(path, omega=False):
     if omega:
         return omegaconf.OmegaConf.load(path)
-    return yaml.load(codecs.open(path, encoding='utf-8'), Loader=yaml.FullLoader)
+    return yaml.load(codecs.open(path, encoding="utf-8"), Loader=yaml.FullLoader)
-def _merge_file(filelist, save_filename, shuffle=False):
+def merge_file(filelist, save_filename, shuffle=False):
     contents = []
     for file in filelist:
         content = readtxt_list_all_strip(file)
@@ -43,9 +46,9 @@ def iter_count(file_name):
     author: unknown
     """
     buffer = 1024 * 1024
-    with codecs.open(file_name, 'r', 'utf-8') as f:
+    with codecs.open(file_name, "r", "utf-8") as f:
         buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
-        return sum(buf.count('\n') for buf in buf_gen)
+        return sum(buf.count("\n") for buf in buf_gen)
 # 需要加入进度条的函数包括
@@ -57,24 +60,24 @@ load_from_json
 # 读txt文件 一次全读完 返回list 去换行
-def readtxt_list_all_strip(path, encoding='utf-8') -> list:
+def readtxt_list_all_strip(path, encoding="utf-8") -> list:
     file_line_num = iter_count(path)
     lines = []
-    with codecs.open(path, 'r', encoding) as r:
+    with codecs.open(path, "r", encoding) as r:
         if file_line_num > LARGE_FILE_THRESHOLD:
             iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
         else:
             iter_obj = enumerate(r.readlines())
         for ldx, line in iter_obj:
-            lines.append(line.strip('\n').strip("\r"))
+            lines.append(line.strip("\n").strip("\r"))
         return lines
 # 读txt 一次读一行 最后返回list
 def readtxt_list_each(path) -> list:
     lines = []
-    with codecs.open(path, 'r', 'utf-8') as r:
+    with codecs.open(path, "r", "utf-8") as r:
         line = r.readline()
         while line:
             lines.append(line)
@@ -82,11 +85,11 @@ def readtxt_list_each(path) -> list:
     return lines
-def readtxt_list_each_strip(path) -> list:
+def readtxt_list_each_strip(path: Optional[str | Path]):
     """
     yield方法
     """
-    with codecs.open(path, 'r', 'utf-8') as r:
+    with codecs.open(path, "r", "utf-8") as r:
         line = r.readline()
         while line:
             yield line.strip("\n").strip("\r")
@@ -95,51 +98,51 @@ def readtxt_list_each_strip(path) -> list:
 # 读txt文件 一次全读完 返回list
 def readtxt_list_all(path) -> list:
-    with codecs.open(path, 'r', 'utf-8') as r:
+    with codecs.open(path, "r", "utf-8") as r:
         lines = r.readlines()
         return lines
 # 读byte文件 读成一条string
 def readtxt_byte(path, encoding="utf-8") -> str:
-    with codecs.open(path, 'rb') as r:
+    with codecs.open(path, "rb") as r:
         lines = r.read()
         lines = lines.decode(encoding)
-        return lines.replace('\r', '')
+        return lines.replace("\r", "")
 # 读txt文件 读成一条string
-def readtxt_string(path, encoding="utf-8") -> str:
-    with codecs.open(path, 'r', encoding) as r:
+def read_text(path, encoding="utf-8") -> str:
+    with codecs.open(path, "r", encoding) as r:
         lines = r.read()
-        return lines.replace('\r', '')
+        return lines.replace("\r", "")
 # 写txt文件覆盖
-def writetxt_w(txt, path, r='w'):
-    with codecs.open(path, r, 'utf-8') as w:
+def writetxt_w(txt, path, r="w"):
+    with codecs.open(path, r, "utf-8") as w:
         w.writelines(txt)
 # 写txt文件追加
 def writetxt_a(txt, path):
-    with codecs.open(path, 'a', 'utf-8') as w:
+    with codecs.open(path, "a", "utf-8") as w:
         w.writelines(txt)
 def writetxt(txt, path, encoding="utf-8"):
-    with codecs.open(path, 'w', encoding) as w:
+    with codecs.open(path, "w", encoding) as w:
         w.write(txt)
 def writetxt_wb(txt, path):
-    with codecs.open(path, 'wb') as w:
+    with codecs.open(path, "wb") as w:
         w.write(txt)
 # 写list 覆盖
 def writetxt_w_list(list, path, num_lf=1):
-    with codecs.open(path, 'w', "utf-8") as w:
+    with codecs.open(path, "w", "utf-8") as w:
         for i in list:
             w.write(i)
             w.write("\n" * num_lf)
@@ -147,7 +150,7 @@ def writetxt_w_list(list, path, num_lf=1):
 # 写list 追加
 def writetxt_a_list(list, path, num_lf=2):
-    with codecs.open(path, 'a', "utf-8") as w:
+    with codecs.open(path, "a", "utf-8") as w:
         for i in list:
             w.write(i)
             w.write("\n" * num_lf)
@@ -158,7 +161,7 @@ def save_to_json(content, path):
         json.dump(content, w, ensure_ascii=False, indent=1)
-def load_from_json(path):
+def load_from_json(path: Optional[str | Path]):
     with codecs.open(path, "r", "utf-8") as r:
         content = json.load(r)
         return content
@@ -167,60 +170,60 @@ def load_from_json(path):
 # 读txt文件 读成一条string if gb2312
 def readtxt_string_all_encoding(path):
     try:
-        with codecs.open(path, 'rb', "utf-8-sig") as r:
+        with codecs.open(path, "rb", "utf-8-sig") as r:
             lines = r.read()
             return lines
     except:
         try:
-            with codecs.open(path, 'rb', "utf-8") as r:
+            with codecs.open(path, "rb", "utf-8") as r:
                 lines = r.reacd()
                 return lines
         except:
             try:
-                with codecs.open(path, 'rb', "big5") as r:
+                with codecs.open(path, "rb", "big5") as r:
                     lines = r.read()
                     return lines
             except:
                 print(path)
-                with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
+                with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
                     lines = r.read()
                     return lines
 def readtxt_list_all_encoding(path):
     try:
-        with codecs.open(path, 'rb', "utf-8-sig") as r:
+        with codecs.open(path, "rb", "utf-8-sig") as r:
             lines = r.readlines()
             return lines
     except:
         try:
-            with codecs.open(path, 'rb', "utf-8") as r:
+            with codecs.open(path, "rb", "utf-8") as r:
                 lines = r.readlines()
                 return lines
         except:
             try:
-                with codecs.open(path, 'rb', "big5") as r:
+                with codecs.open(path, "rb", "big5") as r:
                     lines = r.readlines()
                     return lines
             except:
-                with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
+                with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
                     lines = r.readlines()
                     return lines
 # line by line
 def save_to_jsonl(corpus, path):
-    with open(path, 'w', encoding='utf-8') as wt:
+    with open(path, "w", encoding="utf-8") as wt:
         for i in corpus:
             wt.write(json.dumps(i, ensure_ascii=False))
-            wt.write('\n')
+            wt.write("\n")
 # line by line
 def load_from_jsonl(path):
     file_line_num = iter_count(path)
     if file_line_num > 1e5:
-        with open(path, 'r', encoding='utf-8') as rd:
+        with open(path, "r", encoding="utf-8") as rd:
             corpus = []
             while True:
                 line = rd.readline()
@@ -230,7 +233,7 @@ def load_from_jsonl(path):
                     break
         return corpus
     else:
-        with open(path, 'r', encoding='utf-8') as rd:
+        with open(path, "r", encoding="utf-8") as rd:
             corpus = []
             while True:
                 line = rd.readline()
@@ -242,20 +245,20 @@ def load_from_jsonl(path):
 def save_pkl(data, path):
-    with open(path, 'wb') as f:
+    with open(path, "wb") as f:
         pickle.dump(data, f)
 def load_pkl(path):
-    with open(path, 'rb') as f:
+    with open(path, "rb") as f:
         data = pickle.load(f)
     return data
 def save_to_csv(df, save_path, index_flag=False):
-    with open(save_path, 'wb+') as csvfile:
+    with open(save_path, "wb+") as csvfile:
         csvfile.write(codecs.BOM_UTF8)
-    df.to_csv(save_path, mode='a', index=index_flag)
+    df.to_csv(save_path, mode="a", index=index_flag)
 def save_to_mongo():

nlpertools-1.0.11/src/nlpertools/llm/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .call_llm_once import *
+from .infer import *
+from .price import *

nlpertools-1.0.11/src/nlpertools/llm/call_llm_once.py ADDED Viewed

@@ -0,0 +1,60 @@
+from ..io.file import read_yaml
+from tqdm import tqdm
+import os
+from typing import Optional, Union
+"""
+从你当前的项目里找到.key文件 获取url和key
+"""
+def call_once_stream(
+    client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.2
+) -> str:
+    """
+    调用LLM模型进行一次推理
+    :param prompt: 输入的提示文本
+    :param model_name: 模型名称
+    :param max_tokens: 最大输出token数
+    :return: 模型的输出文本
+    """
+    from openai import OpenAI
+    if isinstance(input, str):
+        message = [{"role": "user", "content": input}]
+    elif isinstance(input, list):
+        message = input
+    completion = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens, stream=True)
+    text = ""
+    for chunk in completion:
+        if chunk.choices:
+            c = chunk.choices[0].delta.content or ""
+            text += c
+            print(c, end="")
+        else:
+            print()
+            print(chunk.usage)
+    return text
+def call_once(
+    client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.8
+) -> str:
+    """
+    调用LLM模型进行一次推理
+    :param prompt: 输入的提示文本
+    :param model_name: 模型名称
+    :param max_tokens: 最大输出token数
+    :return: 模型的输出文本
+    """
+    from openai import OpenAI
+    if isinstance(input, str):
+        message = [{"role": "user", "content": input}]
+    elif isinstance(input, list):
+        message = input
+    response = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens,temperature=temperature)
+    return response.choices[0].message.content

{nlpertools-1.0.10 → nlpertools-1.0.11}/src/nlpertools/llm/infer.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 from tqdm import tqdm
-from openai import OpenAI
 import concurrent.futures
+import itertools
 INFER_PARAS = {
@@ -15,14 +15,17 @@ INFER_PARAS = {
 def parse_infer_data(infer_data: list):
+    # 解释一下为什么要[][]，因为message本来就必须得是[]
     if isinstance(infer_data[0], str):
-        message = [{"role": "user", "content": i} for i in infer_data]
+        message = [[{"role": "user", "content": i}] for i in infer_data]
     elif isinstance(infer_data[0], list):
         message = infer_data
     return message
-def common_api_infer_func(model_name, infer_data: list, infer_paras, client: OpenAI):
+def common_api_infer_func(model_name, infer_data: list, infer_paras, client):
+    from openai import OpenAI
     """
     infer_data: list of messages/prompt
     """
@@ -31,16 +34,58 @@ def common_api_infer_func(model_name, infer_data: list, infer_paras, client: Ope
     def get_response(model_name, messages, infer_paras):
         responses = []
         infer_times = infer_paras.get("infer_times", 1)
         for _ in range(infer_times):
             # 使用OpenAI API进行推理
-            response = client.chat.completions.create(model=model_name, messages=messages, **infer_paras)
+            response = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                temperature=infer_paras.get("temperature", 0.7),
+                max_tokens=infer_paras.get("max_tokens", 8192),
+            )
             text = response.choices[0].message.content
             responses.append({"text": text})
         return responses
     with concurrent.futures.ThreadPoolExecutor(16) as executor:
         futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
-        results = [future.result() for future in concurrent.futures.as_completed(futures)]
+        # results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures))] # 乱序
+        results = [future.result() for future in tqdm(futures)]
+    return results
+def common_api_infer_func_multi_client(model_name, infer_data: list, infer_paras, clients: list):
+    """
+    infer_data: list of messages/prompt
+    """
+    messages = parse_infer_data(infer_data)
+    iter_cycle = itertools.cycle(clients)
+    def get_response(model_name, messages, infer_paras):
+        client = next(iter_cycle)
+        # print(client.base_url)
+        responses = []
+        infer_times = infer_paras.get("infer_times", 1)
+        for _ in range(infer_times):
+            # 使用OpenAI API进行推理
+            try:
+                response = client.chat.completions.create(
+                    model=model_name,
+                    messages=messages,
+                    temperature=infer_paras.get("temperature", 0.7),
+                    max_tokens=infer_paras.get("max_tokens", 8192),
+                )
+                text = response.choices[0].message.content
+            except Exception as e:
+                print(e.__str__())
+                text = ""
+            responses.append({"text": text})
+        return responses
+    with concurrent.futures.ThreadPoolExecutor(128) as executor:
+        futures = [executor.submit(get_response, model_name, message, infer_paras) for message in messages]
+        results = [future.result() for future in tqdm(futures)]
     return results

nlpertools 1.0.10__tar.gz → 1.0.11__tar.gz

nlpertools 1.0.10tar.gz → 1.0.11tar.gz