PyPI - nlpertools - Versions diffs - 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl - Mend

nlpertools 1.0.9py3-none-any.whl → 1.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

nlpertools/__init__.py +3 -2
nlpertools/cli.py +26 -47
nlpertools/dataprocess/__init__.py +1 -0
nlpertools/dataprocess/dedupl.py +9 -0
nlpertools/{dataprocess.py → dataprocess/dp_main.py} +13 -1
nlpertools/io/dir.py +25 -5
nlpertools/io/file.py +46 -43
nlpertools/llm/__init__.py +3 -0
nlpertools/llm/call_llm_once.py +60 -0
nlpertools/llm/infer.py +119 -0
nlpertools/llm/price.py +13 -0
nlpertools/ml.py +72 -59
nlpertools/other.py +82 -53
nlpertools/utils/package.py +9 -10
nlpertools/wrapper.py +6 -4
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/METADATA +27 -25
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/RECORD +21 -15
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/WHEEL +1 -1
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/entry_points.txt +0 -0
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info/licenses}/LICENSE +0 -0
{nlpertools-1.0.9.dist-info → nlpertools-1.0.11.dist-info}/top_level.txt +0 -0

nlpertools/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from .algo.kmp import *
 from .data_structure.base_structure import *
 from .draw import *
+from .dataprocess.dp_main import *
 from .dataprocess import *
 from .io.dir import *
 from .io.file import *
@@ -17,7 +18,7 @@ from .utils_for_nlpertools import *
 from .wrapper import *
 from .monitor import *
 from .cli import *
+from .llm import *
-__version__ = '1.0.9'
+__version__ = "1.0.11"

nlpertools/cli.py CHANGED Viewed

@@ -2,45 +2,30 @@ import argparse
 import os
 import uuid
 import sys
+from .dataprocess.dp_main import startwith
-"""
-如何Debug cli.py
-"""
-def git_push():
-    """
-    针对国内提交github经常失败，自动提交
-    """
-    num = -1
-    while 1:
-        num += 1
-        print("retry num: {}".format(num))
-        info = os.system("git push --set-upstream origin main")
-        print(str(info))
-        if not str(info).startswith("fatal"):
-            print("scucess")
-            break
-def git_pull():
+def run_git_command(command):
     """
-    针对国内提交github经常失败，自动提交
+    循环执行git命令，直到成功
     """
+    print(command)
     num = -1
-    while 1:
+    while True:
         num += 1
-        print("retry num: {}".format(num))
-        info = os.system("git pull")
+        print(f"retry num: {num}")
+        info = os.system(command)
         print(str(info))
-        if not str(info).startswith("fatal") and not str(info).startswith("error"):
-            print("scucess")
+        # 检查命令执行结果，若未出现错误则认为执行成功
+        if (not startwith(str(info), ["fatal", "error", "128", "1"])) and "fatal" not in str(info):
+            print("success")
+            print(f"success info : ##{info}##")
             break
 def get_mac_address():
     mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
-    mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
+    mac_address = ":".join([mac[e : e + 2] for e in range(0, 11, 2)])
     print("mac address 不一定准确")
     print(mac_address)
     return mac_address
@@ -48,6 +33,7 @@ def get_mac_address():
 def get_2af_value(key):
     import pyotp
     """
     key应该是7位的
     """
@@ -80,15 +66,11 @@ def start_gpu_usage_notify_client():
     from plyer import notification
     import time
-    SERVER_URL = 'http://127.0.0.1:5000/notify'  # 服务器的 API 地址
+    SERVER_URL = "http://127.0.0.1:5000/notify"  # 服务器的 API 地址
     def notify(text):
         # 使用 plyer 发送通知
-        notification.notify(
-            title='远程通知',
-            message=text,
-            timeout=10  # 10秒的通知显示时间
-        )
+        notification.notify(title="远程通知", message=text, timeout=10)  # 10秒的通知显示时间
     """定时轮询服务器获取通知"""
     while True:
@@ -108,22 +90,19 @@ def start_gpu_usage_notify_client():
 def main():
-    parser = argparse.ArgumentParser(description="CLI tool for git operations and getting MAC address.")
-    parser.add_argument('--gitpush', action='store_true', help='Perform git push operation.')
-    parser.add_argument('--gitpull', action='store_true', help='Perform git pull operation.')
-    parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
-    parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
-    parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
-    parser.add_argument('--monitor_gpu_cli', action='store_true', help='Get the 2fa value.')
-    parser.add_argument('--monitor_gpu_ser', action='store_true', help='Get the 2fa value.')
+    parser = argparse.ArgumentParser(description="CLI tool for git operations and other functions.")
+    parser.add_argument("git_command", nargs="*", help="Any git command (e.g., push, pull)")
+    parser.add_argument("--mac_address", action="store_true", help="Get the MAC address.")
+    parser.add_argument("--get_2fa", action="store_true", help="Get the 2fa value.")
+    parser.add_argument("--get_2fa_key", type=str, help="Get the 2fa value.")
+    parser.add_argument("--monitor_gpu_cli", action="store_true", help="monitor gpu cli")
+    parser.add_argument("--monitor_gpu_ser", action="store_true", help="monitor gpu ser")
     args = parser.parse_args()
-    if args.gitpush:
-        git_push()
-    elif args.gitpull:
-        git_pull()
+    if args.git_command:
+        git_cmd = " ".join(args.git_command)
+        run_git_command(git_cmd)
     elif args.mac_address:
         get_mac_address()
     elif args.monitor_gpu_cli:
@@ -139,5 +118,5 @@ def main():
         print("No operation specified.")
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

nlpertools/dataprocess/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .dedupl import *

nlpertools/dataprocess/dedupl.py ADDED Viewed

@@ -0,0 +1,9 @@
+# 根据字段对一个元素为dict的list去重
+def deduplicate_dict_list(dict_list: list, key: str) -> list:
+    seen = set()
+    result = []
+    for d in dict_list:
+        if key in d and d[key] not in seen:
+            seen.add(d[key])
+            result.append(d)
+    return result

nlpertools/{dataprocess.py → dataprocess/dp_main.py} RENAMED Viewed

@@ -8,7 +8,7 @@ from typing import List
 import numpy as np
 # from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
-from .utils.package import *
+from ..utils.package import *
 main_special_characters = string.punctuation + string.digits + string.whitespace
 other_special_characters = (
@@ -19,6 +19,18 @@ other_special_characters = (
     "」﴾》"
 )
+def startwith(text: str, pattern_list: list) -> bool:
+    """
+    判断text是否以pattern_list中的某个pattern开头
+    :param text:
+    :param pattern_list:
+    :return:
+    """
+    for pattern in pattern_list:
+        if text.startswith(pattern):
+            return True
+    return False
 class Pattern:
     """

nlpertools/io/dir.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # @Author  : youshu.Ji
 import os
 from pathlib import Path
+from typing import overload,Literal,Union
 # dir ----------------------------------------------------------------------
@@ -45,15 +46,34 @@ def get_filename(path, suffix=True) -> str:
         filename = filename.split('.')[0]
     return filename
-def listdir(dir_name, including_dir=True):
-    filenames = os.listdir(dir_name)
+"""
+因为os.listdir无法支持Path类型，虽然是bytelikepath,但是传入Path后只会返回字符串
+且无法只返回文件名
+故重新实现
+"""
+@overload
+def listdir(dir_name: Path, including_dir: Literal[True]) -> list[Path]: ...
+@overload
+def listdir(dir_name: str, including_dir: Literal[True]) -> list[str]: ...
+@overload
+def listdir(dir_name: Path, including_dir: Literal[False] = False) -> list[str]: ...
+@overload
+def listdir(dir_name: str, including_dir: Literal[False] = False) -> list[str]: ...
+def listdir(dir_name: Union[Path, str], including_dir: bool = False) -> list[Path] | list[str]:
+    """
+    including_dir=True -> list[Path] or list[str]
+    including_dir=False -> list[str]
+    """
+    filenames = os.listdir(str(dir_name))
     if including_dir:
-        return [os.path.join(dir_name, filename) for filename in filenames]
+        if isinstance(dir_name, Path):
+            return [dir_name / filename for filename in filenames]
+        else:
+            return [os.path.join(dir_name, filename) for filename in filenames]
     else:
         return list(filenames)
 def listdir_yield(dir_name, including_dir=True):
     filenames = os.listdir(dir_name)
     for filename in filenames:

nlpertools/io/file.py CHANGED Viewed

@@ -5,8 +5,11 @@ import codecs
 import json
 import pickle
 import random
-from itertools import (takewhile, repeat)
+from itertools import takewhile, repeat
+from typing import Optional
+from pathlib import Path
 import pandas as pd
 # import omegaconf
 # import yaml
 from ..utils.package import *
@@ -15,18 +18,18 @@ LARGE_FILE_THRESHOLD = 1e5
 def safe_filename(filename: str) -> str:
-    for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|']:
-        filename = filename.replace(char, '_')
+    for char in ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]:
+        filename = filename.replace(char, "_")
     return filename
 def read_yaml(path, omega=False):
     if omega:
         return omegaconf.OmegaConf.load(path)
-    return yaml.load(codecs.open(path, encoding='utf-8'), Loader=yaml.FullLoader)
+    return yaml.load(codecs.open(path, encoding="utf-8"), Loader=yaml.FullLoader)
-def _merge_file(filelist, save_filename, shuffle=False):
+def merge_file(filelist, save_filename, shuffle=False):
     contents = []
     for file in filelist:
         content = readtxt_list_all_strip(file)
@@ -43,9 +46,9 @@ def iter_count(file_name):
     author: unknown
     """
     buffer = 1024 * 1024
-    with codecs.open(file_name, 'r', 'utf-8') as f:
+    with codecs.open(file_name, "r", "utf-8") as f:
         buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
-        return sum(buf.count('\n') for buf in buf_gen)
+        return sum(buf.count("\n") for buf in buf_gen)
 # 需要加入进度条的函数包括
@@ -57,24 +60,24 @@ load_from_json
 # 读txt文件 一次全读完 返回list 去换行
-def readtxt_list_all_strip(path, encoding='utf-8') -> list:
+def readtxt_list_all_strip(path, encoding="utf-8") -> list:
     file_line_num = iter_count(path)
     lines = []
-    with codecs.open(path, 'r', encoding) as r:
+    with codecs.open(path, "r", encoding) as r:
         if file_line_num > LARGE_FILE_THRESHOLD:
             iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
         else:
             iter_obj = enumerate(r.readlines())
         for ldx, line in iter_obj:
-            lines.append(line.strip('\n').strip("\r"))
+            lines.append(line.strip("\n").strip("\r"))
         return lines
 # 读txt 一次读一行 最后返回list
 def readtxt_list_each(path) -> list:
     lines = []
-    with codecs.open(path, 'r', 'utf-8') as r:
+    with codecs.open(path, "r", "utf-8") as r:
         line = r.readline()
         while line:
             lines.append(line)
@@ -82,11 +85,11 @@ def readtxt_list_each(path) -> list:
     return lines
-def readtxt_list_each_strip(path) -> list:
+def readtxt_list_each_strip(path: Optional[str | Path]):
     """
     yield方法
     """
-    with codecs.open(path, 'r', 'utf-8') as r:
+    with codecs.open(path, "r", "utf-8") as r:
         line = r.readline()
         while line:
             yield line.strip("\n").strip("\r")
@@ -95,51 +98,51 @@ def readtxt_list_each_strip(path) -> list:
 # 读txt文件 一次全读完 返回list
 def readtxt_list_all(path) -> list:
-    with codecs.open(path, 'r', 'utf-8') as r:
+    with codecs.open(path, "r", "utf-8") as r:
         lines = r.readlines()
         return lines
 # 读byte文件 读成一条string
 def readtxt_byte(path, encoding="utf-8") -> str:
-    with codecs.open(path, 'rb') as r:
+    with codecs.open(path, "rb") as r:
         lines = r.read()
         lines = lines.decode(encoding)
-        return lines.replace('\r', '')
+        return lines.replace("\r", "")
 # 读txt文件 读成一条string
-def readtxt_string(path, encoding="utf-8") -> str:
-    with codecs.open(path, 'r', encoding) as r:
+def read_text(path, encoding="utf-8") -> str:
+    with codecs.open(path, "r", encoding) as r:
         lines = r.read()
-        return lines.replace('\r', '')
+        return lines.replace("\r", "")
 # 写txt文件覆盖
-def writetxt_w(txt, path, r='w'):
-    with codecs.open(path, r, 'utf-8') as w:
+def writetxt_w(txt, path, r="w"):
+    with codecs.open(path, r, "utf-8") as w:
         w.writelines(txt)
 # 写txt文件追加
 def writetxt_a(txt, path):
-    with codecs.open(path, 'a', 'utf-8') as w:
+    with codecs.open(path, "a", "utf-8") as w:
         w.writelines(txt)
 def writetxt(txt, path, encoding="utf-8"):
-    with codecs.open(path, 'w', encoding) as w:
+    with codecs.open(path, "w", encoding) as w:
         w.write(txt)
 def writetxt_wb(txt, path):
-    with codecs.open(path, 'wb') as w:
+    with codecs.open(path, "wb") as w:
         w.write(txt)
 # 写list 覆盖
 def writetxt_w_list(list, path, num_lf=1):
-    with codecs.open(path, 'w', "utf-8") as w:
+    with codecs.open(path, "w", "utf-8") as w:
         for i in list:
             w.write(i)
             w.write("\n" * num_lf)
@@ -147,7 +150,7 @@ def writetxt_w_list(list, path, num_lf=1):
 # 写list 追加
 def writetxt_a_list(list, path, num_lf=2):
-    with codecs.open(path, 'a', "utf-8") as w:
+    with codecs.open(path, "a", "utf-8") as w:
         for i in list:
             w.write(i)
             w.write("\n" * num_lf)
@@ -158,7 +161,7 @@ def save_to_json(content, path):
         json.dump(content, w, ensure_ascii=False, indent=1)
-def load_from_json(path):
+def load_from_json(path: Optional[str | Path]):
     with codecs.open(path, "r", "utf-8") as r:
         content = json.load(r)
         return content
@@ -167,60 +170,60 @@ def load_from_json(path):
 # 读txt文件 读成一条string if gb2312
 def readtxt_string_all_encoding(path):
     try:
-        with codecs.open(path, 'rb', "utf-8-sig") as r:
+        with codecs.open(path, "rb", "utf-8-sig") as r:
             lines = r.read()
             return lines
     except:
         try:
-            with codecs.open(path, 'rb', "utf-8") as r:
+            with codecs.open(path, "rb", "utf-8") as r:
                 lines = r.reacd()
                 return lines
         except:
             try:
-                with codecs.open(path, 'rb', "big5") as r:
+                with codecs.open(path, "rb", "big5") as r:
                     lines = r.read()
                     return lines
             except:
                 print(path)
-                with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
+                with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
                     lines = r.read()
                     return lines
 def readtxt_list_all_encoding(path):
     try:
-        with codecs.open(path, 'rb', "utf-8-sig") as r:
+        with codecs.open(path, "rb", "utf-8-sig") as r:
             lines = r.readlines()
             return lines
     except:
         try:
-            with codecs.open(path, 'rb', "utf-8") as r:
+            with codecs.open(path, "rb", "utf-8") as r:
                 lines = r.readlines()
                 return lines
         except:
             try:
-                with codecs.open(path, 'rb', "big5") as r:
+                with codecs.open(path, "rb", "big5") as r:
                     lines = r.readlines()
                     return lines
             except:
-                with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
+                with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
                     lines = r.readlines()
                     return lines
 # line by line
 def save_to_jsonl(corpus, path):
-    with open(path, 'w', encoding='utf-8') as wt:
+    with open(path, "w", encoding="utf-8") as wt:
         for i in corpus:
             wt.write(json.dumps(i, ensure_ascii=False))
-            wt.write('\n')
+            wt.write("\n")
 # line by line
 def load_from_jsonl(path):
     file_line_num = iter_count(path)
     if file_line_num > 1e5:
-        with open(path, 'r', encoding='utf-8') as rd:
+        with open(path, "r", encoding="utf-8") as rd:
             corpus = []
             while True:
                 line = rd.readline()
@@ -230,7 +233,7 @@ def load_from_jsonl(path):
                     break
         return corpus
     else:
-        with open(path, 'r', encoding='utf-8') as rd:
+        with open(path, "r", encoding="utf-8") as rd:
             corpus = []
             while True:
                 line = rd.readline()
@@ -242,20 +245,20 @@ def load_from_jsonl(path):
 def save_pkl(data, path):
-    with open(path, 'wb') as f:
+    with open(path, "wb") as f:
         pickle.dump(data, f)
 def load_pkl(path):
-    with open(path, 'rb') as f:
+    with open(path, "rb") as f:
         data = pickle.load(f)
     return data
 def save_to_csv(df, save_path, index_flag=False):
-    with open(save_path, 'wb+') as csvfile:
+    with open(save_path, "wb+") as csvfile:
         csvfile.write(codecs.BOM_UTF8)
-    df.to_csv(save_path, mode='a', index=index_flag)
+    df.to_csv(save_path, mode="a", index=index_flag)
 def save_to_mongo():

nlpertools/llm/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .call_llm_once import *
+from .infer import *
+from .price import *

nlpertools/llm/call_llm_once.py ADDED Viewed

@@ -0,0 +1,60 @@
+from ..io.file import read_yaml
+from tqdm import tqdm
+import os
+from typing import Optional, Union
+"""
+从你当前的项目里找到.key文件 获取url和key
+"""
+def call_once_stream(
+    client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.2
+) -> str:
+    """
+    调用LLM模型进行一次推理
+    :param prompt: 输入的提示文本
+    :param model_name: 模型名称
+    :param max_tokens: 最大输出token数
+    :return: 模型的输出文本
+    """
+    from openai import OpenAI
+    if isinstance(input, str):
+        message = [{"role": "user", "content": input}]
+    elif isinstance(input, list):
+        message = input
+    completion = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens, stream=True)
+    text = ""
+    for chunk in completion:
+        if chunk.choices:
+            c = chunk.choices[0].delta.content or ""
+            text += c
+            print(c, end="")
+        else:
+            print()
+            print(chunk.usage)
+    return text
+def call_once(
+    client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.8
+) -> str:
+    """
+    调用LLM模型进行一次推理
+    :param prompt: 输入的提示文本
+    :param model_name: 模型名称
+    :param max_tokens: 最大输出token数
+    :return: 模型的输出文本
+    """
+    from openai import OpenAI
+    if isinstance(input, str):
+        message = [{"role": "user", "content": input}]
+    elif isinstance(input, list):
+        message = input
+    response = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens,temperature=temperature)
+    return response.choices[0].message.content

nlpertools 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

nlpertools 1.0.9py3-none-any.whl → 1.0.11py3-none-any.whl