nlpertools 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nlpertools/__init__.py CHANGED
@@ -4,6 +4,7 @@
4
4
  from .algo.kmp import *
5
5
  from .data_structure.base_structure import *
6
6
  from .draw import *
7
+ from .dataprocess.dp_main import *
7
8
  from .dataprocess import *
8
9
  from .io.dir import *
9
10
  from .io.file import *
@@ -17,7 +18,7 @@ from .utils_for_nlpertools import *
17
18
  from .wrapper import *
18
19
  from .monitor import *
19
20
  from .cli import *
21
+ from .llm import *
20
22
 
21
23
 
22
-
23
- __version__ = '1.0.9'
24
+ __version__ = "1.0.11"
nlpertools/cli.py CHANGED
@@ -2,45 +2,30 @@ import argparse
2
2
  import os
3
3
  import uuid
4
4
  import sys
5
+ from .dataprocess.dp_main import startwith
5
6
 
6
- """
7
- 如何Debug cli.py
8
- """
9
7
 
10
-
11
- def git_push():
12
- """
13
- 针对国内提交github经常失败,自动提交
14
- """
15
- num = -1
16
- while 1:
17
- num += 1
18
- print("retry num: {}".format(num))
19
- info = os.system("git push --set-upstream origin main")
20
- print(str(info))
21
- if not str(info).startswith("fatal"):
22
- print("scucess")
23
- break
24
-
25
-
26
- def git_pull():
8
+ def run_git_command(command):
27
9
  """
28
- 针对国内提交github经常失败,自动提交
10
+ 循环执行git命令,直到成功
29
11
  """
12
+ print(command)
30
13
  num = -1
31
- while 1:
14
+ while True:
32
15
  num += 1
33
- print("retry num: {}".format(num))
34
- info = os.system("git pull")
16
+ print(f"retry num: {num}")
17
+ info = os.system(command)
35
18
  print(str(info))
36
- if not str(info).startswith("fatal") and not str(info).startswith("error"):
37
- print("scucess")
19
+ # 检查命令执行结果,若未出现错误则认为执行成功
20
+ if (not startwith(str(info), ["fatal", "error", "128", "1"])) and "fatal" not in str(info):
21
+ print("success")
22
+ print(f"success info : ##{info}##")
38
23
  break
39
24
 
40
25
 
41
26
  def get_mac_address():
42
27
  mac = uuid.UUID(int=uuid.getnode()).hex[-12:]
43
- mac_address = ":".join([mac[e:e + 2] for e in range(0, 11, 2)])
28
+ mac_address = ":".join([mac[e : e + 2] for e in range(0, 11, 2)])
44
29
  print("mac address 不一定准确")
45
30
  print(mac_address)
46
31
  return mac_address
@@ -48,6 +33,7 @@ def get_mac_address():
48
33
 
49
34
  def get_2af_value(key):
50
35
  import pyotp
36
+
51
37
  """
52
38
  key应该是7位的
53
39
  """
@@ -80,15 +66,11 @@ def start_gpu_usage_notify_client():
80
66
  from plyer import notification
81
67
  import time
82
68
 
83
- SERVER_URL = 'http://127.0.0.1:5000/notify' # 服务器的 API 地址
69
+ SERVER_URL = "http://127.0.0.1:5000/notify" # 服务器的 API 地址
84
70
 
85
71
  def notify(text):
86
72
  # 使用 plyer 发送通知
87
- notification.notify(
88
- title='远程通知',
89
- message=text,
90
- timeout=10 # 10秒的通知显示时间
91
- )
73
+ notification.notify(title="远程通知", message=text, timeout=10) # 10秒的通知显示时间
92
74
 
93
75
  """定时轮询服务器获取通知"""
94
76
  while True:
@@ -108,22 +90,19 @@ def start_gpu_usage_notify_client():
108
90
 
109
91
 
110
92
  def main():
111
- parser = argparse.ArgumentParser(description="CLI tool for git operations and getting MAC address.")
112
- parser.add_argument('--gitpush', action='store_true', help='Perform git push operation.')
113
- parser.add_argument('--gitpull', action='store_true', help='Perform git pull operation.')
114
- parser.add_argument('--mac_address', action='store_true', help='Get the MAC address.')
115
-
116
- parser.add_argument('--get_2fa', action='store_true', help='Get the 2fa value.')
117
- parser.add_argument('--get_2fa_key', type=str, help='Get the 2fa value.')
118
- parser.add_argument('--monitor_gpu_cli', action='store_true', help='Get the 2fa value.')
119
- parser.add_argument('--monitor_gpu_ser', action='store_true', help='Get the 2fa value.')
93
+ parser = argparse.ArgumentParser(description="CLI tool for git operations and other functions.")
94
+ parser.add_argument("git_command", nargs="*", help="Any git command (e.g., push, pull)")
95
+ parser.add_argument("--mac_address", action="store_true", help="Get the MAC address.")
96
+ parser.add_argument("--get_2fa", action="store_true", help="Get the 2fa value.")
97
+ parser.add_argument("--get_2fa_key", type=str, help="Get the 2fa value.")
98
+ parser.add_argument("--monitor_gpu_cli", action="store_true", help="monitor gpu cli")
99
+ parser.add_argument("--monitor_gpu_ser", action="store_true", help="monitor gpu ser")
120
100
 
121
101
  args = parser.parse_args()
122
102
 
123
- if args.gitpush:
124
- git_push()
125
- elif args.gitpull:
126
- git_pull()
103
+ if args.git_command:
104
+ git_cmd = " ".join(args.git_command)
105
+ run_git_command(git_cmd)
127
106
  elif args.mac_address:
128
107
  get_mac_address()
129
108
  elif args.monitor_gpu_cli:
@@ -139,5 +118,5 @@ def main():
139
118
  print("No operation specified.")
140
119
 
141
120
 
142
- if __name__ == '__main__':
121
+ if __name__ == "__main__":
143
122
  main()
@@ -0,0 +1 @@
1
+ from .dedupl import *
@@ -0,0 +1,9 @@
1
+ # 根据字段对一个元素为dict的list去重
2
+ def deduplicate_dict_list(dict_list: list, key: str) -> list:
3
+ seen = set()
4
+ result = []
5
+ for d in dict_list:
6
+ if key in d and d[key] not in seen:
7
+ seen.add(d[key])
8
+ result.append(d)
9
+ return result
@@ -8,7 +8,7 @@ from typing import List
8
8
  import numpy as np
9
9
 
10
10
  # from . import DB_CONFIG_FILE # cannot import name 'DB_CONFIG_FILE' from partially initialized module 'nlpertools'
11
- from .utils.package import *
11
+ from ..utils.package import *
12
12
 
13
13
  main_special_characters = string.punctuation + string.digits + string.whitespace
14
14
  other_special_characters = (
@@ -19,6 +19,18 @@ other_special_characters = (
19
19
  "」﴾》"
20
20
  )
21
21
 
22
+ def startwith(text: str, pattern_list: list) -> bool:
23
+ """
24
+ 判断text是否以pattern_list中的某个pattern开头
25
+ :param text:
26
+ :param pattern_list:
27
+ :return:
28
+ """
29
+ for pattern in pattern_list:
30
+ if text.startswith(pattern):
31
+ return True
32
+ return False
33
+
22
34
 
23
35
  class Pattern:
24
36
  """
nlpertools/io/dir.py CHANGED
@@ -3,6 +3,7 @@
3
3
  # @Author : youshu.Ji
4
4
  import os
5
5
  from pathlib import Path
6
+ from typing import overload,Literal,Union
6
7
 
7
8
 
8
9
  # dir ----------------------------------------------------------------------
@@ -45,15 +46,34 @@ def get_filename(path, suffix=True) -> str:
45
46
  filename = filename.split('.')[0]
46
47
  return filename
47
48
 
48
-
49
- def listdir(dir_name, including_dir=True):
50
- filenames = os.listdir(dir_name)
49
+ """
50
+ 因为os.listdir无法支持Path类型,虽然是bytelikepath,但是传入Path后只会返回字符串
51
+ 且无法只返回文件名
52
+ 故重新实现
53
+ """
54
+ @overload
55
+ def listdir(dir_name: Path, including_dir: Literal[True]) -> list[Path]: ...
56
+ @overload
57
+ def listdir(dir_name: str, including_dir: Literal[True]) -> list[str]: ...
58
+ @overload
59
+ def listdir(dir_name: Path, including_dir: Literal[False] = False) -> list[str]: ...
60
+ @overload
61
+ def listdir(dir_name: str, including_dir: Literal[False] = False) -> list[str]: ...
62
+
63
+ def listdir(dir_name: Union[Path, str], including_dir: bool = False) -> list[Path] | list[str]:
64
+ """
65
+ including_dir=True -> list[Path] or list[str]
66
+ including_dir=False -> list[str]
67
+ """
68
+ filenames = os.listdir(str(dir_name))
51
69
  if including_dir:
52
- return [os.path.join(dir_name, filename) for filename in filenames]
70
+ if isinstance(dir_name, Path):
71
+ return [dir_name / filename for filename in filenames]
72
+ else:
73
+ return [os.path.join(dir_name, filename) for filename in filenames]
53
74
  else:
54
75
  return list(filenames)
55
76
 
56
-
57
77
  def listdir_yield(dir_name, including_dir=True):
58
78
  filenames = os.listdir(dir_name)
59
79
  for filename in filenames:
nlpertools/io/file.py CHANGED
@@ -5,8 +5,11 @@ import codecs
5
5
  import json
6
6
  import pickle
7
7
  import random
8
- from itertools import (takewhile, repeat)
8
+ from itertools import takewhile, repeat
9
+ from typing import Optional
10
+ from pathlib import Path
9
11
  import pandas as pd
12
+
10
13
  # import omegaconf
11
14
  # import yaml
12
15
  from ..utils.package import *
@@ -15,18 +18,18 @@ LARGE_FILE_THRESHOLD = 1e5
15
18
 
16
19
 
17
20
  def safe_filename(filename: str) -> str:
18
- for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|']:
19
- filename = filename.replace(char, '_')
21
+ for char in ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]:
22
+ filename = filename.replace(char, "_")
20
23
  return filename
21
24
 
22
25
 
23
26
  def read_yaml(path, omega=False):
24
27
  if omega:
25
28
  return omegaconf.OmegaConf.load(path)
26
- return yaml.load(codecs.open(path, encoding='utf-8'), Loader=yaml.FullLoader)
29
+ return yaml.load(codecs.open(path, encoding="utf-8"), Loader=yaml.FullLoader)
27
30
 
28
31
 
29
- def _merge_file(filelist, save_filename, shuffle=False):
32
+ def merge_file(filelist, save_filename, shuffle=False):
30
33
  contents = []
31
34
  for file in filelist:
32
35
  content = readtxt_list_all_strip(file)
@@ -43,9 +46,9 @@ def iter_count(file_name):
43
46
  author: unknown
44
47
  """
45
48
  buffer = 1024 * 1024
46
- with codecs.open(file_name, 'r', 'utf-8') as f:
49
+ with codecs.open(file_name, "r", "utf-8") as f:
47
50
  buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
48
- return sum(buf.count('\n') for buf in buf_gen)
51
+ return sum(buf.count("\n") for buf in buf_gen)
49
52
 
50
53
 
51
54
  # 需要加入进度条的函数包括
@@ -57,24 +60,24 @@ load_from_json
57
60
 
58
61
 
59
62
  # 读txt文件 一次全读完 返回list 去换行
60
- def readtxt_list_all_strip(path, encoding='utf-8') -> list:
63
+ def readtxt_list_all_strip(path, encoding="utf-8") -> list:
61
64
  file_line_num = iter_count(path)
62
65
  lines = []
63
- with codecs.open(path, 'r', encoding) as r:
66
+ with codecs.open(path, "r", encoding) as r:
64
67
  if file_line_num > LARGE_FILE_THRESHOLD:
65
68
  iter_obj = tqdm(enumerate(r.readlines()), total=file_line_num)
66
69
  else:
67
70
  iter_obj = enumerate(r.readlines())
68
71
 
69
72
  for ldx, line in iter_obj:
70
- lines.append(line.strip('\n').strip("\r"))
73
+ lines.append(line.strip("\n").strip("\r"))
71
74
  return lines
72
75
 
73
76
 
74
77
  # 读txt 一次读一行 最后返回list
75
78
  def readtxt_list_each(path) -> list:
76
79
  lines = []
77
- with codecs.open(path, 'r', 'utf-8') as r:
80
+ with codecs.open(path, "r", "utf-8") as r:
78
81
  line = r.readline()
79
82
  while line:
80
83
  lines.append(line)
@@ -82,11 +85,11 @@ def readtxt_list_each(path) -> list:
82
85
  return lines
83
86
 
84
87
 
85
- def readtxt_list_each_strip(path) -> list:
88
+ def readtxt_list_each_strip(path: Optional[str | Path]):
86
89
  """
87
90
  yield方法
88
91
  """
89
- with codecs.open(path, 'r', 'utf-8') as r:
92
+ with codecs.open(path, "r", "utf-8") as r:
90
93
  line = r.readline()
91
94
  while line:
92
95
  yield line.strip("\n").strip("\r")
@@ -95,51 +98,51 @@ def readtxt_list_each_strip(path) -> list:
95
98
 
96
99
  # 读txt文件 一次全读完 返回list
97
100
  def readtxt_list_all(path) -> list:
98
- with codecs.open(path, 'r', 'utf-8') as r:
101
+ with codecs.open(path, "r", "utf-8") as r:
99
102
  lines = r.readlines()
100
103
  return lines
101
104
 
102
105
 
103
106
  # 读byte文件 读成一条string
104
107
  def readtxt_byte(path, encoding="utf-8") -> str:
105
- with codecs.open(path, 'rb') as r:
108
+ with codecs.open(path, "rb") as r:
106
109
  lines = r.read()
107
110
  lines = lines.decode(encoding)
108
- return lines.replace('\r', '')
111
+ return lines.replace("\r", "")
109
112
 
110
113
 
111
114
  # 读txt文件 读成一条string
112
- def readtxt_string(path, encoding="utf-8") -> str:
113
- with codecs.open(path, 'r', encoding) as r:
115
+ def read_text(path, encoding="utf-8") -> str:
116
+ with codecs.open(path, "r", encoding) as r:
114
117
  lines = r.read()
115
- return lines.replace('\r', '')
118
+ return lines.replace("\r", "")
116
119
 
117
120
 
118
121
  # 写txt文件覆盖
119
- def writetxt_w(txt, path, r='w'):
120
- with codecs.open(path, r, 'utf-8') as w:
122
+ def writetxt_w(txt, path, r="w"):
123
+ with codecs.open(path, r, "utf-8") as w:
121
124
  w.writelines(txt)
122
125
 
123
126
 
124
127
  # 写txt文件追加
125
128
  def writetxt_a(txt, path):
126
- with codecs.open(path, 'a', 'utf-8') as w:
129
+ with codecs.open(path, "a", "utf-8") as w:
127
130
  w.writelines(txt)
128
131
 
129
132
 
130
133
  def writetxt(txt, path, encoding="utf-8"):
131
- with codecs.open(path, 'w', encoding) as w:
134
+ with codecs.open(path, "w", encoding) as w:
132
135
  w.write(txt)
133
136
 
134
137
 
135
138
  def writetxt_wb(txt, path):
136
- with codecs.open(path, 'wb') as w:
139
+ with codecs.open(path, "wb") as w:
137
140
  w.write(txt)
138
141
 
139
142
 
140
143
  # 写list 覆盖
141
144
  def writetxt_w_list(list, path, num_lf=1):
142
- with codecs.open(path, 'w', "utf-8") as w:
145
+ with codecs.open(path, "w", "utf-8") as w:
143
146
  for i in list:
144
147
  w.write(i)
145
148
  w.write("\n" * num_lf)
@@ -147,7 +150,7 @@ def writetxt_w_list(list, path, num_lf=1):
147
150
 
148
151
  # 写list 追加
149
152
  def writetxt_a_list(list, path, num_lf=2):
150
- with codecs.open(path, 'a', "utf-8") as w:
153
+ with codecs.open(path, "a", "utf-8") as w:
151
154
  for i in list:
152
155
  w.write(i)
153
156
  w.write("\n" * num_lf)
@@ -158,7 +161,7 @@ def save_to_json(content, path):
158
161
  json.dump(content, w, ensure_ascii=False, indent=1)
159
162
 
160
163
 
161
- def load_from_json(path):
164
+ def load_from_json(path: Optional[str | Path]):
162
165
  with codecs.open(path, "r", "utf-8") as r:
163
166
  content = json.load(r)
164
167
  return content
@@ -167,60 +170,60 @@ def load_from_json(path):
167
170
  # 读txt文件 读成一条string if gb2312
168
171
  def readtxt_string_all_encoding(path):
169
172
  try:
170
- with codecs.open(path, 'rb', "utf-8-sig") as r:
173
+ with codecs.open(path, "rb", "utf-8-sig") as r:
171
174
  lines = r.read()
172
175
  return lines
173
176
  except:
174
177
  try:
175
- with codecs.open(path, 'rb', "utf-8") as r:
178
+ with codecs.open(path, "rb", "utf-8") as r:
176
179
  lines = r.reacd()
177
180
  return lines
178
181
  except:
179
182
  try:
180
- with codecs.open(path, 'rb', "big5") as r:
183
+ with codecs.open(path, "rb", "big5") as r:
181
184
  lines = r.read()
182
185
  return lines
183
186
  except:
184
187
  print(path)
185
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
188
+ with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
186
189
  lines = r.read()
187
190
  return lines
188
191
 
189
192
 
190
193
  def readtxt_list_all_encoding(path):
191
194
  try:
192
- with codecs.open(path, 'rb', "utf-8-sig") as r:
195
+ with codecs.open(path, "rb", "utf-8-sig") as r:
193
196
  lines = r.readlines()
194
197
  return lines
195
198
  except:
196
199
  try:
197
- with codecs.open(path, 'rb', "utf-8") as r:
200
+ with codecs.open(path, "rb", "utf-8") as r:
198
201
  lines = r.readlines()
199
202
  return lines
200
203
  except:
201
204
  try:
202
- with codecs.open(path, 'rb', "big5") as r:
205
+ with codecs.open(path, "rb", "big5") as r:
203
206
  lines = r.readlines()
204
207
  return lines
205
208
  except:
206
- with codecs.open(path, 'rb', "gb2312", errors='ignore') as r:
209
+ with codecs.open(path, "rb", "gb2312", errors="ignore") as r:
207
210
  lines = r.readlines()
208
211
  return lines
209
212
 
210
213
 
211
214
  # line by line
212
215
  def save_to_jsonl(corpus, path):
213
- with open(path, 'w', encoding='utf-8') as wt:
216
+ with open(path, "w", encoding="utf-8") as wt:
214
217
  for i in corpus:
215
218
  wt.write(json.dumps(i, ensure_ascii=False))
216
- wt.write('\n')
219
+ wt.write("\n")
217
220
 
218
221
 
219
222
  # line by line
220
223
  def load_from_jsonl(path):
221
224
  file_line_num = iter_count(path)
222
225
  if file_line_num > 1e5:
223
- with open(path, 'r', encoding='utf-8') as rd:
226
+ with open(path, "r", encoding="utf-8") as rd:
224
227
  corpus = []
225
228
  while True:
226
229
  line = rd.readline()
@@ -230,7 +233,7 @@ def load_from_jsonl(path):
230
233
  break
231
234
  return corpus
232
235
  else:
233
- with open(path, 'r', encoding='utf-8') as rd:
236
+ with open(path, "r", encoding="utf-8") as rd:
234
237
  corpus = []
235
238
  while True:
236
239
  line = rd.readline()
@@ -242,20 +245,20 @@ def load_from_jsonl(path):
242
245
 
243
246
 
244
247
  def save_pkl(data, path):
245
- with open(path, 'wb') as f:
248
+ with open(path, "wb") as f:
246
249
  pickle.dump(data, f)
247
250
 
248
251
 
249
252
  def load_pkl(path):
250
- with open(path, 'rb') as f:
253
+ with open(path, "rb") as f:
251
254
  data = pickle.load(f)
252
255
  return data
253
256
 
254
257
 
255
258
  def save_to_csv(df, save_path, index_flag=False):
256
- with open(save_path, 'wb+') as csvfile:
259
+ with open(save_path, "wb+") as csvfile:
257
260
  csvfile.write(codecs.BOM_UTF8)
258
- df.to_csv(save_path, mode='a', index=index_flag)
261
+ df.to_csv(save_path, mode="a", index=index_flag)
259
262
 
260
263
 
261
264
  def save_to_mongo():
@@ -0,0 +1,3 @@
1
+ from .call_llm_once import *
2
+ from .infer import *
3
+ from .price import *
@@ -0,0 +1,60 @@
1
+ from ..io.file import read_yaml
2
+ from tqdm import tqdm
3
+ import os
4
+ from typing import Optional, Union
5
+
6
+ """
7
+ 从你当前的项目里找到.key文件 获取url和key
8
+ """
9
+
10
+
11
+ def call_once_stream(
12
+ client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.2
13
+ ) -> str:
14
+ """
15
+ 调用LLM模型进行一次推理
16
+ :param prompt: 输入的提示文本
17
+ :param model_name: 模型名称
18
+ :param max_tokens: 最大输出token数
19
+ :return: 模型的输出文本
20
+ """
21
+ from openai import OpenAI
22
+
23
+ if isinstance(input, str):
24
+ message = [{"role": "user", "content": input}]
25
+ elif isinstance(input, list):
26
+ message = input
27
+
28
+ completion = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens, stream=True)
29
+ text = ""
30
+ for chunk in completion:
31
+ if chunk.choices:
32
+ c = chunk.choices[0].delta.content or ""
33
+ text += c
34
+ print(c, end="")
35
+ else:
36
+ print()
37
+ print(chunk.usage)
38
+ return text
39
+
40
+
41
+ def call_once(
42
+ client, input: Optional[Union[str, list]], model_name: str = "qwen3-0626-e4", max_tokens: int = 8192, temperature=0.8
43
+ ) -> str:
44
+ """
45
+ 调用LLM模型进行一次推理
46
+ :param prompt: 输入的提示文本
47
+ :param model_name: 模型名称
48
+ :param max_tokens: 最大输出token数
49
+ :return: 模型的输出文本
50
+ """
51
+ from openai import OpenAI
52
+
53
+ if isinstance(input, str):
54
+ message = [{"role": "user", "content": input}]
55
+ elif isinstance(input, list):
56
+ message = input
57
+
58
+ response = client.chat.completions.create(model=model_name, messages=message, max_tokens=max_tokens,temperature=temperature)
59
+
60
+ return response.choices[0].message.content