nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. nlpertools/__init__.py +24 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/data_client.py +387 -257
  9. nlpertools/data_structure/base_structure.py +109 -13
  10. nlpertools/dataprocess.py +611 -3
  11. nlpertools/default_db_config.yml +41 -0
  12. nlpertools/io/__init__.py +3 -3
  13. nlpertools/io/dir.py +54 -36
  14. nlpertools/io/file.py +277 -222
  15. nlpertools/ml.py +483 -460
  16. nlpertools/monitor/__init__.py +0 -0
  17. nlpertools/monitor/gpu.py +18 -0
  18. nlpertools/monitor/memory.py +24 -0
  19. nlpertools/movie.py +36 -0
  20. nlpertools/nlpertools_config.yml +1 -0
  21. nlpertools/{openApi.py → open_api.py} +65 -65
  22. nlpertools/other.py +364 -249
  23. nlpertools/pic.py +288 -0
  24. nlpertools/plugin.py +43 -43
  25. nlpertools/reminder.py +98 -87
  26. nlpertools/utils/__init__.py +3 -3
  27. nlpertools/utils/lazy.py +727 -0
  28. nlpertools/utils/log_util.py +20 -0
  29. nlpertools/utils/package.py +89 -76
  30. nlpertools/utils/package_v1.py +94 -0
  31. nlpertools/utils/package_v2.py +117 -0
  32. nlpertools/utils_for_nlpertools.py +93 -93
  33. nlpertools/vector_index_demo.py +108 -0
  34. nlpertools/wrapper.py +161 -96
  35. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  36. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  37. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  38. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  39. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  40. nlpertools_helper/__init__.py +10 -0
  41. nlpertools-1.0.5.dist-info/METADATA +0 -85
  42. nlpertools-1.0.5.dist-info/RECORD +0 -25
  43. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import math
5
+
6
+ import faiss
7
+ import gensim
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ def build_index_use(vectors):
13
+ d = len(vectors[0])
14
+ nlist = 100
15
+ quantizer = faiss.IndexFlatL2(d)
16
+ index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
17
+ index.train(vectors)
18
+ index.add(vectors)
19
+ return index
20
+
21
+
22
+ def build_index(vectors, distances="L2", nprobe=10):
23
+ """ 建立 faiss 索引.
24
+
25
+ Args:
26
+ vectors(numpy.array): 向量矩阵,shape=(向量数, 向量维度)
27
+ distance(str): 度量距离,支持 L2、COS 和 INNER_PRODUCT.
28
+ nprobe(int): 向量搜索时需要搜索的聚类数.
29
+
30
+ Return: 返回 faiss 索引对象.
31
+
32
+ """
33
+ metric_type = None
34
+ if distances == "L2":
35
+ metric_type = faiss.METRIC_L2
36
+ elif distances in ("COS", "INNER_PRODUCT"):
37
+ metric_type = faiss.METRIC_INNER_PRODUCT
38
+ else:
39
+ raise NotImplementedError
40
+
41
+ index_pipes = []
42
+
43
+ if distances == "COS":
44
+ index_pipes.append("L2norm")
45
+
46
+ K = 4 * math.sqrt(vectors.shape[0])
47
+ use_ivf = False
48
+ if vectors.shape[0] >= 30 * K:
49
+ index_pipes.append(f"IVF{K}")
50
+ use_ivf = True
51
+
52
+ index_pipes.append("Flat")
53
+
54
+ index = faiss.index_factory(vectors.shape[1], ",".join(index_pipes),
55
+ metric_type)
56
+
57
+ vectors = vectors.astype(np.float32)
58
+ if not index.is_trained:
59
+ index.train(vectors)
60
+
61
+ index.add(vectors)
62
+
63
+ # IVF 使用 reconstruct 时必须执行此函数
64
+ if use_ivf:
65
+ ivf_index = faiss.extract_index_ivf(index)
66
+ ivf_index.make_direct_map()
67
+ ivf_index.nprobe = nprobe
68
+
69
+ return index
70
+
71
+
72
+ def read_index_from_file(filename):
73
+ """ 从向量文件中读取 faiss 向量对象. """
74
+ return faiss.read_index(filename)
75
+
76
+
77
+ def write_index_to_file(index, filename):
78
+ """ 将 faiss 向量对象写入文件. """
79
+ faiss.write_index(index, filename)
80
+
81
+
82
+ word2vec_path = "glove_vector_path"
83
+ wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
84
+ vectors = wv_from_text.vectors
85
+
86
+ name_example = ["{}.jpg".format((i % 9) + 1) for i in range(len(vectors))]
87
+ df = pd.DataFrame({
88
+ "name": name_example,
89
+ # "vector": str(vectors[0]),
90
+ # "text": list(wv_from_text.key_to_index.keys()),
91
+ })
92
+ test_index = build_index_use(vectors)
93
+ write_index_to_file(test_index, "test.index")
94
+
95
+ df.to_csv("test.csv", index=False)
96
+
97
+ import gensim
98
+ import hnswlib
99
+
100
+ word2vec_path = "glove_vector_path"
101
+ wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
102
+ vectors = wv_from_text.vectors
103
+
104
+ labels = [idx for idx, i in enumerate(vectors)]
105
+ index = hnswlib.Index(space="l2", dim=len(vectors[0]))
106
+ index.init_index(max_elements=len(vectors), ef_construction=200, M=16)
107
+ index.add_items(vectors, labels)
108
+ index.save_index("hnswlib.index")
nlpertools/wrapper.py CHANGED
@@ -1,96 +1,161 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- # 定义装饰器
5
- from functools import wraps
6
-
7
-
8
- def fn_timer(function):
9
- @wraps(function)
10
- def function_timer(*args, **kwargs):
11
- t0 = time.time()
12
- result = function(*args, **kwargs)
13
- t1 = time.time()
14
- print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
15
- return result
16
-
17
- return function_timer
18
-
19
-
20
- def fn_timeout_checker(wait_time, callback):
21
- """
22
- 超时判断的装饰器
23
- 两个包,使用gevent出现bug
24
- """
25
- # from gevent import Timeout
26
- # from gevent.monkey import patch_all
27
-
28
- # patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
29
-
30
- from eventlet import Timeout
31
- from eventlet import monkey_patch
32
-
33
- monkey_patch(time=True)
34
-
35
- def wrapper(func):
36
- def inner(*args, **kwargs):
37
- finish_flag = False
38
- with Timeout(wait_time, False):
39
- res = func(*args, **kwargs)
40
- finish_flag = True
41
- if not finish_flag:
42
- res = callback()
43
- return res
44
-
45
- return inner
46
-
47
- return wrapper
48
-
49
-
50
- def fn_try(parameter):
51
- """
52
- 该函数把try...catch...封装成装饰器,
53
- 接收一个字典参数,并把其中的msg字段改为具体报错信息
54
- :param parameter: {"msg": "", etc.}
55
- :return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
56
- """
57
-
58
- def wrapper(function):
59
- def inner(*args, **kwargs):
60
- try:
61
- result = function(*args, **kwargs)
62
- return result
63
- except Exception as e:
64
- msg = "报错!"
65
- print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
66
- parameter["msg"] = parameter["msg"].format(str(e))
67
- return parameter
68
- finally:
69
- pass
70
-
71
- return inner
72
-
73
- return wrapper
74
-
75
-
76
- def example(function):
77
- @wraps(function)
78
- def function_example(*args, **kwargs):
79
- print("此方法仅仅用于提示该方法怎么写")
80
- result = function(*args, **kwargs)
81
- return result
82
-
83
- return function_example
84
-
85
-
86
- def singleton(cls):
87
- instances = {}
88
-
89
- def _singleton(*args, **kwargs):
90
- if cls not in instances:
91
- instances[cls] = cls(*args, **kwargs)
92
- return instances[cls]
93
-
94
- return _singleton
95
-
96
-
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ # 定义装饰器
5
+ import logging
6
+ import time
7
+ from functools import wraps
8
+ import asyncio
9
+
10
+ def fn_async_timer(function):
11
+ """
12
+ 针对异步函数的装饰器
13
+ """
14
+ @wraps(function)
15
+ async def function_timer(*args, **kwargs):
16
+ t0 = time.time()
17
+ result = await function(*args, **kwargs)
18
+ t1 = time.time()
19
+ print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
20
+ return result
21
+
22
+ return function_timer
23
+
24
+
25
+ def fn_timer(async_func=False, analyse=False):
26
+ """
27
+ >>> @fn_timer()
28
+ >>> def example():
29
+ >>> time.sleep(2)
30
+ :param analyse:
31
+ :return:
32
+ """
33
+
34
+ def wrapper(func):
35
+ async def func_time_async(*args, **kwargs):
36
+ t0 = time.time()
37
+ result = await asyncio.create_task(func(*args, **kwargs))
38
+ t1 = time.time()
39
+ print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
40
+ return result
41
+
42
+ def func_time(*args, **kwargs):
43
+ t0 = time.time()
44
+ result = func(*args, **kwargs)
45
+ t1 = time.time()
46
+ print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
47
+ return result
48
+
49
+ def func_time_analyse(*args, **kwargs):
50
+ from pyinstrument import Profiler
51
+
52
+ profiler = Profiler()
53
+ profiler.start()
54
+
55
+ result = func(*args, **kwargs)
56
+
57
+ profiler.stop()
58
+ profiler.print()
59
+ return result
60
+
61
+ if async_func is True:
62
+ return func_time_async
63
+ else:
64
+ if analyse:
65
+ return func_time_analyse
66
+ else:
67
+ return func_time
68
+
69
+ return wrapper
70
+
71
+
72
+ def fn_timeout_checker(wait_time, callback):
73
+ """
74
+ 超时判断的装饰器
75
+ 两个包,使用gevent出现bug
76
+ """
77
+ # from gevent import Timeout
78
+ # from gevent.monkey import patch_all
79
+
80
+ # patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
81
+
82
+ from eventlet import Timeout
83
+ from eventlet import monkey_patch
84
+
85
+ monkey_patch(time=True)
86
+
87
+ def wrapper(func):
88
+ def inner(*args, **kwargs):
89
+ finish_flag = False
90
+ with Timeout(wait_time, False):
91
+ res = func(*args, **kwargs)
92
+ finish_flag = True
93
+ if not finish_flag:
94
+ res = callback()
95
+ return res
96
+
97
+ return inner
98
+
99
+ return wrapper
100
+
101
+
102
+ def fn_try(parameter):
103
+ """
104
+ 该函数把try...catch...封装成装饰器,
105
+ 接收一个字典参数,并把其中的msg字段改为具体报错信息
106
+ :param parameter: {"msg": "", etc.}
107
+ :return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
108
+ """
109
+
110
+ def wrapper(function):
111
+ def inner(*args, **kwargs):
112
+ try:
113
+ result = function(*args, **kwargs)
114
+ return result
115
+ except Exception as e:
116
+ msg = "报错!"
117
+ print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
118
+ parameter["msg"] = parameter["msg"].format(str(e))
119
+ return parameter
120
+ finally:
121
+ pass
122
+
123
+ return inner
124
+
125
+ return wrapper
126
+
127
+
128
+ def try_log(function):
129
+ @wraps(function)
130
+ def inner(*args, **kwargs):
131
+ try:
132
+ result = function(*args, **kwargs)
133
+ return result
134
+ except Exception as e:
135
+ logging.error(*args)
136
+ logging.error(e.__traceback__.tb_frame.f_globals["__file__"])
137
+ logging.error(e.__traceback__.tb_lineno)
138
+ logging.error(repr(e))
139
+
140
+ return inner
141
+
142
+
143
+ def example(function):
144
+ @wraps(function)
145
+ def function_example(*args, **kwargs):
146
+ print("此方法仅仅用于提示该方法怎么写")
147
+ result = function(*args, **kwargs)
148
+ return result
149
+
150
+ return function_example
151
+
152
+
153
+ def singleton(cls):
154
+ instances = {}
155
+
156
+ def _singleton(*args, **kwargs):
157
+ if cls not in instances:
158
+ instances[cls] = cls(*args, **kwargs)
159
+ return instances[cls]
160
+
161
+ return _singleton