nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. nlpertools/__init__.py +24 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/data_client.py +387 -257
  9. nlpertools/data_structure/base_structure.py +109 -13
  10. nlpertools/dataprocess.py +611 -3
  11. nlpertools/default_db_config.yml +41 -0
  12. nlpertools/io/__init__.py +3 -3
  13. nlpertools/io/dir.py +54 -36
  14. nlpertools/io/file.py +277 -222
  15. nlpertools/ml.py +483 -460
  16. nlpertools/monitor/__init__.py +0 -0
  17. nlpertools/monitor/gpu.py +18 -0
  18. nlpertools/monitor/memory.py +24 -0
  19. nlpertools/movie.py +36 -0
  20. nlpertools/nlpertools_config.yml +1 -0
  21. nlpertools/{openApi.py → open_api.py} +65 -65
  22. nlpertools/other.py +364 -249
  23. nlpertools/pic.py +288 -0
  24. nlpertools/plugin.py +43 -43
  25. nlpertools/reminder.py +98 -87
  26. nlpertools/utils/__init__.py +3 -3
  27. nlpertools/utils/lazy.py +727 -0
  28. nlpertools/utils/log_util.py +20 -0
  29. nlpertools/utils/package.py +89 -76
  30. nlpertools/utils/package_v1.py +94 -0
  31. nlpertools/utils/package_v2.py +117 -0
  32. nlpertools/utils_for_nlpertools.py +93 -93
  33. nlpertools/vector_index_demo.py +108 -0
  34. nlpertools/wrapper.py +161 -96
  35. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  36. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  37. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  38. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  39. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  40. nlpertools_helper/__init__.py +10 -0
  41. nlpertools-1.0.5.dist-info/METADATA +0 -85
  42. nlpertools-1.0.5.dist-info/RECORD +0 -25
  43. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import math
5
+
6
+ import faiss
7
+ import gensim
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ def build_index_use(vectors):
13
+ d = len(vectors[0])
14
+ nlist = 100
15
+ quantizer = faiss.IndexFlatL2(d)
16
+ index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
17
+ index.train(vectors)
18
+ index.add(vectors)
19
+ return index
20
+
21
+
22
+ def build_index(vectors, distances="L2", nprobe=10):
23
+ """ 建立 faiss 索引.
24
+
25
+ Args:
26
+ vectors(numpy.array): 向量矩阵,shape=(向量数, 向量维度)
27
+ distance(str): 度量距离,支持 L2、COS 和 INNER_PRODUCT.
28
+ nprobe(int): 向量搜索时需要搜索的聚类数.
29
+
30
+ Return: 返回 faiss 索引对象.
31
+
32
+ """
33
+ metric_type = None
34
+ if distances == "L2":
35
+ metric_type = faiss.METRIC_L2
36
+ elif distances in ("COS", "INNER_PRODUCT"):
37
+ metric_type = faiss.METRIC_INNER_PRODUCT
38
+ else:
39
+ raise NotImplementedError
40
+
41
+ index_pipes = []
42
+
43
+ if distances == "COS":
44
+ index_pipes.append("L2norm")
45
+
46
+ K = 4 * math.sqrt(vectors.shape[0])
47
+ use_ivf = False
48
+ if vectors.shape[0] >= 30 * K:
49
+ index_pipes.append(f"IVF{K}")
50
+ use_ivf = True
51
+
52
+ index_pipes.append("Flat")
53
+
54
+ index = faiss.index_factory(vectors.shape[1], ",".join(index_pipes),
55
+ metric_type)
56
+
57
+ vectors = vectors.astype(np.float32)
58
+ if not index.is_trained:
59
+ index.train(vectors)
60
+
61
+ index.add(vectors)
62
+
63
+ # IVF 使用 reconstruct 时必须执行此函数
64
+ if use_ivf:
65
+ ivf_index = faiss.extract_index_ivf(index)
66
+ ivf_index.make_direct_map()
67
+ ivf_index.nprobe = nprobe
68
+
69
+ return index
70
+
71
+
72
+ def read_index_from_file(filename):
73
+ """ 从向量文件中读取 faiss 向量对象. """
74
+ return faiss.read_index(filename)
75
+
76
+
77
+ def write_index_to_file(index, filename):
78
+ """ 将 faiss 向量对象写入文件. """
79
+ faiss.write_index(index, filename)
80
+
81
+
82
+ word2vec_path = "glove_vector_path"
83
+ wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
84
+ vectors = wv_from_text.vectors
85
+
86
+ name_example = ["{}.jpg".format((i % 9) + 1) for i in range(len(vectors))]
87
+ df = pd.DataFrame({
88
+ "name": name_example,
89
+ # "vector": str(vectors[0]),
90
+ # "text": list(wv_from_text.key_to_index.keys()),
91
+ })
92
+ test_index = build_index_use(vectors)
93
+ write_index_to_file(test_index, "test.index")
94
+
95
+ df.to_csv("test.csv", index=False)
96
+
97
+ import gensim
98
+ import hnswlib
99
+
100
+ word2vec_path = "glove_vector_path"
101
+ wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
102
+ vectors = wv_from_text.vectors
103
+
104
+ labels = [idx for idx, i in enumerate(vectors)]
105
+ index = hnswlib.Index(space="l2", dim=len(vectors[0]))
106
+ index.init_index(max_elements=len(vectors), ef_construction=200, M=16)
107
+ index.add_items(vectors, labels)
108
+ index.save_index("hnswlib.index")
nlpertools/wrapper.py CHANGED
@@ -1,96 +1,161 @@
1
- #!/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- # 定义装饰器
5
- from functools import wraps
6
-
7
-
8
- def fn_timer(function):
9
- @wraps(function)
10
- def function_timer(*args, **kwargs):
11
- t0 = time.time()
12
- result = function(*args, **kwargs)
13
- t1 = time.time()
14
- print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
15
- return result
16
-
17
- return function_timer
18
-
19
-
20
- def fn_timeout_checker(wait_time, callback):
21
- """
22
- 超时判断的装饰器
23
- 两个包,使用gevent出现bug
24
- """
25
- # from gevent import Timeout
26
- # from gevent.monkey import patch_all
27
-
28
- # patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
29
-
30
- from eventlet import Timeout
31
- from eventlet import monkey_patch
32
-
33
- monkey_patch(time=True)
34
-
35
- def wrapper(func):
36
- def inner(*args, **kwargs):
37
- finish_flag = False
38
- with Timeout(wait_time, False):
39
- res = func(*args, **kwargs)
40
- finish_flag = True
41
- if not finish_flag:
42
- res = callback()
43
- return res
44
-
45
- return inner
46
-
47
- return wrapper
48
-
49
-
50
- def fn_try(parameter):
51
- """
52
- 该函数把try...catch...封装成装饰器,
53
- 接收一个字典参数,并把其中的msg字段改为具体报错信息
54
- :param parameter: {"msg": "", etc.}
55
- :return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
56
- """
57
-
58
- def wrapper(function):
59
- def inner(*args, **kwargs):
60
- try:
61
- result = function(*args, **kwargs)
62
- return result
63
- except Exception as e:
64
- msg = "报错!"
65
- print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
66
- parameter["msg"] = parameter["msg"].format(str(e))
67
- return parameter
68
- finally:
69
- pass
70
-
71
- return inner
72
-
73
- return wrapper
74
-
75
-
76
- def example(function):
77
- @wraps(function)
78
- def function_example(*args, **kwargs):
79
- print("此方法仅仅用于提示该方法怎么写")
80
- result = function(*args, **kwargs)
81
- return result
82
-
83
- return function_example
84
-
85
-
86
- def singleton(cls):
87
- instances = {}
88
-
89
- def _singleton(*args, **kwargs):
90
- if cls not in instances:
91
- instances[cls] = cls(*args, **kwargs)
92
- return instances[cls]
93
-
94
- return _singleton
95
-
96
-
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ # 定义装饰器
5
+ import logging
6
+ import time
7
+ from functools import wraps
8
+ import asyncio
9
+
10
+ def fn_async_timer(function):
11
+ """
12
+ 针对异步函数的装饰器
13
+ """
14
+ @wraps(function)
15
+ async def function_timer(*args, **kwargs):
16
+ t0 = time.time()
17
+ result = await function(*args, **kwargs)
18
+ t1 = time.time()
19
+ print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
20
+ return result
21
+
22
+ return function_timer
23
+
24
+
25
+ def fn_timer(async_func=False, analyse=False):
26
+ """
27
+ >>> @fn_timer()
28
+ >>> def example():
29
+ >>> time.sleep(2)
30
+ :param analyse:
31
+ :return:
32
+ """
33
+
34
+ def wrapper(func):
35
+ async def func_time_async(*args, **kwargs):
36
+ t0 = time.time()
37
+ result = await asyncio.create_task(func(*args, **kwargs))
38
+ t1 = time.time()
39
+ print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
40
+ return result
41
+
42
+ def func_time(*args, **kwargs):
43
+ t0 = time.time()
44
+ result = func(*args, **kwargs)
45
+ t1 = time.time()
46
+ print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
47
+ return result
48
+
49
+ def func_time_analyse(*args, **kwargs):
50
+ from pyinstrument import Profiler
51
+
52
+ profiler = Profiler()
53
+ profiler.start()
54
+
55
+ result = func(*args, **kwargs)
56
+
57
+ profiler.stop()
58
+ profiler.print()
59
+ return result
60
+
61
+ if async_func is True:
62
+ return func_time_async
63
+ else:
64
+ if analyse:
65
+ return func_time_analyse
66
+ else:
67
+ return func_time
68
+
69
+ return wrapper
70
+
71
+
72
+ def fn_timeout_checker(wait_time, callback):
73
+ """
74
+ 超时判断的装饰器
75
+ 两个包,使用gevent出现bug
76
+ """
77
+ # from gevent import Timeout
78
+ # from gevent.monkey import patch_all
79
+
80
+ # patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
81
+
82
+ from eventlet import Timeout
83
+ from eventlet import monkey_patch
84
+
85
+ monkey_patch(time=True)
86
+
87
+ def wrapper(func):
88
+ def inner(*args, **kwargs):
89
+ finish_flag = False
90
+ with Timeout(wait_time, False):
91
+ res = func(*args, **kwargs)
92
+ finish_flag = True
93
+ if not finish_flag:
94
+ res = callback()
95
+ return res
96
+
97
+ return inner
98
+
99
+ return wrapper
100
+
101
+
102
+ def fn_try(parameter):
103
+ """
104
+ 该函数把try...catch...封装成装饰器,
105
+ 接收一个字典参数,并把其中的msg字段改为具体报错信息
106
+ :param parameter: {"msg": "", etc.}
107
+ :return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
108
+ """
109
+
110
+ def wrapper(function):
111
+ def inner(*args, **kwargs):
112
+ try:
113
+ result = function(*args, **kwargs)
114
+ return result
115
+ except Exception as e:
116
+ msg = "报错!"
117
+ print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
118
+ parameter["msg"] = parameter["msg"].format(str(e))
119
+ return parameter
120
+ finally:
121
+ pass
122
+
123
+ return inner
124
+
125
+ return wrapper
126
+
127
+
128
+ def try_log(function):
129
+ @wraps(function)
130
+ def inner(*args, **kwargs):
131
+ try:
132
+ result = function(*args, **kwargs)
133
+ return result
134
+ except Exception as e:
135
+ logging.error(*args)
136
+ logging.error(e.__traceback__.tb_frame.f_globals["__file__"])
137
+ logging.error(e.__traceback__.tb_lineno)
138
+ logging.error(repr(e))
139
+
140
+ return inner
141
+
142
+
143
+ def example(function):
144
+ @wraps(function)
145
+ def function_example(*args, **kwargs):
146
+ print("此方法仅仅用于提示该方法怎么写")
147
+ result = function(*args, **kwargs)
148
+ return result
149
+
150
+ return function_example
151
+
152
+
153
+ def singleton(cls):
154
+ instances = {}
155
+
156
+ def _singleton(*args, **kwargs):
157
+ if cls not in instances:
158
+ instances[cls] = cls(*args, **kwargs)
159
+ return instances[cls]
160
+
161
+ return _singleton