nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +24 -20
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -55
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -257
- nlpertools/data_structure/base_structure.py +109 -13
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -36
- nlpertools/io/file.py +277 -222
- nlpertools/ml.py +483 -460
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -65
- nlpertools/other.py +364 -249
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -43
- nlpertools/reminder.py +98 -87
- nlpertools/utils/__init__.py +3 -3
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -76
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -93
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -96
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.5.dist-info/METADATA +0 -85
- nlpertools-1.0.5.dist-info/RECORD +0 -25
- nlpertools-1.0.5.dist-info/top_level.txt +0 -1
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import math
|
5
|
+
|
6
|
+
import faiss
|
7
|
+
import gensim
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
def build_index_use(vectors):
|
13
|
+
d = len(vectors[0])
|
14
|
+
nlist = 100
|
15
|
+
quantizer = faiss.IndexFlatL2(d)
|
16
|
+
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
|
17
|
+
index.train(vectors)
|
18
|
+
index.add(vectors)
|
19
|
+
return index
|
20
|
+
|
21
|
+
|
22
|
+
def build_index(vectors, distances="L2", nprobe=10):
|
23
|
+
""" 建立 faiss 索引.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
vectors(numpy.array): 向量矩阵,shape=(向量数, 向量维度)
|
27
|
+
distance(str): 度量距离,支持 L2、COS 和 INNER_PRODUCT.
|
28
|
+
nprobe(int): 向量搜索时需要搜索的聚类数.
|
29
|
+
|
30
|
+
Return: 返回 faiss 索引对象.
|
31
|
+
|
32
|
+
"""
|
33
|
+
metric_type = None
|
34
|
+
if distances == "L2":
|
35
|
+
metric_type = faiss.METRIC_L2
|
36
|
+
elif distances in ("COS", "INNER_PRODUCT"):
|
37
|
+
metric_type = faiss.METRIC_INNER_PRODUCT
|
38
|
+
else:
|
39
|
+
raise NotImplementedError
|
40
|
+
|
41
|
+
index_pipes = []
|
42
|
+
|
43
|
+
if distances == "COS":
|
44
|
+
index_pipes.append("L2norm")
|
45
|
+
|
46
|
+
K = 4 * math.sqrt(vectors.shape[0])
|
47
|
+
use_ivf = False
|
48
|
+
if vectors.shape[0] >= 30 * K:
|
49
|
+
index_pipes.append(f"IVF{K}")
|
50
|
+
use_ivf = True
|
51
|
+
|
52
|
+
index_pipes.append("Flat")
|
53
|
+
|
54
|
+
index = faiss.index_factory(vectors.shape[1], ",".join(index_pipes),
|
55
|
+
metric_type)
|
56
|
+
|
57
|
+
vectors = vectors.astype(np.float32)
|
58
|
+
if not index.is_trained:
|
59
|
+
index.train(vectors)
|
60
|
+
|
61
|
+
index.add(vectors)
|
62
|
+
|
63
|
+
# IVF 使用 reconstruct 时必须执行此函数
|
64
|
+
if use_ivf:
|
65
|
+
ivf_index = faiss.extract_index_ivf(index)
|
66
|
+
ivf_index.make_direct_map()
|
67
|
+
ivf_index.nprobe = nprobe
|
68
|
+
|
69
|
+
return index
|
70
|
+
|
71
|
+
|
72
|
+
def read_index_from_file(filename):
|
73
|
+
""" 从向量文件中读取 faiss 向量对象. """
|
74
|
+
return faiss.read_index(filename)
|
75
|
+
|
76
|
+
|
77
|
+
def write_index_to_file(index, filename):
|
78
|
+
""" 将 faiss 向量对象写入文件. """
|
79
|
+
faiss.write_index(index, filename)
|
80
|
+
|
81
|
+
|
82
|
+
word2vec_path = "glove_vector_path"
|
83
|
+
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
|
84
|
+
vectors = wv_from_text.vectors
|
85
|
+
|
86
|
+
name_example = ["{}.jpg".format((i % 9) + 1) for i in range(len(vectors))]
|
87
|
+
df = pd.DataFrame({
|
88
|
+
"name": name_example,
|
89
|
+
# "vector": str(vectors[0]),
|
90
|
+
# "text": list(wv_from_text.key_to_index.keys()),
|
91
|
+
})
|
92
|
+
test_index = build_index_use(vectors)
|
93
|
+
write_index_to_file(test_index, "test.index")
|
94
|
+
|
95
|
+
df.to_csv("test.csv", index=False)
|
96
|
+
|
97
|
+
import gensim
|
98
|
+
import hnswlib
|
99
|
+
|
100
|
+
word2vec_path = "glove_vector_path"
|
101
|
+
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
|
102
|
+
vectors = wv_from_text.vectors
|
103
|
+
|
104
|
+
labels = [idx for idx, i in enumerate(vectors)]
|
105
|
+
index = hnswlib.Index(space="l2", dim=len(vectors[0]))
|
106
|
+
index.init_index(max_elements=len(vectors), ef_construction=200, M=16)
|
107
|
+
index.add_items(vectors, labels)
|
108
|
+
index.save_index("hnswlib.index")
|
nlpertools/wrapper.py
CHANGED
@@ -1,96 +1,161 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
4
|
-
# 定义装饰器
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
return
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
# 定义装饰器
|
5
|
+
import logging
|
6
|
+
import time
|
7
|
+
from functools import wraps
|
8
|
+
import asyncio
|
9
|
+
|
10
|
+
def fn_async_timer(function):
|
11
|
+
"""
|
12
|
+
针对异步函数的装饰器
|
13
|
+
"""
|
14
|
+
@wraps(function)
|
15
|
+
async def function_timer(*args, **kwargs):
|
16
|
+
t0 = time.time()
|
17
|
+
result = await function(*args, **kwargs)
|
18
|
+
t1 = time.time()
|
19
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
|
20
|
+
return result
|
21
|
+
|
22
|
+
return function_timer
|
23
|
+
|
24
|
+
|
25
|
+
def fn_timer(async_func=False, analyse=False):
|
26
|
+
"""
|
27
|
+
>>> @fn_timer()
|
28
|
+
>>> def example():
|
29
|
+
>>> time.sleep(2)
|
30
|
+
:param analyse:
|
31
|
+
:return:
|
32
|
+
"""
|
33
|
+
|
34
|
+
def wrapper(func):
|
35
|
+
async def func_time_async(*args, **kwargs):
|
36
|
+
t0 = time.time()
|
37
|
+
result = await asyncio.create_task(func(*args, **kwargs))
|
38
|
+
t1 = time.time()
|
39
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
|
40
|
+
return result
|
41
|
+
|
42
|
+
def func_time(*args, **kwargs):
|
43
|
+
t0 = time.time()
|
44
|
+
result = func(*args, **kwargs)
|
45
|
+
t1 = time.time()
|
46
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
|
47
|
+
return result
|
48
|
+
|
49
|
+
def func_time_analyse(*args, **kwargs):
|
50
|
+
from pyinstrument import Profiler
|
51
|
+
|
52
|
+
profiler = Profiler()
|
53
|
+
profiler.start()
|
54
|
+
|
55
|
+
result = func(*args, **kwargs)
|
56
|
+
|
57
|
+
profiler.stop()
|
58
|
+
profiler.print()
|
59
|
+
return result
|
60
|
+
|
61
|
+
if async_func is True:
|
62
|
+
return func_time_async
|
63
|
+
else:
|
64
|
+
if analyse:
|
65
|
+
return func_time_analyse
|
66
|
+
else:
|
67
|
+
return func_time
|
68
|
+
|
69
|
+
return wrapper
|
70
|
+
|
71
|
+
|
72
|
+
def fn_timeout_checker(wait_time, callback):
|
73
|
+
"""
|
74
|
+
超时判断的装饰器
|
75
|
+
两个包,使用gevent出现bug
|
76
|
+
"""
|
77
|
+
# from gevent import Timeout
|
78
|
+
# from gevent.monkey import patch_all
|
79
|
+
|
80
|
+
# patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
|
81
|
+
|
82
|
+
from eventlet import Timeout
|
83
|
+
from eventlet import monkey_patch
|
84
|
+
|
85
|
+
monkey_patch(time=True)
|
86
|
+
|
87
|
+
def wrapper(func):
|
88
|
+
def inner(*args, **kwargs):
|
89
|
+
finish_flag = False
|
90
|
+
with Timeout(wait_time, False):
|
91
|
+
res = func(*args, **kwargs)
|
92
|
+
finish_flag = True
|
93
|
+
if not finish_flag:
|
94
|
+
res = callback()
|
95
|
+
return res
|
96
|
+
|
97
|
+
return inner
|
98
|
+
|
99
|
+
return wrapper
|
100
|
+
|
101
|
+
|
102
|
+
def fn_try(parameter):
|
103
|
+
"""
|
104
|
+
该函数把try...catch...封装成装饰器,
|
105
|
+
接收一个字典参数,并把其中的msg字段改为具体报错信息
|
106
|
+
:param parameter: {"msg": "", etc.}
|
107
|
+
:return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
|
108
|
+
"""
|
109
|
+
|
110
|
+
def wrapper(function):
|
111
|
+
def inner(*args, **kwargs):
|
112
|
+
try:
|
113
|
+
result = function(*args, **kwargs)
|
114
|
+
return result
|
115
|
+
except Exception as e:
|
116
|
+
msg = "报错!"
|
117
|
+
print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
|
118
|
+
parameter["msg"] = parameter["msg"].format(str(e))
|
119
|
+
return parameter
|
120
|
+
finally:
|
121
|
+
pass
|
122
|
+
|
123
|
+
return inner
|
124
|
+
|
125
|
+
return wrapper
|
126
|
+
|
127
|
+
|
128
|
+
def try_log(function):
|
129
|
+
@wraps(function)
|
130
|
+
def inner(*args, **kwargs):
|
131
|
+
try:
|
132
|
+
result = function(*args, **kwargs)
|
133
|
+
return result
|
134
|
+
except Exception as e:
|
135
|
+
logging.error(*args)
|
136
|
+
logging.error(e.__traceback__.tb_frame.f_globals["__file__"])
|
137
|
+
logging.error(e.__traceback__.tb_lineno)
|
138
|
+
logging.error(repr(e))
|
139
|
+
|
140
|
+
return inner
|
141
|
+
|
142
|
+
|
143
|
+
def example(function):
|
144
|
+
@wraps(function)
|
145
|
+
def function_example(*args, **kwargs):
|
146
|
+
print("此方法仅仅用于提示该方法怎么写")
|
147
|
+
result = function(*args, **kwargs)
|
148
|
+
return result
|
149
|
+
|
150
|
+
return function_example
|
151
|
+
|
152
|
+
|
153
|
+
def singleton(cls):
|
154
|
+
instances = {}
|
155
|
+
|
156
|
+
def _singleton(*args, **kwargs):
|
157
|
+
if cls not in instances:
|
158
|
+
instances[cls] = cls(*args, **kwargs)
|
159
|
+
return instances[cls]
|
160
|
+
|
161
|
+
return _singleton
|