nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- nlpertools/__init__.py +24 -11
- nlpertools/algo/__init__.py +0 -0
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -0
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -0
- nlpertools/data_structure/__init__.py +0 -0
- nlpertools/data_structure/base_structure.py +109 -0
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -47
- nlpertools/io/file.py +277 -205
- nlpertools/ml.py +483 -317
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -62
- nlpertools/other.py +364 -188
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -34
- nlpertools/reminder.py +98 -15
- nlpertools/template/__init__.py +0 -0
- nlpertools/utils/__init__.py +3 -0
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -0
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -0
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -0
- {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.4.dist-info/METADATA +0 -42
- nlpertools-1.0.4.dist-info/RECORD +0 -15
- nlpertools-1.0.4.dist-info/top_level.txt +0 -1
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
"""
|
5
|
+
# 该项目暂时没有日志输出
|
6
|
+
import codecs
|
7
|
+
import logging.config
|
8
|
+
|
9
|
+
import nlpertools
|
10
|
+
import yaml
|
11
|
+
|
12
|
+
nlpertools.j_mkdir("logs")
|
13
|
+
|
14
|
+
with codecs.open('log_config.yml', 'r', 'utf-8') as stream:
|
15
|
+
config = yaml.load(stream, Loader=yaml.FullLoader)
|
16
|
+
|
17
|
+
# logging.basicConfig(level=logging.INFO)
|
18
|
+
logging.config.dictConfig(config)
|
19
|
+
logger = logging.getLogger()
|
20
|
+
"""
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import importlib
|
5
|
+
from importlib import import_module
|
6
|
+
|
7
|
+
|
8
|
+
def try_import(name, package):
|
9
|
+
try:
|
10
|
+
return import_module(name, package=package)
|
11
|
+
except:
|
12
|
+
pass
|
13
|
+
# print("import {} failed".format(name))
|
14
|
+
finally:
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
def lazy_import(importer_name, to_import):
|
19
|
+
"""
|
20
|
+
Example from net
|
21
|
+
author: unknown
|
22
|
+
this function is not used
|
23
|
+
"""
|
24
|
+
"""Return the importing module and a callable for lazy importing.
|
25
|
+
|
26
|
+
The module named by importer_name represents the module performing the
|
27
|
+
import to help facilitate resolving relative imports.
|
28
|
+
|
29
|
+
to_import is an iterable of the modules to be potentially imported (absolute
|
30
|
+
or relative). The `as` form of importing is also supported,
|
31
|
+
e.g. `pkg.mod as spam`.
|
32
|
+
|
33
|
+
This function returns a tuple of two items. The first is the importer
|
34
|
+
module for easy reference within itself. The second item is a callable to be
|
35
|
+
set to `__getattr__`.
|
36
|
+
"""
|
37
|
+
module = importlib.import_module(importer_name)
|
38
|
+
import_mapping = {}
|
39
|
+
for name in to_import:
|
40
|
+
importing, _, binding = name.partition(' as ')
|
41
|
+
if not binding:
|
42
|
+
_, _, binding = importing.rpartition('.')
|
43
|
+
import_mapping[binding] = importing
|
44
|
+
|
45
|
+
def __getattr__(name):
|
46
|
+
if name not in import_mapping:
|
47
|
+
message = f'module {importer_name!r} has no attribute {name!r}'
|
48
|
+
raise AttributeError(message)
|
49
|
+
importing = import_mapping[name]
|
50
|
+
# imortlib.import_module() implicitly sets submodules on this module as
|
51
|
+
# appropriate for direct imports.
|
52
|
+
imported = importlib.import_module(importing,
|
53
|
+
module.__spec__.parent)
|
54
|
+
setattr(module, name, imported)
|
55
|
+
return imported
|
56
|
+
|
57
|
+
return module, __getattr__
|
58
|
+
|
59
|
+
|
60
|
+
# jieba = try_import("jieba", None)
|
61
|
+
# sns = try_import("seaborn", None)
|
62
|
+
# torch = try_import("torch", None)
|
63
|
+
# nn = try_import("torch.nn", None)
|
64
|
+
# BertTokenizer = try_import("transformers", "BertTokenizer")
|
65
|
+
# BertForMaskedLM = try_import("transformers", "BertForMaskedLM")
|
66
|
+
# Elasticsearch = try_import("elasticsearch", "Elasticsearch")
|
67
|
+
# pd = try_import("pandas", None)
|
68
|
+
# xgb = try_import("xgboost", None)
|
69
|
+
|
70
|
+
aioredis = try_import("aioredis", None)
|
71
|
+
pymysql = try_import("pymysql", None)
|
72
|
+
zhconv = try_import("zhconv", None)
|
73
|
+
KafkaProducer = try_import("kafka", "KafkaProducer")
|
74
|
+
KafkaConsumer = try_import("kafka", "KafkaConsumer")
|
75
|
+
np = try_import("numpy", None)
|
76
|
+
plt = try_import("matplotlib", "pyplot")
|
77
|
+
WordNetLemmatizer = try_import("nltk.stem", "WordNetLemmatizer")
|
78
|
+
metrics = try_import("sklearn", "metrics")
|
79
|
+
requests = try_import("requests", None)
|
80
|
+
pq = try_import("pyquery", None)
|
81
|
+
CountVectorizer = try_import("sklearn.feature_extraction.text", "CountVectorizer")
|
82
|
+
precision_recall_fscore_support = try_import("sklearn.metrics", "precision_recall_fscore_support")
|
83
|
+
tqdm = try_import("tqdm", "tqdm")
|
84
|
+
# TODO 自动导出langid和win32evtlogutil输出有bug
|
85
|
+
langid = try_import("langid", None)
|
86
|
+
win32evtlogutil = try_import("win32evtlogutil", None)
|
87
|
+
TfidfTransformer = try_import("sklearn.feature_extraction.text", "TfidfTransformer")
|
88
|
+
yaml = try_import("yaml", None)
|
89
|
+
omegaconf = try_import("omegaconf", None)
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import importlib
|
5
|
+
from importlib import import_module
|
6
|
+
from importlib.util import LazyLoader
|
7
|
+
from .lazy import lazy_module
|
8
|
+
|
9
|
+
EXCLUDE_LAZYIMPORT = {"torch", "torch.nn", "numpy"}
|
10
|
+
|
11
|
+
|
12
|
+
def try_import(name, package):
|
13
|
+
try:
|
14
|
+
if package:
|
15
|
+
# print("import {} success".format(name))
|
16
|
+
return lazy_module("{}.{}".format(package, name))
|
17
|
+
else:
|
18
|
+
if name in EXCLUDE_LAZYIMPORT:
|
19
|
+
return import_module(name, package=package)
|
20
|
+
return lazy_module(name)
|
21
|
+
# return import_module(name, package=package)
|
22
|
+
except:
|
23
|
+
pass
|
24
|
+
print("import {} failed".format(name))
|
25
|
+
finally:
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
def lazy_import(importer_name, to_import):
|
30
|
+
"""Return the importing module and a callable for lazy importing.
|
31
|
+
|
32
|
+
The module named by importer_name represents the module performing the
|
33
|
+
import to help facilitate resolving relative imports.
|
34
|
+
|
35
|
+
to_import is an iterable of the modules to be potentially imported (absolute
|
36
|
+
or relative). The `as` form of importing is also supported,
|
37
|
+
e.g. `pkg.mod as spam`.
|
38
|
+
|
39
|
+
This function returns a tuple of two items. The first is the importer
|
40
|
+
module for easy reference within itself. The second item is a callable to be
|
41
|
+
set to `__getattr__`.
|
42
|
+
"""
|
43
|
+
module = importlib.import_module(importer_name)
|
44
|
+
import_mapping = {}
|
45
|
+
for name in to_import:
|
46
|
+
importing, _, binding = name.partition(' as ')
|
47
|
+
if not binding:
|
48
|
+
_, _, binding = importing.rpartition('.')
|
49
|
+
import_mapping[binding] = importing
|
50
|
+
|
51
|
+
def __getattr__(name):
|
52
|
+
if name not in import_mapping:
|
53
|
+
message = f'module {importer_name!r} has no attribute {name!r}'
|
54
|
+
raise AttributeError(message)
|
55
|
+
importing = import_mapping[name]
|
56
|
+
# imortlib.import_module() implicitly sets submodules on this module as
|
57
|
+
# appropriate for direct imports.
|
58
|
+
imported = importlib.import_module(importing,
|
59
|
+
module.__spec__.parent)
|
60
|
+
setattr(module, name, imported)
|
61
|
+
return imported
|
62
|
+
|
63
|
+
return module, __getattr__
|
64
|
+
|
65
|
+
|
66
|
+
aioredis = try_import("aioredis", None)
|
67
|
+
happybase = try_import("happybase", None)
|
68
|
+
pd = try_import("pandas", None)
|
69
|
+
pymysql = try_import("pymysql", None)
|
70
|
+
Elasticsearch = try_import("elasticsearch", "Elasticsearch")
|
71
|
+
KafkaProducer = try_import("kafka", "KafkaProducer")
|
72
|
+
MongoClient = try_import("pymongo", "MongoClient")
|
73
|
+
helpers = try_import("elasticsearch", "helpers")
|
74
|
+
KafkaConsumer = try_import("kafka", "KafkaConsumer")
|
75
|
+
np = try_import("numpy", None)
|
76
|
+
sns = try_import("seaborn", None)
|
77
|
+
torch = try_import("torch", None)
|
78
|
+
nn = try_import("torch.nn", None)
|
79
|
+
xgb = try_import("xgboost", None)
|
80
|
+
plt = try_import("matplotlib", "pyplot")
|
81
|
+
WordNetLemmatizer = try_import("nltk.stem", "WordNetLemmatizer")
|
82
|
+
metrics = try_import("sklearn", "metrics")
|
83
|
+
BertTokenizer = try_import("transformers", "BertTokenizer")
|
84
|
+
BertForMaskedLM = try_import("transformers", "BertForMaskedLM")
|
85
|
+
requests = try_import("requests", None)
|
86
|
+
psutil = try_import("psutil", None)
|
87
|
+
pq = try_import("pyquery", None)
|
88
|
+
CountVectorizer = try_import("sklearn.feature_extraction.text", "CountVectorizer")
|
89
|
+
precision_recall_fscore_support = try_import("sklearn.metrics", "precision_recall_fscore_support")
|
90
|
+
tqdm = try_import("tqdm", "tqdm")
|
91
|
+
langid = try_import("langid", None)
|
92
|
+
# win32evtlogutil?
|
93
|
+
TfidfTransformer = try_import("sklearn.feature_extraction.text", "TfidfTransformer")
|
94
|
+
yaml = try_import("yaml", None)
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# !/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import importlib
|
5
|
+
from importlib import import_module
|
6
|
+
import os
|
7
|
+
|
8
|
+
|
9
|
+
def try_import(name, package):
|
10
|
+
try:
|
11
|
+
return import_module(name, package=package)
|
12
|
+
except:
|
13
|
+
pass
|
14
|
+
# print("import {} failed".format(name))
|
15
|
+
finally:
|
16
|
+
pass
|
17
|
+
|
18
|
+
|
19
|
+
aioredis = None
|
20
|
+
happybase = None
|
21
|
+
pd = None
|
22
|
+
pymysql = None
|
23
|
+
Elasticsearch = None
|
24
|
+
KafkaProducer = None
|
25
|
+
MongoClient = None
|
26
|
+
helpers = None
|
27
|
+
KafkaConsumer = None
|
28
|
+
np = None
|
29
|
+
sns = None
|
30
|
+
torch = None
|
31
|
+
nn = None
|
32
|
+
xgb = None
|
33
|
+
plt = None
|
34
|
+
WordNetLemmatizer = None
|
35
|
+
metrics = None
|
36
|
+
BertTokenizer = None
|
37
|
+
BertForMaskedLM = None
|
38
|
+
requests = None
|
39
|
+
psutil = None
|
40
|
+
pq = None
|
41
|
+
CountVectorizer = None
|
42
|
+
precision_recall_fscore_support = None
|
43
|
+
tqdm = None
|
44
|
+
langid = None
|
45
|
+
win32evtlogutil = None
|
46
|
+
TfidfTransformer = None
|
47
|
+
yaml = None
|
48
|
+
|
49
|
+
import_dict = {
|
50
|
+
"aioredis": ("aioredis", None),
|
51
|
+
"happybase": ("happybase", None),
|
52
|
+
"pd": ("pandas", None),
|
53
|
+
"pymysql": ("pymysql", None),
|
54
|
+
"Elasticsearch": ("elasticsearch", "Elasticsearch"),
|
55
|
+
"KafkaProducer": ("kafka", "KafkaProducer"),
|
56
|
+
"MongoClient": ("pymongo", "MongoClient"),
|
57
|
+
"helpers": ("elasticsearch", "helpers"),
|
58
|
+
"KafkaConsumer": ("kafka", "KafkaConsumer"),
|
59
|
+
"np": ("numpy", None),
|
60
|
+
"sns": ("seaborn", None),
|
61
|
+
"torch": ("torch", None),
|
62
|
+
"nn": ("torch.nn", None),
|
63
|
+
"xgb": ("xgboost", None),
|
64
|
+
"plt": ("matplotlib", "pyplot"),
|
65
|
+
"WordNetLemmatizer": ("nltk.stem", "WordNetLemmatizer"),
|
66
|
+
"metrics": ("sklearn", "metrics"),
|
67
|
+
"BertTokenizer": ("transformers", "BertTokenizer"),
|
68
|
+
"BertForMaskedLM": ("transformers", "BertForMaskedLM"),
|
69
|
+
"requests": ("requests", None),
|
70
|
+
"psutil": ("psutil", None),
|
71
|
+
"pq": ("pyquery", None),
|
72
|
+
"CountVectorizer": ("sklearn.feature_extraction.text", "CountVectorizer"),
|
73
|
+
"precision_recall_fscore_support": ("sklearn.metrics", "precision_recall_fscore_support"),
|
74
|
+
"tqdm": ("tqdm", "tqdm"),
|
75
|
+
"langid": ("langid", None),
|
76
|
+
"win32evtlogutil": ("win32evtlogutil", None),
|
77
|
+
"TfidfTransformer": ("sklearn.feature_extraction.text", "TfidfTransformer"),
|
78
|
+
"yaml": ("yaml", None)
|
79
|
+
}
|
80
|
+
if "nlpertools_helper" in os.environ.keys():
|
81
|
+
# TODO 该方法未经过测试
|
82
|
+
import_list = os.environ["nlpertools_helper"]
|
83
|
+
|
84
|
+
for k in import_list:
|
85
|
+
name, package = import_dict[k]
|
86
|
+
globals()[k] = try_import(name, package)
|
87
|
+
else:
|
88
|
+
aioredis = try_import("aioredis", None)
|
89
|
+
happybase = try_import("happybase", None)
|
90
|
+
pd = try_import("pandas", None)
|
91
|
+
pymysql = try_import("pymysql", None)
|
92
|
+
Elasticsearch = try_import("elasticsearch", "Elasticsearch")
|
93
|
+
KafkaProducer = try_import("kafka", "KafkaProducer")
|
94
|
+
MongoClient = try_import("pymongo", "MongoClient")
|
95
|
+
helpers = try_import("elasticsearch", "helpers")
|
96
|
+
KafkaConsumer = try_import("kafka", "KafkaConsumer")
|
97
|
+
np = try_import("numpy", None)
|
98
|
+
sns = try_import("seaborn", None)
|
99
|
+
torch = try_import("torch", None)
|
100
|
+
nn = try_import("torch.nn", None)
|
101
|
+
xgb = try_import("xgboost", None)
|
102
|
+
plt = try_import("matplotlib", "pyplot")
|
103
|
+
WordNetLemmatizer = try_import("nltk.stem", "WordNetLemmatizer")
|
104
|
+
metrics = try_import("sklearn", "metrics")
|
105
|
+
BertTokenizer = try_import("transformers", "BertTokenizer")
|
106
|
+
BertForMaskedLM = try_import("transformers", "BertForMaskedLM")
|
107
|
+
requests = try_import("requests", None)
|
108
|
+
psutil = try_import("psutil", None)
|
109
|
+
pq = try_import("pyquery", None)
|
110
|
+
CountVectorizer = try_import("sklearn.feature_extraction.text", "CountVectorizer")
|
111
|
+
precision_recall_fscore_support = try_import("sklearn.metrics", "precision_recall_fscore_support")
|
112
|
+
tqdm = try_import("tqdm", "tqdm")
|
113
|
+
# TODO 自动导出langid和win32evtlogutil输出有bug
|
114
|
+
langid = try_import("langid", None)
|
115
|
+
win32evtlogutil = try_import("win32evtlogutil", None)
|
116
|
+
TfidfTransformer = try_import("sklearn.feature_extraction.text", "TfidfTransformer")
|
117
|
+
yaml = try_import("yaml", None)
|
@@ -0,0 +1,93 @@
|
|
1
|
+
import os
|
2
|
+
import shutil
|
3
|
+
from importlib import import_module
|
4
|
+
|
5
|
+
from .io.dir import j_mkdir
|
6
|
+
from .io.file import readtxt_list_all_strip, writetxt_w_list
|
7
|
+
|
8
|
+
|
9
|
+
def try_import(name, package):
|
10
|
+
try:
|
11
|
+
return import_module(name, package=package)
|
12
|
+
except:
|
13
|
+
print("import {} failed".format(name))
|
14
|
+
finally:
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
def convert_import_to_try_import(from_path, to_path):
|
19
|
+
j_mkdir(to_path)
|
20
|
+
for root, dirs, files in os.walk(from_path):
|
21
|
+
for sub_dir in dirs:
|
22
|
+
j_mkdir(os.path.join(root.replace(from_path, to_path), sub_dir))
|
23
|
+
for file in files:
|
24
|
+
src = os.path.join(root, file)
|
25
|
+
dst = os.path.join(root.replace(from_path, to_path), file)
|
26
|
+
excluded_file = ["wrapper.py", "kmp.py", "__init__.py"]
|
27
|
+
if file.endswith(".py") and file != "utils_for_nlpertools.py" and file not in excluded_file:
|
28
|
+
raw_code = readtxt_list_all_strip(src)
|
29
|
+
start_idx, end_idx = 0, 0
|
30
|
+
|
31
|
+
for idx, each_line in enumerate(raw_code[:30]):
|
32
|
+
each_line = each_line.lstrip("# ")
|
33
|
+
if start_idx == 0 and (each_line.startswith("from") or each_line.startswith("import")):
|
34
|
+
try:
|
35
|
+
exec(each_line)
|
36
|
+
except:
|
37
|
+
start_idx = idx
|
38
|
+
if start_idx != 0 and not each_line:
|
39
|
+
end_idx = idx
|
40
|
+
break
|
41
|
+
# print(file, start_idx, end_idx)
|
42
|
+
if start_idx != 0 and end_idx != 0:
|
43
|
+
new_code = raw_code[:start_idx] + convert_import_string_to_import_list(
|
44
|
+
"\n".join(raw_code[start_idx:end_idx])) + raw_code[end_idx:]
|
45
|
+
else:
|
46
|
+
new_code = raw_code
|
47
|
+
writetxt_w_list(new_code, dst)
|
48
|
+
else:
|
49
|
+
shutil.copy(src=src, dst=dst)
|
50
|
+
print("convert over")
|
51
|
+
|
52
|
+
|
53
|
+
def get_import_info(text):
|
54
|
+
pass
|
55
|
+
|
56
|
+
|
57
|
+
def convert_import_string_to_import_list(text):
|
58
|
+
"""
|
59
|
+
该方法将 import 转变为 try import
|
60
|
+
"""
|
61
|
+
models_to_import = []
|
62
|
+
import_list = text.split("\n")
|
63
|
+
for each in import_list:
|
64
|
+
print(each)
|
65
|
+
name, package, as_name = None, None, None
|
66
|
+
elements = each.split(" ")
|
67
|
+
for pre, cur in zip(elements, elements[1:]):
|
68
|
+
if cur.endswith(","):
|
69
|
+
cur = cur.rstrip(",")
|
70
|
+
# 为了实现from import 和 import统一,首先把package和name的含义反过来,后面再掉换
|
71
|
+
if pre == "import":
|
72
|
+
package = cur
|
73
|
+
if pre == "from":
|
74
|
+
name = cur
|
75
|
+
if pre == "as":
|
76
|
+
as_name = cur
|
77
|
+
if pre[-1] == ",":
|
78
|
+
# 针对 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
79
|
+
# 将将前面部分和当前的组成新字段
|
80
|
+
prefix = each.split("import")[0]
|
81
|
+
import_list.append("{}import {}".format(prefix, cur))
|
82
|
+
if not as_name:
|
83
|
+
as_name = package.split(".")[-1]
|
84
|
+
if not name:
|
85
|
+
name, package = package, name
|
86
|
+
models_to_import.append((name, package, as_name))
|
87
|
+
# 打印
|
88
|
+
all_import_info = ["", "from utils_for_nlpertools import try_import", ""]
|
89
|
+
for name, package, as_name in models_to_import:
|
90
|
+
import_info = '{} = try_import("{}", {})'.format(as_name, name, '"{}"'.format(package) if package else package)
|
91
|
+
all_import_info.append(import_info)
|
92
|
+
print(import_info)
|
93
|
+
return all_import_info
|
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import math
|
5
|
+
|
6
|
+
import faiss
|
7
|
+
import gensim
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
def build_index_use(vectors):
|
13
|
+
d = len(vectors[0])
|
14
|
+
nlist = 100
|
15
|
+
quantizer = faiss.IndexFlatL2(d)
|
16
|
+
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
|
17
|
+
index.train(vectors)
|
18
|
+
index.add(vectors)
|
19
|
+
return index
|
20
|
+
|
21
|
+
|
22
|
+
def build_index(vectors, distances="L2", nprobe=10):
|
23
|
+
""" 建立 faiss 索引.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
vectors(numpy.array): 向量矩阵,shape=(向量数, 向量维度)
|
27
|
+
distance(str): 度量距离,支持 L2、COS 和 INNER_PRODUCT.
|
28
|
+
nprobe(int): 向量搜索时需要搜索的聚类数.
|
29
|
+
|
30
|
+
Return: 返回 faiss 索引对象.
|
31
|
+
|
32
|
+
"""
|
33
|
+
metric_type = None
|
34
|
+
if distances == "L2":
|
35
|
+
metric_type = faiss.METRIC_L2
|
36
|
+
elif distances in ("COS", "INNER_PRODUCT"):
|
37
|
+
metric_type = faiss.METRIC_INNER_PRODUCT
|
38
|
+
else:
|
39
|
+
raise NotImplementedError
|
40
|
+
|
41
|
+
index_pipes = []
|
42
|
+
|
43
|
+
if distances == "COS":
|
44
|
+
index_pipes.append("L2norm")
|
45
|
+
|
46
|
+
K = 4 * math.sqrt(vectors.shape[0])
|
47
|
+
use_ivf = False
|
48
|
+
if vectors.shape[0] >= 30 * K:
|
49
|
+
index_pipes.append(f"IVF{K}")
|
50
|
+
use_ivf = True
|
51
|
+
|
52
|
+
index_pipes.append("Flat")
|
53
|
+
|
54
|
+
index = faiss.index_factory(vectors.shape[1], ",".join(index_pipes),
|
55
|
+
metric_type)
|
56
|
+
|
57
|
+
vectors = vectors.astype(np.float32)
|
58
|
+
if not index.is_trained:
|
59
|
+
index.train(vectors)
|
60
|
+
|
61
|
+
index.add(vectors)
|
62
|
+
|
63
|
+
# IVF 使用 reconstruct 时必须执行此函数
|
64
|
+
if use_ivf:
|
65
|
+
ivf_index = faiss.extract_index_ivf(index)
|
66
|
+
ivf_index.make_direct_map()
|
67
|
+
ivf_index.nprobe = nprobe
|
68
|
+
|
69
|
+
return index
|
70
|
+
|
71
|
+
|
72
|
+
def read_index_from_file(filename):
|
73
|
+
""" 从向量文件中读取 faiss 向量对象. """
|
74
|
+
return faiss.read_index(filename)
|
75
|
+
|
76
|
+
|
77
|
+
def write_index_to_file(index, filename):
|
78
|
+
""" 将 faiss 向量对象写入文件. """
|
79
|
+
faiss.write_index(index, filename)
|
80
|
+
|
81
|
+
|
82
|
+
word2vec_path = "glove_vector_path"
|
83
|
+
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
|
84
|
+
vectors = wv_from_text.vectors
|
85
|
+
|
86
|
+
name_example = ["{}.jpg".format((i % 9) + 1) for i in range(len(vectors))]
|
87
|
+
df = pd.DataFrame({
|
88
|
+
"name": name_example,
|
89
|
+
# "vector": str(vectors[0]),
|
90
|
+
# "text": list(wv_from_text.key_to_index.keys()),
|
91
|
+
})
|
92
|
+
test_index = build_index_use(vectors)
|
93
|
+
write_index_to_file(test_index, "test.index")
|
94
|
+
|
95
|
+
df.to_csv("test.csv", index=False)
|
96
|
+
|
97
|
+
import gensim
|
98
|
+
import hnswlib
|
99
|
+
|
100
|
+
word2vec_path = "glove_vector_path"
|
101
|
+
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
|
102
|
+
vectors = wv_from_text.vectors
|
103
|
+
|
104
|
+
labels = [idx for idx, i in enumerate(vectors)]
|
105
|
+
index = hnswlib.Index(space="l2", dim=len(vectors[0]))
|
106
|
+
index.init_index(max_elements=len(vectors), ef_construction=200, M=16)
|
107
|
+
index.add_items(vectors, labels)
|
108
|
+
index.save_index("hnswlib.index")
|
nlpertools/wrapper.py
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
# 定义装饰器
|
5
|
+
import logging
|
6
|
+
import time
|
7
|
+
from functools import wraps
|
8
|
+
import asyncio
|
9
|
+
|
10
|
+
def fn_async_timer(function):
|
11
|
+
"""
|
12
|
+
针对异步函数的装饰器
|
13
|
+
"""
|
14
|
+
@wraps(function)
|
15
|
+
async def function_timer(*args, **kwargs):
|
16
|
+
t0 = time.time()
|
17
|
+
result = await function(*args, **kwargs)
|
18
|
+
t1 = time.time()
|
19
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
|
20
|
+
return result
|
21
|
+
|
22
|
+
return function_timer
|
23
|
+
|
24
|
+
|
25
|
+
def fn_timer(async_func=False, analyse=False):
|
26
|
+
"""
|
27
|
+
>>> @fn_timer()
|
28
|
+
>>> def example():
|
29
|
+
>>> time.sleep(2)
|
30
|
+
:param analyse:
|
31
|
+
:return:
|
32
|
+
"""
|
33
|
+
|
34
|
+
def wrapper(func):
|
35
|
+
async def func_time_async(*args, **kwargs):
|
36
|
+
t0 = time.time()
|
37
|
+
result = await asyncio.create_task(func(*args, **kwargs))
|
38
|
+
t1 = time.time()
|
39
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
|
40
|
+
return result
|
41
|
+
|
42
|
+
def func_time(*args, **kwargs):
|
43
|
+
t0 = time.time()
|
44
|
+
result = func(*args, **kwargs)
|
45
|
+
t1 = time.time()
|
46
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
|
47
|
+
return result
|
48
|
+
|
49
|
+
def func_time_analyse(*args, **kwargs):
|
50
|
+
from pyinstrument import Profiler
|
51
|
+
|
52
|
+
profiler = Profiler()
|
53
|
+
profiler.start()
|
54
|
+
|
55
|
+
result = func(*args, **kwargs)
|
56
|
+
|
57
|
+
profiler.stop()
|
58
|
+
profiler.print()
|
59
|
+
return result
|
60
|
+
|
61
|
+
if async_func is True:
|
62
|
+
return func_time_async
|
63
|
+
else:
|
64
|
+
if analyse:
|
65
|
+
return func_time_analyse
|
66
|
+
else:
|
67
|
+
return func_time
|
68
|
+
|
69
|
+
return wrapper
|
70
|
+
|
71
|
+
|
72
|
+
def fn_timeout_checker(wait_time, callback):
|
73
|
+
"""
|
74
|
+
超时判断的装饰器
|
75
|
+
两个包,使用gevent出现bug
|
76
|
+
"""
|
77
|
+
# from gevent import Timeout
|
78
|
+
# from gevent.monkey import patch_all
|
79
|
+
|
80
|
+
# patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
|
81
|
+
|
82
|
+
from eventlet import Timeout
|
83
|
+
from eventlet import monkey_patch
|
84
|
+
|
85
|
+
monkey_patch(time=True)
|
86
|
+
|
87
|
+
def wrapper(func):
|
88
|
+
def inner(*args, **kwargs):
|
89
|
+
finish_flag = False
|
90
|
+
with Timeout(wait_time, False):
|
91
|
+
res = func(*args, **kwargs)
|
92
|
+
finish_flag = True
|
93
|
+
if not finish_flag:
|
94
|
+
res = callback()
|
95
|
+
return res
|
96
|
+
|
97
|
+
return inner
|
98
|
+
|
99
|
+
return wrapper
|
100
|
+
|
101
|
+
|
102
|
+
def fn_try(parameter):
|
103
|
+
"""
|
104
|
+
该函数把try...catch...封装成装饰器,
|
105
|
+
接收一个字典参数,并把其中的msg字段改为具体报错信息
|
106
|
+
:param parameter: {"msg": "", etc.}
|
107
|
+
:return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
|
108
|
+
"""
|
109
|
+
|
110
|
+
def wrapper(function):
|
111
|
+
def inner(*args, **kwargs):
|
112
|
+
try:
|
113
|
+
result = function(*args, **kwargs)
|
114
|
+
return result
|
115
|
+
except Exception as e:
|
116
|
+
msg = "报错!"
|
117
|
+
print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
|
118
|
+
parameter["msg"] = parameter["msg"].format(str(e))
|
119
|
+
return parameter
|
120
|
+
finally:
|
121
|
+
pass
|
122
|
+
|
123
|
+
return inner
|
124
|
+
|
125
|
+
return wrapper
|
126
|
+
|
127
|
+
|
128
|
+
def try_log(function):
|
129
|
+
@wraps(function)
|
130
|
+
def inner(*args, **kwargs):
|
131
|
+
try:
|
132
|
+
result = function(*args, **kwargs)
|
133
|
+
return result
|
134
|
+
except Exception as e:
|
135
|
+
logging.error(*args)
|
136
|
+
logging.error(e.__traceback__.tb_frame.f_globals["__file__"])
|
137
|
+
logging.error(e.__traceback__.tb_lineno)
|
138
|
+
logging.error(repr(e))
|
139
|
+
|
140
|
+
return inner
|
141
|
+
|
142
|
+
|
143
|
+
def example(function):
|
144
|
+
@wraps(function)
|
145
|
+
def function_example(*args, **kwargs):
|
146
|
+
print("此方法仅仅用于提示该方法怎么写")
|
147
|
+
result = function(*args, **kwargs)
|
148
|
+
return result
|
149
|
+
|
150
|
+
return function_example
|
151
|
+
|
152
|
+
|
153
|
+
def singleton(cls):
|
154
|
+
instances = {}
|
155
|
+
|
156
|
+
def _singleton(*args, **kwargs):
|
157
|
+
if cls not in instances:
|
158
|
+
instances[cls] = cls(*args, **kwargs)
|
159
|
+
return instances[cls]
|
160
|
+
|
161
|
+
return _singleton
|