nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +24 -11
- nlpertools/algo/__init__.py +0 -0
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -0
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -0
- nlpertools/data_structure/__init__.py +0 -0
- nlpertools/data_structure/base_structure.py +109 -0
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -47
- nlpertools/io/file.py +277 -205
- nlpertools/ml.py +483 -317
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -62
- nlpertools/other.py +364 -188
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -34
- nlpertools/reminder.py +98 -15
- nlpertools/template/__init__.py +0 -0
- nlpertools/utils/__init__.py +3 -0
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -0
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -0
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -0
- {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.4.dist-info/METADATA +0 -42
- nlpertools-1.0.4.dist-info/RECORD +0 -15
- nlpertools-1.0.4.dist-info/top_level.txt +0 -1
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
"""
|
5
|
+
# 该项目暂时没有日志输出
|
6
|
+
import codecs
|
7
|
+
import logging.config
|
8
|
+
|
9
|
+
import nlpertools
|
10
|
+
import yaml
|
11
|
+
|
12
|
+
nlpertools.j_mkdir("logs")
|
13
|
+
|
14
|
+
with codecs.open('log_config.yml', 'r', 'utf-8') as stream:
|
15
|
+
config = yaml.load(stream, Loader=yaml.FullLoader)
|
16
|
+
|
17
|
+
# logging.basicConfig(level=logging.INFO)
|
18
|
+
logging.config.dictConfig(config)
|
19
|
+
logger = logging.getLogger()
|
20
|
+
"""
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import importlib
|
5
|
+
from importlib import import_module
|
6
|
+
|
7
|
+
|
8
|
+
def try_import(name, package):
|
9
|
+
try:
|
10
|
+
return import_module(name, package=package)
|
11
|
+
except:
|
12
|
+
pass
|
13
|
+
# print("import {} failed".format(name))
|
14
|
+
finally:
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
def lazy_import(importer_name, to_import):
|
19
|
+
"""
|
20
|
+
Example from net
|
21
|
+
author: unknown
|
22
|
+
this function is not used
|
23
|
+
"""
|
24
|
+
"""Return the importing module and a callable for lazy importing.
|
25
|
+
|
26
|
+
The module named by importer_name represents the module performing the
|
27
|
+
import to help facilitate resolving relative imports.
|
28
|
+
|
29
|
+
to_import is an iterable of the modules to be potentially imported (absolute
|
30
|
+
or relative). The `as` form of importing is also supported,
|
31
|
+
e.g. `pkg.mod as spam`.
|
32
|
+
|
33
|
+
This function returns a tuple of two items. The first is the importer
|
34
|
+
module for easy reference within itself. The second item is a callable to be
|
35
|
+
set to `__getattr__`.
|
36
|
+
"""
|
37
|
+
module = importlib.import_module(importer_name)
|
38
|
+
import_mapping = {}
|
39
|
+
for name in to_import:
|
40
|
+
importing, _, binding = name.partition(' as ')
|
41
|
+
if not binding:
|
42
|
+
_, _, binding = importing.rpartition('.')
|
43
|
+
import_mapping[binding] = importing
|
44
|
+
|
45
|
+
def __getattr__(name):
|
46
|
+
if name not in import_mapping:
|
47
|
+
message = f'module {importer_name!r} has no attribute {name!r}'
|
48
|
+
raise AttributeError(message)
|
49
|
+
importing = import_mapping[name]
|
50
|
+
# imortlib.import_module() implicitly sets submodules on this module as
|
51
|
+
# appropriate for direct imports.
|
52
|
+
imported = importlib.import_module(importing,
|
53
|
+
module.__spec__.parent)
|
54
|
+
setattr(module, name, imported)
|
55
|
+
return imported
|
56
|
+
|
57
|
+
return module, __getattr__
|
58
|
+
|
59
|
+
|
60
|
+
# jieba = try_import("jieba", None)
|
61
|
+
# sns = try_import("seaborn", None)
|
62
|
+
# torch = try_import("torch", None)
|
63
|
+
# nn = try_import("torch.nn", None)
|
64
|
+
# BertTokenizer = try_import("transformers", "BertTokenizer")
|
65
|
+
# BertForMaskedLM = try_import("transformers", "BertForMaskedLM")
|
66
|
+
# Elasticsearch = try_import("elasticsearch", "Elasticsearch")
|
67
|
+
# pd = try_import("pandas", None)
|
68
|
+
# xgb = try_import("xgboost", None)
|
69
|
+
|
70
|
+
aioredis = try_import("aioredis", None)
|
71
|
+
pymysql = try_import("pymysql", None)
|
72
|
+
zhconv = try_import("zhconv", None)
|
73
|
+
KafkaProducer = try_import("kafka", "KafkaProducer")
|
74
|
+
KafkaConsumer = try_import("kafka", "KafkaConsumer")
|
75
|
+
np = try_import("numpy", None)
|
76
|
+
plt = try_import("matplotlib", "pyplot")
|
77
|
+
WordNetLemmatizer = try_import("nltk.stem", "WordNetLemmatizer")
|
78
|
+
metrics = try_import("sklearn", "metrics")
|
79
|
+
requests = try_import("requests", None)
|
80
|
+
pq = try_import("pyquery", None)
|
81
|
+
CountVectorizer = try_import("sklearn.feature_extraction.text", "CountVectorizer")
|
82
|
+
precision_recall_fscore_support = try_import("sklearn.metrics", "precision_recall_fscore_support")
|
83
|
+
tqdm = try_import("tqdm", "tqdm")
|
84
|
+
# TODO 自动导出langid和win32evtlogutil输出有bug
|
85
|
+
langid = try_import("langid", None)
|
86
|
+
win32evtlogutil = try_import("win32evtlogutil", None)
|
87
|
+
TfidfTransformer = try_import("sklearn.feature_extraction.text", "TfidfTransformer")
|
88
|
+
yaml = try_import("yaml", None)
|
89
|
+
omegaconf = try_import("omegaconf", None)
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import importlib
|
5
|
+
from importlib import import_module
|
6
|
+
from importlib.util import LazyLoader
|
7
|
+
from .lazy import lazy_module
|
8
|
+
|
9
|
+
EXCLUDE_LAZYIMPORT = {"torch", "torch.nn", "numpy"}
|
10
|
+
|
11
|
+
|
12
|
+
def try_import(name, package):
|
13
|
+
try:
|
14
|
+
if package:
|
15
|
+
# print("import {} success".format(name))
|
16
|
+
return lazy_module("{}.{}".format(package, name))
|
17
|
+
else:
|
18
|
+
if name in EXCLUDE_LAZYIMPORT:
|
19
|
+
return import_module(name, package=package)
|
20
|
+
return lazy_module(name)
|
21
|
+
# return import_module(name, package=package)
|
22
|
+
except:
|
23
|
+
pass
|
24
|
+
print("import {} failed".format(name))
|
25
|
+
finally:
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
def lazy_import(importer_name, to_import):
|
30
|
+
"""Return the importing module and a callable for lazy importing.
|
31
|
+
|
32
|
+
The module named by importer_name represents the module performing the
|
33
|
+
import to help facilitate resolving relative imports.
|
34
|
+
|
35
|
+
to_import is an iterable of the modules to be potentially imported (absolute
|
36
|
+
or relative). The `as` form of importing is also supported,
|
37
|
+
e.g. `pkg.mod as spam`.
|
38
|
+
|
39
|
+
This function returns a tuple of two items. The first is the importer
|
40
|
+
module for easy reference within itself. The second item is a callable to be
|
41
|
+
set to `__getattr__`.
|
42
|
+
"""
|
43
|
+
module = importlib.import_module(importer_name)
|
44
|
+
import_mapping = {}
|
45
|
+
for name in to_import:
|
46
|
+
importing, _, binding = name.partition(' as ')
|
47
|
+
if not binding:
|
48
|
+
_, _, binding = importing.rpartition('.')
|
49
|
+
import_mapping[binding] = importing
|
50
|
+
|
51
|
+
def __getattr__(name):
|
52
|
+
if name not in import_mapping:
|
53
|
+
message = f'module {importer_name!r} has no attribute {name!r}'
|
54
|
+
raise AttributeError(message)
|
55
|
+
importing = import_mapping[name]
|
56
|
+
# imortlib.import_module() implicitly sets submodules on this module as
|
57
|
+
# appropriate for direct imports.
|
58
|
+
imported = importlib.import_module(importing,
|
59
|
+
module.__spec__.parent)
|
60
|
+
setattr(module, name, imported)
|
61
|
+
return imported
|
62
|
+
|
63
|
+
return module, __getattr__
|
64
|
+
|
65
|
+
|
66
|
+
aioredis = try_import("aioredis", None)
|
67
|
+
happybase = try_import("happybase", None)
|
68
|
+
pd = try_import("pandas", None)
|
69
|
+
pymysql = try_import("pymysql", None)
|
70
|
+
Elasticsearch = try_import("elasticsearch", "Elasticsearch")
|
71
|
+
KafkaProducer = try_import("kafka", "KafkaProducer")
|
72
|
+
MongoClient = try_import("pymongo", "MongoClient")
|
73
|
+
helpers = try_import("elasticsearch", "helpers")
|
74
|
+
KafkaConsumer = try_import("kafka", "KafkaConsumer")
|
75
|
+
np = try_import("numpy", None)
|
76
|
+
sns = try_import("seaborn", None)
|
77
|
+
torch = try_import("torch", None)
|
78
|
+
nn = try_import("torch.nn", None)
|
79
|
+
xgb = try_import("xgboost", None)
|
80
|
+
plt = try_import("matplotlib", "pyplot")
|
81
|
+
WordNetLemmatizer = try_import("nltk.stem", "WordNetLemmatizer")
|
82
|
+
metrics = try_import("sklearn", "metrics")
|
83
|
+
BertTokenizer = try_import("transformers", "BertTokenizer")
|
84
|
+
BertForMaskedLM = try_import("transformers", "BertForMaskedLM")
|
85
|
+
requests = try_import("requests", None)
|
86
|
+
psutil = try_import("psutil", None)
|
87
|
+
pq = try_import("pyquery", None)
|
88
|
+
CountVectorizer = try_import("sklearn.feature_extraction.text", "CountVectorizer")
|
89
|
+
precision_recall_fscore_support = try_import("sklearn.metrics", "precision_recall_fscore_support")
|
90
|
+
tqdm = try_import("tqdm", "tqdm")
|
91
|
+
langid = try_import("langid", None)
|
92
|
+
# win32evtlogutil?
|
93
|
+
TfidfTransformer = try_import("sklearn.feature_extraction.text", "TfidfTransformer")
|
94
|
+
yaml = try_import("yaml", None)
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# !/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import importlib
|
5
|
+
from importlib import import_module
|
6
|
+
import os
|
7
|
+
|
8
|
+
|
9
|
+
def try_import(name, package):
|
10
|
+
try:
|
11
|
+
return import_module(name, package=package)
|
12
|
+
except:
|
13
|
+
pass
|
14
|
+
# print("import {} failed".format(name))
|
15
|
+
finally:
|
16
|
+
pass
|
17
|
+
|
18
|
+
|
19
|
+
aioredis = None
|
20
|
+
happybase = None
|
21
|
+
pd = None
|
22
|
+
pymysql = None
|
23
|
+
Elasticsearch = None
|
24
|
+
KafkaProducer = None
|
25
|
+
MongoClient = None
|
26
|
+
helpers = None
|
27
|
+
KafkaConsumer = None
|
28
|
+
np = None
|
29
|
+
sns = None
|
30
|
+
torch = None
|
31
|
+
nn = None
|
32
|
+
xgb = None
|
33
|
+
plt = None
|
34
|
+
WordNetLemmatizer = None
|
35
|
+
metrics = None
|
36
|
+
BertTokenizer = None
|
37
|
+
BertForMaskedLM = None
|
38
|
+
requests = None
|
39
|
+
psutil = None
|
40
|
+
pq = None
|
41
|
+
CountVectorizer = None
|
42
|
+
precision_recall_fscore_support = None
|
43
|
+
tqdm = None
|
44
|
+
langid = None
|
45
|
+
win32evtlogutil = None
|
46
|
+
TfidfTransformer = None
|
47
|
+
yaml = None
|
48
|
+
|
49
|
+
import_dict = {
|
50
|
+
"aioredis": ("aioredis", None),
|
51
|
+
"happybase": ("happybase", None),
|
52
|
+
"pd": ("pandas", None),
|
53
|
+
"pymysql": ("pymysql", None),
|
54
|
+
"Elasticsearch": ("elasticsearch", "Elasticsearch"),
|
55
|
+
"KafkaProducer": ("kafka", "KafkaProducer"),
|
56
|
+
"MongoClient": ("pymongo", "MongoClient"),
|
57
|
+
"helpers": ("elasticsearch", "helpers"),
|
58
|
+
"KafkaConsumer": ("kafka", "KafkaConsumer"),
|
59
|
+
"np": ("numpy", None),
|
60
|
+
"sns": ("seaborn", None),
|
61
|
+
"torch": ("torch", None),
|
62
|
+
"nn": ("torch.nn", None),
|
63
|
+
"xgb": ("xgboost", None),
|
64
|
+
"plt": ("matplotlib", "pyplot"),
|
65
|
+
"WordNetLemmatizer": ("nltk.stem", "WordNetLemmatizer"),
|
66
|
+
"metrics": ("sklearn", "metrics"),
|
67
|
+
"BertTokenizer": ("transformers", "BertTokenizer"),
|
68
|
+
"BertForMaskedLM": ("transformers", "BertForMaskedLM"),
|
69
|
+
"requests": ("requests", None),
|
70
|
+
"psutil": ("psutil", None),
|
71
|
+
"pq": ("pyquery", None),
|
72
|
+
"CountVectorizer": ("sklearn.feature_extraction.text", "CountVectorizer"),
|
73
|
+
"precision_recall_fscore_support": ("sklearn.metrics", "precision_recall_fscore_support"),
|
74
|
+
"tqdm": ("tqdm", "tqdm"),
|
75
|
+
"langid": ("langid", None),
|
76
|
+
"win32evtlogutil": ("win32evtlogutil", None),
|
77
|
+
"TfidfTransformer": ("sklearn.feature_extraction.text", "TfidfTransformer"),
|
78
|
+
"yaml": ("yaml", None)
|
79
|
+
}
|
80
|
+
if "nlpertools_helper" in os.environ.keys():
|
81
|
+
# TODO 该方法未经过测试
|
82
|
+
import_list = os.environ["nlpertools_helper"]
|
83
|
+
|
84
|
+
for k in import_list:
|
85
|
+
name, package = import_dict[k]
|
86
|
+
globals()[k] = try_import(name, package)
|
87
|
+
else:
|
88
|
+
aioredis = try_import("aioredis", None)
|
89
|
+
happybase = try_import("happybase", None)
|
90
|
+
pd = try_import("pandas", None)
|
91
|
+
pymysql = try_import("pymysql", None)
|
92
|
+
Elasticsearch = try_import("elasticsearch", "Elasticsearch")
|
93
|
+
KafkaProducer = try_import("kafka", "KafkaProducer")
|
94
|
+
MongoClient = try_import("pymongo", "MongoClient")
|
95
|
+
helpers = try_import("elasticsearch", "helpers")
|
96
|
+
KafkaConsumer = try_import("kafka", "KafkaConsumer")
|
97
|
+
np = try_import("numpy", None)
|
98
|
+
sns = try_import("seaborn", None)
|
99
|
+
torch = try_import("torch", None)
|
100
|
+
nn = try_import("torch.nn", None)
|
101
|
+
xgb = try_import("xgboost", None)
|
102
|
+
plt = try_import("matplotlib", "pyplot")
|
103
|
+
WordNetLemmatizer = try_import("nltk.stem", "WordNetLemmatizer")
|
104
|
+
metrics = try_import("sklearn", "metrics")
|
105
|
+
BertTokenizer = try_import("transformers", "BertTokenizer")
|
106
|
+
BertForMaskedLM = try_import("transformers", "BertForMaskedLM")
|
107
|
+
requests = try_import("requests", None)
|
108
|
+
psutil = try_import("psutil", None)
|
109
|
+
pq = try_import("pyquery", None)
|
110
|
+
CountVectorizer = try_import("sklearn.feature_extraction.text", "CountVectorizer")
|
111
|
+
precision_recall_fscore_support = try_import("sklearn.metrics", "precision_recall_fscore_support")
|
112
|
+
tqdm = try_import("tqdm", "tqdm")
|
113
|
+
# TODO 自动导出langid和win32evtlogutil输出有bug
|
114
|
+
langid = try_import("langid", None)
|
115
|
+
win32evtlogutil = try_import("win32evtlogutil", None)
|
116
|
+
TfidfTransformer = try_import("sklearn.feature_extraction.text", "TfidfTransformer")
|
117
|
+
yaml = try_import("yaml", None)
|
@@ -0,0 +1,93 @@
|
|
1
|
+
import os
|
2
|
+
import shutil
|
3
|
+
from importlib import import_module
|
4
|
+
|
5
|
+
from .io.dir import j_mkdir
|
6
|
+
from .io.file import readtxt_list_all_strip, writetxt_w_list
|
7
|
+
|
8
|
+
|
9
|
+
def try_import(name, package):
|
10
|
+
try:
|
11
|
+
return import_module(name, package=package)
|
12
|
+
except:
|
13
|
+
print("import {} failed".format(name))
|
14
|
+
finally:
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
def convert_import_to_try_import(from_path, to_path):
|
19
|
+
j_mkdir(to_path)
|
20
|
+
for root, dirs, files in os.walk(from_path):
|
21
|
+
for sub_dir in dirs:
|
22
|
+
j_mkdir(os.path.join(root.replace(from_path, to_path), sub_dir))
|
23
|
+
for file in files:
|
24
|
+
src = os.path.join(root, file)
|
25
|
+
dst = os.path.join(root.replace(from_path, to_path), file)
|
26
|
+
excluded_file = ["wrapper.py", "kmp.py", "__init__.py"]
|
27
|
+
if file.endswith(".py") and file != "utils_for_nlpertools.py" and file not in excluded_file:
|
28
|
+
raw_code = readtxt_list_all_strip(src)
|
29
|
+
start_idx, end_idx = 0, 0
|
30
|
+
|
31
|
+
for idx, each_line in enumerate(raw_code[:30]):
|
32
|
+
each_line = each_line.lstrip("# ")
|
33
|
+
if start_idx == 0 and (each_line.startswith("from") or each_line.startswith("import")):
|
34
|
+
try:
|
35
|
+
exec(each_line)
|
36
|
+
except:
|
37
|
+
start_idx = idx
|
38
|
+
if start_idx != 0 and not each_line:
|
39
|
+
end_idx = idx
|
40
|
+
break
|
41
|
+
# print(file, start_idx, end_idx)
|
42
|
+
if start_idx != 0 and end_idx != 0:
|
43
|
+
new_code = raw_code[:start_idx] + convert_import_string_to_import_list(
|
44
|
+
"\n".join(raw_code[start_idx:end_idx])) + raw_code[end_idx:]
|
45
|
+
else:
|
46
|
+
new_code = raw_code
|
47
|
+
writetxt_w_list(new_code, dst)
|
48
|
+
else:
|
49
|
+
shutil.copy(src=src, dst=dst)
|
50
|
+
print("convert over")
|
51
|
+
|
52
|
+
|
53
|
+
def get_import_info(text):
|
54
|
+
pass
|
55
|
+
|
56
|
+
|
57
|
+
def convert_import_string_to_import_list(text):
|
58
|
+
"""
|
59
|
+
该方法将 import 转变为 try import
|
60
|
+
"""
|
61
|
+
models_to_import = []
|
62
|
+
import_list = text.split("\n")
|
63
|
+
for each in import_list:
|
64
|
+
print(each)
|
65
|
+
name, package, as_name = None, None, None
|
66
|
+
elements = each.split(" ")
|
67
|
+
for pre, cur in zip(elements, elements[1:]):
|
68
|
+
if cur.endswith(","):
|
69
|
+
cur = cur.rstrip(",")
|
70
|
+
# 为了实现from import 和 import统一,首先把package和name的含义反过来,后面再掉换
|
71
|
+
if pre == "import":
|
72
|
+
package = cur
|
73
|
+
if pre == "from":
|
74
|
+
name = cur
|
75
|
+
if pre == "as":
|
76
|
+
as_name = cur
|
77
|
+
if pre[-1] == ",":
|
78
|
+
# 针对 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
79
|
+
# 将将前面部分和当前的组成新字段
|
80
|
+
prefix = each.split("import")[0]
|
81
|
+
import_list.append("{}import {}".format(prefix, cur))
|
82
|
+
if not as_name:
|
83
|
+
as_name = package.split(".")[-1]
|
84
|
+
if not name:
|
85
|
+
name, package = package, name
|
86
|
+
models_to_import.append((name, package, as_name))
|
87
|
+
# 打印
|
88
|
+
all_import_info = ["", "from utils_for_nlpertools import try_import", ""]
|
89
|
+
for name, package, as_name in models_to_import:
|
90
|
+
import_info = '{} = try_import("{}", {})'.format(as_name, name, '"{}"'.format(package) if package else package)
|
91
|
+
all_import_info.append(import_info)
|
92
|
+
print(import_info)
|
93
|
+
return all_import_info
|
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
import math
|
5
|
+
|
6
|
+
import faiss
|
7
|
+
import gensim
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
def build_index_use(vectors):
|
13
|
+
d = len(vectors[0])
|
14
|
+
nlist = 100
|
15
|
+
quantizer = faiss.IndexFlatL2(d)
|
16
|
+
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
|
17
|
+
index.train(vectors)
|
18
|
+
index.add(vectors)
|
19
|
+
return index
|
20
|
+
|
21
|
+
|
22
|
+
def build_index(vectors, distances="L2", nprobe=10):
|
23
|
+
""" 建立 faiss 索引.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
vectors(numpy.array): 向量矩阵,shape=(向量数, 向量维度)
|
27
|
+
distance(str): 度量距离,支持 L2、COS 和 INNER_PRODUCT.
|
28
|
+
nprobe(int): 向量搜索时需要搜索的聚类数.
|
29
|
+
|
30
|
+
Return: 返回 faiss 索引对象.
|
31
|
+
|
32
|
+
"""
|
33
|
+
metric_type = None
|
34
|
+
if distances == "L2":
|
35
|
+
metric_type = faiss.METRIC_L2
|
36
|
+
elif distances in ("COS", "INNER_PRODUCT"):
|
37
|
+
metric_type = faiss.METRIC_INNER_PRODUCT
|
38
|
+
else:
|
39
|
+
raise NotImplementedError
|
40
|
+
|
41
|
+
index_pipes = []
|
42
|
+
|
43
|
+
if distances == "COS":
|
44
|
+
index_pipes.append("L2norm")
|
45
|
+
|
46
|
+
K = 4 * math.sqrt(vectors.shape[0])
|
47
|
+
use_ivf = False
|
48
|
+
if vectors.shape[0] >= 30 * K:
|
49
|
+
index_pipes.append(f"IVF{K}")
|
50
|
+
use_ivf = True
|
51
|
+
|
52
|
+
index_pipes.append("Flat")
|
53
|
+
|
54
|
+
index = faiss.index_factory(vectors.shape[1], ",".join(index_pipes),
|
55
|
+
metric_type)
|
56
|
+
|
57
|
+
vectors = vectors.astype(np.float32)
|
58
|
+
if not index.is_trained:
|
59
|
+
index.train(vectors)
|
60
|
+
|
61
|
+
index.add(vectors)
|
62
|
+
|
63
|
+
# IVF 使用 reconstruct 时必须执行此函数
|
64
|
+
if use_ivf:
|
65
|
+
ivf_index = faiss.extract_index_ivf(index)
|
66
|
+
ivf_index.make_direct_map()
|
67
|
+
ivf_index.nprobe = nprobe
|
68
|
+
|
69
|
+
return index
|
70
|
+
|
71
|
+
|
72
|
+
def read_index_from_file(filename):
|
73
|
+
""" 从向量文件中读取 faiss 向量对象. """
|
74
|
+
return faiss.read_index(filename)
|
75
|
+
|
76
|
+
|
77
|
+
def write_index_to_file(index, filename):
|
78
|
+
""" 将 faiss 向量对象写入文件. """
|
79
|
+
faiss.write_index(index, filename)
|
80
|
+
|
81
|
+
|
82
|
+
word2vec_path = "glove_vector_path"
|
83
|
+
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
|
84
|
+
vectors = wv_from_text.vectors
|
85
|
+
|
86
|
+
name_example = ["{}.jpg".format((i % 9) + 1) for i in range(len(vectors))]
|
87
|
+
df = pd.DataFrame({
|
88
|
+
"name": name_example,
|
89
|
+
# "vector": str(vectors[0]),
|
90
|
+
# "text": list(wv_from_text.key_to_index.keys()),
|
91
|
+
})
|
92
|
+
test_index = build_index_use(vectors)
|
93
|
+
write_index_to_file(test_index, "test.index")
|
94
|
+
|
95
|
+
df.to_csv("test.csv", index=False)
|
96
|
+
|
97
|
+
import gensim
|
98
|
+
import hnswlib
|
99
|
+
|
100
|
+
word2vec_path = "glove_vector_path"
|
101
|
+
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False, no_header=True)
|
102
|
+
vectors = wv_from_text.vectors
|
103
|
+
|
104
|
+
labels = [idx for idx, i in enumerate(vectors)]
|
105
|
+
index = hnswlib.Index(space="l2", dim=len(vectors[0]))
|
106
|
+
index.init_index(max_elements=len(vectors), ef_construction=200, M=16)
|
107
|
+
index.add_items(vectors, labels)
|
108
|
+
index.save_index("hnswlib.index")
|
nlpertools/wrapper.py
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
# 定义装饰器
|
5
|
+
import logging
|
6
|
+
import time
|
7
|
+
from functools import wraps
|
8
|
+
import asyncio
|
9
|
+
|
10
|
+
def fn_async_timer(function):
|
11
|
+
"""
|
12
|
+
针对异步函数的装饰器
|
13
|
+
"""
|
14
|
+
@wraps(function)
|
15
|
+
async def function_timer(*args, **kwargs):
|
16
|
+
t0 = time.time()
|
17
|
+
result = await function(*args, **kwargs)
|
18
|
+
t1 = time.time()
|
19
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
|
20
|
+
return result
|
21
|
+
|
22
|
+
return function_timer
|
23
|
+
|
24
|
+
|
25
|
+
def fn_timer(async_func=False, analyse=False):
|
26
|
+
"""
|
27
|
+
>>> @fn_timer()
|
28
|
+
>>> def example():
|
29
|
+
>>> time.sleep(2)
|
30
|
+
:param analyse:
|
31
|
+
:return:
|
32
|
+
"""
|
33
|
+
|
34
|
+
def wrapper(func):
|
35
|
+
async def func_time_async(*args, **kwargs):
|
36
|
+
t0 = time.time()
|
37
|
+
result = await asyncio.create_task(func(*args, **kwargs))
|
38
|
+
t1 = time.time()
|
39
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
|
40
|
+
return result
|
41
|
+
|
42
|
+
def func_time(*args, **kwargs):
|
43
|
+
t0 = time.time()
|
44
|
+
result = func(*args, **kwargs)
|
45
|
+
t1 = time.time()
|
46
|
+
print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
|
47
|
+
return result
|
48
|
+
|
49
|
+
def func_time_analyse(*args, **kwargs):
|
50
|
+
from pyinstrument import Profiler
|
51
|
+
|
52
|
+
profiler = Profiler()
|
53
|
+
profiler.start()
|
54
|
+
|
55
|
+
result = func(*args, **kwargs)
|
56
|
+
|
57
|
+
profiler.stop()
|
58
|
+
profiler.print()
|
59
|
+
return result
|
60
|
+
|
61
|
+
if async_func is True:
|
62
|
+
return func_time_async
|
63
|
+
else:
|
64
|
+
if analyse:
|
65
|
+
return func_time_analyse
|
66
|
+
else:
|
67
|
+
return func_time
|
68
|
+
|
69
|
+
return wrapper
|
70
|
+
|
71
|
+
|
72
|
+
def fn_timeout_checker(wait_time, callback):
|
73
|
+
"""
|
74
|
+
超时判断的装饰器
|
75
|
+
两个包,使用gevent出现bug
|
76
|
+
"""
|
77
|
+
# from gevent import Timeout
|
78
|
+
# from gevent.monkey import patch_all
|
79
|
+
|
80
|
+
# patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题
|
81
|
+
|
82
|
+
from eventlet import Timeout
|
83
|
+
from eventlet import monkey_patch
|
84
|
+
|
85
|
+
monkey_patch(time=True)
|
86
|
+
|
87
|
+
def wrapper(func):
|
88
|
+
def inner(*args, **kwargs):
|
89
|
+
finish_flag = False
|
90
|
+
with Timeout(wait_time, False):
|
91
|
+
res = func(*args, **kwargs)
|
92
|
+
finish_flag = True
|
93
|
+
if not finish_flag:
|
94
|
+
res = callback()
|
95
|
+
return res
|
96
|
+
|
97
|
+
return inner
|
98
|
+
|
99
|
+
return wrapper
|
100
|
+
|
101
|
+
|
102
|
+
def fn_try(parameter):
|
103
|
+
"""
|
104
|
+
该函数把try...catch...封装成装饰器,
|
105
|
+
接收一个字典参数,并把其中的msg字段改为具体报错信息
|
106
|
+
:param parameter: {"msg": "", etc.}
|
107
|
+
:return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
|
108
|
+
"""
|
109
|
+
|
110
|
+
def wrapper(function):
|
111
|
+
def inner(*args, **kwargs):
|
112
|
+
try:
|
113
|
+
result = function(*args, **kwargs)
|
114
|
+
return result
|
115
|
+
except Exception as e:
|
116
|
+
msg = "报错!"
|
117
|
+
print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
|
118
|
+
parameter["msg"] = parameter["msg"].format(str(e))
|
119
|
+
return parameter
|
120
|
+
finally:
|
121
|
+
pass
|
122
|
+
|
123
|
+
return inner
|
124
|
+
|
125
|
+
return wrapper
|
126
|
+
|
127
|
+
|
128
|
+
def try_log(function):
|
129
|
+
@wraps(function)
|
130
|
+
def inner(*args, **kwargs):
|
131
|
+
try:
|
132
|
+
result = function(*args, **kwargs)
|
133
|
+
return result
|
134
|
+
except Exception as e:
|
135
|
+
logging.error(*args)
|
136
|
+
logging.error(e.__traceback__.tb_frame.f_globals["__file__"])
|
137
|
+
logging.error(e.__traceback__.tb_lineno)
|
138
|
+
logging.error(repr(e))
|
139
|
+
|
140
|
+
return inner
|
141
|
+
|
142
|
+
|
143
|
+
def example(function):
|
144
|
+
@wraps(function)
|
145
|
+
def function_example(*args, **kwargs):
|
146
|
+
print("此方法仅仅用于提示该方法怎么写")
|
147
|
+
result = function(*args, **kwargs)
|
148
|
+
return result
|
149
|
+
|
150
|
+
return function_example
|
151
|
+
|
152
|
+
|
153
|
+
def singleton(cls):
|
154
|
+
instances = {}
|
155
|
+
|
156
|
+
def _singleton(*args, **kwargs):
|
157
|
+
if cls not in instances:
|
158
|
+
instances[cls] = cls(*args, **kwargs)
|
159
|
+
return instances[cls]
|
160
|
+
|
161
|
+
return _singleton
|