texta-embedding 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- texta_embedding/embedding.py +174 -0
- texta_embedding/exceptions.py +7 -0
- texta_embedding/phraser.py +46 -0
- texta_embedding-2.0.0.data/data/LICENSE +674 -0
- texta_embedding-2.0.0.data/data/README.md +1 -0
- texta_embedding-2.0.0.data/data/VERSION +1 -0
- texta_embedding-2.0.0.data/data/requirements.txt +5 -0
- texta_embedding-2.0.0.dist-info/METADATA +26 -0
- texta_embedding-2.0.0.dist-info/RECORD +12 -0
- texta_embedding-2.0.0.dist-info/WHEEL +5 -0
- texta_embedding-2.0.0.dist-info/licenses/LICENSE +674 -0
- texta_embedding-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from gensim.models import word2vec, fasttext, KeyedVectors
|
|
2
|
+
import joblib
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from texta_tools.text_processor import TextProcessor
|
|
6
|
+
from .phraser import Phraser
|
|
7
|
+
from . import exceptions
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Embedding:
|
|
11
|
+
"""
|
|
12
|
+
Embedding Abstraction to work with both Word2Vec & FastText.
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, description="My Embedding", workers=1, min_freq=5, num_dimensions=100, window=5, num_epochs=5,
|
|
15
|
+
text_processor=TextProcessor(sentences=True, remove_stop_words=True, words_as_list=True)):
|
|
16
|
+
self.model = None
|
|
17
|
+
self.phraser = None
|
|
18
|
+
self.description = description
|
|
19
|
+
# params
|
|
20
|
+
self.workers = workers
|
|
21
|
+
self.min_freq = min_freq
|
|
22
|
+
self.window = window
|
|
23
|
+
self.num_epochs = num_epochs
|
|
24
|
+
self.num_dimensions = num_dimensions
|
|
25
|
+
self.text_processor = text_processor
|
|
26
|
+
|
|
27
|
+
def _train(self, texts, selected_model, use_phraser):
|
|
28
|
+
"""
|
|
29
|
+
Trains Embedding.
|
|
30
|
+
This needs to be called out in train() method of the model implementation (e.g. W2V).
|
|
31
|
+
:param: list texts: List of texts or an iterator (e.g. Elasticsearcher).
|
|
32
|
+
"""
|
|
33
|
+
if not texts:
|
|
34
|
+
raise exceptions.InvalidInputError("No training texts provided.")
|
|
35
|
+
# add texts to text processor so we can use it as an iterator
|
|
36
|
+
self.text_processor.input_texts = texts
|
|
37
|
+
# build phraser if asked
|
|
38
|
+
if use_phraser == True:
|
|
39
|
+
phraser = Phraser()
|
|
40
|
+
phraser.train(self.text_processor)
|
|
41
|
+
# set phraser
|
|
42
|
+
self.phraser = phraser
|
|
43
|
+
# update phraser in text processor
|
|
44
|
+
self.text_processor.phraser = phraser
|
|
45
|
+
|
|
46
|
+
# word2vec model
|
|
47
|
+
model = selected_model(
|
|
48
|
+
self.text_processor,
|
|
49
|
+
min_count=self.min_freq,
|
|
50
|
+
vector_size=self.num_dimensions,
|
|
51
|
+
epochs=int(self.num_epochs),
|
|
52
|
+
window=self.window,
|
|
53
|
+
workers=self.workers
|
|
54
|
+
)
|
|
55
|
+
self.model = model
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
def save(self, file_path):
|
|
59
|
+
"""
|
|
60
|
+
Saves embedding with phraser to disk.
|
|
61
|
+
"""
|
|
62
|
+
to_dump = {"phraser": self.phraser, "embedding": self.model}
|
|
63
|
+
joblib.dump(to_dump, file_path)
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
def load(self, file_path):
|
|
67
|
+
"""
|
|
68
|
+
Loads embedding with phraser from disk.
|
|
69
|
+
"""
|
|
70
|
+
to_load = joblib.load(file_path)
|
|
71
|
+
self.model = to_load["embedding"]
|
|
72
|
+
self.phraser = to_load["phraser"]
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
def load_django(self, embedding_object):
|
|
76
|
+
"""
|
|
77
|
+
Loads embedding in Django.
|
|
78
|
+
"""
|
|
79
|
+
file_path = embedding_object.embedding_model.path
|
|
80
|
+
to_load = joblib.load(file_path)
|
|
81
|
+
self.model = to_load["embedding"]
|
|
82
|
+
self.phraser = to_load["phraser"]
|
|
83
|
+
self.description = embedding_object.description
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
def get_vector(self, word):
|
|
87
|
+
"""
|
|
88
|
+
Returns vector for given embedding entry.
|
|
89
|
+
"""
|
|
90
|
+
if word not in self.get_vocabulary():
|
|
91
|
+
raise exceptions.OutOfVocabError("Word or phrase not in vocabulary.")
|
|
92
|
+
return self.model.wv.__getitem__(word)
|
|
93
|
+
|
|
94
|
+
def get_vectors(self):
|
|
95
|
+
"""
|
|
96
|
+
Returns Vectors as a Matrix.
|
|
97
|
+
"""
|
|
98
|
+
return self.model.wv.vectors
|
|
99
|
+
|
|
100
|
+
def get_vocabulary(self):
|
|
101
|
+
"""
|
|
102
|
+
Returns embedding vocabulary (dict) from KeyedVectors.
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
return self.model.wv.key_to_index
|
|
106
|
+
except AttributeError:
|
|
107
|
+
# Handle older Gensim3 models.
|
|
108
|
+
return {token: token_index for token_index, token in enumerate(self.model.wv.index2word)}
|
|
109
|
+
|
|
110
|
+
def get_similarity(self, s1: str, s2: str):
|
|
111
|
+
""" Get similarity of the input words / phrases.
|
|
112
|
+
"""
|
|
113
|
+
s1 = self._format_str(s1)
|
|
114
|
+
s2 = self._format_str(s2)
|
|
115
|
+
return self.model.wv.similarity(s1, s2)
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _format_str(string):
|
|
119
|
+
return str(string).replace(' ', '_')
|
|
120
|
+
|
|
121
|
+
def get_similar(self, positives_used, negatives_used=list(), positives_unused=list(), negatives_unused=list(), n=20):
|
|
122
|
+
"""
|
|
123
|
+
Find similar words & phraser for input list of strings.
|
|
124
|
+
"""
|
|
125
|
+
# check if all inputs are strings
|
|
126
|
+
for input_list in (positives_used, negatives_used, positives_unused, negatives_unused):
|
|
127
|
+
if not isinstance(input_list, list):
|
|
128
|
+
raise exceptions.InvalidInputError("Input must be list!")
|
|
129
|
+
# reformat positives & negatives
|
|
130
|
+
# get vocab:
|
|
131
|
+
vocab = self.get_vocabulary()
|
|
132
|
+
# filter out words not present in the embedding vocabulary
|
|
133
|
+
positives = [self._format_str(positive) for positive in positives_used if self._format_str(positive) in vocab]
|
|
134
|
+
negatives = [self._format_str(negative) for negative in negatives_used if self._format_str(negative) in vocab]
|
|
135
|
+
# create set of the tokens we don't want to see in the output
|
|
136
|
+
neg_pos_combined = negatives_used + positives_unused + negatives_unused
|
|
137
|
+
not_suggestions = [self._format_str(not_suggestion) for not_suggestion in neg_pos_combined if self._format_str(not_suggestion) in vocab]
|
|
138
|
+
not_suggestions = set(not_suggestions)
|
|
139
|
+
# return logic
|
|
140
|
+
if positives:
|
|
141
|
+
similar_items = self.model.wv.most_similar(positive=positives, negative=negatives, topn=n+len(not_suggestions))
|
|
142
|
+
similar_items_topn = list()
|
|
143
|
+
for i,s in enumerate(similar_items):
|
|
144
|
+
if s[0] not in not_suggestions:
|
|
145
|
+
similar_items_topn += [{'phrase': s[0].replace('_', ' '), 'score': s[1], 'model': self.description}]
|
|
146
|
+
if i == n:
|
|
147
|
+
break
|
|
148
|
+
return similar_items_topn
|
|
149
|
+
else:
|
|
150
|
+
return []
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class FastTextEmbedding(Embedding):
|
|
154
|
+
"""
|
|
155
|
+
This is an actual embedding class you want to use!
|
|
156
|
+
"""
|
|
157
|
+
def train(self, texts, use_phraser=True):
|
|
158
|
+
"""
|
|
159
|
+
Trains FastText embedding.
|
|
160
|
+
:param: list texts: List of texts or an iterator (e.g. Elasticsearcher).
|
|
161
|
+
"""
|
|
162
|
+
return self._train(texts, fasttext.FastText, use_phraser=use_phraser)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class W2VEmbedding(Embedding):
|
|
166
|
+
"""
|
|
167
|
+
This is an actual embedding class you want to use!
|
|
168
|
+
"""
|
|
169
|
+
def train(self, texts, use_phraser=True):
|
|
170
|
+
"""
|
|
171
|
+
Trains Word2Vec embedding.
|
|
172
|
+
:param: list texts: List of texts or an iterator (e.g. Elasticsearcher).
|
|
173
|
+
"""
|
|
174
|
+
return self._train(texts, word2vec.Word2Vec, use_phraser=use_phraser)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from gensim.models.phrases import Phraser as GSPhraser
|
|
2
|
+
from gensim.models.phrases import Phrases
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
logging.basicConfig(
|
|
6
|
+
format='%(levelname)s %(asctime)s: %(message)s',
|
|
7
|
+
datefmt='%d.%m.%Y %H:%M:%S',
|
|
8
|
+
level=logging.ERROR
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
class Phraser:
|
|
12
|
+
"""
|
|
13
|
+
Wrapper of Gensim Phraser.
|
|
14
|
+
"""
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self._phraser = None
|
|
17
|
+
|
|
18
|
+
def train(self, sentences):
|
|
19
|
+
"""
|
|
20
|
+
Trains phraser model using input sentences.
|
|
21
|
+
"""
|
|
22
|
+
phrase_model = Phrases(sentences)
|
|
23
|
+
phraser = GSPhraser(phrase_model)
|
|
24
|
+
self._phraser = phraser
|
|
25
|
+
|
|
26
|
+
def _phrase(self, text):
|
|
27
|
+
try:
|
|
28
|
+
return self._phraser[text]
|
|
29
|
+
except AttributeError as e:
|
|
30
|
+
# when older model is used, AttributeError is caught
|
|
31
|
+
# skip phrasing in that case and log error
|
|
32
|
+
logging.error(f"Phrasing failed. Skipping. Error Message: {e}")
|
|
33
|
+
return text
|
|
34
|
+
|
|
35
|
+
def phrase(self, text):
|
|
36
|
+
"""
|
|
37
|
+
Phrases input text.
|
|
38
|
+
"""
|
|
39
|
+
if self._phraser:
|
|
40
|
+
if isinstance(text, str):
|
|
41
|
+
text = text.split(' ')
|
|
42
|
+
return ' '.join(self._phrase(text))
|
|
43
|
+
else:
|
|
44
|
+
return self._phrase(text)
|
|
45
|
+
else:
|
|
46
|
+
return text
|