texta-embedding 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ from gensim.models import word2vec, fasttext, KeyedVectors
2
+ import joblib
3
+ import json
4
+
5
+ from texta_tools.text_processor import TextProcessor
6
+ from .phraser import Phraser
7
+ from . import exceptions
8
+
9
+
10
+ class Embedding:
11
+ """
12
+ Embedding Abstraction to work with both Word2Vec & FastText.
13
+ """
14
+ def __init__(self, description="My Embedding", workers=1, min_freq=5, num_dimensions=100, window=5, num_epochs=5,
15
+ text_processor=TextProcessor(sentences=True, remove_stop_words=True, words_as_list=True)):
16
+ self.model = None
17
+ self.phraser = None
18
+ self.description = description
19
+ # params
20
+ self.workers = workers
21
+ self.min_freq = min_freq
22
+ self.window = window
23
+ self.num_epochs = num_epochs
24
+ self.num_dimensions = num_dimensions
25
+ self.text_processor = text_processor
26
+
27
+ def _train(self, texts, selected_model, use_phraser):
28
+ """
29
+ Trains Embedding.
30
+ This needs to be called out in train() method of the model implementation (e.g. W2V).
31
+ :param: list texts: List of texts or an iterator (e.g. Elasticsearcher).
32
+ """
33
+ if not texts:
34
+ raise exceptions.InvalidInputError("No training texts provided.")
35
+ # add texts to text processor so we can use it as an iterator
36
+ self.text_processor.input_texts = texts
37
+ # build phraser if asked
38
+ if use_phraser == True:
39
+ phraser = Phraser()
40
+ phraser.train(self.text_processor)
41
+ # set phraser
42
+ self.phraser = phraser
43
+ # update phraser in text processor
44
+ self.text_processor.phraser = phraser
45
+
46
+ # word2vec model
47
+ model = selected_model(
48
+ self.text_processor,
49
+ min_count=self.min_freq,
50
+ vector_size=self.num_dimensions,
51
+ epochs=int(self.num_epochs),
52
+ window=self.window,
53
+ workers=self.workers
54
+ )
55
+ self.model = model
56
+ return True
57
+
58
+ def save(self, file_path):
59
+ """
60
+ Saves embedding with phraser to disk.
61
+ """
62
+ to_dump = {"phraser": self.phraser, "embedding": self.model}
63
+ joblib.dump(to_dump, file_path)
64
+ return True
65
+
66
+ def load(self, file_path):
67
+ """
68
+ Loads embedding with phraser from disk.
69
+ """
70
+ to_load = joblib.load(file_path)
71
+ self.model = to_load["embedding"]
72
+ self.phraser = to_load["phraser"]
73
+ return True
74
+
75
+ def load_django(self, embedding_object):
76
+ """
77
+ Loads embedding in Django.
78
+ """
79
+ file_path = embedding_object.embedding_model.path
80
+ to_load = joblib.load(file_path)
81
+ self.model = to_load["embedding"]
82
+ self.phraser = to_load["phraser"]
83
+ self.description = embedding_object.description
84
+ return True
85
+
86
+ def get_vector(self, word):
87
+ """
88
+ Returns vector for given embedding entry.
89
+ """
90
+ if word not in self.get_vocabulary():
91
+ raise exceptions.OutOfVocabError("Word or phrase not in vocabulary.")
92
+ return self.model.wv.__getitem__(word)
93
+
94
+ def get_vectors(self):
95
+ """
96
+ Returns Vectors as a Matrix.
97
+ """
98
+ return self.model.wv.vectors
99
+
100
+ def get_vocabulary(self):
101
+ """
102
+ Returns embedding vocabulary (dict) from KeyedVectors.
103
+ """
104
+ try:
105
+ return self.model.wv.key_to_index
106
+ except AttributeError:
107
+ # Handle older Gensim3 models.
108
+ return {token: token_index for token_index, token in enumerate(self.model.wv.index2word)}
109
+
110
+ def get_similarity(self, s1: str, s2: str):
111
+ """ Get similarity of the input words / phrases.
112
+ """
113
+ s1 = self._format_str(s1)
114
+ s2 = self._format_str(s2)
115
+ return self.model.wv.similarity(s1, s2)
116
+
117
+ @staticmethod
118
+ def _format_str(string):
119
+ return str(string).replace(' ', '_')
120
+
121
+ def get_similar(self, positives_used, negatives_used=list(), positives_unused=list(), negatives_unused=list(), n=20):
122
+ """
123
+ Find similar words & phraser for input list of strings.
124
+ """
125
+ # check if all inputs are strings
126
+ for input_list in (positives_used, negatives_used, positives_unused, negatives_unused):
127
+ if not isinstance(input_list, list):
128
+ raise exceptions.InvalidInputError("Input must be list!")
129
+ # reformat positives & negatives
130
+ # get vocab:
131
+ vocab = self.get_vocabulary()
132
+ # filter out words not present in the embedding vocabulary
133
+ positives = [self._format_str(positive) for positive in positives_used if self._format_str(positive) in vocab]
134
+ negatives = [self._format_str(negative) for negative in negatives_used if self._format_str(negative) in vocab]
135
+ # create set of the tokens we don't want to see in the output
136
+ neg_pos_combined = negatives_used + positives_unused + negatives_unused
137
+ not_suggestions = [self._format_str(not_suggestion) for not_suggestion in neg_pos_combined if self._format_str(not_suggestion) in vocab]
138
+ not_suggestions = set(not_suggestions)
139
+ # return logic
140
+ if positives:
141
+ similar_items = self.model.wv.most_similar(positive=positives, negative=negatives, topn=n+len(not_suggestions))
142
+ similar_items_topn = list()
143
+ for i,s in enumerate(similar_items):
144
+ if s[0] not in not_suggestions:
145
+ similar_items_topn += [{'phrase': s[0].replace('_', ' '), 'score': s[1], 'model': self.description}]
146
+ if i == n:
147
+ break
148
+ return similar_items_topn
149
+ else:
150
+ return []
151
+
152
+
153
+ class FastTextEmbedding(Embedding):
154
+ """
155
+ This is an actual embedding class you want to use!
156
+ """
157
+ def train(self, texts, use_phraser=True):
158
+ """
159
+ Trains FastText embedding.
160
+ :param: list texts: List of texts or an iterator (e.g. Elasticsearcher).
161
+ """
162
+ return self._train(texts, fasttext.FastText, use_phraser=use_phraser)
163
+
164
+
165
+ class W2VEmbedding(Embedding):
166
+ """
167
+ This is an actual embedding class you want to use!
168
+ """
169
+ def train(self, texts, use_phraser=True):
170
+ """
171
+ Trains Word2Vec embedding.
172
+ :param: list texts: List of texts or an iterator (e.g. Elasticsearcher).
173
+ """
174
+ return self._train(texts, word2vec.Word2Vec, use_phraser=use_phraser)
@@ -0,0 +1,7 @@
1
+ class InvalidInputError(Exception):
2
+ """Raised when something incorrect given to trainers."""
3
+ pass
4
+
5
+ class OutOfVocabError(Exception):
6
+ """Raised when word not present in the vocabulary of embedding."""
7
+ pass
@@ -0,0 +1,46 @@
1
+ from gensim.models.phrases import Phraser as GSPhraser
2
+ from gensim.models.phrases import Phrases
3
+ import logging
4
+
5
+ logging.basicConfig(
6
+ format='%(levelname)s %(asctime)s: %(message)s',
7
+ datefmt='%d.%m.%Y %H:%M:%S',
8
+ level=logging.ERROR
9
+ )
10
+
11
+ class Phraser:
12
+ """
13
+ Wrapper of Gensim Phraser.
14
+ """
15
+ def __init__(self):
16
+ self._phraser = None
17
+
18
+ def train(self, sentences):
19
+ """
20
+ Trains phraser model using input sentences.
21
+ """
22
+ phrase_model = Phrases(sentences)
23
+ phraser = GSPhraser(phrase_model)
24
+ self._phraser = phraser
25
+
26
+ def _phrase(self, text):
27
+ try:
28
+ return self._phraser[text]
29
+ except AttributeError as e:
30
+ # when older model is used, AttributeError is caught
31
+ # skip phrasing in that case and log error
32
+ logging.error(f"Phrasing failed. Skipping. Error Message: {e}")
33
+ return text
34
+
35
+ def phrase(self, text):
36
+ """
37
+ Phrases input text.
38
+ """
39
+ if self._phraser:
40
+ if isinstance(text, str):
41
+ text = text.split(' ')
42
+ return ' '.join(self._phrase(text))
43
+ else:
44
+ return self._phrase(text)
45
+ else:
46
+ return text