glove 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,247 @@
1
+ module Glove
2
+ class Model
3
+ # Default options (see #initialize)
4
+ DEFAULTS = {
5
+ max_count: 100,
6
+ learning_rate: 0.05,
7
+ alpha: 0.75,
8
+ num_components: 30,
9
+ epochs: 5,
10
+ threads: 4
11
+ }
12
+
13
+ # @!attribute [r] corpus
14
+ # @return [Glove::Corpus] reference to the Corpus instance
15
+ # @!attribute [r] token_index
16
+ # @return [Hash] reference to corpus.index
17
+ # @!attribute [r] token_pairs
18
+ # @return [Array<(Glove::TokenPair)>] reference to corpus.pairs
19
+ # @!attribute [rw] word_vec
20
+ # @return [GSL::Matrix] the word vector matrix
21
+ # @!attribute [rw] word_biases
22
+ # @return [GSL::Vector] the vector holding the word biases
23
+ attr_reader :opts, :window, :epochs, :num_components, :min_count
24
+ attr_reader :learning_rate, :alpha, :max_count, :threads
25
+ attr_reader :cooc_matrix, :corpus, :token_index, :token_pairs
26
+ attr_accessor :word_vec, :word_biases
27
+
28
+ # Create a new {Glove::Model} instance. Accepts options for
29
+ # {Glove::Corpus} and {Glove::Parser} which only get forwarded
30
+ # and not used in this class.
31
+ #
32
+ # @param [Hash] options the options to initialize the instance with.
33
+ # @option options [Integer] :max_count (100) Parameter specifying cutoff in
34
+ # weighting function
35
+ # @option options [Float] :learning_rate (0.05) Initial learning rate
36
+ # @option options [Float] :alpha (0.75) Exponent of weighting function
37
+ # @option options [Integer] :num_components (30) Column size of the word vector
38
+ # matrix
39
+ # @option options [Integer] :epochs (5) Number of training iterations
40
+ # @option options [Integer] :threads (4) Number of threads to use in building
41
+ # the co-occurence matrix and training iterations. Must be greater then 0
42
+ # @return [Glove::Model] A GloVe model.
43
+ def initialize(options={})
44
+ @opts = DEFAULTS.dup.merge(options)
45
+ @opts.each do |key, value|
46
+ instance_variable_set :"@#{key}", value
47
+ end
48
+
49
+ @cooc_matrix = nil
50
+ @word_vec = nil
51
+ @word_biases = nil
52
+ end
53
+
54
+ # Fit a string or {Glove::Corpus} instance and build co-occurance matrix
55
+ #
56
+ # @param [String, Glove::Corpus] text The text to train from
57
+ # @example Provide corpus for the model
58
+ # model = Glove::Model.new
59
+ # model.fit(File.read('shakespeare.txt'))
60
+ # @example Provide a {Glove::Corpus} instance as text argument
61
+ # model = Glove::Model.new
62
+ # corpus = Glove::Corpus.build(File.read('shakespeare.txt'))
63
+ # model.fit(corpus)
64
+ # @return [Glove::Model] Current instance
65
+ def fit(text)
66
+ fit_corpus(text)
67
+ build_cooc_matrix
68
+ build_word_vectors
69
+ self
70
+ end
71
+
72
+ # Train the model. Must call #fit prior
73
+ # @return [Glove::Model] Current instance
74
+ def train
75
+ train_in_epochs(matrix_nnz)
76
+ self
77
+ end
78
+
79
+ # Save trained data to files
80
+ #
81
+ # @param [String] corpus_file Filename for corpus
82
+ # @param [String] cooc_file Filename for co-occurence matrix
83
+ # @param [String] vec_file Filename for Word Vector Maxtrix
84
+ # @param [String] bias_file Filename for Word Biases Vector
85
+ def save(corpus_file, cooc_file, vec_file, bias_file)
86
+ File.open(corpus_file, 'wb') do |file|
87
+ file.write Marshal.dump(corpus)
88
+ end
89
+
90
+ cooc_matrix.fwrite(cooc_file)
91
+ word_vec.fwrite(vec_file)
92
+ word_biases.fwrite(bias_file)
93
+ end
94
+
95
+ # Loads training data from already existing files
96
+ #
97
+ # @param [String] corpus_file Filename for corpus
98
+ # @param [String] cooc_file Filename for co-occurence matrix
99
+ # @param [String] vec_file Filename for Word Vector Maxtrix
100
+ # @param [String] bias_file Filename for Word Biases Vector
101
+ def load(corpus_file, cooc_file, vec_file, bias_file)
102
+ @corpus = Marshal.load(File.binread(corpus_file))
103
+
104
+ @token_index = corpus.index
105
+ @token_pairs = corpus.pairs
106
+
107
+ size = token_index.size
108
+
109
+ @cooc_matrix = GSL::Matrix.alloc(size, size)
110
+ @word_vec = GSL::Matrix.alloc(size, num_components)
111
+ @word_biases = GSL::Vector.alloc(size)
112
+
113
+ @cooc_matrix.fread(cooc_file)
114
+ @word_vec.fread(vec_file)
115
+ @word_biases.fread(bias_file)
116
+ end
117
+
118
+ # @todo create graph of the word vector matrix
119
+ def visualize
120
+ raise "Not implemented"
121
+ end
122
+
123
+ # Get a words that relates to :target like :word1 relates to :word2
124
+ #
125
+ # @param [String] word1
126
+ # @param [String] word2
127
+ # @param [Integer] num Number of related words to :target
128
+ # @param [Float] accuracy Allowance in difference of target cosine
129
+ # and related word cosine distances
130
+ # @example What words relate to atom like quantum relates to physics?
131
+ # model.analogy_words('quantum', 'physics', 'atom')
132
+ # # => [["electron", 0.98583], ["energi", 0.98151], ["photon",0.96650]]
133
+ # @return [Array] List of related words to target
134
+ def analogy_words(word1, word2, target, num=3, accuracy=0.0001)
135
+ word1 = word1.stem
136
+ word2 = word1.stem
137
+ target = target.stem
138
+
139
+ distance = cosine(vector(word1), vector(word2))
140
+
141
+ vector_distance(target).reject do |item|
142
+ diff = item[1].to_f.abs - distance
143
+ diff.abs < accuracy
144
+ end.take(num)
145
+ end
146
+
147
+ # Get most similar words to :word
148
+ #
149
+ # @param [String] word The word to find similar to
150
+ # @param [Integer] num (3) Number of similar words to :word
151
+ # @example Get 1 most similar word to 'physics'
152
+ # model.most_similar('physics', 1) # => ["quantum", 0.9967993356234444]
153
+ # @return [Array] List of most similar words with cosine distance as values
154
+ def most_similar(word, num=3)
155
+ vector_distance(word.stem).take(num)
156
+ end
157
+
158
+ # Prevent token_pairs, matrices and vectors to fill up the terminal
159
+ def inspect
160
+ to_s
161
+ end
162
+
163
+ private
164
+
165
+ # Perform train iterations
166
+ #
167
+ # @param [Array] indices The non-zero value indices in cooc_matrix
168
+ def train_in_epochs(indices)
169
+ 1.upto(epochs) do |epoch|
170
+ shuffled = indices.shuffle
171
+ @word_vec, @word_biases = Workers::TrainingWorker.new(self, shuffled).run
172
+ end
173
+ end
174
+
175
+ # Builds the corpus and sets @token_index and @token_pairs
176
+ def fit_corpus(text)
177
+ @corpus =
178
+ if text.is_a? Corpus
179
+ text
180
+ else
181
+ Corpus.build(text, opts)
182
+ end
183
+
184
+ @token_index = corpus.index
185
+ @token_pairs = corpus.pairs
186
+ end
187
+
188
+ # Create initial values for @word_vec and @word_biases
189
+ def build_word_vectors
190
+ cols = token_index.size
191
+ @word_vec = GSL::Matrix.rand(cols, num_components)
192
+ @word_biases = GSL::Vector.alloc(cols)
193
+ end
194
+
195
+ # Buids the co-occurence matrix
196
+ def build_cooc_matrix
197
+ @cooc_matrix = Workers::CooccurrenceWorker.new(self).run
198
+ end
199
+
200
+ # Array of all non-zero (both row and col) value coordinates in the
201
+ # cooc_matrix
202
+ def matrix_nnz
203
+ entries = []
204
+ cooc_matrix.enum_for(:each_col).each_with_index do |col, col_idx|
205
+ col.enum_for(:each).each_with_index do |row, row_idx|
206
+ value = cooc_matrix[row_idx, col_idx]
207
+
208
+ entries << [row_idx, col_idx] unless value.zero?
209
+ end
210
+ end
211
+ entries
212
+ end
213
+
214
+ # Find the vector row of @word_vec for a given word
215
+ #
216
+ # @param [String] word The word to transform into a vector
217
+ # @return [GSL::Vector] The corresponding vector into the #word_vec matrix
218
+ def vector(word)
219
+ return nil unless word_index = token_index[word]
220
+ word_vec.row(word_index)
221
+ end
222
+
223
+ # Balculates the cosine distance of all the words in the vocabulary
224
+ # against a given word. Results are then sorted in DESC order
225
+ #
226
+ # @param [String] word The word to compare against
227
+ # @return [Array<(String, Integer)>] Array of tokens and their distance
228
+ def vector_distance(word)
229
+ return {} unless word_vector = vector(word)
230
+
231
+ token_index.map.with_index do |(token,count), idx|
232
+ next if token.eql? word
233
+ [token, cosine(word_vector, word_vec.row(idx))]
234
+ end.compact.sort{ |a,b| b[1] <=> a[1] }
235
+ end
236
+
237
+ # Compute cosine distance between two vectors
238
+ #
239
+ # @param [GSL::Vector] vector1 First vector
240
+ # @param [GSL::Vector] vector2 Second vector
241
+ # @return [Float] the cosine distance
242
+ def cosine(vector1, vector2)
243
+ return 0 if vector1.nil? || vector2.nil?
244
+ vector1.dot(vector2) / (vector1.norm * vector2.norm)
245
+ end
246
+ end
247
+ end
@@ -0,0 +1,90 @@
1
+ module Glove
2
+ # Takes a string of text and tokenizes it for usage in {Glove::Corpus}
3
+ #
4
+ class Parser
5
+ # Default options (see #initialize)
6
+ DEFAULTS = {
7
+ stem: true,
8
+ min_length: 3,
9
+ max_length: 25,
10
+ alphabetic: true,
11
+ normalize: true,
12
+ stop_words: true
13
+ }
14
+
15
+ # @!attribute [r] text
16
+ # @return [String] the current value of the text attribute
17
+ #
18
+ attr_reader :text
19
+
20
+ # Create a new {Glove::Parser}, passing the text and options as arguments
21
+ #
22
+ # @param [String] text value for the text attribute
23
+ # @param [Hash] options the options to initialize the instance with.
24
+ # @option options [Boolean] :stem (true) Whether to stem the tokens
25
+ # @option options [Boolean] :alphabetic (true) Remove any non-alphabetic chars
26
+ # @option options [Boolean] :normalize (true) Normalize the text and keep
27
+ # words with length between option[:min_length] and option[:max_length]
28
+ # @option options [Boolean] :stop_words (true) Filter stop words
29
+ # @option options [Integer] :min_length (3) the min allowed length of a word
30
+ # @option options [Integer] :max_length (25) the max allowed length of a word
31
+ # @return [Glove::Parser] A new parser.
32
+ def initialize(text, options={})
33
+ @text, @opt = text, DEFAULTS.dup.merge(options)
34
+ end
35
+
36
+ # Call all parsing methods in the class and return the final text value as
37
+ # array of words
38
+ #
39
+ # @return [Array] The tokens array
40
+ def tokenize
41
+ downcase
42
+ stop_words if @opt[:stop_words]
43
+ alphabetic if @opt[:alphabetic]
44
+ split
45
+ normalize if @opt[:normalize]
46
+ stem if @opt[:stem]
47
+ text
48
+ end
49
+
50
+ # Downcases the text value
51
+ def downcase
52
+ text.downcase!
53
+ end
54
+
55
+ # Splits the text string into an array of words
56
+ def split
57
+ @text = text.split
58
+ end
59
+
60
+ # Filters out the text leaving only alphabetical characters in words
61
+ # and splits the words
62
+ def alphabetic
63
+ text.gsub!(/([^[:alpha:]]+)|((?=\w*[a-z])(?=\w*[0-9])\w+)/, ' ')
64
+ end
65
+
66
+ # Stems every member of the text array
67
+ def stem
68
+ text.map!(&:stem)
69
+ end
70
+
71
+ # Selects words with length within the :min_length and :max_length boundaries
72
+ def normalize
73
+ text.keep_if do |word|
74
+ word.length.between?(@opt[:min_length], @opt[:max_length])
75
+ end
76
+ end
77
+
78
+ # Exclude words that are in the STOP_WORDS array
79
+ def stop_words
80
+ @text = text.scan(/(\w+)(\W+)/).reject do |(word, other)|
81
+ stop_words_array.include? word
82
+ end.flatten.join
83
+ end
84
+
85
+ # Reads the default stop words file and return array of its entries
86
+ def stop_words_array
87
+ @stop_words ||= File.read(File.join(Glove.root_path, 'resources', 'en.stop')).split
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,15 @@
1
+ module Glove
2
+ # Holds a token string and its neighbors in an array
3
+ class TokenPair
4
+ # @!attribute [r] token
5
+ # @return [String] The word/token
6
+ # @!attribute [r] neighbors
7
+ # @return [Array<(String)>>] List of neighboring words
8
+ attr_accessor :token, :neighbors
9
+
10
+ # Get class instance and set token and neighbors variables
11
+ def initialize(token='', neighbors=[])
12
+ @token, @neighbors = token, neighbors
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Glove
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'forwardable'
2
+
3
+ module Glove
4
+ module Workers
5
+ require 'glove/workers/cooccurrence_worker'
6
+ require 'glove/workers/training_worker'
7
+ end
8
+ end
@@ -0,0 +1,55 @@
1
+ module Glove
2
+ module Workers
3
+ # Constructs the co-occurrence matrix for {Glove::Model}
4
+ class CooccurrenceWorker
5
+ extend ::Forwardable
6
+
7
+ # @!attribute [r] token_index
8
+ # @return [Hash{String=>Integer}] Clone of @caller.token_index
9
+ # @!attribute [r] word_biases
10
+ # @return [Array<(Glove::TokenPair)>] Clone of @caller.token_pairs
11
+ attr_reader :token_index, :token_pairs
12
+
13
+ def_delegators :@caller, :threads
14
+
15
+ # Creates instance of the class
16
+ #
17
+ # @param [Glove::Model] caller Caller class
18
+ def initialize(caller)
19
+ @caller = caller
20
+ @token_index = @caller.token_index.dup
21
+ @token_pairs = @caller.token_pairs.dup
22
+ end
23
+
24
+ # Perform the building of the matrix
25
+ #
26
+ # @return [GSL::Matrix] The co-occurrence matrix
27
+ def run
28
+ vectors = Parallel.map(token_index, in_processes: threads) do |slice|
29
+ build_cooc_matrix_col(slice)
30
+ end
31
+
32
+ GSL::Matrix.alloc(*vectors)
33
+ end
34
+
35
+ # Creates a vector column for the cooc_matrix based on given token.
36
+ # Calculates sum for how many times the word exists in the constext of the
37
+ # entire vocabulary
38
+ #
39
+ # @param [Array<(String, Integer)>] slice Token with index
40
+ # @return [Array] GSL::Vector#to_a representation of the column
41
+ def build_cooc_matrix_col(slice)
42
+ token = slice[0]
43
+ vector = GSL::Vector.alloc(token_index.size)
44
+
45
+ token_pairs.each do |pair|
46
+ key = token_index[pair.token]
47
+ sum = pair.neighbors.select{ |word| word == token }.size
48
+ vector[key] += sum
49
+ end
50
+
51
+ vector.to_a
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,104 @@
1
+ module Glove
2
+ module Workers
3
+ # Performs the traing process on the word vector matrix, as well as word biases
4
+ class TrainingWorker
5
+ extend ::Forwardable
6
+
7
+ # @!attribute [r] indices
8
+ # @return [Glove::Corpus] Shuffled co-occurrence matrix slots
9
+ # @!attribute [r] word_vec
10
+ # @return [GSL::Matrix] Clone of @caller.word_vec
11
+ # @!attribute [r] word_biases
12
+ # @return [GSL::Vector] Clone of @caller.word_biases
13
+ attr_accessor :indices, :word_vec, :word_biases
14
+
15
+ def_delegators :@caller, :cooc_matrix, :threads, :max_count, :alpha, :learning_rate
16
+
17
+ # Create a {Glove::Workers::TrainingWorker} instance
18
+ # @param [Glove::Model] caller Caller class
19
+ # @param [Array<(Integer, Integer)>] indices Shuffled indices of non-zero
20
+ # elements in the model's co-occurence matrix
21
+ def initialize(caller, indices)
22
+ @caller, @indices = caller, indices
23
+ @word_vec = @caller.word_vec.dup
24
+ @word_biases = @caller.word_biases.dup
25
+ end
26
+
27
+ # Runs the calculations
28
+ # @return [GSL::Matrix, GSL::Vector] Weighted word vectors and word biases
29
+ def run
30
+ mutex = Mutex.new
31
+ slice_size = indices.size / threads
32
+
33
+ workers = indices.each_slice(slice_size).map do |slice|
34
+ Thread.new{ work(slice, mutex) }
35
+ end
36
+ workers.each(&:join)
37
+
38
+ [word_vec, word_biases]
39
+ end
40
+
41
+ # Perform a full train iteration on the word vectors and word biases
42
+ # @param [Array] slice Shuffled co-occurrence matrix slots
43
+ # @param [Mutex] mutex Thread-safe lock on #apply_weights
44
+ def work(slice, mutex)
45
+ slice.each do |slot|
46
+ w1, w2 = slot
47
+ loss, word_a_norm, word_b_norm = calc_weights(w1, w2)
48
+
49
+ mutex.synchronize do
50
+ apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
51
+ end
52
+ end
53
+ end
54
+
55
+ # Calculates loss, and norms for word1 (row) and word2 (column) by given
56
+ # indices
57
+ #
58
+ # @param [Integer] w1 Row index
59
+ # @param [Integer] w2 Column index
60
+ # @param [Float] prediction (0.0) Initial predication value
61
+ # @param [Float] word_a_norm (0.0) Initial norm of word at row w1
62
+ # @param [Float] word_b_norm (0.0) Initial norm of word at col w2
63
+ # @return [Float, Float, Float] Array of loss, word_a_norm, word_b_norm
64
+ def calc_weights(w1, w2, prediction=0.0, word_a_norm=0.0, word_b_norm = 0.0)
65
+ count = cooc_matrix[w1, w2]
66
+
67
+ word_vec.each_col do |col|
68
+ w1_context = col[w1]
69
+ w2_context = col[w2]
70
+
71
+ prediction = prediction + w1_context + w2_context
72
+ word_a_norm += w1_context * w1_context
73
+ word_b_norm += w2_context * w2_context
74
+ end
75
+
76
+ prediction = prediction + word_biases[w1] + word_biases[w2]
77
+ word_a_norm = Math.sqrt(word_a_norm)
78
+ word_b_norm = Math.sqrt(word_b_norm)
79
+ entry_weight = [1.0, (count/max_count)].min ** alpha
80
+ loss = entry_weight * (prediction - Math.log(count))
81
+
82
+ [loss, word_a_norm, word_b_norm]
83
+ end
84
+
85
+ # Applies calculated weights to @word_vec and @word_biases. MUST be called
86
+ # in a Mutex#synchronize block
87
+ #
88
+ # @param [Integer] w1 Row index
89
+ # @param [Integer] w2 Column index
90
+ # @param [Float] loss Loss value
91
+ # @param [Float] word_a_norm Norm of word at row w1
92
+ # @param [Float] word_b_norm Norm of word at col w2
93
+ def apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
94
+ word_vec.each_col do |col|
95
+ col[w1] = (col[w1] - learning_rate * loss * col[w2]) / word_a_norm
96
+ col[w2] = (col[w2] - learning_rate * loss * col[w2]) / word_b_norm
97
+ end
98
+
99
+ word_biases[w1] -= learning_rate * loss
100
+ word_biases[w2] -= learning_rate * loss
101
+ end
102
+ end
103
+ end
104
+ end