glove 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ module Glove
2
+ class Model
3
+ # Default options (see #initialize)
4
+ DEFAULTS = {
5
+ max_count: 100,
6
+ learning_rate: 0.05,
7
+ alpha: 0.75,
8
+ num_components: 30,
9
+ epochs: 5,
10
+ threads: 4
11
+ }
12
+
13
+ # @!attribute [r] corpus
14
+ # @return [Glove::Corpus] reference to the Corpus instance
15
+ # @!attribute [r] token_index
16
+ # @return [Hash] reference to corpus.index
17
+ # @!attribute [r] token_pairs
18
+ # @return [Array<(Glove::TokenPair)>] reference to corpus.pairs
19
+ # @!attribute [rw] word_vec
20
+ # @return [GSL::Matrix] the word vector matrix
21
+ # @!attribute [rw] word_biases
22
+ # @return [GSL::Vector] the vector holding the word biases
23
+ attr_reader :opts, :window, :epochs, :num_components, :min_count
24
+ attr_reader :learning_rate, :alpha, :max_count, :threads
25
+ attr_reader :cooc_matrix, :corpus, :token_index, :token_pairs
26
+ attr_accessor :word_vec, :word_biases
27
+
28
+ # Create a new {Glove::Model} instance. Accepts options for
29
+ # {Glove::Corpus} and {Glove::Parser} which only get forwarded
30
+ # and not used in this class.
31
+ #
32
+ # @param [Hash] options the options to initialize the instance with.
33
+ # @option options [Integer] :max_count (100) Parameter specifying cutoff in
34
+ # weighting function
35
+ # @option options [Float] :learning_rate (0.05) Initial learning rate
36
+ # @option options [Float] :alpha (0.75) Exponent of weighting function
37
+ # @option options [Integer] :num_components (30) Column size of the word vector
38
+ # matrix
39
+ # @option options [Integer] :epochs (5) Number of training iterations
40
+ # @option options [Integer] :threads (4) Number of threads to use in building
41
+ # the co-occurence matrix and training iterations. Must be greater then 0
42
+ # @return [Glove::Model] A GloVe model.
43
+ def initialize(options={})
44
+ @opts = DEFAULTS.dup.merge(options)
45
+ @opts.each do |key, value|
46
+ instance_variable_set :"@#{key}", value
47
+ end
48
+
49
+ @cooc_matrix = nil
50
+ @word_vec = nil
51
+ @word_biases = nil
52
+ end
53
+
54
+ # Fit a string or {Glove::Corpus} instance and build co-occurance matrix
55
+ #
56
+ # @param [String, Glove::Corpus] text The text to train from
57
+ # @example Provide corpus for the model
58
+ # model = Glove::Model.new
59
+ # model.fit(File.read('shakespeare.txt'))
60
+ # @example Provide a {Glove::Corpus} instance as text argument
61
+ # model = Glove::Model.new
62
+ # corpus = Glove::Corpus.build(File.read('shakespeare.txt'))
63
+ # model.fit(corpus)
64
+ # @return [Glove::Model] Current instance
65
+ def fit(text)
66
+ fit_corpus(text)
67
+ build_cooc_matrix
68
+ build_word_vectors
69
+ self
70
+ end
71
+
72
+ # Train the model. Must call #fit prior
73
+ # @return [Glove::Model] Current instance
74
+ def train
75
+ train_in_epochs(matrix_nnz)
76
+ self
77
+ end
78
+
79
+ # Save trained data to files
80
+ #
81
+ # @param [String] corpus_file Filename for corpus
82
+ # @param [String] cooc_file Filename for co-occurence matrix
83
+ # @param [String] vec_file Filename for Word Vector Maxtrix
84
+ # @param [String] bias_file Filename for Word Biases Vector
85
+ def save(corpus_file, cooc_file, vec_file, bias_file)
86
+ File.open(corpus_file, 'wb') do |file|
87
+ file.write Marshal.dump(corpus)
88
+ end
89
+
90
+ cooc_matrix.fwrite(cooc_file)
91
+ word_vec.fwrite(vec_file)
92
+ word_biases.fwrite(bias_file)
93
+ end
94
+
95
+ # Loads training data from already existing files
96
+ #
97
+ # @param [String] corpus_file Filename for corpus
98
+ # @param [String] cooc_file Filename for co-occurence matrix
99
+ # @param [String] vec_file Filename for Word Vector Maxtrix
100
+ # @param [String] bias_file Filename for Word Biases Vector
101
+ def load(corpus_file, cooc_file, vec_file, bias_file)
102
+ @corpus = Marshal.load(File.binread(corpus_file))
103
+
104
+ @token_index = corpus.index
105
+ @token_pairs = corpus.pairs
106
+
107
+ size = token_index.size
108
+
109
+ @cooc_matrix = GSL::Matrix.alloc(size, size)
110
+ @word_vec = GSL::Matrix.alloc(size, num_components)
111
+ @word_biases = GSL::Vector.alloc(size)
112
+
113
+ @cooc_matrix.fread(cooc_file)
114
+ @word_vec.fread(vec_file)
115
+ @word_biases.fread(bias_file)
116
+ end
117
+
118
+ # @todo create graph of the word vector matrix
119
+ def visualize
120
+ raise "Not implemented"
121
+ end
122
+
123
+ # Get a words that relates to :target like :word1 relates to :word2
124
+ #
125
+ # @param [String] word1
126
+ # @param [String] word2
127
+ # @param [Integer] num Number of related words to :target
128
+ # @param [Float] accuracy Allowance in difference of target cosine
129
+ # and related word cosine distances
130
+ # @example What words relate to atom like quantum relates to physics?
131
+ # model.analogy_words('quantum', 'physics', 'atom')
132
+ # # => [["electron", 0.98583], ["energi", 0.98151], ["photon",0.96650]]
133
+ # @return [Array] List of related words to target
134
+ def analogy_words(word1, word2, target, num=3, accuracy=0.0001)
135
+ word1 = word1.stem
136
+ word2 = word1.stem
137
+ target = target.stem
138
+
139
+ distance = cosine(vector(word1), vector(word2))
140
+
141
+ vector_distance(target).reject do |item|
142
+ diff = item[1].to_f.abs - distance
143
+ diff.abs < accuracy
144
+ end.take(num)
145
+ end
146
+
147
+ # Get most similar words to :word
148
+ #
149
+ # @param [String] word The word to find similar to
150
+ # @param [Integer] num (3) Number of similar words to :word
151
+ # @example Get 1 most similar word to 'physics'
152
+ # model.most_similar('physics', 1) # => ["quantum", 0.9967993356234444]
153
+ # @return [Array] List of most similar words with cosine distance as values
154
+ def most_similar(word, num=3)
155
+ vector_distance(word.stem).take(num)
156
+ end
157
+
158
+ # Prevent token_pairs, matrices and vectors to fill up the terminal
159
+ def inspect
160
+ to_s
161
+ end
162
+
163
+ private
164
+
165
+ # Perform train iterations
166
+ #
167
+ # @param [Array] indices The non-zero value indices in cooc_matrix
168
+ def train_in_epochs(indices)
169
+ 1.upto(epochs) do |epoch|
170
+ shuffled = indices.shuffle
171
+ @word_vec, @word_biases = Workers::TrainingWorker.new(self, shuffled).run
172
+ end
173
+ end
174
+
175
+ # Builds the corpus and sets @token_index and @token_pairs
176
+ def fit_corpus(text)
177
+ @corpus =
178
+ if text.is_a? Corpus
179
+ text
180
+ else
181
+ Corpus.build(text, opts)
182
+ end
183
+
184
+ @token_index = corpus.index
185
+ @token_pairs = corpus.pairs
186
+ end
187
+
188
+ # Create initial values for @word_vec and @word_biases
189
+ def build_word_vectors
190
+ cols = token_index.size
191
+ @word_vec = GSL::Matrix.rand(cols, num_components)
192
+ @word_biases = GSL::Vector.alloc(cols)
193
+ end
194
+
195
+ # Buids the co-occurence matrix
196
+ def build_cooc_matrix
197
+ @cooc_matrix = Workers::CooccurrenceWorker.new(self).run
198
+ end
199
+
200
+ # Array of all non-zero (both row and col) value coordinates in the
201
+ # cooc_matrix
202
+ def matrix_nnz
203
+ entries = []
204
+ cooc_matrix.enum_for(:each_col).each_with_index do |col, col_idx|
205
+ col.enum_for(:each).each_with_index do |row, row_idx|
206
+ value = cooc_matrix[row_idx, col_idx]
207
+
208
+ entries << [row_idx, col_idx] unless value.zero?
209
+ end
210
+ end
211
+ entries
212
+ end
213
+
214
+ # Find the vector row of @word_vec for a given word
215
+ #
216
+ # @param [String] word The word to transform into a vector
217
+ # @return [GSL::Vector] The corresponding vector into the #word_vec matrix
218
+ def vector(word)
219
+ return nil unless word_index = token_index[word]
220
+ word_vec.row(word_index)
221
+ end
222
+
223
+ # Balculates the cosine distance of all the words in the vocabulary
224
+ # against a given word. Results are then sorted in DESC order
225
+ #
226
+ # @param [String] word The word to compare against
227
+ # @return [Array<(String, Integer)>] Array of tokens and their distance
228
+ def vector_distance(word)
229
+ return {} unless word_vector = vector(word)
230
+
231
+ token_index.map.with_index do |(token,count), idx|
232
+ next if token.eql? word
233
+ [token, cosine(word_vector, word_vec.row(idx))]
234
+ end.compact.sort{ |a,b| b[1] <=> a[1] }
235
+ end
236
+
237
+ # Compute cosine distance between two vectors
238
+ #
239
+ # @param [GSL::Vector] vector1 First vector
240
+ # @param [GSL::Vector] vector2 Second vector
241
+ # @return [Float] the cosine distance
242
+ def cosine(vector1, vector2)
243
+ return 0 if vector1.nil? || vector2.nil?
244
+ vector1.dot(vector2) / (vector1.norm * vector2.norm)
245
+ end
246
+ end
247
+ end
@@ -0,0 +1,90 @@
1
+ module Glove
2
+ # Takes a string of text and tokenizes it for usage in {Glove::Corpus}
3
+ #
4
+ class Parser
5
+ # Default options (see #initialize)
6
+ DEFAULTS = {
7
+ stem: true,
8
+ min_length: 3,
9
+ max_length: 25,
10
+ alphabetic: true,
11
+ normalize: true,
12
+ stop_words: true
13
+ }
14
+
15
+ # @!attribute [r] text
16
+ # @return [String] the current value of the text attribute
17
+ #
18
+ attr_reader :text
19
+
20
+ # Create a new {Glove::Parser}, passing the text and options as arguments
21
+ #
22
+ # @param [String] text value for the text attribute
23
+ # @param [Hash] options the options to initialize the instance with.
24
+ # @option options [Boolean] :stem (true) Whether to stem the tokens
25
+ # @option options [Boolean] :alphabetic (true) Remove any non-alphabetic chars
26
+ # @option options [Boolean] :normalize (true) Normalize the text and keep
27
+ # words with length between option[:min_length] and option[:max_length]
28
+ # @option options [Boolean] :stop_words (true) Filter stop words
29
+ # @option options [Integer] :min_length (3) the min allowed length of a word
30
+ # @option options [Integer] :max_length (25) the max allowed length of a word
31
+ # @return [Glove::Parser] A new parser.
32
+ def initialize(text, options={})
33
+ @text, @opt = text, DEFAULTS.dup.merge(options)
34
+ end
35
+
36
+ # Call all parsing methods in the class and return the final text value as
37
+ # array of words
38
+ #
39
+ # @return [Array] The tokens array
40
+ def tokenize
41
+ downcase
42
+ stop_words if @opt[:stop_words]
43
+ alphabetic if @opt[:alphabetic]
44
+ split
45
+ normalize if @opt[:normalize]
46
+ stem if @opt[:stem]
47
+ text
48
+ end
49
+
50
+ # Downcases the text value
51
+ def downcase
52
+ text.downcase!
53
+ end
54
+
55
+ # Splits the text string into an array of words
56
+ def split
57
+ @text = text.split
58
+ end
59
+
60
+ # Filters out the text leaving only alphabetical characters in words
61
+ # and splits the words
62
+ def alphabetic
63
+ text.gsub!(/([^[:alpha:]]+)|((?=\w*[a-z])(?=\w*[0-9])\w+)/, ' ')
64
+ end
65
+
66
+ # Stems every member of the text array
67
+ def stem
68
+ text.map!(&:stem)
69
+ end
70
+
71
+ # Selects words with length within the :min_length and :max_length boundaries
72
+ def normalize
73
+ text.keep_if do |word|
74
+ word.length.between?(@opt[:min_length], @opt[:max_length])
75
+ end
76
+ end
77
+
78
+ # Exclude words that are in the STOP_WORDS array
79
+ def stop_words
80
+ @text = text.scan(/(\w+)(\W+)/).reject do |(word, other)|
81
+ stop_words_array.include? word
82
+ end.flatten.join
83
+ end
84
+
85
+ # Reads the default stop words file and return array of its entries
86
+ def stop_words_array
87
+ @stop_words ||= File.read(File.join(Glove.root_path, 'resources', 'en.stop')).split
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,15 @@
1
+ module Glove
2
+ # Holds a token string and its neighbors in an array
3
+ class TokenPair
4
+ # @!attribute [r] token
5
+ # @return [String] The word/token
6
+ # @!attribute [r] neighbors
7
+ # @return [Array<(String)>>] List of neighboring words
8
+ attr_accessor :token, :neighbors
9
+
10
+ # Get class instance and set token and neighbors variables
11
+ def initialize(token='', neighbors=[])
12
+ @token, @neighbors = token, neighbors
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Glove
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'forwardable'
2
+
3
+ module Glove
4
+ module Workers
5
+ require 'glove/workers/cooccurrence_worker'
6
+ require 'glove/workers/training_worker'
7
+ end
8
+ end
@@ -0,0 +1,55 @@
1
+ module Glove
2
+ module Workers
3
+ # Constructs the co-occurrence matrix for {Glove::Model}
4
+ class CooccurrenceWorker
5
+ extend ::Forwardable
6
+
7
+ # @!attribute [r] token_index
8
+ # @return [Hash{String=>Integer}] Clone of @caller.token_index
9
+ # @!attribute [r] word_biases
10
+ # @return [Array<(Glove::TokenPair)>] Clone of @caller.token_pairs
11
+ attr_reader :token_index, :token_pairs
12
+
13
+ def_delegators :@caller, :threads
14
+
15
+ # Creates instance of the class
16
+ #
17
+ # @param [Glove::Model] caller Caller class
18
+ def initialize(caller)
19
+ @caller = caller
20
+ @token_index = @caller.token_index.dup
21
+ @token_pairs = @caller.token_pairs.dup
22
+ end
23
+
24
+ # Perform the building of the matrix
25
+ #
26
+ # @return [GSL::Matrix] The co-occurrence matrix
27
+ def run
28
+ vectors = Parallel.map(token_index, in_processes: threads) do |slice|
29
+ build_cooc_matrix_col(slice)
30
+ end
31
+
32
+ GSL::Matrix.alloc(*vectors)
33
+ end
34
+
35
+ # Creates a vector column for the cooc_matrix based on given token.
36
+ # Calculates sum for how many times the word exists in the constext of the
37
+ # entire vocabulary
38
+ #
39
+ # @param [Array<(String, Integer)>] slice Token with index
40
+ # @return [Array] GSL::Vector#to_a representation of the column
41
+ def build_cooc_matrix_col(slice)
42
+ token = slice[0]
43
+ vector = GSL::Vector.alloc(token_index.size)
44
+
45
+ token_pairs.each do |pair|
46
+ key = token_index[pair.token]
47
+ sum = pair.neighbors.select{ |word| word == token }.size
48
+ vector[key] += sum
49
+ end
50
+
51
+ vector.to_a
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,104 @@
1
+ module Glove
2
+ module Workers
3
+ # Performs the traing process on the word vector matrix, as well as word biases
4
+ class TrainingWorker
5
+ extend ::Forwardable
6
+
7
+ # @!attribute [r] indices
8
+ # @return [Glove::Corpus] Shuffled co-occurrence matrix slots
9
+ # @!attribute [r] word_vec
10
+ # @return [GSL::Matrix] Clone of @caller.word_vec
11
+ # @!attribute [r] word_biases
12
+ # @return [GSL::Vector] Clone of @caller.word_biases
13
+ attr_accessor :indices, :word_vec, :word_biases
14
+
15
+ def_delegators :@caller, :cooc_matrix, :threads, :max_count, :alpha, :learning_rate
16
+
17
+ # Create a {Glove::Workers::TrainingWorker} instance
18
+ # @param [Glove::Model] caller Caller class
19
+ # @param [Array<(Integer, Integer)>] indices Shuffled indices of non-zero
20
+ # elements in the model's co-occurence matrix
21
+ def initialize(caller, indices)
22
+ @caller, @indices = caller, indices
23
+ @word_vec = @caller.word_vec.dup
24
+ @word_biases = @caller.word_biases.dup
25
+ end
26
+
27
+ # Runs the calculations
28
+ # @return [GSL::Matrix, GSL::Vector] Weighted word vectors and word biases
29
+ def run
30
+ mutex = Mutex.new
31
+ slice_size = indices.size / threads
32
+
33
+ workers = indices.each_slice(slice_size).map do |slice|
34
+ Thread.new{ work(slice, mutex) }
35
+ end
36
+ workers.each(&:join)
37
+
38
+ [word_vec, word_biases]
39
+ end
40
+
41
+ # Perform a full train iteration on the word vectors and word biases
42
+ # @param [Array] slice Shuffled co-occurrence matrix slots
43
+ # @param [Mutex] mutex Thread-safe lock on #apply_weights
44
+ def work(slice, mutex)
45
+ slice.each do |slot|
46
+ w1, w2 = slot
47
+ loss, word_a_norm, word_b_norm = calc_weights(w1, w2)
48
+
49
+ mutex.synchronize do
50
+ apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
51
+ end
52
+ end
53
+ end
54
+
55
+ # Calculates loss, and norms for word1 (row) and word2 (column) by given
56
+ # indices
57
+ #
58
+ # @param [Integer] w1 Row index
59
+ # @param [Integer] w2 Column index
60
+ # @param [Float] prediction (0.0) Initial predication value
61
+ # @param [Float] word_a_norm (0.0) Initial norm of word at row w1
62
+ # @param [Float] word_b_norm (0.0) Initial norm of word at col w2
63
+ # @return [Float, Float, Float] Array of loss, word_a_norm, word_b_norm
64
+ def calc_weights(w1, w2, prediction=0.0, word_a_norm=0.0, word_b_norm = 0.0)
65
+ count = cooc_matrix[w1, w2]
66
+
67
+ word_vec.each_col do |col|
68
+ w1_context = col[w1]
69
+ w2_context = col[w2]
70
+
71
+ prediction = prediction + w1_context + w2_context
72
+ word_a_norm += w1_context * w1_context
73
+ word_b_norm += w2_context * w2_context
74
+ end
75
+
76
+ prediction = prediction + word_biases[w1] + word_biases[w2]
77
+ word_a_norm = Math.sqrt(word_a_norm)
78
+ word_b_norm = Math.sqrt(word_b_norm)
79
+ entry_weight = [1.0, (count/max_count)].min ** alpha
80
+ loss = entry_weight * (prediction - Math.log(count))
81
+
82
+ [loss, word_a_norm, word_b_norm]
83
+ end
84
+
85
+ # Applies calculated weights to @word_vec and @word_biases. MUST be called
86
+ # in a Mutex#synchronize block
87
+ #
88
+ # @param [Integer] w1 Row index
89
+ # @param [Integer] w2 Column index
90
+ # @param [Float] loss Loss value
91
+ # @param [Float] word_a_norm Norm of word at row w1
92
+ # @param [Float] word_b_norm Norm of word at col w2
93
+ def apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
94
+ word_vec.each_col do |col|
95
+ col[w1] = (col[w1] - learning_rate * loss * col[w2]) / word_a_norm
96
+ col[w2] = (col[w2] - learning_rate * loss * col[w2]) / word_b_norm
97
+ end
98
+
99
+ word_biases[w1] -= learning_rate * loss
100
+ word_biases[w2] -= learning_rate * loss
101
+ end
102
+ end
103
+ end
104
+ end