glove 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +122 -0
- data/Rakefile +7 -0
- data/benchmark/benchmark.rb +39 -0
- data/benchmark/co-occurrence.rb +85 -0
- data/benchmark/data/quantum-physics.txt +1 -0
- data/benchmark/profile.rb +19 -0
- data/benchmark/results/.keep +0 -0
- data/glove.gemspec +28 -0
- data/lib/glove.rb +18 -0
- data/lib/glove/corpus.rb +103 -0
- data/lib/glove/model.rb +247 -0
- data/lib/glove/parser.rb +90 -0
- data/lib/glove/token_pair.rb +15 -0
- data/lib/glove/version.rb +3 -0
- data/lib/glove/workers.rb +8 -0
- data/lib/glove/workers/cooccurrence_worker.rb +55 -0
- data/lib/glove/workers/training_worker.rb +104 -0
- data/resources/en.stop +711 -0
- data/spec/fixtures/biases-t.bin +0 -0
- data/spec/fixtures/cooc-t.bin +0 -0
- data/spec/fixtures/corpus-t.bin +0 -0
- data/spec/fixtures/words-t.bin +0 -0
- data/spec/lib/glove/corpus_spec.rb +77 -0
- data/spec/lib/glove/model_spec.rb +208 -0
- data/spec/lib/glove/parser_spec.rb +55 -0
- data/spec/lib/glove/token_pair_spec.rb +14 -0
- data/spec/lib/glove/workers/cooccurrence_worker_spec.rb +61 -0
- data/spec/lib/glove/workers/training_worker_spec.rb +84 -0
- data/spec/spec_helper.rb +18 -0
- metadata +177 -0
data/lib/glove/model.rb
ADDED
@@ -0,0 +1,247 @@
|
|
1
|
+
module Glove
|
2
|
+
class Model
|
3
|
+
# Default options (see #initialize)
|
4
|
+
DEFAULTS = {
|
5
|
+
max_count: 100,
|
6
|
+
learning_rate: 0.05,
|
7
|
+
alpha: 0.75,
|
8
|
+
num_components: 30,
|
9
|
+
epochs: 5,
|
10
|
+
threads: 4
|
11
|
+
}
|
12
|
+
|
13
|
+
# @!attribute [r] corpus
|
14
|
+
# @return [Glove::Corpus] reference to the Corpus instance
|
15
|
+
# @!attribute [r] token_index
|
16
|
+
# @return [Hash] reference to corpus.index
|
17
|
+
# @!attribute [r] token_pairs
|
18
|
+
# @return [Array<(Glove::TokenPair)>] reference to corpus.pairs
|
19
|
+
# @!attribute [rw] word_vec
|
20
|
+
# @return [GSL::Matrix] the word vector matrix
|
21
|
+
# @!attribute [rw] word_biases
|
22
|
+
# @return [GSL::Vector] the vector holding the word biases
|
23
|
+
attr_reader :opts, :window, :epochs, :num_components, :min_count
|
24
|
+
attr_reader :learning_rate, :alpha, :max_count, :threads
|
25
|
+
attr_reader :cooc_matrix, :corpus, :token_index, :token_pairs
|
26
|
+
attr_accessor :word_vec, :word_biases
|
27
|
+
|
28
|
+
# Create a new {Glove::Model} instance. Accepts options for
|
29
|
+
# {Glove::Corpus} and {Glove::Parser} which only get forwarded
|
30
|
+
# and not used in this class.
|
31
|
+
#
|
32
|
+
# @param [Hash] options the options to initialize the instance with.
|
33
|
+
# @option options [Integer] :max_count (100) Parameter specifying cutoff in
|
34
|
+
# weighting function
|
35
|
+
# @option options [Float] :learning_rate (0.05) Initial learning rate
|
36
|
+
# @option options [Float] :alpha (0.75) Exponent of weighting function
|
37
|
+
# @option options [Integer] :num_components (30) Column size of the word vector
|
38
|
+
# matrix
|
39
|
+
# @option options [Integer] :epochs (5) Number of training iterations
|
40
|
+
# @option options [Integer] :threads (4) Number of threads to use in building
|
41
|
+
# the co-occurence matrix and training iterations. Must be greater then 0
|
42
|
+
# @return [Glove::Model] A GloVe model.
|
43
|
+
def initialize(options={})
|
44
|
+
@opts = DEFAULTS.dup.merge(options)
|
45
|
+
@opts.each do |key, value|
|
46
|
+
instance_variable_set :"@#{key}", value
|
47
|
+
end
|
48
|
+
|
49
|
+
@cooc_matrix = nil
|
50
|
+
@word_vec = nil
|
51
|
+
@word_biases = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
# Fit a string or {Glove::Corpus} instance and build co-occurance matrix
|
55
|
+
#
|
56
|
+
# @param [String, Glove::Corpus] text The text to train from
|
57
|
+
# @example Provide corpus for the model
|
58
|
+
# model = Glove::Model.new
|
59
|
+
# model.fit(File.read('shakespeare.txt'))
|
60
|
+
# @example Provide a {Glove::Corpus} instance as text argument
|
61
|
+
# model = Glove::Model.new
|
62
|
+
# corpus = Glove::Corpus.build(File.read('shakespeare.txt'))
|
63
|
+
# model.fit(corpus)
|
64
|
+
# @return [Glove::Model] Current instance
|
65
|
+
def fit(text)
|
66
|
+
fit_corpus(text)
|
67
|
+
build_cooc_matrix
|
68
|
+
build_word_vectors
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# Train the model. Must call #fit prior
|
73
|
+
# @return [Glove::Model] Current instance
|
74
|
+
def train
|
75
|
+
train_in_epochs(matrix_nnz)
|
76
|
+
self
|
77
|
+
end
|
78
|
+
|
79
|
+
# Save trained data to files
|
80
|
+
#
|
81
|
+
# @param [String] corpus_file Filename for corpus
|
82
|
+
# @param [String] cooc_file Filename for co-occurence matrix
|
83
|
+
# @param [String] vec_file Filename for Word Vector Maxtrix
|
84
|
+
# @param [String] bias_file Filename for Word Biases Vector
|
85
|
+
def save(corpus_file, cooc_file, vec_file, bias_file)
|
86
|
+
File.open(corpus_file, 'wb') do |file|
|
87
|
+
file.write Marshal.dump(corpus)
|
88
|
+
end
|
89
|
+
|
90
|
+
cooc_matrix.fwrite(cooc_file)
|
91
|
+
word_vec.fwrite(vec_file)
|
92
|
+
word_biases.fwrite(bias_file)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Loads training data from already existing files
|
96
|
+
#
|
97
|
+
# @param [String] corpus_file Filename for corpus
|
98
|
+
# @param [String] cooc_file Filename for co-occurence matrix
|
99
|
+
# @param [String] vec_file Filename for Word Vector Maxtrix
|
100
|
+
# @param [String] bias_file Filename for Word Biases Vector
|
101
|
+
def load(corpus_file, cooc_file, vec_file, bias_file)
|
102
|
+
@corpus = Marshal.load(File.binread(corpus_file))
|
103
|
+
|
104
|
+
@token_index = corpus.index
|
105
|
+
@token_pairs = corpus.pairs
|
106
|
+
|
107
|
+
size = token_index.size
|
108
|
+
|
109
|
+
@cooc_matrix = GSL::Matrix.alloc(size, size)
|
110
|
+
@word_vec = GSL::Matrix.alloc(size, num_components)
|
111
|
+
@word_biases = GSL::Vector.alloc(size)
|
112
|
+
|
113
|
+
@cooc_matrix.fread(cooc_file)
|
114
|
+
@word_vec.fread(vec_file)
|
115
|
+
@word_biases.fread(bias_file)
|
116
|
+
end
|
117
|
+
|
118
|
+
# @todo create graph of the word vector matrix
|
119
|
+
def visualize
|
120
|
+
raise "Not implemented"
|
121
|
+
end
|
122
|
+
|
123
|
+
# Get a words that relates to :target like :word1 relates to :word2
|
124
|
+
#
|
125
|
+
# @param [String] word1
|
126
|
+
# @param [String] word2
|
127
|
+
# @param [Integer] num Number of related words to :target
|
128
|
+
# @param [Float] accuracy Allowance in difference of target cosine
|
129
|
+
# and related word cosine distances
|
130
|
+
# @example What words relate to atom like quantum relates to physics?
|
131
|
+
# model.analogy_words('quantum', 'physics', 'atom')
|
132
|
+
# # => [["electron", 0.98583], ["energi", 0.98151], ["photon",0.96650]]
|
133
|
+
# @return [Array] List of related words to target
|
134
|
+
def analogy_words(word1, word2, target, num=3, accuracy=0.0001)
|
135
|
+
word1 = word1.stem
|
136
|
+
word2 = word1.stem
|
137
|
+
target = target.stem
|
138
|
+
|
139
|
+
distance = cosine(vector(word1), vector(word2))
|
140
|
+
|
141
|
+
vector_distance(target).reject do |item|
|
142
|
+
diff = item[1].to_f.abs - distance
|
143
|
+
diff.abs < accuracy
|
144
|
+
end.take(num)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Get most similar words to :word
|
148
|
+
#
|
149
|
+
# @param [String] word The word to find similar to
|
150
|
+
# @param [Integer] num (3) Number of similar words to :word
|
151
|
+
# @example Get 1 most similar word to 'physics'
|
152
|
+
# model.most_similar('physics', 1) # => ["quantum", 0.9967993356234444]
|
153
|
+
# @return [Array] List of most similar words with cosine distance as values
|
154
|
+
def most_similar(word, num=3)
|
155
|
+
vector_distance(word.stem).take(num)
|
156
|
+
end
|
157
|
+
|
158
|
+
# Prevent token_pairs, matrices and vectors to fill up the terminal
|
159
|
+
def inspect
|
160
|
+
to_s
|
161
|
+
end
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
# Perform train iterations
|
166
|
+
#
|
167
|
+
# @param [Array] indices The non-zero value indices in cooc_matrix
|
168
|
+
def train_in_epochs(indices)
|
169
|
+
1.upto(epochs) do |epoch|
|
170
|
+
shuffled = indices.shuffle
|
171
|
+
@word_vec, @word_biases = Workers::TrainingWorker.new(self, shuffled).run
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Builds the corpus and sets @token_index and @token_pairs
|
176
|
+
def fit_corpus(text)
|
177
|
+
@corpus =
|
178
|
+
if text.is_a? Corpus
|
179
|
+
text
|
180
|
+
else
|
181
|
+
Corpus.build(text, opts)
|
182
|
+
end
|
183
|
+
|
184
|
+
@token_index = corpus.index
|
185
|
+
@token_pairs = corpus.pairs
|
186
|
+
end
|
187
|
+
|
188
|
+
# Create initial values for @word_vec and @word_biases
|
189
|
+
def build_word_vectors
|
190
|
+
cols = token_index.size
|
191
|
+
@word_vec = GSL::Matrix.rand(cols, num_components)
|
192
|
+
@word_biases = GSL::Vector.alloc(cols)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Buids the co-occurence matrix
|
196
|
+
def build_cooc_matrix
|
197
|
+
@cooc_matrix = Workers::CooccurrenceWorker.new(self).run
|
198
|
+
end
|
199
|
+
|
200
|
+
# Array of all non-zero (both row and col) value coordinates in the
|
201
|
+
# cooc_matrix
|
202
|
+
def matrix_nnz
|
203
|
+
entries = []
|
204
|
+
cooc_matrix.enum_for(:each_col).each_with_index do |col, col_idx|
|
205
|
+
col.enum_for(:each).each_with_index do |row, row_idx|
|
206
|
+
value = cooc_matrix[row_idx, col_idx]
|
207
|
+
|
208
|
+
entries << [row_idx, col_idx] unless value.zero?
|
209
|
+
end
|
210
|
+
end
|
211
|
+
entries
|
212
|
+
end
|
213
|
+
|
214
|
+
# Find the vector row of @word_vec for a given word
|
215
|
+
#
|
216
|
+
# @param [String] word The word to transform into a vector
|
217
|
+
# @return [GSL::Vector] The corresponding vector into the #word_vec matrix
|
218
|
+
def vector(word)
|
219
|
+
return nil unless word_index = token_index[word]
|
220
|
+
word_vec.row(word_index)
|
221
|
+
end
|
222
|
+
|
223
|
+
# Balculates the cosine distance of all the words in the vocabulary
|
224
|
+
# against a given word. Results are then sorted in DESC order
|
225
|
+
#
|
226
|
+
# @param [String] word The word to compare against
|
227
|
+
# @return [Array<(String, Integer)>] Array of tokens and their distance
|
228
|
+
def vector_distance(word)
|
229
|
+
return {} unless word_vector = vector(word)
|
230
|
+
|
231
|
+
token_index.map.with_index do |(token,count), idx|
|
232
|
+
next if token.eql? word
|
233
|
+
[token, cosine(word_vector, word_vec.row(idx))]
|
234
|
+
end.compact.sort{ |a,b| b[1] <=> a[1] }
|
235
|
+
end
|
236
|
+
|
237
|
+
# Compute cosine distance between two vectors
|
238
|
+
#
|
239
|
+
# @param [GSL::Vector] vector1 First vector
|
240
|
+
# @param [GSL::Vector] vector2 Second vector
|
241
|
+
# @return [Float] the cosine distance
|
242
|
+
def cosine(vector1, vector2)
|
243
|
+
return 0 if vector1.nil? || vector2.nil?
|
244
|
+
vector1.dot(vector2) / (vector1.norm * vector2.norm)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
data/lib/glove/parser.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
module Glove
|
2
|
+
# Takes a string of text and tokenizes it for usage in {Glove::Corpus}
|
3
|
+
#
|
4
|
+
class Parser
|
5
|
+
# Default options (see #initialize)
|
6
|
+
DEFAULTS = {
|
7
|
+
stem: true,
|
8
|
+
min_length: 3,
|
9
|
+
max_length: 25,
|
10
|
+
alphabetic: true,
|
11
|
+
normalize: true,
|
12
|
+
stop_words: true
|
13
|
+
}
|
14
|
+
|
15
|
+
# @!attribute [r] text
|
16
|
+
# @return [String] the current value of the text attribute
|
17
|
+
#
|
18
|
+
attr_reader :text
|
19
|
+
|
20
|
+
# Create a new {Glove::Parser}, passing the text and options as arguments
|
21
|
+
#
|
22
|
+
# @param [String] text value for the text attribute
|
23
|
+
# @param [Hash] options the options to initialize the instance with.
|
24
|
+
# @option options [Boolean] :stem (true) Whether to stem the tokens
|
25
|
+
# @option options [Boolean] :alphabetic (true) Remove any non-alphabetic chars
|
26
|
+
# @option options [Boolean] :normalize (true) Normalize the text and keep
|
27
|
+
# words with length between option[:min_length] and option[:max_length]
|
28
|
+
# @option options [Boolean] :stop_words (true) Filter stop words
|
29
|
+
# @option options [Integer] :min_length (3) the min allowed length of a word
|
30
|
+
# @option options [Integer] :max_length (25) the max allowed length of a word
|
31
|
+
# @return [Glove::Parser] A new parser.
|
32
|
+
def initialize(text, options={})
|
33
|
+
@text, @opt = text, DEFAULTS.dup.merge(options)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Call all parsing methods in the class and return the final text value as
|
37
|
+
# array of words
|
38
|
+
#
|
39
|
+
# @return [Array] The tokens array
|
40
|
+
def tokenize
|
41
|
+
downcase
|
42
|
+
stop_words if @opt[:stop_words]
|
43
|
+
alphabetic if @opt[:alphabetic]
|
44
|
+
split
|
45
|
+
normalize if @opt[:normalize]
|
46
|
+
stem if @opt[:stem]
|
47
|
+
text
|
48
|
+
end
|
49
|
+
|
50
|
+
# Downcases the text value
|
51
|
+
def downcase
|
52
|
+
text.downcase!
|
53
|
+
end
|
54
|
+
|
55
|
+
# Splits the text string into an array of words
|
56
|
+
def split
|
57
|
+
@text = text.split
|
58
|
+
end
|
59
|
+
|
60
|
+
# Filters out the text leaving only alphabetical characters in words
|
61
|
+
# and splits the words
|
62
|
+
def alphabetic
|
63
|
+
text.gsub!(/([^[:alpha:]]+)|((?=\w*[a-z])(?=\w*[0-9])\w+)/, ' ')
|
64
|
+
end
|
65
|
+
|
66
|
+
# Stems every member of the text array
|
67
|
+
def stem
|
68
|
+
text.map!(&:stem)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Selects words with length within the :min_length and :max_length boundaries
|
72
|
+
def normalize
|
73
|
+
text.keep_if do |word|
|
74
|
+
word.length.between?(@opt[:min_length], @opt[:max_length])
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Exclude words that are in the STOP_WORDS array
|
79
|
+
def stop_words
|
80
|
+
@text = text.scan(/(\w+)(\W+)/).reject do |(word, other)|
|
81
|
+
stop_words_array.include? word
|
82
|
+
end.flatten.join
|
83
|
+
end
|
84
|
+
|
85
|
+
# Reads the default stop words file and return array of its entries
|
86
|
+
def stop_words_array
|
87
|
+
@stop_words ||= File.read(File.join(Glove.root_path, 'resources', 'en.stop')).split
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Glove
|
2
|
+
# Holds a token string and its neighbors in an array
|
3
|
+
class TokenPair
|
4
|
+
# @!attribute [r] token
|
5
|
+
# @return [String] The word/token
|
6
|
+
# @!attribute [r] neighbors
|
7
|
+
# @return [Array<(String)>>] List of neighboring words
|
8
|
+
attr_accessor :token, :neighbors
|
9
|
+
|
10
|
+
# Get class instance and set token and neighbors variables
|
11
|
+
def initialize(token='', neighbors=[])
|
12
|
+
@token, @neighbors = token, neighbors
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Glove
|
2
|
+
module Workers
|
3
|
+
# Constructs the co-occurrence matrix for {Glove::Model}
|
4
|
+
class CooccurrenceWorker
|
5
|
+
extend ::Forwardable
|
6
|
+
|
7
|
+
# @!attribute [r] token_index
|
8
|
+
# @return [Hash{String=>Integer}] Clone of @caller.token_index
|
9
|
+
# @!attribute [r] word_biases
|
10
|
+
# @return [Array<(Glove::TokenPair)>] Clone of @caller.token_pairs
|
11
|
+
attr_reader :token_index, :token_pairs
|
12
|
+
|
13
|
+
def_delegators :@caller, :threads
|
14
|
+
|
15
|
+
# Creates instance of the class
|
16
|
+
#
|
17
|
+
# @param [Glove::Model] caller Caller class
|
18
|
+
def initialize(caller)
|
19
|
+
@caller = caller
|
20
|
+
@token_index = @caller.token_index.dup
|
21
|
+
@token_pairs = @caller.token_pairs.dup
|
22
|
+
end
|
23
|
+
|
24
|
+
# Perform the building of the matrix
|
25
|
+
#
|
26
|
+
# @return [GSL::Matrix] The co-occurrence matrix
|
27
|
+
def run
|
28
|
+
vectors = Parallel.map(token_index, in_processes: threads) do |slice|
|
29
|
+
build_cooc_matrix_col(slice)
|
30
|
+
end
|
31
|
+
|
32
|
+
GSL::Matrix.alloc(*vectors)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Creates a vector column for the cooc_matrix based on given token.
|
36
|
+
# Calculates sum for how many times the word exists in the constext of the
|
37
|
+
# entire vocabulary
|
38
|
+
#
|
39
|
+
# @param [Array<(String, Integer)>] slice Token with index
|
40
|
+
# @return [Array] GSL::Vector#to_a representation of the column
|
41
|
+
def build_cooc_matrix_col(slice)
|
42
|
+
token = slice[0]
|
43
|
+
vector = GSL::Vector.alloc(token_index.size)
|
44
|
+
|
45
|
+
token_pairs.each do |pair|
|
46
|
+
key = token_index[pair.token]
|
47
|
+
sum = pair.neighbors.select{ |word| word == token }.size
|
48
|
+
vector[key] += sum
|
49
|
+
end
|
50
|
+
|
51
|
+
vector.to_a
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
module Glove
|
2
|
+
module Workers
|
3
|
+
# Performs the traing process on the word vector matrix, as well as word biases
|
4
|
+
class TrainingWorker
|
5
|
+
extend ::Forwardable
|
6
|
+
|
7
|
+
# @!attribute [r] indices
|
8
|
+
# @return [Glove::Corpus] Shuffled co-occurrence matrix slots
|
9
|
+
# @!attribute [r] word_vec
|
10
|
+
# @return [GSL::Matrix] Clone of @caller.word_vec
|
11
|
+
# @!attribute [r] word_biases
|
12
|
+
# @return [GSL::Vector] Clone of @caller.word_biases
|
13
|
+
attr_accessor :indices, :word_vec, :word_biases
|
14
|
+
|
15
|
+
def_delegators :@caller, :cooc_matrix, :threads, :max_count, :alpha, :learning_rate
|
16
|
+
|
17
|
+
# Create a {Glove::Workers::TrainingWorker} instance
|
18
|
+
# @param [Glove::Model] caller Caller class
|
19
|
+
# @param [Array<(Integer, Integer)>] indices Shuffled indices of non-zero
|
20
|
+
# elements in the model's co-occurence matrix
|
21
|
+
def initialize(caller, indices)
|
22
|
+
@caller, @indices = caller, indices
|
23
|
+
@word_vec = @caller.word_vec.dup
|
24
|
+
@word_biases = @caller.word_biases.dup
|
25
|
+
end
|
26
|
+
|
27
|
+
# Runs the calculations
|
28
|
+
# @return [GSL::Matrix, GSL::Vector] Weighted word vectors and word biases
|
29
|
+
def run
|
30
|
+
mutex = Mutex.new
|
31
|
+
slice_size = indices.size / threads
|
32
|
+
|
33
|
+
workers = indices.each_slice(slice_size).map do |slice|
|
34
|
+
Thread.new{ work(slice, mutex) }
|
35
|
+
end
|
36
|
+
workers.each(&:join)
|
37
|
+
|
38
|
+
[word_vec, word_biases]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Perform a full train iteration on the word vectors and word biases
|
42
|
+
# @param [Array] slice Shuffled co-occurrence matrix slots
|
43
|
+
# @param [Mutex] mutex Thread-safe lock on #apply_weights
|
44
|
+
def work(slice, mutex)
|
45
|
+
slice.each do |slot|
|
46
|
+
w1, w2 = slot
|
47
|
+
loss, word_a_norm, word_b_norm = calc_weights(w1, w2)
|
48
|
+
|
49
|
+
mutex.synchronize do
|
50
|
+
apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Calculates loss, and norms for word1 (row) and word2 (column) by given
|
56
|
+
# indices
|
57
|
+
#
|
58
|
+
# @param [Integer] w1 Row index
|
59
|
+
# @param [Integer] w2 Column index
|
60
|
+
# @param [Float] prediction (0.0) Initial predication value
|
61
|
+
# @param [Float] word_a_norm (0.0) Initial norm of word at row w1
|
62
|
+
# @param [Float] word_b_norm (0.0) Initial norm of word at col w2
|
63
|
+
# @return [Float, Float, Float] Array of loss, word_a_norm, word_b_norm
|
64
|
+
def calc_weights(w1, w2, prediction=0.0, word_a_norm=0.0, word_b_norm = 0.0)
|
65
|
+
count = cooc_matrix[w1, w2]
|
66
|
+
|
67
|
+
word_vec.each_col do |col|
|
68
|
+
w1_context = col[w1]
|
69
|
+
w2_context = col[w2]
|
70
|
+
|
71
|
+
prediction = prediction + w1_context + w2_context
|
72
|
+
word_a_norm += w1_context * w1_context
|
73
|
+
word_b_norm += w2_context * w2_context
|
74
|
+
end
|
75
|
+
|
76
|
+
prediction = prediction + word_biases[w1] + word_biases[w2]
|
77
|
+
word_a_norm = Math.sqrt(word_a_norm)
|
78
|
+
word_b_norm = Math.sqrt(word_b_norm)
|
79
|
+
entry_weight = [1.0, (count/max_count)].min ** alpha
|
80
|
+
loss = entry_weight * (prediction - Math.log(count))
|
81
|
+
|
82
|
+
[loss, word_a_norm, word_b_norm]
|
83
|
+
end
|
84
|
+
|
85
|
+
# Applies calculated weights to @word_vec and @word_biases. MUST be called
|
86
|
+
# in a Mutex#synchronize block
|
87
|
+
#
|
88
|
+
# @param [Integer] w1 Row index
|
89
|
+
# @param [Integer] w2 Column index
|
90
|
+
# @param [Float] loss Loss value
|
91
|
+
# @param [Float] word_a_norm Norm of word at row w1
|
92
|
+
# @param [Float] word_b_norm Norm of word at col w2
|
93
|
+
def apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
|
94
|
+
word_vec.each_col do |col|
|
95
|
+
col[w1] = (col[w1] - learning_rate * loss * col[w2]) / word_a_norm
|
96
|
+
col[w2] = (col[w2] - learning_rate * loss * col[w2]) / word_b_norm
|
97
|
+
end
|
98
|
+
|
99
|
+
word_biases[w1] -= learning_rate * loss
|
100
|
+
word_biases[w2] -= learning_rate * loss
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|