glove 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Binary file
Binary file
@@ -0,0 +1,77 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Corpus do
4
+ let(:text) { "the quick brown fox jumped over the lazy dog" }
5
+ let(:opt) { {window: 3, min_count: 2, stop_words: false} }
6
+ let(:corpus) { described_class.new(text, opt) }
7
+
8
+ describe '.build(text, options)' do
9
+ it 'forwards args to #initialize and calls #build_tokens on the instance' do
10
+ expect_any_instance_of(Glove::Corpus).to receive(:build_tokens)
11
+
12
+ Glove::Corpus.build(text)
13
+ end
14
+ end
15
+
16
+ describe '.new(text, options)' do
17
+ it 'gets parsed tokens from Parser class' do
18
+ expect(corpus.tokens).to be_a Array
19
+ end
20
+
21
+ it 'sets options as instance variables' do
22
+ expect(corpus.window).to eq(opt[:window])
23
+ expect(corpus.min_count).to eq(opt[:min_count])
24
+ end
25
+ end
26
+
27
+ describe '#build_tokens' do
28
+ it 'calls #build_count, #build_index, #build_pairs and returns self' do
29
+ expect(corpus).to receive(:build_count)
30
+ expect(corpus).to receive(:build_index)
31
+ expect(corpus).to receive(:build_pairs)
32
+ expect(corpus.build_tokens).to be_instance_of described_class
33
+ end
34
+ end
35
+
36
+ describe '#count' do
37
+ it 'constructs a token count hash' do
38
+ expect(corpus.count).to eq({'the' => 2})
39
+ end
40
+ end
41
+
42
+ describe '#index' do
43
+ before do
44
+ corpus.build_count
45
+ end
46
+
47
+ it 'constructs a token index hash' do
48
+ expect(corpus.index).to eq({'the' => 0})
49
+ end
50
+ end
51
+
52
+ describe '#pairs' do
53
+ before do
54
+ corpus.build_count
55
+ end
56
+
57
+ it 'constructs array of token pairs with neighbors based on window opt' do
58
+ first_pair = corpus.pairs.first
59
+ last_pair = corpus.pairs.last
60
+
61
+ expect(first_pair.neighbors).to eq %w(quick brown fox)
62
+ expect(last_pair.neighbors).to eq %w(fox jump over lazi dog)
63
+ end
64
+ end
65
+
66
+ describe '#token_neighbors(word, index)' do
67
+ let(:corpus) { described_class.new(text, stop_words: false, min_count: 1) }
68
+ before do
69
+ corpus.build_count
70
+ end
71
+
72
+ it "returns window number of neighbors on each side" do
73
+ neighbors = corpus.token_neighbors('jump', 4)
74
+ expect(neighbors).to eq(['brown', 'fox', 'over', 'the'])
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,208 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Model do
4
+ let(:text) { 'the quick brown fox jumped over the lazy dog' }
5
+ let(:model) { Glove::Model.new }
6
+
7
+ describe '.new(options)' do
8
+ it 'sets options as instance variables' do
9
+ expect(model.threads).to eq(Glove::Model::DEFAULTS[:threads])
10
+ end
11
+
12
+ it 'sets cooc_matrix, word_vec and Word_biases to nil' do
13
+ expect(model.cooc_matrix).to be_nil
14
+ expect(model.word_vec).to be_nil
15
+ expect(model.word_biases).to be_nil
16
+ end
17
+ end
18
+
19
+ describe '#fit(text)' do
20
+ before do
21
+ allow(model).to receive(:fit_corpus).with(text)
22
+ allow(model).to receive(:build_cooc_matrix)
23
+ allow(model).to receive(:build_word_vectors)
24
+ end
25
+
26
+ it 'calls its internal methods #fit_corpus to build the corpus obj' do
27
+ expect(model).to receive(:fit_corpus).with(text)
28
+ model.fit(text)
29
+ end
30
+
31
+ it 'calls its internal methods #build_cooc_matrix to build the corpus obj' do
32
+ expect(model).to receive(:build_cooc_matrix)
33
+ model.fit(text)
34
+ end
35
+
36
+ it 'calls its internal methods #build_word_vectors to build the corpus obj' do
37
+ expect(model).to receive(:build_word_vectors)
38
+ model.fit(text)
39
+ end
40
+ end
41
+
42
+ describe '#fit_corpus(text)' do
43
+ before do
44
+ model.send :fit_corpus, text
45
+ end
46
+
47
+ it "build a corpus object from text string argument" do
48
+ expect(model.corpus).to be_instance_of Glove::Corpus
49
+ end
50
+
51
+ it "sets @token_index and @token_pairs vars" do
52
+ expect(model.token_pairs).not_to be_nil
53
+ expect(model.token_index).not_to be_nil
54
+ end
55
+ end
56
+
57
+ describe '#build_word_vectors' do
58
+ before do
59
+ allow(model).to receive(:token_index).and_return([0,1,2,3,4])
60
+ model.send :build_word_vectors
61
+ end
62
+
63
+ it 'creates @word_vec matrix with random floats' do
64
+ expect(model.word_vec.isnull?).to eq(false)
65
+ end
66
+
67
+ it 'creates @word_biases vector with zeros' do
68
+ expect(model.word_biases.isnull?).to eq(true)
69
+ end
70
+ end
71
+
72
+ describe '#train' do
73
+ let(:cooc_matrix) { GSL::Matrix.rand(4,4) }
74
+ before do
75
+ allow(model).to receive(:cooc_matrix).and_return(cooc_matrix)
76
+ allow(model).to receive(:train_in_epochs)
77
+ end
78
+
79
+ it 'calls the #train_in_epochs method' do
80
+ expect(model).to receive(:train_in_epochs)
81
+ model.train
82
+ end
83
+ end
84
+
85
+ context "IO" do
86
+ let(:corpus) { Glove::Corpus.build('quick brown fox', min_count: 1, stop_words: false) }
87
+ let(:cooc) { GSL::Matrix.zeros(3,3) }
88
+ let(:words) { GSL::Matrix.zeros(3, Glove::Model::DEFAULTS[:num_components]) }
89
+ let(:biases) { GSL::Vector.alloc([1,2,3]) }
90
+
91
+ describe '#save' do
92
+ let(:files) do
93
+ %w(corpus.bin cooc.bin words.bin biases.bin).map do |f|
94
+ File.join(fixtures_path, f)
95
+ end
96
+ end
97
+
98
+ before(:each) do
99
+ model.instance_variable_set(:@cooc_matrix, cooc)
100
+ model.instance_variable_set(:@corpus, corpus)
101
+ model.instance_variable_set(:@word_vec, words)
102
+ model.instance_variable_set(:@word_biases, biases)
103
+ end
104
+
105
+ it "dumps corpus, cooc_matrix, word_vec and word_biases to files" do
106
+ model.save(*files)
107
+
108
+ files.each do |file|
109
+ expect(File.size(file)).to be > 0
110
+ end
111
+
112
+ files.each{ |f| File.delete(f) }
113
+ end
114
+ end
115
+
116
+ describe '#load' do
117
+ let(:files) do
118
+ %w(corpus-t.bin cooc-t.bin words-t.bin biases-t.bin).map do |f|
119
+ File.join(fixtures_path, f)
120
+ end
121
+ end
122
+
123
+ before(:each) do
124
+ model.load(*files)
125
+ end
126
+
127
+ it 'loads corpus data from file as first argument' do
128
+ expect(model.corpus.tokens).to eq(corpus.tokens)
129
+ end
130
+
131
+ it 'loads cooc_matrix data from file as second argument' do
132
+ expect(model.cooc_matrix).to eq(cooc)
133
+ end
134
+
135
+ it 'loads word_vec data from file as third argument' do
136
+ expect(model.word_vec).to eq(words)
137
+ end
138
+
139
+ it 'loads word_biases data from file as fourth argument' do
140
+ expect(model.word_biases).to eq(biases)
141
+ end
142
+ end
143
+ end
144
+
145
+ describe '#visualize' do
146
+ pending
147
+ end
148
+
149
+ describe '#analogy_words(word1, word2, target, num, accuracy)' do
150
+ let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
151
+ let(:target) { 'atom' }
152
+ let(:pair_cosine) { 0.99999 }
153
+
154
+ before do
155
+ allow(model).to receive(:vector).and_return(0)
156
+ allow(model).to receive(:cosine).and_return(pair_cosine)
157
+ allow(model).to receive(:vector_distance).and_return(distances)
158
+ end
159
+
160
+ it 'returns the distances whose diff between the pair distance is less than accuracy arg' do
161
+ words = model.analogy_words('quantum', 'physics', target).flatten
162
+
163
+ expect(words).to include('electron')
164
+ expect(words).not_to include('radiation')
165
+ end
166
+ end
167
+
168
+ describe '#most_similar(word, num)' do
169
+ let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
170
+
171
+ before do
172
+ allow(model).to receive(:vector_distance).and_return(distances)
173
+ end
174
+
175
+ it 'returns closest vectors to given word' do
176
+ words = model.most_similar('atom', 1).flatten
177
+
178
+ expect(words).to include('electron')
179
+ expect(words).not_to include('radiation')
180
+ end
181
+ end
182
+
183
+ describe '#train_in_epochs(indices)' do
184
+ let(:worker) { double(:train, run: nil) }
185
+ let(:epochs) { Glove::Model::DEFAULTS[:epochs] }
186
+ before do
187
+ allow(Glove::Workers::TrainingWorker).to receive(:new).and_return(worker)
188
+ end
189
+ it 'calls a traing worker exactly @epochs times' do
190
+ expect(worker).to receive(:run).exactly(epochs).times
191
+
192
+ model.send :train_in_epochs, []
193
+ end
194
+ end
195
+
196
+ describe '#matrix_nnz' do
197
+ let(:matrix) { GSL::Matrix[[0,9], [3,0]] }
198
+
199
+ before do
200
+ allow(model).to receive(:cooc_matrix).and_return(matrix)
201
+ end
202
+
203
+ it 'gets all non-zero value indices in the cooc_matrix' do
204
+ nnz = model.send :matrix_nnz
205
+ expect(nnz).to eq([[1,0], [0,1]])
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Parser do
4
+ let(:text) { "the quick brown Fx jumps over the lazy d0g" }
5
+ let(:parser) { described_class.new(text) }
6
+
7
+ describe '#tokenize' do
8
+ let(:tokens) { %w(quick brown jump lazi) }
9
+
10
+ it "tokenizes the text string" do
11
+ expect(parser.tokenize).to eq(tokens)
12
+ end
13
+ end
14
+
15
+ describe '#downcase' do
16
+ it "downcases all letters" do
17
+ expect(parser.downcase).to eq text.downcase
18
+ end
19
+ end
20
+
21
+ describe '#split' do
22
+ it "splits the text string into an array" do
23
+ expect(parser.split).to be_a Array
24
+ end
25
+ end
26
+
27
+ describe '#alphabetic' do
28
+ it "leaves only words that do not contain any numbers" do
29
+ expect(parser.alphabetic).not_to include('b2b')
30
+ end
31
+ end
32
+
33
+ describe '#stem' do
34
+ it "stemps all words in the text array" do
35
+ parser.split
36
+
37
+ expect(parser.stem).not_to include('jumps')
38
+ expect(parser.stem).to include('jump')
39
+ end
40
+ end
41
+
42
+ describe '#normalize' do
43
+ it "removes words whose length if not within specified boundary" do
44
+ parser.split
45
+
46
+ expect(parser.normalize).not_to include('Fx')
47
+ end
48
+ end
49
+
50
+ describe '#stop_words' do
51
+ it "filters all stop words from the text" do
52
+ expect(parser.stop_words).not_to include('the')
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::TokenPair do
4
+ let(:token) { 'fox' }
5
+ let(:neighbors) { ['brown', 'jump'] }
6
+ let(:pair) { Glove::TokenPair.new(token, neighbors) }
7
+
8
+ describe '.new(word, neighbors)' do
9
+ it 'sets the token and neighbors variables' do
10
+ expect(pair.token).to eq(token)
11
+ expect(pair.neighbors).to eq(neighbors)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,61 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Workers::CooccurrenceWorker do
4
+ let(:index) { {'quick' => 0, 'brown' => 1, 'fox' => 2} }
5
+ let(:pairs) do
6
+ index.map{ |w,i| Glove::TokenPair.new(w) }
7
+ end
8
+ let(:threads) { 0 }
9
+ let(:caller) do
10
+ double(:caller, token_index: index, token_pairs: pairs, threads: threads)
11
+ end
12
+ let(:worker) { described_class.new(caller) }
13
+
14
+ describe '.new' do
15
+ it "keeps reference of the caller class" do
16
+ expect(worker.instance_variable_get(:@caller)).to eq(caller)
17
+ end
18
+
19
+ it "dupes token_index off the caller" do
20
+ expect(worker.token_index).to eq(index)
21
+ end
22
+
23
+ it "dupes token_pairs off the caller" do
24
+ expect(worker.token_pairs).to eq(pairs)
25
+ end
26
+ end
27
+
28
+ describe '#threads' do
29
+ it "delegates method to @caller" do
30
+ expect(worker.threads).to eq(threads)
31
+ end
32
+ end
33
+
34
+ describe '#run' do
35
+ before do
36
+ allow(worker).to receive(:build_cooc_matrix_col).and_return([0,1,2,3])
37
+ end
38
+
39
+ it 'calls #build_cooc_matrix_col in parallel processes' do
40
+ expect(worker).to receive(:build_cooc_matrix_col).exactly(index.size).times
41
+ worker.run
42
+ end
43
+
44
+ it 'converts the vector results into a matrix' do
45
+ expect(worker.run).to be_a GSL::Matrix
46
+ end
47
+ end
48
+
49
+ describe '#build_cooc_matrix_col' do
50
+ before do
51
+ pairs[0].neighbors << 'fox'
52
+ end
53
+
54
+ it 'builds the vector co-occurrence representation of a given token' do
55
+ result = worker.build_cooc_matrix_col(['fox', 2])
56
+
57
+ expect(result.size).to eq(index.size)
58
+ expect(result[0]).to eq(1)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,84 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Workers::TrainingWorker do
4
+ let(:text) { 'quick fox brown fox' }
5
+ let(:opt) { {min_count: 1, stop_words: false, threads: 0} }
6
+ let(:model) { Glove::Model.new(opt).fit(text) }
7
+ let(:index) { model.send(:matrix_nnz)[0] }
8
+ let(:worker) { described_class.new(model, [index]) }
9
+
10
+ describe '.new' do
11
+ it 'dupes caller\'s :word_vec attribute ' do
12
+ expect(worker.word_vec).to eq(model.word_vec)
13
+ end
14
+
15
+ it 'dupes caller\'s :word_biases attribute ' do
16
+ expect(worker.word_biases).to eq(model.word_biases)
17
+ end
18
+ end
19
+
20
+ describe '#run' do
21
+ before do
22
+ allow(model).to receive(:threads).and_return(1)
23
+ allow(worker).to receive(:work)
24
+ end
25
+
26
+ it 'runs the #work method :threads number of times' do
27
+ expect(worker).to receive(:work).exactly(1).times
28
+ worker.run
29
+ end
30
+
31
+ it 'returns array of :word_vec and :word_biases after running the transforms' do
32
+ expect(worker.run).to eq([model.word_vec, model.word_biases])
33
+ end
34
+ end
35
+
36
+ describe '#work' do
37
+ let(:loss) { 1 }
38
+ let(:word_a_norm) { 1 }
39
+ let(:word_b_norm) { 1 }
40
+
41
+ before do
42
+ allow(worker).to receive(:calc_weights).with(index[0], index[1]).
43
+ and_return([loss, word_b_norm, word_b_norm])
44
+ end
45
+
46
+ it 'calculates loss, and norm for each matrix index and applies the new values' do
47
+ expect(worker).to receive(:calc_weights).exactly(1).times
48
+ expect(worker).to receive(:apply_weights).
49
+ with(index[0], index[1], loss, word_a_norm, word_b_norm)
50
+
51
+ worker.work([index], Mutex.new)
52
+ end
53
+ end
54
+
55
+ describe '#calc_weights' do
56
+ it 'performs the calculation and returns loss and norm' do
57
+ loss, norm1, norm2 = worker.calc_weights(index[0], index[1])
58
+
59
+ expect(loss).not_to eq(0)
60
+ expect(loss).not_to eq(norm1)
61
+ expect(loss).not_to eq(norm2)
62
+ end
63
+ end
64
+
65
+ describe '#apply_weights' do
66
+ before do
67
+ worker.apply_weights(index[0], index[1], 1, 1, 1)
68
+ end
69
+
70
+ it "applies weights on the :word_vec matrix" do
71
+ expect(worker.word_vec[0,0]).not_to eq(model.word_vec[0,0])
72
+ end
73
+
74
+ it 'applied loss reducation on :word_biases' do
75
+ bias1 = worker.word_biases[index[0]]
76
+ bias2 = worker.word_biases[index[1]]
77
+ model_bias1 = model.word_biases[index[0]]
78
+ model_bias2 = model.word_biases[index[1]]
79
+
80
+ expect(bias1).not_to eq(model_bias1)
81
+ expect(bias2).not_to eq(model_bias2)
82
+ end
83
+ end
84
+ end