glove 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
@@ -0,0 +1,77 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Corpus do
4
+ let(:text) { "the quick brown fox jumped over the lazy dog" }
5
+ let(:opt) { {window: 3, min_count: 2, stop_words: false} }
6
+ let(:corpus) { described_class.new(text, opt) }
7
+
8
+ describe '.build(text, options)' do
9
+ it 'forwards args to #initialize and calls #build_tokens on the instance' do
10
+ expect_any_instance_of(Glove::Corpus).to receive(:build_tokens)
11
+
12
+ Glove::Corpus.build(text)
13
+ end
14
+ end
15
+
16
+ describe '.new(text, options)' do
17
+ it 'gets parsed tokens from Parser class' do
18
+ expect(corpus.tokens).to be_a Array
19
+ end
20
+
21
+ it 'sets options as instance variables' do
22
+ expect(corpus.window).to eq(opt[:window])
23
+ expect(corpus.min_count).to eq(opt[:min_count])
24
+ end
25
+ end
26
+
27
+ describe '#build_tokens' do
28
+ it 'calls #build_count, #build_index, #build_pairs and returns self' do
29
+ expect(corpus).to receive(:build_count)
30
+ expect(corpus).to receive(:build_index)
31
+ expect(corpus).to receive(:build_pairs)
32
+ expect(corpus.build_tokens).to be_instance_of described_class
33
+ end
34
+ end
35
+
36
+ describe '#count' do
37
+ it 'constructs a token count hash' do
38
+ expect(corpus.count).to eq({'the' => 2})
39
+ end
40
+ end
41
+
42
+ describe '#index' do
43
+ before do
44
+ corpus.build_count
45
+ end
46
+
47
+ it 'constructs a token index hash' do
48
+ expect(corpus.index).to eq({'the' => 0})
49
+ end
50
+ end
51
+
52
+ describe '#pairs' do
53
+ before do
54
+ corpus.build_count
55
+ end
56
+
57
+ it 'constructs array of token pairs with neighbors based on window opt' do
58
+ first_pair = corpus.pairs.first
59
+ last_pair = corpus.pairs.last
60
+
61
+ expect(first_pair.neighbors).to eq %w(quick brown fox)
62
+ expect(last_pair.neighbors).to eq %w(fox jump over lazi dog)
63
+ end
64
+ end
65
+
66
+ describe '#token_neighbors(word, index)' do
67
+ let(:corpus) { described_class.new(text, stop_words: false, min_count: 1) }
68
+ before do
69
+ corpus.build_count
70
+ end
71
+
72
+ it "returns window number of neighbors on each side" do
73
+ neighbors = corpus.token_neighbors('jump', 4)
74
+ expect(neighbors).to eq(['brown', 'fox', 'over', 'the'])
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,208 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Model do
4
+ let(:text) { 'the quick brown fox jumped over the lazy dog' }
5
+ let(:model) { Glove::Model.new }
6
+
7
+ describe '.new(options)' do
8
+ it 'sets options as instance variables' do
9
+ expect(model.threads).to eq(Glove::Model::DEFAULTS[:threads])
10
+ end
11
+
12
+ it 'sets cooc_matrix, word_vec and Word_biases to nil' do
13
+ expect(model.cooc_matrix).to be_nil
14
+ expect(model.word_vec).to be_nil
15
+ expect(model.word_biases).to be_nil
16
+ end
17
+ end
18
+
19
+ describe '#fit(text)' do
20
+ before do
21
+ allow(model).to receive(:fit_corpus).with(text)
22
+ allow(model).to receive(:build_cooc_matrix)
23
+ allow(model).to receive(:build_word_vectors)
24
+ end
25
+
26
+ it 'calls its internal methods #fit_corpus to build the corpus obj' do
27
+ expect(model).to receive(:fit_corpus).with(text)
28
+ model.fit(text)
29
+ end
30
+
31
+ it 'calls its internal methods #build_cooc_matrix to build the corpus obj' do
32
+ expect(model).to receive(:build_cooc_matrix)
33
+ model.fit(text)
34
+ end
35
+
36
+ it 'calls its internal methods #build_word_vectors to build the corpus obj' do
37
+ expect(model).to receive(:build_word_vectors)
38
+ model.fit(text)
39
+ end
40
+ end
41
+
42
+ describe '#fit_corpus(text)' do
43
+ before do
44
+ model.send :fit_corpus, text
45
+ end
46
+
47
+ it "build a corpus object from text string argument" do
48
+ expect(model.corpus).to be_instance_of Glove::Corpus
49
+ end
50
+
51
+ it "sets @token_index and @token_pairs vars" do
52
+ expect(model.token_pairs).not_to be_nil
53
+ expect(model.token_index).not_to be_nil
54
+ end
55
+ end
56
+
57
+ describe '#build_word_vectors' do
58
+ before do
59
+ allow(model).to receive(:token_index).and_return([0,1,2,3,4])
60
+ model.send :build_word_vectors
61
+ end
62
+
63
+ it 'creates @word_vec matrix with random floats' do
64
+ expect(model.word_vec.isnull?).to eq(false)
65
+ end
66
+
67
+ it 'creates @word_biases vector with zeros' do
68
+ expect(model.word_biases.isnull?).to eq(true)
69
+ end
70
+ end
71
+
72
+ describe '#train' do
73
+ let(:cooc_matrix) { GSL::Matrix.rand(4,4) }
74
+ before do
75
+ allow(model).to receive(:cooc_matrix).and_return(cooc_matrix)
76
+ allow(model).to receive(:train_in_epochs)
77
+ end
78
+
79
+ it 'calls the #train_in_epochs method' do
80
+ expect(model).to receive(:train_in_epochs)
81
+ model.train
82
+ end
83
+ end
84
+
85
+ context "IO" do
86
+ let(:corpus) { Glove::Corpus.build('quick brown fox', min_count: 1, stop_words: false) }
87
+ let(:cooc) { GSL::Matrix.zeros(3,3) }
88
+ let(:words) { GSL::Matrix.zeros(3, Glove::Model::DEFAULTS[:num_components]) }
89
+ let(:biases) { GSL::Vector.alloc([1,2,3]) }
90
+
91
+ describe '#save' do
92
+ let(:files) do
93
+ %w(corpus.bin cooc.bin words.bin biases.bin).map do |f|
94
+ File.join(fixtures_path, f)
95
+ end
96
+ end
97
+
98
+ before(:each) do
99
+ model.instance_variable_set(:@cooc_matrix, cooc)
100
+ model.instance_variable_set(:@corpus, corpus)
101
+ model.instance_variable_set(:@word_vec, words)
102
+ model.instance_variable_set(:@word_biases, biases)
103
+ end
104
+
105
+ it "dumps corpus, cooc_matrix, word_vec and word_biases to files" do
106
+ model.save(*files)
107
+
108
+ files.each do |file|
109
+ expect(File.size(file)).to be > 0
110
+ end
111
+
112
+ files.each{ |f| File.delete(f) }
113
+ end
114
+ end
115
+
116
+ describe '#load' do
117
+ let(:files) do
118
+ %w(corpus-t.bin cooc-t.bin words-t.bin biases-t.bin).map do |f|
119
+ File.join(fixtures_path, f)
120
+ end
121
+ end
122
+
123
+ before(:each) do
124
+ model.load(*files)
125
+ end
126
+
127
+ it 'loads corpus data from file as first argument' do
128
+ expect(model.corpus.tokens).to eq(corpus.tokens)
129
+ end
130
+
131
+ it 'loads cooc_matrix data from file as second argument' do
132
+ expect(model.cooc_matrix).to eq(cooc)
133
+ end
134
+
135
+ it 'loads word_vec data from file as third argument' do
136
+ expect(model.word_vec).to eq(words)
137
+ end
138
+
139
+ it 'loads word_biases data from file as fourth argument' do
140
+ expect(model.word_biases).to eq(biases)
141
+ end
142
+ end
143
+ end
144
+
145
+ describe '#visualize' do
146
+ pending
147
+ end
148
+
149
+ describe '#analogy_words(word1, word2, target, num, accuracy)' do
150
+ let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
151
+ let(:target) { 'atom' }
152
+ let(:pair_cosine) { 0.99999 }
153
+
154
+ before do
155
+ allow(model).to receive(:vector).and_return(0)
156
+ allow(model).to receive(:cosine).and_return(pair_cosine)
157
+ allow(model).to receive(:vector_distance).and_return(distances)
158
+ end
159
+
160
+ it 'returns the distances whose diff between the pair distance is less than accuracy arg' do
161
+ words = model.analogy_words('quantum', 'physics', target).flatten
162
+
163
+ expect(words).to include('electron')
164
+ expect(words).not_to include('radiation')
165
+ end
166
+ end
167
+
168
+ describe '#most_similar(word, num)' do
169
+ let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
170
+
171
+ before do
172
+ allow(model).to receive(:vector_distance).and_return(distances)
173
+ end
174
+
175
+ it 'returns closest vectors to given word' do
176
+ words = model.most_similar('atom', 1).flatten
177
+
178
+ expect(words).to include('electron')
179
+ expect(words).not_to include('radiation')
180
+ end
181
+ end
182
+
183
+ describe '#train_in_epochs(indices)' do
184
+ let(:worker) { double(:train, run: nil) }
185
+ let(:epochs) { Glove::Model::DEFAULTS[:epochs] }
186
+ before do
187
+ allow(Glove::Workers::TrainingWorker).to receive(:new).and_return(worker)
188
+ end
189
+ it 'calls a traing worker exactly @epochs times' do
190
+ expect(worker).to receive(:run).exactly(epochs).times
191
+
192
+ model.send :train_in_epochs, []
193
+ end
194
+ end
195
+
196
+ describe '#matrix_nnz' do
197
+ let(:matrix) { GSL::Matrix[[0,9], [3,0]] }
198
+
199
+ before do
200
+ allow(model).to receive(:cooc_matrix).and_return(matrix)
201
+ end
202
+
203
+ it 'gets all non-zero value indices in the cooc_matrix' do
204
+ nnz = model.send :matrix_nnz
205
+ expect(nnz).to eq([[1,0], [0,1]])
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Parser do
4
+ let(:text) { "the quick brown Fx jumps over the lazy d0g" }
5
+ let(:parser) { described_class.new(text) }
6
+
7
+ describe '#tokenize' do
8
+ let(:tokens) { %w(quick brown jump lazi) }
9
+
10
+ it "tokenizes the text string" do
11
+ expect(parser.tokenize).to eq(tokens)
12
+ end
13
+ end
14
+
15
+ describe '#downcase' do
16
+ it "downcases all letters" do
17
+ expect(parser.downcase).to eq text.downcase
18
+ end
19
+ end
20
+
21
+ describe '#split' do
22
+ it "splits the text string into an array" do
23
+ expect(parser.split).to be_a Array
24
+ end
25
+ end
26
+
27
+ describe '#alphabetic' do
28
+ it "leaves only words that do not contain any numbers" do
29
+ expect(parser.alphabetic).not_to include('b2b')
30
+ end
31
+ end
32
+
33
+ describe '#stem' do
34
+ it "stemps all words in the text array" do
35
+ parser.split
36
+
37
+ expect(parser.stem).not_to include('jumps')
38
+ expect(parser.stem).to include('jump')
39
+ end
40
+ end
41
+
42
+ describe '#normalize' do
43
+ it "removes words whose length if not within specified boundary" do
44
+ parser.split
45
+
46
+ expect(parser.normalize).not_to include('Fx')
47
+ end
48
+ end
49
+
50
+ describe '#stop_words' do
51
+ it "filters all stop words from the text" do
52
+ expect(parser.stop_words).not_to include('the')
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::TokenPair do
4
+ let(:token) { 'fox' }
5
+ let(:neighbors) { ['brown', 'jump'] }
6
+ let(:pair) { Glove::TokenPair.new(token, neighbors) }
7
+
8
+ describe '.new(word, neighbors)' do
9
+ it 'sets the token and neighbors variables' do
10
+ expect(pair.token).to eq(token)
11
+ expect(pair.neighbors).to eq(neighbors)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,61 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Workers::CooccurrenceWorker do
4
+ let(:index) { {'quick' => 0, 'brown' => 1, 'fox' => 2} }
5
+ let(:pairs) do
6
+ index.map{ |w,i| Glove::TokenPair.new(w) }
7
+ end
8
+ let(:threads) { 0 }
9
+ let(:caller) do
10
+ double(:caller, token_index: index, token_pairs: pairs, threads: threads)
11
+ end
12
+ let(:worker) { described_class.new(caller) }
13
+
14
+ describe '.new' do
15
+ it "keeps reference of the caller class" do
16
+ expect(worker.instance_variable_get(:@caller)).to eq(caller)
17
+ end
18
+
19
+ it "dupes token_index off the caller" do
20
+ expect(worker.token_index).to eq(index)
21
+ end
22
+
23
+ it "dupes token_pairs off the caller" do
24
+ expect(worker.token_pairs).to eq(pairs)
25
+ end
26
+ end
27
+
28
+ describe '#threads' do
29
+ it "delegates method to @caller" do
30
+ expect(worker.threads).to eq(threads)
31
+ end
32
+ end
33
+
34
+ describe '#run' do
35
+ before do
36
+ allow(worker).to receive(:build_cooc_matrix_col).and_return([0,1,2,3])
37
+ end
38
+
39
+ it 'calls #build_cooc_matrix_col in parallel processes' do
40
+ expect(worker).to receive(:build_cooc_matrix_col).exactly(index.size).times
41
+ worker.run
42
+ end
43
+
44
+ it 'converts the vector results into a matrix' do
45
+ expect(worker.run).to be_a GSL::Matrix
46
+ end
47
+ end
48
+
49
+ describe '#build_cooc_matrix_col' do
50
+ before do
51
+ pairs[0].neighbors << 'fox'
52
+ end
53
+
54
+ it 'builds the vector co-occurrence representation of a given token' do
55
+ result = worker.build_cooc_matrix_col(['fox', 2])
56
+
57
+ expect(result.size).to eq(index.size)
58
+ expect(result[0]).to eq(1)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,84 @@
1
+ require 'spec_helper'
2
+
3
+ describe Glove::Workers::TrainingWorker do
4
+ let(:text) { 'quick fox brown fox' }
5
+ let(:opt) { {min_count: 1, stop_words: false, threads: 0} }
6
+ let(:model) { Glove::Model.new(opt).fit(text) }
7
+ let(:index) { model.send(:matrix_nnz)[0] }
8
+ let(:worker) { described_class.new(model, [index]) }
9
+
10
+ describe '.new' do
11
+ it 'dupes caller\'s :word_vec attribute ' do
12
+ expect(worker.word_vec).to eq(model.word_vec)
13
+ end
14
+
15
+ it 'dupes caller\'s :word_biases attribute ' do
16
+ expect(worker.word_biases).to eq(model.word_biases)
17
+ end
18
+ end
19
+
20
+ describe '#run' do
21
+ before do
22
+ allow(model).to receive(:threads).and_return(1)
23
+ allow(worker).to receive(:work)
24
+ end
25
+
26
+ it 'runs the #work method :threads number of times' do
27
+ expect(worker).to receive(:work).exactly(1).times
28
+ worker.run
29
+ end
30
+
31
+ it 'returns array of :word_vec and :word_biases after running the transforms' do
32
+ expect(worker.run).to eq([model.word_vec, model.word_biases])
33
+ end
34
+ end
35
+
36
+ describe '#work' do
37
+ let(:loss) { 1 }
38
+ let(:word_a_norm) { 1 }
39
+ let(:word_b_norm) { 1 }
40
+
41
+ before do
42
+ allow(worker).to receive(:calc_weights).with(index[0], index[1]).
43
+ and_return([loss, word_b_norm, word_b_norm])
44
+ end
45
+
46
+ it 'calculates loss, and norm for each matrix index and applies the new values' do
47
+ expect(worker).to receive(:calc_weights).exactly(1).times
48
+ expect(worker).to receive(:apply_weights).
49
+ with(index[0], index[1], loss, word_a_norm, word_b_norm)
50
+
51
+ worker.work([index], Mutex.new)
52
+ end
53
+ end
54
+
55
+ describe '#calc_weights' do
56
+ it 'performs the calculation and returns loss and norm' do
57
+ loss, norm1, norm2 = worker.calc_weights(index[0], index[1])
58
+
59
+ expect(loss).not_to eq(0)
60
+ expect(loss).not_to eq(norm1)
61
+ expect(loss).not_to eq(norm2)
62
+ end
63
+ end
64
+
65
+ describe '#apply_weights' do
66
+ before do
67
+ worker.apply_weights(index[0], index[1], 1, 1, 1)
68
+ end
69
+
70
+ it "applies weights on the :word_vec matrix" do
71
+ expect(worker.word_vec[0,0]).not_to eq(model.word_vec[0,0])
72
+ end
73
+
74
+ it 'applied loss reducation on :word_biases' do
75
+ bias1 = worker.word_biases[index[0]]
76
+ bias2 = worker.word_biases[index[1]]
77
+ model_bias1 = model.word_biases[index[0]]
78
+ model_bias2 = model.word_biases[index[1]]
79
+
80
+ expect(bias1).not_to eq(model_bias1)
81
+ expect(bias2).not_to eq(model_bias2)
82
+ end
83
+ end
84
+ end