glove 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +122 -0
- data/Rakefile +7 -0
- data/benchmark/benchmark.rb +39 -0
- data/benchmark/co-occurrence.rb +85 -0
- data/benchmark/data/quantum-physics.txt +1 -0
- data/benchmark/profile.rb +19 -0
- data/benchmark/results/.keep +0 -0
- data/glove.gemspec +28 -0
- data/lib/glove.rb +18 -0
- data/lib/glove/corpus.rb +103 -0
- data/lib/glove/model.rb +247 -0
- data/lib/glove/parser.rb +90 -0
- data/lib/glove/token_pair.rb +15 -0
- data/lib/glove/version.rb +3 -0
- data/lib/glove/workers.rb +8 -0
- data/lib/glove/workers/cooccurrence_worker.rb +55 -0
- data/lib/glove/workers/training_worker.rb +104 -0
- data/resources/en.stop +711 -0
- data/spec/fixtures/biases-t.bin +0 -0
- data/spec/fixtures/cooc-t.bin +0 -0
- data/spec/fixtures/corpus-t.bin +0 -0
- data/spec/fixtures/words-t.bin +0 -0
- data/spec/lib/glove/corpus_spec.rb +77 -0
- data/spec/lib/glove/model_spec.rb +208 -0
- data/spec/lib/glove/parser_spec.rb +55 -0
- data/spec/lib/glove/token_pair_spec.rb +14 -0
- data/spec/lib/glove/workers/cooccurrence_worker_spec.rb +61 -0
- data/spec/lib/glove/workers/training_worker_spec.rb +84 -0
- data/spec/spec_helper.rb +18 -0
- metadata +177 -0
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Glove::Corpus do
|
4
|
+
let(:text) { "the quick brown fox jumped over the lazy dog" }
|
5
|
+
let(:opt) { {window: 3, min_count: 2, stop_words: false} }
|
6
|
+
let(:corpus) { described_class.new(text, opt) }
|
7
|
+
|
8
|
+
describe '.build(text, options)' do
|
9
|
+
it 'forwards args to #initialize and calls #build_tokens on the instance' do
|
10
|
+
expect_any_instance_of(Glove::Corpus).to receive(:build_tokens)
|
11
|
+
|
12
|
+
Glove::Corpus.build(text)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '.new(text, options)' do
|
17
|
+
it 'gets parsed tokens from Parser class' do
|
18
|
+
expect(corpus.tokens).to be_a Array
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'sets options as instance variables' do
|
22
|
+
expect(corpus.window).to eq(opt[:window])
|
23
|
+
expect(corpus.min_count).to eq(opt[:min_count])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe '#build_tokens' do
|
28
|
+
it 'calls #build_count, #build_index, #build_pairs and returns self' do
|
29
|
+
expect(corpus).to receive(:build_count)
|
30
|
+
expect(corpus).to receive(:build_index)
|
31
|
+
expect(corpus).to receive(:build_pairs)
|
32
|
+
expect(corpus.build_tokens).to be_instance_of described_class
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe '#count' do
|
37
|
+
it 'constructs a token count hash' do
|
38
|
+
expect(corpus.count).to eq({'the' => 2})
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#index' do
|
43
|
+
before do
|
44
|
+
corpus.build_count
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'constructs a token index hash' do
|
48
|
+
expect(corpus.index).to eq({'the' => 0})
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe '#pairs' do
|
53
|
+
before do
|
54
|
+
corpus.build_count
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'constructs array of token pairs with neighbors based on window opt' do
|
58
|
+
first_pair = corpus.pairs.first
|
59
|
+
last_pair = corpus.pairs.last
|
60
|
+
|
61
|
+
expect(first_pair.neighbors).to eq %w(quick brown fox)
|
62
|
+
expect(last_pair.neighbors).to eq %w(fox jump over lazi dog)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe '#token_neighbors(word, index)' do
|
67
|
+
let(:corpus) { described_class.new(text, stop_words: false, min_count: 1) }
|
68
|
+
before do
|
69
|
+
corpus.build_count
|
70
|
+
end
|
71
|
+
|
72
|
+
it "returns window number of neighbors on each side" do
|
73
|
+
neighbors = corpus.token_neighbors('jump', 4)
|
74
|
+
expect(neighbors).to eq(['brown', 'fox', 'over', 'the'])
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,208 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Glove::Model do
|
4
|
+
let(:text) { 'the quick brown fox jumped over the lazy dog' }
|
5
|
+
let(:model) { Glove::Model.new }
|
6
|
+
|
7
|
+
describe '.new(options)' do
|
8
|
+
it 'sets options as instance variables' do
|
9
|
+
expect(model.threads).to eq(Glove::Model::DEFAULTS[:threads])
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'sets cooc_matrix, word_vec and Word_biases to nil' do
|
13
|
+
expect(model.cooc_matrix).to be_nil
|
14
|
+
expect(model.word_vec).to be_nil
|
15
|
+
expect(model.word_biases).to be_nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '#fit(text)' do
|
20
|
+
before do
|
21
|
+
allow(model).to receive(:fit_corpus).with(text)
|
22
|
+
allow(model).to receive(:build_cooc_matrix)
|
23
|
+
allow(model).to receive(:build_word_vectors)
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'calls its internal methods #fit_corpus to build the corpus obj' do
|
27
|
+
expect(model).to receive(:fit_corpus).with(text)
|
28
|
+
model.fit(text)
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'calls its internal methods #build_cooc_matrix to build the corpus obj' do
|
32
|
+
expect(model).to receive(:build_cooc_matrix)
|
33
|
+
model.fit(text)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'calls its internal methods #build_word_vectors to build the corpus obj' do
|
37
|
+
expect(model).to receive(:build_word_vectors)
|
38
|
+
model.fit(text)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#fit_corpus(text)' do
|
43
|
+
before do
|
44
|
+
model.send :fit_corpus, text
|
45
|
+
end
|
46
|
+
|
47
|
+
it "build a corpus object from text string argument" do
|
48
|
+
expect(model.corpus).to be_instance_of Glove::Corpus
|
49
|
+
end
|
50
|
+
|
51
|
+
it "sets @token_index and @token_pairs vars" do
|
52
|
+
expect(model.token_pairs).not_to be_nil
|
53
|
+
expect(model.token_index).not_to be_nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe '#build_word_vectors' do
|
58
|
+
before do
|
59
|
+
allow(model).to receive(:token_index).and_return([0,1,2,3,4])
|
60
|
+
model.send :build_word_vectors
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'creates @word_vec matrix with random floats' do
|
64
|
+
expect(model.word_vec.isnull?).to eq(false)
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'creates @word_biases vector with zeros' do
|
68
|
+
expect(model.word_biases.isnull?).to eq(true)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe '#train' do
|
73
|
+
let(:cooc_matrix) { GSL::Matrix.rand(4,4) }
|
74
|
+
before do
|
75
|
+
allow(model).to receive(:cooc_matrix).and_return(cooc_matrix)
|
76
|
+
allow(model).to receive(:train_in_epochs)
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'calls the #train_in_epochs method' do
|
80
|
+
expect(model).to receive(:train_in_epochs)
|
81
|
+
model.train
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
context "IO" do
|
86
|
+
let(:corpus) { Glove::Corpus.build('quick brown fox', min_count: 1, stop_words: false) }
|
87
|
+
let(:cooc) { GSL::Matrix.zeros(3,3) }
|
88
|
+
let(:words) { GSL::Matrix.zeros(3, Glove::Model::DEFAULTS[:num_components]) }
|
89
|
+
let(:biases) { GSL::Vector.alloc([1,2,3]) }
|
90
|
+
|
91
|
+
describe '#save' do
|
92
|
+
let(:files) do
|
93
|
+
%w(corpus.bin cooc.bin words.bin biases.bin).map do |f|
|
94
|
+
File.join(fixtures_path, f)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
before(:each) do
|
99
|
+
model.instance_variable_set(:@cooc_matrix, cooc)
|
100
|
+
model.instance_variable_set(:@corpus, corpus)
|
101
|
+
model.instance_variable_set(:@word_vec, words)
|
102
|
+
model.instance_variable_set(:@word_biases, biases)
|
103
|
+
end
|
104
|
+
|
105
|
+
it "dumps corpus, cooc_matrix, word_vec and word_biases to files" do
|
106
|
+
model.save(*files)
|
107
|
+
|
108
|
+
files.each do |file|
|
109
|
+
expect(File.size(file)).to be > 0
|
110
|
+
end
|
111
|
+
|
112
|
+
files.each{ |f| File.delete(f) }
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
describe '#load' do
|
117
|
+
let(:files) do
|
118
|
+
%w(corpus-t.bin cooc-t.bin words-t.bin biases-t.bin).map do |f|
|
119
|
+
File.join(fixtures_path, f)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
before(:each) do
|
124
|
+
model.load(*files)
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'loads corpus data from file as first argument' do
|
128
|
+
expect(model.corpus.tokens).to eq(corpus.tokens)
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'loads cooc_matrix data from file as second argument' do
|
132
|
+
expect(model.cooc_matrix).to eq(cooc)
|
133
|
+
end
|
134
|
+
|
135
|
+
it 'loads word_vec data from file as third argument' do
|
136
|
+
expect(model.word_vec).to eq(words)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'loads word_biases data from file as fourth argument' do
|
140
|
+
expect(model.word_biases).to eq(biases)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
describe '#visualize' do
|
146
|
+
pending
|
147
|
+
end
|
148
|
+
|
149
|
+
describe '#analogy_words(word1, word2, target, num, accuracy)' do
|
150
|
+
let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
|
151
|
+
let(:target) { 'atom' }
|
152
|
+
let(:pair_cosine) { 0.99999 }
|
153
|
+
|
154
|
+
before do
|
155
|
+
allow(model).to receive(:vector).and_return(0)
|
156
|
+
allow(model).to receive(:cosine).and_return(pair_cosine)
|
157
|
+
allow(model).to receive(:vector_distance).and_return(distances)
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'returns the distances whose diff between the pair distance is less than accuracy arg' do
|
161
|
+
words = model.analogy_words('quantum', 'physics', target).flatten
|
162
|
+
|
163
|
+
expect(words).to include('electron')
|
164
|
+
expect(words).not_to include('radiation')
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
describe '#most_similar(word, num)' do
|
169
|
+
let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
|
170
|
+
|
171
|
+
before do
|
172
|
+
allow(model).to receive(:vector_distance).and_return(distances)
|
173
|
+
end
|
174
|
+
|
175
|
+
it 'returns closest vectors to given word' do
|
176
|
+
words = model.most_similar('atom', 1).flatten
|
177
|
+
|
178
|
+
expect(words).to include('electron')
|
179
|
+
expect(words).not_to include('radiation')
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
describe '#train_in_epochs(indices)' do
|
184
|
+
let(:worker) { double(:train, run: nil) }
|
185
|
+
let(:epochs) { Glove::Model::DEFAULTS[:epochs] }
|
186
|
+
before do
|
187
|
+
allow(Glove::Workers::TrainingWorker).to receive(:new).and_return(worker)
|
188
|
+
end
|
189
|
+
it 'calls a traing worker exactly @epochs times' do
|
190
|
+
expect(worker).to receive(:run).exactly(epochs).times
|
191
|
+
|
192
|
+
model.send :train_in_epochs, []
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
describe '#matrix_nnz' do
|
197
|
+
let(:matrix) { GSL::Matrix[[0,9], [3,0]] }
|
198
|
+
|
199
|
+
before do
|
200
|
+
allow(model).to receive(:cooc_matrix).and_return(matrix)
|
201
|
+
end
|
202
|
+
|
203
|
+
it 'gets all non-zero value indices in the cooc_matrix' do
|
204
|
+
nnz = model.send :matrix_nnz
|
205
|
+
expect(nnz).to eq([[1,0], [0,1]])
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Glove::Parser do
|
4
|
+
let(:text) { "the quick brown Fx jumps over the lazy d0g" }
|
5
|
+
let(:parser) { described_class.new(text) }
|
6
|
+
|
7
|
+
describe '#tokenize' do
|
8
|
+
let(:tokens) { %w(quick brown jump lazi) }
|
9
|
+
|
10
|
+
it "tokenizes the text string" do
|
11
|
+
expect(parser.tokenize).to eq(tokens)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '#downcase' do
|
16
|
+
it "downcases all letters" do
|
17
|
+
expect(parser.downcase).to eq text.downcase
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '#split' do
|
22
|
+
it "splits the text string into an array" do
|
23
|
+
expect(parser.split).to be_a Array
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe '#alphabetic' do
|
28
|
+
it "leaves only words that do not contain any numbers" do
|
29
|
+
expect(parser.alphabetic).not_to include('b2b')
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe '#stem' do
|
34
|
+
it "stemps all words in the text array" do
|
35
|
+
parser.split
|
36
|
+
|
37
|
+
expect(parser.stem).not_to include('jumps')
|
38
|
+
expect(parser.stem).to include('jump')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#normalize' do
|
43
|
+
it "removes words whose length if not within specified boundary" do
|
44
|
+
parser.split
|
45
|
+
|
46
|
+
expect(parser.normalize).not_to include('Fx')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe '#stop_words' do
|
51
|
+
it "filters all stop words from the text" do
|
52
|
+
expect(parser.stop_words).not_to include('the')
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Glove::TokenPair do
|
4
|
+
let(:token) { 'fox' }
|
5
|
+
let(:neighbors) { ['brown', 'jump'] }
|
6
|
+
let(:pair) { Glove::TokenPair.new(token, neighbors) }
|
7
|
+
|
8
|
+
describe '.new(word, neighbors)' do
|
9
|
+
it 'sets the token and neighbors variables' do
|
10
|
+
expect(pair.token).to eq(token)
|
11
|
+
expect(pair.neighbors).to eq(neighbors)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Glove::Workers::CooccurrenceWorker do
|
4
|
+
let(:index) { {'quick' => 0, 'brown' => 1, 'fox' => 2} }
|
5
|
+
let(:pairs) do
|
6
|
+
index.map{ |w,i| Glove::TokenPair.new(w) }
|
7
|
+
end
|
8
|
+
let(:threads) { 0 }
|
9
|
+
let(:caller) do
|
10
|
+
double(:caller, token_index: index, token_pairs: pairs, threads: threads)
|
11
|
+
end
|
12
|
+
let(:worker) { described_class.new(caller) }
|
13
|
+
|
14
|
+
describe '.new' do
|
15
|
+
it "keeps reference of the caller class" do
|
16
|
+
expect(worker.instance_variable_get(:@caller)).to eq(caller)
|
17
|
+
end
|
18
|
+
|
19
|
+
it "dupes token_index off the caller" do
|
20
|
+
expect(worker.token_index).to eq(index)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "dupes token_pairs off the caller" do
|
24
|
+
expect(worker.token_pairs).to eq(pairs)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe '#threads' do
|
29
|
+
it "delegates method to @caller" do
|
30
|
+
expect(worker.threads).to eq(threads)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#run' do
|
35
|
+
before do
|
36
|
+
allow(worker).to receive(:build_cooc_matrix_col).and_return([0,1,2,3])
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'calls #build_cooc_matrix_col in parallel processes' do
|
40
|
+
expect(worker).to receive(:build_cooc_matrix_col).exactly(index.size).times
|
41
|
+
worker.run
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'converts the vector results into a matrix' do
|
45
|
+
expect(worker.run).to be_a GSL::Matrix
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#build_cooc_matrix_col' do
|
50
|
+
before do
|
51
|
+
pairs[0].neighbors << 'fox'
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'builds the vector co-occurrence representation of a given token' do
|
55
|
+
result = worker.build_cooc_matrix_col(['fox', 2])
|
56
|
+
|
57
|
+
expect(result.size).to eq(index.size)
|
58
|
+
expect(result[0]).to eq(1)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Glove::Workers::TrainingWorker do
|
4
|
+
let(:text) { 'quick fox brown fox' }
|
5
|
+
let(:opt) { {min_count: 1, stop_words: false, threads: 0} }
|
6
|
+
let(:model) { Glove::Model.new(opt).fit(text) }
|
7
|
+
let(:index) { model.send(:matrix_nnz)[0] }
|
8
|
+
let(:worker) { described_class.new(model, [index]) }
|
9
|
+
|
10
|
+
describe '.new' do
|
11
|
+
it 'dupes caller\'s :word_vec attribute ' do
|
12
|
+
expect(worker.word_vec).to eq(model.word_vec)
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'dupes caller\'s :word_biases attribute ' do
|
16
|
+
expect(worker.word_biases).to eq(model.word_biases)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe '#run' do
|
21
|
+
before do
|
22
|
+
allow(model).to receive(:threads).and_return(1)
|
23
|
+
allow(worker).to receive(:work)
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'runs the #work method :threads number of times' do
|
27
|
+
expect(worker).to receive(:work).exactly(1).times
|
28
|
+
worker.run
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'returns array of :word_vec and :word_biases after running the transforms' do
|
32
|
+
expect(worker.run).to eq([model.word_vec, model.word_biases])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe '#work' do
|
37
|
+
let(:loss) { 1 }
|
38
|
+
let(:word_a_norm) { 1 }
|
39
|
+
let(:word_b_norm) { 1 }
|
40
|
+
|
41
|
+
before do
|
42
|
+
allow(worker).to receive(:calc_weights).with(index[0], index[1]).
|
43
|
+
and_return([loss, word_b_norm, word_b_norm])
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'calculates loss, and norm for each matrix index and applies the new values' do
|
47
|
+
expect(worker).to receive(:calc_weights).exactly(1).times
|
48
|
+
expect(worker).to receive(:apply_weights).
|
49
|
+
with(index[0], index[1], loss, word_a_norm, word_b_norm)
|
50
|
+
|
51
|
+
worker.work([index], Mutex.new)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe '#calc_weights' do
|
56
|
+
it 'performs the calculation and returns loss and norm' do
|
57
|
+
loss, norm1, norm2 = worker.calc_weights(index[0], index[1])
|
58
|
+
|
59
|
+
expect(loss).not_to eq(0)
|
60
|
+
expect(loss).not_to eq(norm1)
|
61
|
+
expect(loss).not_to eq(norm2)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe '#apply_weights' do
|
66
|
+
before do
|
67
|
+
worker.apply_weights(index[0], index[1], 1, 1, 1)
|
68
|
+
end
|
69
|
+
|
70
|
+
it "applies weights on the :word_vec matrix" do
|
71
|
+
expect(worker.word_vec[0,0]).not_to eq(model.word_vec[0,0])
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'applied loss reducation on :word_biases' do
|
75
|
+
bias1 = worker.word_biases[index[0]]
|
76
|
+
bias2 = worker.word_biases[index[1]]
|
77
|
+
model_bias1 = model.word_biases[index[0]]
|
78
|
+
model_bias2 = model.word_biases[index[1]]
|
79
|
+
|
80
|
+
expect(bias1).not_to eq(model_bias1)
|
81
|
+
expect(bias2).not_to eq(model_bias2)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|