RubyGems - glove - Versions diffs - 0.1.1 - Mend

glove 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +7 -0
data/.gitignore +19 -0
data/.rspec +1 -0
data/.travis.yml +12 -0
data/Gemfile +7 -0
data/LICENSE.txt +22 -0
data/README.md +122 -0
data/Rakefile +7 -0
data/benchmark/benchmark.rb +39 -0
data/benchmark/co-occurrence.rb +85 -0
data/benchmark/data/quantum-physics.txt +1 -0
data/benchmark/profile.rb +19 -0
data/benchmark/results/.keep +0 -0
data/glove.gemspec +28 -0
data/lib/glove.rb +18 -0
data/lib/glove/corpus.rb +103 -0
data/lib/glove/model.rb +247 -0
data/lib/glove/parser.rb +90 -0
data/lib/glove/token_pair.rb +15 -0
data/lib/glove/version.rb +3 -0
data/lib/glove/workers.rb +8 -0
data/lib/glove/workers/cooccurrence_worker.rb +55 -0
data/lib/glove/workers/training_worker.rb +104 -0
data/resources/en.stop +711 -0
data/spec/fixtures/biases-t.bin +0 -0
data/spec/fixtures/cooc-t.bin +0 -0
data/spec/fixtures/corpus-t.bin +0 -0
data/spec/fixtures/words-t.bin +0 -0
data/spec/lib/glove/corpus_spec.rb +77 -0
data/spec/lib/glove/model_spec.rb +208 -0
data/spec/lib/glove/parser_spec.rb +55 -0
data/spec/lib/glove/token_pair_spec.rb +14 -0
data/spec/lib/glove/workers/cooccurrence_worker_spec.rb +61 -0
data/spec/lib/glove/workers/training_worker_spec.rb +84 -0
data/spec/spec_helper.rb +18 -0
metadata +177 -0

data/spec/fixtures/biases-t.bin ADDED

Binary file

data/spec/fixtures/cooc-t.bin ADDED

Binary file

data/spec/fixtures/corpus-t.bin ADDED

Binary file

data/spec/fixtures/words-t.bin ADDED

Binary file

data/spec/lib/glove/corpus_spec.rb ADDED

@@ -0,0 +1,77 @@
+require 'spec_helper'
+describe Glove::Corpus do
+  let(:text) { "the quick brown fox jumped over the lazy dog" }
+  let(:opt)  { {window: 3, min_count: 2, stop_words: false} }
+  let(:corpus) { described_class.new(text, opt) }
+  describe '.build(text, options)' do
+    it 'forwards args to #initialize and calls #build_tokens on the instance' do
+      expect_any_instance_of(Glove::Corpus).to receive(:build_tokens)
+      Glove::Corpus.build(text)
+    end
+  end
+  describe '.new(text, options)' do
+    it 'gets parsed tokens from Parser class' do
+      expect(corpus.tokens).to be_a Array
+    end
+    it 'sets options as instance variables' do
+      expect(corpus.window).to    eq(opt[:window])
+      expect(corpus.min_count).to eq(opt[:min_count])
+    end
+  end
+  describe '#build_tokens' do
+    it 'calls #build_count, #build_index, #build_pairs and returns self' do
+      expect(corpus).to receive(:build_count)
+      expect(corpus).to receive(:build_index)
+      expect(corpus).to receive(:build_pairs)
+      expect(corpus.build_tokens).to be_instance_of described_class
+    end
+  end
+  describe '#count' do
+    it 'constructs a token count hash' do
+      expect(corpus.count).to eq({'the' => 2})
+    end
+  end
+  describe '#index' do
+    before do
+      corpus.build_count
+    end
+    it 'constructs a token index hash' do
+      expect(corpus.index).to eq({'the' => 0})
+    end
+  end
+  describe '#pairs' do
+    before do
+      corpus.build_count
+    end
+    it 'constructs array of token pairs with neighbors based on window opt' do
+      first_pair = corpus.pairs.first
+      last_pair  = corpus.pairs.last
+      expect(first_pair.neighbors).to eq %w(quick brown fox)
+      expect(last_pair.neighbors).to  eq %w(fox jump over lazi dog)
+    end
+  end
+  describe '#token_neighbors(word, index)' do
+    let(:corpus) { described_class.new(text, stop_words: false, min_count: 1) }
+    before do
+      corpus.build_count
+    end
+    it "returns window number of neighbors on each side" do
+      neighbors = corpus.token_neighbors('jump', 4)
+      expect(neighbors).to eq(['brown', 'fox', 'over', 'the'])
+    end
+  end
+end

data/spec/lib/glove/model_spec.rb ADDED

@@ -0,0 +1,208 @@
+require 'spec_helper'
+describe Glove::Model do
+  let(:text) { 'the quick brown fox jumped over the lazy dog' }
+  let(:model) { Glove::Model.new }
+  describe '.new(options)' do
+    it 'sets options as instance variables' do
+      expect(model.threads).to eq(Glove::Model::DEFAULTS[:threads])
+    end
+    it 'sets cooc_matrix, word_vec and Word_biases to nil' do
+      expect(model.cooc_matrix).to be_nil
+      expect(model.word_vec).to    be_nil
+      expect(model.word_biases).to be_nil
+    end
+  end
+  describe '#fit(text)' do
+    before do
+      allow(model).to receive(:fit_corpus).with(text)
+      allow(model).to receive(:build_cooc_matrix)
+      allow(model).to receive(:build_word_vectors)
+    end
+    it 'calls its internal methods #fit_corpus to build the corpus obj' do
+      expect(model).to receive(:fit_corpus).with(text)
+      model.fit(text)
+    end
+    it 'calls its internal methods #build_cooc_matrix to build the corpus obj' do
+      expect(model).to receive(:build_cooc_matrix)
+      model.fit(text)
+    end
+    it 'calls its internal methods #build_word_vectors to build the corpus obj' do
+      expect(model).to receive(:build_word_vectors)
+      model.fit(text)
+    end
+  end
+  describe '#fit_corpus(text)' do
+    before do
+      model.send :fit_corpus, text
+    end
+    it "build a corpus object from text string argument" do
+      expect(model.corpus).to be_instance_of Glove::Corpus
+    end
+    it "sets @token_index and @token_pairs vars" do
+      expect(model.token_pairs).not_to be_nil
+      expect(model.token_index).not_to be_nil
+    end
+  end
+  describe '#build_word_vectors' do
+    before do
+      allow(model).to receive(:token_index).and_return([0,1,2,3,4])
+      model.send :build_word_vectors
+    end
+    it 'creates @word_vec matrix with random floats' do
+      expect(model.word_vec.isnull?).to eq(false)
+    end
+    it 'creates @word_biases vector with zeros' do
+      expect(model.word_biases.isnull?).to eq(true)
+    end
+  end
+  describe '#train' do
+    let(:cooc_matrix) { GSL::Matrix.rand(4,4) }
+    before do
+      allow(model).to receive(:cooc_matrix).and_return(cooc_matrix)
+      allow(model).to receive(:train_in_epochs)
+    end
+    it 'calls the #train_in_epochs method' do
+      expect(model).to receive(:train_in_epochs)
+      model.train
+    end
+  end
+  context "IO" do
+    let(:corpus) { Glove::Corpus.build('quick brown fox', min_count: 1, stop_words: false) }
+    let(:cooc)   { GSL::Matrix.zeros(3,3) }
+    let(:words)  { GSL::Matrix.zeros(3, Glove::Model::DEFAULTS[:num_components]) }
+    let(:biases) { GSL::Vector.alloc([1,2,3]) }
+    describe '#save' do
+      let(:files)  do
+        %w(corpus.bin cooc.bin words.bin biases.bin).map do |f|
+          File.join(fixtures_path, f)
+        end
+      end
+      before(:each) do
+        model.instance_variable_set(:@cooc_matrix, cooc)
+        model.instance_variable_set(:@corpus, corpus)
+        model.instance_variable_set(:@word_vec, words)
+        model.instance_variable_set(:@word_biases, biases)
+      end
+      it "dumps corpus, cooc_matrix, word_vec and word_biases to files" do
+        model.save(*files)
+        files.each do |file|
+          expect(File.size(file)).to be > 0
+        end
+        files.each{ |f| File.delete(f) }
+      end
+    end
+    describe '#load' do
+      let(:files)  do
+        %w(corpus-t.bin cooc-t.bin words-t.bin biases-t.bin).map do |f|
+          File.join(fixtures_path, f)
+        end
+      end
+      before(:each) do
+        model.load(*files)
+      end
+      it 'loads corpus data from file as first argument' do
+        expect(model.corpus.tokens).to eq(corpus.tokens)
+      end
+      it 'loads cooc_matrix data from file as second argument' do
+        expect(model.cooc_matrix).to eq(cooc)
+      end
+      it 'loads word_vec data from file as third argument' do
+        expect(model.word_vec).to eq(words)
+      end
+      it 'loads word_biases data from file as fourth argument' do
+        expect(model.word_biases).to eq(biases)
+      end
+    end
+  end
+  describe '#visualize' do
+    pending
+  end
+  describe '#analogy_words(word1, word2, target, num, accuracy)' do
+    let(:distances)   { [["electron", 0.98583], ["radiation", 0.99998]] }
+    let(:target)      { 'atom' }
+    let(:pair_cosine) { 0.99999 }
+    before do
+      allow(model).to receive(:vector).and_return(0)
+      allow(model).to receive(:cosine).and_return(pair_cosine)
+      allow(model).to receive(:vector_distance).and_return(distances)
+    end
+    it 'returns the distances whose diff between the pair distance is less than accuracy arg' do
+      words = model.analogy_words('quantum', 'physics', target).flatten
+      expect(words).to     include('electron')
+      expect(words).not_to include('radiation')
+    end
+  end
+  describe '#most_similar(word, num)' do
+    let(:distances) { [["electron", 0.98583], ["radiation", 0.99998]] }
+    before do
+      allow(model).to receive(:vector_distance).and_return(distances)
+    end
+    it 'returns closest vectors to given word' do
+      words = model.most_similar('atom', 1).flatten
+      expect(words).to     include('electron')
+      expect(words).not_to include('radiation')
+    end
+  end
+  describe '#train_in_epochs(indices)' do
+    let(:worker) { double(:train, run: nil) }
+    let(:epochs) { Glove::Model::DEFAULTS[:epochs] }
+    before do
+      allow(Glove::Workers::TrainingWorker).to receive(:new).and_return(worker)
+    end
+    it 'calls a traing worker exactly @epochs times' do
+      expect(worker).to receive(:run).exactly(epochs).times
+      model.send :train_in_epochs, []
+    end
+  end
+  describe '#matrix_nnz' do
+    let(:matrix) { GSL::Matrix[[0,9], [3,0]] }
+    before do
+      allow(model).to receive(:cooc_matrix).and_return(matrix)
+    end
+    it 'gets all non-zero value indices in the cooc_matrix' do
+      nnz = model.send :matrix_nnz
+      expect(nnz).to eq([[1,0], [0,1]])
+    end
+  end
+end

data/spec/lib/glove/parser_spec.rb ADDED

@@ -0,0 +1,55 @@
+require 'spec_helper'
+describe Glove::Parser do
+  let(:text) { "the quick brown Fx jumps over the lazy d0g" }
+  let(:parser) { described_class.new(text) }
+  describe '#tokenize' do
+    let(:tokens) { %w(quick brown jump lazi) }
+    it "tokenizes the text string" do
+      expect(parser.tokenize).to eq(tokens)
+    end
+  end
+  describe '#downcase' do
+    it "downcases all letters" do
+      expect(parser.downcase).to eq text.downcase
+    end
+  end
+  describe '#split' do
+    it "splits the text string into an array" do
+      expect(parser.split).to be_a Array
+    end
+  end
+  describe '#alphabetic' do
+    it "leaves only words that do not contain any numbers" do
+      expect(parser.alphabetic).not_to include('b2b')
+    end
+  end
+  describe '#stem' do
+    it "stemps all words in the text array" do
+      parser.split
+      expect(parser.stem).not_to include('jumps')
+      expect(parser.stem).to     include('jump')
+    end
+  end
+  describe '#normalize' do
+    it "removes words whose length if not within specified boundary" do
+      parser.split
+      expect(parser.normalize).not_to include('Fx')
+    end
+  end
+  describe '#stop_words' do
+    it "filters all stop words from the text" do
+      expect(parser.stop_words).not_to include('the')
+    end
+  end
+end

data/spec/lib/glove/token_pair_spec.rb ADDED

@@ -0,0 +1,14 @@
+require 'spec_helper'
+describe Glove::TokenPair do
+  let(:token) { 'fox' }
+  let(:neighbors) { ['brown', 'jump'] }
+  let(:pair) { Glove::TokenPair.new(token, neighbors) }
+  describe '.new(word, neighbors)' do
+    it 'sets the token and neighbors variables' do
+      expect(pair.token).to     eq(token)
+      expect(pair.neighbors).to eq(neighbors)
+    end
+  end
+end

data/spec/lib/glove/workers/cooccurrence_worker_spec.rb ADDED

@@ -0,0 +1,61 @@
+require 'spec_helper'
+describe Glove::Workers::CooccurrenceWorker do
+  let(:index) { {'quick' => 0, 'brown' => 1, 'fox' => 2} }
+  let(:pairs) do
+    index.map{ |w,i| Glove::TokenPair.new(w) }
+  end
+  let(:threads) { 0 }
+  let(:caller) do
+    double(:caller, token_index: index, token_pairs: pairs, threads: threads)
+  end
+  let(:worker) { described_class.new(caller) }
+  describe '.new' do
+    it "keeps reference of the caller class" do
+      expect(worker.instance_variable_get(:@caller)).to eq(caller)
+    end
+    it "dupes token_index off the caller" do
+      expect(worker.token_index).to eq(index)
+    end
+    it "dupes token_pairs off the caller" do
+      expect(worker.token_pairs).to eq(pairs)
+    end
+  end
+  describe '#threads' do
+    it "delegates method to @caller" do
+      expect(worker.threads).to eq(threads)
+    end
+  end
+  describe '#run' do
+    before do
+      allow(worker).to receive(:build_cooc_matrix_col).and_return([0,1,2,3])
+    end
+    it 'calls #build_cooc_matrix_col in parallel processes' do
+      expect(worker).to receive(:build_cooc_matrix_col).exactly(index.size).times
+      worker.run
+    end
+    it 'converts the vector results into a matrix' do
+      expect(worker.run).to be_a GSL::Matrix
+    end
+  end
+  describe '#build_cooc_matrix_col' do
+    before do
+      pairs[0].neighbors << 'fox'
+    end
+    it 'builds the vector co-occurrence representation of a given token' do
+      result = worker.build_cooc_matrix_col(['fox', 2])
+      expect(result.size).to eq(index.size)
+      expect(result[0]).to eq(1)
+    end
+  end
+end

data/spec/lib/glove/workers/training_worker_spec.rb ADDED

@@ -0,0 +1,84 @@
+require 'spec_helper'
+describe Glove::Workers::TrainingWorker do
+  let(:text)    { 'quick fox brown fox' }
+  let(:opt)     { {min_count: 1, stop_words: false, threads: 0} }
+  let(:model)   { Glove::Model.new(opt).fit(text) }
+  let(:index)   { model.send(:matrix_nnz)[0] }
+  let(:worker)  { described_class.new(model, [index]) }
+  describe '.new' do
+    it 'dupes caller\'s :word_vec attribute ' do
+      expect(worker.word_vec).to eq(model.word_vec)
+    end
+    it 'dupes caller\'s :word_biases attribute ' do
+      expect(worker.word_biases).to eq(model.word_biases)
+    end
+  end
+  describe '#run' do
+    before do
+      allow(model).to receive(:threads).and_return(1)
+      allow(worker).to receive(:work)
+    end
+    it 'runs the #work method :threads number of times' do
+      expect(worker).to receive(:work).exactly(1).times
+      worker.run
+    end
+    it 'returns array of :word_vec and :word_biases after running the transforms' do
+      expect(worker.run).to eq([model.word_vec, model.word_biases])
+    end
+  end
+  describe '#work' do
+    let(:loss) { 1 }
+    let(:word_a_norm) { 1 }
+    let(:word_b_norm) { 1 }
+    before do
+      allow(worker).to receive(:calc_weights).with(index[0], index[1]).
+                        and_return([loss, word_b_norm, word_b_norm])
+    end
+    it 'calculates loss, and norm for each matrix index and applies the new values' do
+      expect(worker).to receive(:calc_weights).exactly(1).times
+      expect(worker).to receive(:apply_weights).
+        with(index[0], index[1], loss, word_a_norm, word_b_norm)
+      worker.work([index], Mutex.new)
+    end
+  end
+  describe '#calc_weights' do
+    it 'performs the calculation and returns loss and norm' do
+      loss, norm1, norm2 = worker.calc_weights(index[0], index[1])
+      expect(loss).not_to eq(0)
+      expect(loss).not_to eq(norm1)
+      expect(loss).not_to eq(norm2)
+    end
+  end
+  describe '#apply_weights' do
+    before do
+      worker.apply_weights(index[0], index[1], 1, 1, 1)
+    end
+    it "applies weights on the :word_vec matrix" do
+      expect(worker.word_vec[0,0]).not_to eq(model.word_vec[0,0])
+    end
+    it 'applied loss reducation on :word_biases' do
+      bias1 = worker.word_biases[index[0]]
+      bias2 = worker.word_biases[index[1]]
+      model_bias1 = model.word_biases[index[0]]
+      model_bias2 = model.word_biases[index[1]]
+      expect(bias1).not_to eq(model_bias1)
+      expect(bias2).not_to eq(model_bias2)
+    end
+  end
+end