RubyGems - glove - Versions diffs - 0.1.1 - Mend

glove 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +7 -0
data/.gitignore +19 -0
data/.rspec +1 -0
data/.travis.yml +12 -0
data/Gemfile +7 -0
data/LICENSE.txt +22 -0
data/README.md +122 -0
data/Rakefile +7 -0
data/benchmark/benchmark.rb +39 -0
data/benchmark/co-occurrence.rb +85 -0
data/benchmark/data/quantum-physics.txt +1 -0
data/benchmark/profile.rb +19 -0
data/benchmark/results/.keep +0 -0
data/glove.gemspec +28 -0
data/lib/glove.rb +18 -0
data/lib/glove/corpus.rb +103 -0
data/lib/glove/model.rb +247 -0
data/lib/glove/parser.rb +90 -0
data/lib/glove/token_pair.rb +15 -0
data/lib/glove/version.rb +3 -0
data/lib/glove/workers.rb +8 -0
data/lib/glove/workers/cooccurrence_worker.rb +55 -0
data/lib/glove/workers/training_worker.rb +104 -0
data/resources/en.stop +711 -0
data/spec/fixtures/biases-t.bin +0 -0
data/spec/fixtures/cooc-t.bin +0 -0
data/spec/fixtures/corpus-t.bin +0 -0
data/spec/fixtures/words-t.bin +0 -0
data/spec/lib/glove/corpus_spec.rb +77 -0
data/spec/lib/glove/model_spec.rb +208 -0
data/spec/lib/glove/parser_spec.rb +55 -0
data/spec/lib/glove/token_pair_spec.rb +14 -0
data/spec/lib/glove/workers/cooccurrence_worker_spec.rb +61 -0
data/spec/lib/glove/workers/training_worker_spec.rb +84 -0
data/spec/spec_helper.rb +18 -0
metadata +177 -0

data/lib/glove/model.rb ADDED

@@ -0,0 +1,247 @@
+module Glove
+  class Model
+    # Default options (see #initialize)
+    DEFAULTS = {
+      max_count:      100,
+      learning_rate:  0.05,
+      alpha:          0.75,
+      num_components: 30,
+      epochs:         5,
+      threads:        4
+    }
+    # @!attribute [r] corpus
+    #   @return [Glove::Corpus] reference to the Corpus instance
+    # @!attribute [r] token_index
+    #   @return [Hash] reference to corpus.index
+    # @!attribute [r] token_pairs
+    #   @return [Array<(Glove::TokenPair)>] reference to corpus.pairs
+    # @!attribute [rw] word_vec
+    #   @return [GSL::Matrix] the word vector matrix
+    # @!attribute [rw] word_biases
+    #   @return [GSL::Vector] the vector holding the word biases
+    attr_reader :opts, :window, :epochs, :num_components, :min_count
+    attr_reader :learning_rate, :alpha, :max_count, :threads
+    attr_reader :cooc_matrix, :corpus, :token_index, :token_pairs
+    attr_accessor :word_vec, :word_biases
+    # Create a new {Glove::Model} instance. Accepts options for
+    # {Glove::Corpus} and {Glove::Parser} which only get forwarded
+    # and not used in this class.
+    #
+    # @param [Hash] options the options to initialize the instance with.
+    # @option options [Integer] :max_count (100) Parameter specifying cutoff in
+    #   weighting function
+    # @option options [Float] :learning_rate (0.05) Initial learning rate
+    # @option options [Float] :alpha (0.75) Exponent of weighting function
+    # @option options [Integer] :num_components (30) Column size of the word vector
+    #   matrix
+    # @option options [Integer] :epochs (5) Number of training iterations
+    # @option options [Integer] :threads (4) Number of threads to use in building
+    #   the co-occurence matrix and training iterations. Must be greater then 0
+    # @return [Glove::Model] A GloVe model.
+    def initialize(options={})
+      @opts = DEFAULTS.dup.merge(options)
+      @opts.each do |key, value|
+        instance_variable_set :"@#{key}", value
+      end
+      @cooc_matrix = nil
+      @word_vec    = nil
+      @word_biases = nil
+    end
+    # Fit a string or {Glove::Corpus} instance and build co-occurance matrix
+    #
+    # @param [String, Glove::Corpus] text The text to train from
+    # @example Provide corpus for the model
+    #   model = Glove::Model.new
+    #   model.fit(File.read('shakespeare.txt'))
+    # @example Provide a {Glove::Corpus} instance as text argument
+    #   model = Glove::Model.new
+    #   corpus = Glove::Corpus.build(File.read('shakespeare.txt'))
+    #   model.fit(corpus)
+    # @return [Glove::Model] Current instance
+    def fit(text)
+      fit_corpus(text)
+      build_cooc_matrix
+      build_word_vectors
+      self
+    end
+    # Train the model. Must call #fit prior
+    # @return [Glove::Model] Current instance
+    def train
+      train_in_epochs(matrix_nnz)
+      self
+    end
+    # Save trained data to files
+    #
+    # @param [String] corpus_file Filename for corpus
+    # @param [String] cooc_file Filename for co-occurence matrix
+    # @param [String] vec_file Filename for Word Vector Maxtrix
+    # @param [String] bias_file Filename for Word Biases Vector
+    def save(corpus_file, cooc_file, vec_file, bias_file)
+      File.open(corpus_file, 'wb') do |file|
+        file.write Marshal.dump(corpus)
+      end
+      cooc_matrix.fwrite(cooc_file)
+      word_vec.fwrite(vec_file)
+      word_biases.fwrite(bias_file)
+    end
+    # Loads training data from already existing files
+    #
+    # @param [String] corpus_file Filename for corpus
+    # @param [String] cooc_file Filename for co-occurence matrix
+    # @param [String] vec_file Filename for Word Vector Maxtrix
+    # @param [String] bias_file Filename for Word Biases Vector
+    def load(corpus_file, cooc_file, vec_file, bias_file)
+      @corpus = Marshal.load(File.binread(corpus_file))
+      @token_index = corpus.index
+      @token_pairs = corpus.pairs
+      size = token_index.size
+      @cooc_matrix = GSL::Matrix.alloc(size, size)
+      @word_vec    = GSL::Matrix.alloc(size, num_components)
+      @word_biases = GSL::Vector.alloc(size)
+      @cooc_matrix.fread(cooc_file)
+      @word_vec.fread(vec_file)
+      @word_biases.fread(bias_file)
+    end
+    # @todo create graph of the word vector matrix
+    def visualize
+      raise "Not implemented"
+    end
+    # Get a words that relates to :target like :word1 relates to :word2
+    #
+    # @param [String] word1
+    # @param [String] word2
+    # @param [Integer] num Number of related words to :target
+    # @param [Float] accuracy Allowance in difference of target cosine
+    #   and related word cosine distances
+    # @example What words relate to atom like quantum relates to physics?
+    #   model.analogy_words('quantum', 'physics', 'atom')
+    #   # => [["electron", 0.98583], ["energi", 0.98151], ["photon",0.96650]]
+    # @return [Array] List of related words to target
+    def analogy_words(word1, word2, target, num=3, accuracy=0.0001)
+      word1  = word1.stem
+      word2  = word1.stem
+      target = target.stem
+      distance = cosine(vector(word1), vector(word2))
+      vector_distance(target).reject do |item|
+        diff = item[1].to_f.abs - distance
+        diff.abs < accuracy
+      end.take(num)
+    end
+    # Get most similar words to :word
+    #
+    # @param [String] word The word to find similar to
+    # @param [Integer] num (3) Number of similar words to :word
+    # @example Get 1 most similar word to 'physics'
+    #   model.most_similar('physics', 1) # => ["quantum", 0.9967993356234444]
+    # @return [Array] List of most similar words with cosine distance as values
+    def most_similar(word, num=3)
+      vector_distance(word.stem).take(num)
+    end
+    # Prevent token_pairs, matrices and vectors to fill up the terminal
+    def inspect
+      to_s
+    end
+    private
+    # Perform train iterations
+    #
+    # @param [Array] indices The non-zero value indices in cooc_matrix
+    def train_in_epochs(indices)
+      1.upto(epochs) do |epoch|
+        shuffled = indices.shuffle
+        @word_vec, @word_biases = Workers::TrainingWorker.new(self, shuffled).run
+      end
+    end
+    # Builds the corpus and sets @token_index and @token_pairs
+    def fit_corpus(text)
+      @corpus =
+        if text.is_a? Corpus
+          text
+        else
+          Corpus.build(text, opts)
+        end
+      @token_index = corpus.index
+      @token_pairs = corpus.pairs
+    end
+    # Create initial values for @word_vec and @word_biases
+    def build_word_vectors
+      cols          = token_index.size
+      @word_vec     = GSL::Matrix.rand(cols, num_components)
+      @word_biases  = GSL::Vector.alloc(cols)
+    end
+    # Buids the co-occurence matrix
+    def build_cooc_matrix
+      @cooc_matrix = Workers::CooccurrenceWorker.new(self).run
+    end
+    # Array of all non-zero (both row and col) value coordinates in the
+    # cooc_matrix
+    def matrix_nnz
+      entries = []
+      cooc_matrix.enum_for(:each_col).each_with_index do |col, col_idx|
+        col.enum_for(:each).each_with_index do |row, row_idx|
+          value = cooc_matrix[row_idx, col_idx]
+          entries << [row_idx, col_idx] unless value.zero?
+        end
+      end
+      entries
+    end
+    # Find the vector row of @word_vec for a given word
+    #
+    # @param [String] word The word to transform into a vector
+    # @return [GSL::Vector] The corresponding vector into the #word_vec matrix
+    def vector(word)
+      return nil unless word_index = token_index[word]
+      word_vec.row(word_index)
+    end
+    # Balculates the cosine distance of all the words in the vocabulary
+    # against a given word. Results are then sorted in DESC order
+    #
+    # @param [String] word The word to compare against
+    # @return [Array<(String, Integer)>] Array of tokens and their distance
+    def vector_distance(word)
+      return {} unless word_vector = vector(word)
+      token_index.map.with_index do |(token,count), idx|
+        next if token.eql? word
+        [token, cosine(word_vector, word_vec.row(idx))]
+      end.compact.sort{ |a,b| b[1] <=> a[1] }
+    end
+    # Compute cosine distance between two vectors
+    #
+    # @param [GSL::Vector] vector1 First vector
+    # @param [GSL::Vector] vector2 Second vector
+    # @return [Float] the cosine distance
+    def cosine(vector1, vector2)
+      return 0 if vector1.nil? || vector2.nil?
+      vector1.dot(vector2) / (vector1.norm * vector2.norm)
+    end
+  end
+end

data/lib/glove/parser.rb ADDED

@@ -0,0 +1,90 @@
+module Glove
+  # Takes a string of text and tokenizes it for usage in {Glove::Corpus}
+  #
+  class Parser
+    # Default options (see #initialize)
+    DEFAULTS = {
+      stem:       true,
+      min_length: 3,
+      max_length: 25,
+      alphabetic: true,
+      normalize:  true,
+      stop_words: true
+    }
+    # @!attribute [r] text
+    #   @return [String] the current value of the text attribute
+    #
+    attr_reader :text
+    # Create a new {Glove::Parser}, passing the text and options as arguments
+    #
+    # @param [String] text value for the text attribute
+    # @param [Hash] options the options to initialize the instance with.
+    # @option options [Boolean] :stem (true) Whether to stem the tokens
+    # @option options [Boolean] :alphabetic (true) Remove any non-alphabetic chars
+    # @option options [Boolean] :normalize (true) Normalize the text and keep
+    #   words with length between option[:min_length] and option[:max_length]
+    # @option options [Boolean] :stop_words (true) Filter stop words
+    # @option options [Integer] :min_length (3) the min allowed length of a word
+    # @option options [Integer] :max_length (25) the max allowed length of a word
+    # @return [Glove::Parser] A new parser.
+    def initialize(text, options={})
+      @text, @opt = text, DEFAULTS.dup.merge(options)
+    end
+    # Call all parsing methods in the class and return the final text value as
+    # array of words
+    #
+    # @return [Array] The tokens array
+    def tokenize
+      downcase
+      stop_words  if @opt[:stop_words]
+      alphabetic  if @opt[:alphabetic]
+      split
+      normalize   if @opt[:normalize]
+      stem        if @opt[:stem]
+      text
+    end
+    # Downcases the text value
+    def downcase
+      text.downcase!
+    end
+    # Splits the text string into an array of words
+    def split
+      @text = text.split
+    end
+    # Filters out the text leaving only alphabetical characters in words
+    # and splits the words
+    def alphabetic
+      text.gsub!(/([^[:alpha:]]+)|((?=\w*[a-z])(?=\w*[0-9])\w+)/, ' ')
+    end
+    # Stems every member of the text array
+    def stem
+      text.map!(&:stem)
+    end
+    # Selects words with length within the :min_length and :max_length boundaries
+    def normalize
+      text.keep_if do |word|
+        word.length.between?(@opt[:min_length], @opt[:max_length])
+      end
+    end
+    # Exclude words that are in the STOP_WORDS array
+    def stop_words
+      @text = text.scan(/(\w+)(\W+)/).reject do |(word, other)|
+        stop_words_array.include? word
+      end.flatten.join
+    end
+    # Reads the default stop words file and return array of its entries
+    def stop_words_array
+      @stop_words ||= File.read(File.join(Glove.root_path, 'resources', 'en.stop')).split
+    end
+  end
+end

data/lib/glove/token_pair.rb ADDED

@@ -0,0 +1,15 @@
+module Glove
+  # Holds a token string and its neighbors in an array
+  class TokenPair
+    # @!attribute [r] token
+    #   @return [String] The word/token
+    # @!attribute [r] neighbors
+    #   @return [Array<(String)>>] List of neighboring words
+    attr_accessor :token, :neighbors
+    # Get class instance and set token and neighbors variables
+    def initialize(token='', neighbors=[])
+      @token, @neighbors = token, neighbors
+    end
+  end
+end

data/lib/glove/version.rb ADDED

@@ -0,0 +1,3 @@
+module Glove
+  VERSION = "0.1.1"
+end

data/lib/glove/workers.rb ADDED

@@ -0,0 +1,8 @@
+require 'forwardable'
+module Glove
+  module Workers
+    require 'glove/workers/cooccurrence_worker'
+    require 'glove/workers/training_worker'
+  end
+end

data/lib/glove/workers/cooccurrence_worker.rb ADDED

@@ -0,0 +1,55 @@
+module Glove
+  module Workers
+    # Constructs the co-occurrence matrix for {Glove::Model}
+    class CooccurrenceWorker
+      extend ::Forwardable
+      # @!attribute [r] token_index
+      #   @return [Hash{String=>Integer}] Clone of @caller.token_index
+      # @!attribute [r] word_biases
+      #   @return [Array<(Glove::TokenPair)>] Clone of @caller.token_pairs
+      attr_reader :token_index, :token_pairs
+      def_delegators :@caller, :threads
+      # Creates instance of the class
+      #
+      # @param [Glove::Model] caller Caller class
+      def initialize(caller)
+        @caller = caller
+        @token_index = @caller.token_index.dup
+        @token_pairs = @caller.token_pairs.dup
+      end
+      # Perform the building of the matrix
+      #
+      # @return [GSL::Matrix] The co-occurrence matrix
+      def run
+        vectors = Parallel.map(token_index, in_processes: threads) do |slice|
+          build_cooc_matrix_col(slice)
+        end
+        GSL::Matrix.alloc(*vectors)
+      end
+      # Creates a vector column for the cooc_matrix based on given token.
+      # Calculates sum for how many times the word exists in the constext of the
+      # entire vocabulary
+      #
+      # @param [Array<(String, Integer)>] slice Token with index
+      # @return [Array] GSL::Vector#to_a representation of the column
+      def build_cooc_matrix_col(slice)
+        token = slice[0]
+        vector = GSL::Vector.alloc(token_index.size)
+        token_pairs.each do |pair|
+          key = token_index[pair.token]
+          sum = pair.neighbors.select{ |word| word == token }.size
+          vector[key] += sum
+        end
+        vector.to_a
+      end
+    end
+  end
+end

data/lib/glove/workers/training_worker.rb ADDED

@@ -0,0 +1,104 @@
+module Glove
+  module Workers
+    # Performs the traing process on the word vector matrix, as well as word biases
+    class TrainingWorker
+      extend ::Forwardable
+      # @!attribute [r] indices
+      #   @return [Glove::Corpus] Shuffled co-occurrence matrix slots
+      # @!attribute [r] word_vec
+      #   @return [GSL::Matrix] Clone of @caller.word_vec
+      # @!attribute [r] word_biases
+      #   @return [GSL::Vector] Clone of @caller.word_biases
+      attr_accessor :indices, :word_vec, :word_biases
+      def_delegators :@caller, :cooc_matrix, :threads, :max_count, :alpha, :learning_rate
+      # Create a {Glove::Workers::TrainingWorker} instance
+      # @param [Glove::Model] caller Caller class
+      # @param [Array<(Integer, Integer)>] indices Shuffled indices of non-zero
+      #   elements in the model's co-occurence matrix
+      def initialize(caller, indices)
+        @caller, @indices = caller, indices
+        @word_vec     = @caller.word_vec.dup
+        @word_biases  = @caller.word_biases.dup
+      end
+      # Runs the calculations
+      # @return [GSL::Matrix, GSL::Vector] Weighted word vectors and word biases
+      def run
+        mutex = Mutex.new
+        slice_size = indices.size / threads
+        workers = indices.each_slice(slice_size).map do |slice|
+          Thread.new{ work(slice, mutex) }
+        end
+        workers.each(&:join)
+        [word_vec, word_biases]
+      end
+      # Perform a full train iteration on the word vectors and word biases
+      # @param [Array] slice Shuffled co-occurrence matrix slots
+      # @param [Mutex] mutex Thread-safe lock on #apply_weights
+      def work(slice, mutex)
+        slice.each do |slot|
+          w1, w2 = slot
+          loss, word_a_norm, word_b_norm = calc_weights(w1, w2)
+          mutex.synchronize do
+            apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
+          end
+        end
+      end
+      # Calculates loss, and norms for word1 (row) and word2 (column) by given
+      # indices
+      #
+      # @param [Integer] w1 Row index
+      # @param [Integer] w2 Column index
+      # @param [Float] prediction (0.0) Initial predication value
+      # @param [Float] word_a_norm (0.0) Initial norm of word at row w1
+      # @param [Float] word_b_norm (0.0) Initial norm of word at col w2
+      # @return [Float, Float, Float] Array of loss, word_a_norm, word_b_norm
+      def calc_weights(w1, w2, prediction=0.0, word_a_norm=0.0, word_b_norm = 0.0)
+        count = cooc_matrix[w1, w2]
+        word_vec.each_col do |col|
+          w1_context = col[w1]
+          w2_context = col[w2]
+          prediction = prediction + w1_context + w2_context
+          word_a_norm += w1_context * w1_context
+          word_b_norm += w2_context * w2_context
+        end
+        prediction = prediction + word_biases[w1] + word_biases[w2]
+        word_a_norm = Math.sqrt(word_a_norm)
+        word_b_norm = Math.sqrt(word_b_norm)
+        entry_weight = [1.0, (count/max_count)].min ** alpha
+        loss = entry_weight * (prediction - Math.log(count))
+        [loss, word_a_norm, word_b_norm]
+      end
+      # Applies calculated weights to @word_vec and @word_biases. MUST be called
+      # in a Mutex#synchronize block
+      #
+      # @param [Integer] w1 Row index
+      # @param [Integer] w2 Column index
+      # @param [Float] loss Loss value
+      # @param [Float] word_a_norm Norm of word at row w1
+      # @param [Float] word_b_norm Norm of word at col w2
+      def apply_weights(w1, w2, loss, word_a_norm, word_b_norm)
+        word_vec.each_col do |col|
+          col[w1] = (col[w1] - learning_rate * loss * col[w2]) / word_a_norm
+          col[w2] = (col[w2] - learning_rate * loss * col[w2]) / word_b_norm
+        end
+        word_biases[w1] -= learning_rate * loss
+        word_biases[w2] -= learning_rate * loss
+      end
+    end
+  end
+end