RubyGems - tf-idf-similarity - Versions diffs - 0.0.9 → 0.1.0 - Mend

tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/.travis.yml +29 -0
data/Gemfile +4 -0
data/README.md +41 -29
data/lib/tf-idf-similarity.rb +12 -1
data/lib/tf-idf-similarity/document.rb +35 -28
data/lib/tf-idf-similarity/extras/document.rb +2 -125
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
data/lib/tf-idf-similarity/term_count_model.rb +78 -0
data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
data/lib/tf-idf-similarity/token.rb +34 -12
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/document_spec.rb +136 -0
data/spec/extras/tf_idf_model_spec.rb +269 -0
data/spec/spec_helper.rb +21 -0
data/spec/term_count_model_spec.rb +108 -0
data/spec/tf_idf_model_spec.rb +174 -0
data/spec/token_spec.rb +34 -0
data/td-idf-similarity.gemspec +3 -3
metadata +91 -63
data/lib/tf-idf-similarity/collection.rb +0 -205
data/lib/tf-idf-similarity/extras/collection.rb +0 -110

data/lib/tf-idf-similarity/matrix_methods.rb ADDED Viewed

@@ -0,0 +1,164 @@
+module TfIdfSimilarity::MatrixMethods
+private
+  # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
+  #
+  # @note Lucene normalizes document length differently.
+  def normalize
+    case @library
+    when :gsl
+      @matrix.clone.each_col do |column|
+        unless column.isnull?
+          column.normalize!
+        end
+      end
+    when :narray # @see https://github.com/masa16/narray/issues/21
+      norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
+      norm[norm.where2[1]] = 1.0 # avoid division by zero
+      NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
+    when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
+      normal = NMatrix.new(:dense, @matrix.shape, :float64)
+      (0...@matrix.shape[1]).each do |j|
+        column = @matrix.column(j)
+        norm = Math.sqrt(column.transpose.dot(column)[0, 0])
+        (0...@matrix.shape[0]).each do |i|
+          normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
+        end
+      end
+      normal
+    else
+      Matrix.columns(@matrix.column_vectors.map do |column|
+        if column.to_a.all?(&:zero?)
+          column
+        elsif column.respond_to?(:normalize)
+          column.normalize
+        else
+          column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
+        end
+      end)
+    end
+  end
+  # @param [Integer] row index
+  # @param [Integer] column index
+  def get(i, j)
+    case @library
+    when :narray
+      @matrix[j, i]
+    else
+      @matrix[i, j]
+    end
+  end
+  # @param [Integer] index the row index
+  # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
+  def row(index)
+    case @library
+    when :narray
+      @matrix[true, index]
+    else
+      @matrix.row(index)
+    end
+  end
+  # @param [Integer] index the column index
+  # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
+  def column(index)
+    case @library
+    when :narray
+      @matrix[index, true]
+    else
+      @matrix.column(index)
+    end
+  end
+  # @return [Float] the number of rows in the matrix
+  def row_size
+    case @library
+    when :gsl, :nmatrix
+      @matrix.shape[0]
+    when :narray
+      @matrix.shape[1]
+    else
+      @matrix.row_size
+    end
+  end
+  # @return [Float] the number of columns in the matrix
+  def column_size
+    case @library
+    when :gsl, :nmatrix
+      @matrix.shape[1]
+    when :narray
+      @matrix.shape[0]
+    else
+      @matrix.column_size
+    end
+  end
+  # @return [Array<Float>] the matrix's values
+  def values
+    case @library
+    when :nmatrix
+      @matrix.each.to_a
+    else
+      @matrix.to_a.flatten
+    end
+  end
+  # @return [Float] the sum of all values in the matrix
+  def sum
+    case @library
+    when :narray
+      @matrix.sum
+    else
+      values.reduce(0, :+)
+    end
+  end
+  # @param [Array<Array>] array matrix rows
+  # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
+  def initialize_matrix(array)
+    case @library
+    when :gsl
+      GSL::Matrix[*array]
+    when :narray
+      NArray[*array]
+    when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91
+      NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten)
+    else
+      Matrix[*array]
+    end
+  end
+  # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
+  # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
+  def multiply_self(matrix)
+    case @library
+    when :nmatrix
+      matrix.transpose.dot(matrix)
+    else
+      matrix.transpose * matrix
+    end
+  end
+  def log(number)
+    case @library
+    when :gsl
+      GSL::Sf::log(number)
+    when :narray
+      NMath.log(number)
+    else
+      Math.log(number)
+    end
+  end
+  def sqrt(number)
+    case @library
+    when :narray
+      NMath.sqrt(number)
+    else
+      Math.sqrt(number)
+    end
+  end
+end

data/lib/tf-idf-similarity/term_count_model.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# A simple document-term matrix.
+#
+# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
+# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
+# @see http://en.wikipedia.org/wiki/Okapi_BM25
+class TfIdfSimilarity::TermCountModel
+  include TfIdfSimilarity::MatrixMethods
+  # The documents in the corpus.
+  attr_reader :documents
+  # The set of terms in the corpus.
+  attr_reader :terms
+  # The average number of tokens in a document.
+  attr_reader :average_document_size
+  # @param [Array<TfIdfSimilarity::Document>] documents documents
+  # @param [Hash] opts optional arguments
+  # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
+  def initialize(documents, opts = {})
+    @documents = documents
+    @terms = Set.new(documents.map(&:terms).flatten).to_a
+    @library = (opts[:library] || :matrix).to_sym
+    array = Array.new(terms.size) do |i|
+      Array.new(documents.size) do |j|
+        documents[j].term_count(terms[i])
+      end
+    end
+    @matrix = initialize_matrix(array)
+    @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
+  end
+  # @param [String] term a term
+  # @return [Integer] the number of documents the term appears in
+  def document_count(term)
+    index = terms.index(term)
+    if index
+      case @library
+      when :gsl, :narray
+        row(index).where.size
+      when :nmatrix
+        row(index).each.count(&:nonzero?)
+      else
+        vector = row(index)
+        unless vector.respond_to?(:count)
+          vector = vector.to_a
+        end
+        vector.count(&:nonzero?)
+      end
+    else
+      0
+    end
+  end
+  # @param [String] term a term
+  # @return [Integer] the number of times the term appears in the corpus
+  def term_count(term)
+    index = terms.index(term)
+    if index
+      case @library
+      when :gsl, :narray
+        row(index).sum
+      when :nmatrix
+        row(index).each.reduce(0, :+)
+      else
+        vector = row(index)
+        unless vector.respond_to?(:reduce)
+          vector = vector.to_a
+        end
+        vector.reduce(0, :+)
+      end
+    else
+      0
+    end
+  end
+end

data/lib/tf-idf-similarity/tf_idf_model.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# A document-term matrix using either the tf*idf or BM25 functions.
+#
+# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
+# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
+# @see http://en.wikipedia.org/wiki/Okapi_BM25
+class TfIdfSimilarity::TfIdfModel
+  include TfIdfSimilarity::MatrixMethods
+  extend Forwardable
+  def_delegators :@model, :documents, :terms, :document_count
+  # @param [Array<TfIdfSimilarity::Document>] documents documents
+  # @param [Hash] opts optional arguments
+  # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
+  # @option opts [Symbol] :function :tfidf (default) or :bm25
+  def initialize(documents, opts = {})
+    @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
+    @library = (opts[:library] || :matrix).to_sym
+    @function = (opts[:function] || :tfidf).to_sym
+    array = Array.new(terms.size) do |i|
+      idf = inverse_document_frequency(terms[i])
+      Array.new(documents.size) do |j|
+        term_frequency(documents[j], terms[i]) * idf
+      end
+    end
+    @matrix = initialize_matrix(array)
+  end
+  # Return the term's inverse document frequency.
+  #
+  # @param [String] term a term
+  # @return [Float] the term's inverse document frequency
+  def inverse_document_frequency(term)
+    df = @model.document_count(term)
+    if @function == :bm25
+      log((documents.size - df + 0.5) / (df + 0.5))
+    else
+      1 + log(documents.size / (df + 1.0))
+    end
+  end
+  alias_method :idf, :inverse_document_frequency
+  # Returns the term's frequency in the document.
+  #
+  # @param [Document] document a document
+  # @param [String] term a term
+  # @return [Float] the term's frequency in the document
+  #
+  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
+  def term_frequency(document, term)
+    tf = document.term_count(term)
+    if @function == :bm25
+      (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
+    else
+      sqrt(tf)
+    end
+  end
+  alias_method :tf, :term_frequency
+  # Return the term frequency–inverse document frequency.
+  #
+  # @param [Document] document a document
+  # @param [String] term a term
+  # @return [Float] the term frequency–inverse document frequency
+  def term_frequency_inverse_document_frequency(document, term)
+    inverse_document_frequency(term) * term_frequency(document, term)
+  end
+  alias_method :tfidf, :term_frequency_inverse_document_frequency
+  # Returns a similarity matrix for the documents in the corpus.
+  #
+  # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
+  # @note Columns are normalized to unit vectors, so we can calculate the cosine
+  #   similarity of all document vectors. BM25 doesn't normalize columns, but
+  #   BM25 wasn't written with this use case in mind.
+  def similarity_matrix
+    multiply_self(normalize)
+  end
+end

data/lib/tf-idf-similarity/token.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # coding: utf-8
+# A token.
+#
 # @note We can add more filters from Solr and stem using Porter's Snowball.
 #
 # @see https://github.com/aurelian/ruby-stemmer
@@ -14,29 +16,49 @@ class TfIdfSimilarity::Token < String
   #
   # @return [Boolean] whether the string is a token
   def valid?
-    !self[%r{
-      \A
-        (
-         \d           | # number
-         \p{Cntrl}    | # control character
-         \p{Punct}    | # punctuation
-         [[:space:]]    # whitespace
-        )+
-      \z
-    }x]
+    if RUBY_VERSION < '1.9'
+      !self[%r{
+        \A
+          (
+           \d           | # number
+           [[:cntrl:]]  | # control character
+           [[:punct:]]  | # punctuation
+           [[:space:]]    # whitespace
+          )+
+        \z
+      }x]
+    else
+      !self[%r{
+        \A
+          (
+           \d           | # number
+           \p{Cntrl}    | # control character
+           \p{Punct}    | # punctuation
+           \p{Space}      # whitespace
+          )+
+        \z
+      }x] # The Ruby 1.8 parser will complain about this regular expression.
+    end
   end
+  # Returns a lowercase string.
+  #
   # @return [Token] a lowercase string
   #
   # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
   def lowercase_filter
-    self.class.new UnicodeUtils.downcase(self, :fr)
+    self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
+      "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
+      "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
+    ).downcase)
   end
+  # Returns a string with no English possessive or periods in acronyms.
+  #
   # @return [Token] a string with no English possessive or periods in acronyms
   #
   # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
   def classic_filter
-    self.class.new self.gsub('.', '').chomp("'s")
+    self.class.new(self.gsub('.', '').chomp("'s"))
   end
 end

data/lib/tf-idf-similarity/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TfIdfSimilarity
-  VERSION = "0.0.9"
+  VERSION = "0.1.0"
 end

data/spec/document_spec.rb ADDED Viewed

@@ -0,0 +1,136 @@
+require 'spec_helper'
+# @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
+describe TfIdfSimilarity::Document do
+  let :text do
+    "FOO-foo BAR bar \r\n\t 123 !@#"
+  end
+  let :tokens do
+    ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
+  end
+  let :document_without_text do
+    TfIdfSimilarity::Document.new('')
+  end
+  let :document do
+    TfIdfSimilarity::Document.new(text)
+  end
+  let :document_with_id do
+    TfIdfSimilarity::Document.new(text, :id => 'baz')
+  end
+  let :document_with_tokens do
+    TfIdfSimilarity::Document.new(text, :tokens => tokens)
+  end
+  let :document_with_term_counts do
+    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
+  end
+  let :document_with_term_counts_and_size do
+    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
+  end
+  let :document_with_size do
+    TfIdfSimilarity::Document.new(text, :size => 10)
+  end
+  describe '#id' do
+    it 'should return the ID if no ID given' do
+      document.id.should == document.object_id
+    end
+    it 'should return the given ID' do
+      document_with_id.id.should == 'baz'
+    end
+  end
+  describe '#text' do
+    it 'should return the text' do
+      document.text.should == text
+    end
+  end
+  describe '#size' do
+    it 'should return the number of tokens if no tokens given' do
+      document.size.should == 4
+    end
+    it 'should return the number of tokens if tokens given' do
+      document_with_tokens.size.should == 3
+    end
+    it 'should return the number of tokens if no text given' do
+      document_without_text.size.should == 0
+    end
+    it 'should return the number of tokens if term counts given' do
+      document_with_term_counts.size.should == 15
+    end
+    it 'should return the given number of tokens if term counts and size given' do
+      document_with_term_counts_and_size.size.should == 20
+    end
+    it 'should not return the given number of tokens if term counts not given' do
+      document_with_size.size.should_not == 10
+    end
+  end
+  describe '#term_counts' do
+    it 'should return the term counts if no tokens given' do
+      document.term_counts.should == {'foo' => 2, 'bar' => 2}
+    end
+    it 'should return the term counts if tokens given' do
+      document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
+    end
+    it 'should return no term counts if no text given' do
+      document_without_text.term_counts.should == {}
+    end
+    it 'should return the term counts if term counts given' do
+      document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
+    end
+  end
+  describe '#terms' do
+    it 'should return the terms if no tokens given' do
+      document.terms.sort.should == ['bar', 'foo']
+    end
+    it 'should return the terms if tokens given' do
+      document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
+    end
+    it 'should return no terms if no text given' do
+      document_without_text.terms.should == []
+    end
+    it 'should return the terms if term counts given' do
+      document_with_term_counts.terms.sort.should == ['bar', 'baz']
+    end
+  end
+  describe '#term_count' do
+    it 'should return the term count if no tokens given' do
+      document.term_count('foo').should == 2
+    end
+    it 'should return the term count if tokens given' do
+      document_with_tokens.term_count('foo-foo').should == 1
+    end
+    it 'should return no term count if no text given' do
+      document_without_text.term_count('foo').should == 0
+    end
+    it 'should return the term count if term counts given' do
+      document_with_term_counts.term_count('bar').should == 5
+    end
+  end
+end