RubyGems - tf-idf-similarity - Versions diffs - 0.1.3 → 0.1.4 - Mend

tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -8
data/Gemfile +2 -2
data/README.md +40 -9
data/lib/tf-idf-similarity.rb +1 -0
data/lib/tf-idf-similarity/bm25_model.rb +23 -62
data/lib/tf-idf-similarity/document.rb +69 -67
data/lib/tf-idf-similarity/extras/document.rb +10 -8
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
data/lib/tf-idf-similarity/model.rb +66 -0
data/lib/tf-idf-similarity/term_count_model.rb +59 -57
data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
data/lib/tf-idf-similarity/token.rb +39 -37
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/bm25_model_spec.rb +200 -0
data/spec/document_spec.rb +98 -96
data/spec/extras/tf_idf_model_spec.rb +224 -222
data/spec/spec_helper.rb +6 -0
data/spec/term_count_model_spec.rb +76 -74
data/spec/tf_idf_model_spec.rb +143 -117
data/spec/token_spec.rb +23 -21
metadata +6 -2

data/lib/tf-idf-similarity/tf_idf_model.rb CHANGED

@@ -1,66 +1,27 @@
 # A document-term matrix using the tf*idf function.
 #
 # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
-class TfIdfSimilarity::TfIdfModel
-  include TfIdfSimilarity::MatrixMethods
-  extend Forwardable
-  def_delegators :@model, :documents, :terms, :document_count
-  # @param [Array<TfIdfSimilarity::Document>] documents documents
-  # @param [Hash] opts optional arguments
-  # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
-  def initialize(documents, opts = {})
-    @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
-    @library = (opts[:library] || :matrix).to_sym
-    array = Array.new(terms.size) do |i|
-      idf = inverse_document_frequency(terms[i])
-      Array.new(documents.size) do |j|
-        term_frequency(documents[j], terms[i]) * idf
-      end
+module TfIdfSimilarity
+  class TfIdfModel < Model
+    # Return the term's inverse document frequency.
+    #
+    # @param [String] term a term
+    # @return [Float] the term's inverse document frequency
+    def inverse_document_frequency(term)
+      df = @model.document_count(term)
+      1 + log(documents.size / (df + 1.0))
     end
-    @matrix = initialize_matrix(array)
-  end
-  # Return the term's inverse document frequency.
-  #
-  # @param [String] term a term
-  # @return [Float] the term's inverse document frequency
-  def inverse_document_frequency(term)
-    df = @model.document_count(term)
-    1 + log(documents.size / (df + 1.0))
-  end
-  alias_method :idf, :inverse_document_frequency
-  # Returns the term's frequency in the document.
-  #
-  # @param [Document] document a document
-  # @param [String] term a term
-  # @return [Float] the term's frequency in the document
-  def term_frequency(document, term)
-    tf = document.term_count(term)
-    sqrt(tf)
-  end
-  alias_method :tf, :term_frequency
-  # Return the term frequency–inverse document frequency.
-  #
-  # @param [Document] document a document
-  # @param [String] term a term
-  # @return [Float] the term frequency–inverse document frequency
-  def term_frequency_inverse_document_frequency(document, term)
-    inverse_document_frequency(term) * term_frequency(document, term)
-  end
-  alias_method :tfidf, :term_frequency_inverse_document_frequency
-  # Returns a similarity matrix for the documents in the corpus.
-  #
-  # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
-  # @note Columns are normalized to unit vectors, so we can calculate the cosine
-  #   similarity of all document vectors.
-  def similarity_matrix
-    multiply_self(normalize)
+    alias_method :idf, :inverse_document_frequency
+    # Returns the term's frequency in the document.
+    #
+    # @param [Document] document a document
+    # @param [String] term a term
+    # @return [Float] the term's frequency in the document
+    def term_frequency(document, term)
+      tf = document.term_count(term)
+      sqrt(tf)
+    end
+    alias_method :tf, :term_frequency
   end
 end

data/lib/tf-idf-similarity/token.rb CHANGED

@@ -8,44 +8,46 @@
 # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory
 # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
 # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
-class TfIdfSimilarity::Token < String
-  # Returns a falsy value if all its characters are numbers, punctuation,
-  # whitespace or control characters.
-  #
-  # @note Some implementations ignore one and two-letter words.
-  #
-  # @return [Boolean] whether the string is a token
-  def valid?
-    !self[%r{
-      \A
-        (
-         \d           | # number
-         [[:cntrl:]]  | # control character
-         [[:punct:]]  | # punctuation
-         [[:space:]]    # whitespace
-        )+
-      \z
-    }x]
-  end
+module TfIdfSimilarity
+  class Token < String
+    # Returns a falsy value if all its characters are numbers, punctuation,
+    # whitespace or control characters.
+    #
+    # @note Some implementations ignore one and two-letter words.
+    #
+    # @return [Boolean] whether the string is a token
+    def valid?
+      !self[%r{
+        \A
+          (
+           \d           | # number
+           [[:cntrl:]]  | # control character
+           [[:punct:]]  | # punctuation
+           [[:space:]]    # whitespace
+          )+
+        \z
+      }x]
+    end
-  # Returns a lowercase string.
-  #
-  # @return [Token] a lowercase string
-  #
-  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
-  def lowercase_filter
-    self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
-      "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
-      "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
-    ).downcase)
-  end
+    # Returns a lowercase string.
+    #
+    # @return [Token] a lowercase string
+    #
+    # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
+    def lowercase_filter
+      self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
+        "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
+        "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
+      ).downcase)
+    end
-  # Returns a string with no English possessive or periods in acronyms.
-  #
-  # @return [Token] a string with no English possessive or periods in acronyms
-  #
-  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
-  def classic_filter
-    self.class.new(self.gsub('.', '').chomp("'s"))
+    # Returns a string with no English possessive or periods in acronyms.
+    #
+    # @return [Token] a string with no English possessive or periods in acronyms
+    #
+    # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
+    def classic_filter
+      self.class.new(self.gsub('.', '').chomp("'s"))
+    end
   end
 end

data/lib/tf-idf-similarity/version.rb CHANGED

@@ -1,3 +1,3 @@
 module TfIdfSimilarity
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

data/spec/bm25_model_spec.rb ADDED

@@ -0,0 +1,200 @@
+require 'spec_helper'
+module TfIdfSimilarity
+  describe BM25Model do
+    let :text do
+      "FOO-foo BAR bar \r\n\t 123 !@#"
+    end
+    let :tokens do
+      ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
+    end
+    let :document_without_text do
+      Document.new('')
+    end
+    let :document do
+      Document.new(text)
+    end
+    let :document_with_tokens do
+      Document.new(text, :tokens => tokens)
+    end
+    let :document_with_term_counts do
+      Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
+    end
+    let :non_corpus_document do
+      Document.new('foo foo foo')
+    end
+    def similarity_matrix_values(model)
+      matrix = model.similarity_matrix
+      if MATRIX_LIBRARY == :nmatrix
+        matrix.each.to_a
+      else
+        matrix.to_a.flatten
+      end
+    end
+    context 'without documents', :empty_matrix => true do
+      let :model do
+        BM25Model.new([], :library => MATRIX_LIBRARY)
+      end
+      describe '#documents' do
+        it 'should be empty' do
+          model.documents.should be_empty
+        end
+      end
+      describe '#document_index' do
+        it 'should return nil' do
+          model.document_index(document).should be_nil
+        end
+      end
+      describe '#text_index' do
+        it 'should return nil' do
+          model.text_index(text).should be_nil
+        end
+      end
+      describe '#terms' do
+        it 'should be empty' do
+          model.terms.should be_empty
+        end
+      end
+      describe '#inverse_document_frequency' do
+        it 'should return negative infinity' do
+          model.idf('foo').should == 0.0
+        end
+      end
+      describe '#term_frequency' do
+        it 'should return the term frequency' do
+          model.tf(document, 'foo').should be_nan
+        end
+      end
+      describe '#term_frequency_inverse_document_frequency' do
+        it 'should return negative infinity' do
+          model.tfidf(document, 'foo').should be_nan
+        end
+      end
+      describe '#similarity_matrix' do
+        it 'should be empty' do
+          similarity_matrix_values(model).should be_empty
+        end
+      end
+    end
+    context 'with documents' do
+      let :documents do
+        [
+          document,
+          document_with_tokens,
+          document_without_text,
+          document_with_term_counts,
+        ]
+      end
+      let :model do
+        BM25Model.new(documents, :library => MATRIX_LIBRARY)
+      end
+      describe '#documents' do
+        it 'should return the documents' do
+          model.documents.should == documents
+        end
+      end
+      describe '#document_index' do
+        it 'should return nil' do
+          model.document_index(document).should == 0
+        end
+      end
+      describe '#text_index' do
+        it 'should return the index' do
+          model.text_index(text).should == 0
+        end
+      end
+      describe '#terms' do
+        it 'should return the terms' do
+          model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+        end
+      end
+      describe '#inverse_document_frequency' do
+        it 'should return the inverse document frequency' do
+          model.idf('foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)))
+        end
+        it 'should return the inverse document frequency of a non-occurring term' do
+          model.idf('xxx').should be_within(0.001).of(Math.log((4 - 0 + 0.5) / (0 + 0.5)))
+        end
+      end
+      describe '#term_frequency' do
+        it 'should return the term frequency if no tokens given' do
+          model.tf(document, 'foo').should == (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5)
+        end
+        it 'should return the term frequency if tokens given' do
+          model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
+        end
+        it 'should return no term frequency if no text given' do
+          model.tf(document_without_text, 'foo').should == 0
+        end
+        it 'should return the term frequency if term counts given' do
+          model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
+        end
+        it 'should return the term frequency of a non-occurring term' do
+          model.tf(document, 'xxx').should == 0
+        end
+        it 'should return the term frequency in a non-occurring document' do
+          model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
+        end
+      end
+      describe '#term_frequency_inverse_document_frequency' do
+        it 'should return the tf*idf' do
+          model.tfidf(document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5))
+        end
+        it 'should return the tf*idf of a non-occurring term' do
+          model.tfidf(document, 'xxx').should == 0
+        end
+        it 'should return the tf*idf in a non-occurring term' do
+          model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
+        end
+      end
+      describe '#similarity_matrix' do
+        it 'should return the similarity matrix' do
+          expected = [
+            1.0,   0.564, 0.0, 0.479,
+            0.564, 1.0,   0.0, 0.540,
+            0.0,   0.0,   0.0, 0.0,
+            0.479, 0.540, 0.0, 1.0,
+          ]
+          similarity_matrix_values(model).each_with_index do |value,i|
+            value.should be_within(0.001).of(expected[i])
+          end
+        end
+      end
+    end
+  end
+end

data/spec/document_spec.rb CHANGED

@@ -1,136 +1,138 @@
 require 'spec_helper'
 # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
-describe TfIdfSimilarity::Document do
-  let :text do
-    "FOO-foo BAR bar \r\n\t 123 !@#"
-  end
-  let :tokens do
-    ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
-  end
-  let :document_without_text do
-    TfIdfSimilarity::Document.new('')
-  end
-  let :document do
-    TfIdfSimilarity::Document.new(text)
-  end
-  let :document_with_id do
-    TfIdfSimilarity::Document.new(text, :id => 'baz')
-  end
-  let :document_with_tokens do
-    TfIdfSimilarity::Document.new(text, :tokens => tokens)
-  end
-  let :document_with_term_counts do
-    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
-  end
-  let :document_with_term_counts_and_size do
-    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
-  end
-  let :document_with_size do
-    TfIdfSimilarity::Document.new(text, :size => 10)
-  end
-  describe '#id' do
-    it 'should return the ID if no ID given' do
-      document.id.should == document.object_id
+module TfIdfSimilarity
+  describe Document do
+    let :text do
+      "FOO-foo BAR bar \r\n\t 123 !@#"
     end
-    it 'should return the given ID' do
-      document_with_id.id.should == 'baz'
+    let :tokens do
+      ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
     end
-  end
-  describe '#text' do
-    it 'should return the text' do
-      document.text.should == text
+    let :document_without_text do
+      Document.new('')
     end
-  end
-  describe '#size' do
-    it 'should return the number of tokens if no tokens given' do
-      document.size.should == 4
+    let :document do
+      Document.new(text)
     end
-    it 'should return the number of tokens if tokens given' do
-      document_with_tokens.size.should == 3
+    let :document_with_id do
+      Document.new(text, :id => 'baz')
     end
-    it 'should return the number of tokens if no text given' do
-      document_without_text.size.should == 0
+    let :document_with_tokens do
+      Document.new(text, :tokens => tokens)
     end
-    it 'should return the number of tokens if term counts given' do
-      document_with_term_counts.size.should == 15
+    let :document_with_term_counts do
+      Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
     end
-    it 'should return the given number of tokens if term counts and size given' do
-      document_with_term_counts_and_size.size.should == 20
+    let :document_with_term_counts_and_size do
+      Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
     end
-    it 'should not return the given number of tokens if term counts not given' do
-      document_with_size.size.should_not == 10
+    let :document_with_size do
+      Document.new(text, :size => 10)
     end
-  end
-  describe '#term_counts' do
-    it 'should return the term counts if no tokens given' do
-      document.term_counts.should == {'foo' => 2, 'bar' => 2}
-    end
+    describe '#id' do
+      it 'should return the ID if no ID given' do
+        document.id.should == document.object_id
+      end
-    it 'should return the term counts if tokens given' do
-      document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
+      it 'should return the given ID' do
+        document_with_id.id.should == 'baz'
+      end
     end
-    it 'should return no term counts if no text given' do
-      document_without_text.term_counts.should == {}
+    describe '#text' do
+      it 'should return the text' do
+        document.text.should == text
+      end
     end
-    it 'should return the term counts if term counts given' do
-      document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
-    end
-  end
+    describe '#size' do
+      it 'should return the number of tokens if no tokens given' do
+        document.size.should == 4
+      end
-  describe '#terms' do
-    it 'should return the terms if no tokens given' do
-      document.terms.sort.should == ['bar', 'foo']
-    end
+      it 'should return the number of tokens if tokens given' do
+        document_with_tokens.size.should == 3
+      end
-    it 'should return the terms if tokens given' do
-      document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
-    end
+      it 'should return the number of tokens if no text given' do
+        document_without_text.size.should == 0
+      end
-    it 'should return no terms if no text given' do
-      document_without_text.terms.should == []
-    end
+      it 'should return the number of tokens if term counts given' do
+        document_with_term_counts.size.should == 15
+      end
-    it 'should return the terms if term counts given' do
-      document_with_term_counts.terms.sort.should == ['bar', 'baz']
-    end
-  end
+      it 'should return the given number of tokens if term counts and size given' do
+        document_with_term_counts_and_size.size.should == 20
+      end
-  describe '#term_count' do
-    it 'should return the term count if no tokens given' do
-      document.term_count('foo').should == 2
+      it 'should not return the given number of tokens if term counts not given' do
+        document_with_size.size.should_not == 10
+      end
     end
-    it 'should return the term count if tokens given' do
-      document_with_tokens.term_count('foo-foo').should == 1
+    describe '#term_counts' do
+      it 'should return the term counts if no tokens given' do
+        document.term_counts.should == {'foo' => 2, 'bar' => 2}
+      end
+      it 'should return the term counts if tokens given' do
+        document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
+      end
+      it 'should return no term counts if no text given' do
+        document_without_text.term_counts.should == {}
+      end
+      it 'should return the term counts if term counts given' do
+        document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
+      end
     end
-    it 'should return no term count if no text given' do
-      document_without_text.term_count('foo').should == 0
+    describe '#terms' do
+      it 'should return the terms if no tokens given' do
+        document.terms.sort.should == ['bar', 'foo']
+      end
+      it 'should return the terms if tokens given' do
+        document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
+      end
+      it 'should return no terms if no text given' do
+        document_without_text.terms.should == []
+      end
+      it 'should return the terms if term counts given' do
+        document_with_term_counts.terms.sort.should == ['bar', 'baz']
+      end
     end
-    it 'should return the term count if term counts given' do
-      document_with_term_counts.term_count('bar').should == 5
+    describe '#term_count' do
+      it 'should return the term count if no tokens given' do
+        document.term_count('foo').should == 2
+      end
+      it 'should return the term count if tokens given' do
+        document_with_tokens.term_count('foo-foo').should == 1
+      end
+      it 'should return no term count if no text given' do
+        document_without_text.term_count('foo').should == 0
+      end
+      it 'should return the term count if term counts given' do
+        document_with_term_counts.term_count('bar').should == 5
+      end
     end
   end
 end