RubyGems - tf-idf-similarity - Versions diffs - 0.0.9 → 0.1.0 - Mend

tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/.travis.yml +29 -0
data/Gemfile +4 -0
data/README.md +41 -29
data/lib/tf-idf-similarity.rb +12 -1
data/lib/tf-idf-similarity/document.rb +35 -28
data/lib/tf-idf-similarity/extras/document.rb +2 -125
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
data/lib/tf-idf-similarity/term_count_model.rb +78 -0
data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
data/lib/tf-idf-similarity/token.rb +34 -12
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/document_spec.rb +136 -0
data/spec/extras/tf_idf_model_spec.rb +269 -0
data/spec/spec_helper.rb +21 -0
data/spec/term_count_model_spec.rb +108 -0
data/spec/tf_idf_model_spec.rb +174 -0
data/spec/token_spec.rb +34 -0
data/td-idf-similarity.gemspec +3 -3
metadata +91 -63
data/lib/tf-idf-similarity/collection.rb +0 -205
data/lib/tf-idf-similarity/extras/collection.rb +0 -110

data/spec/extras/tf_idf_model_spec.rb ADDED Viewed

@@ -0,0 +1,269 @@
+require 'spec_helper'
+require 'tf-idf-similarity/extras/document'
+require 'tf-idf-similarity/extras/tf_idf_model'
+describe TfIdfSimilarity::TfIdfModel do
+  def build_document(text, opts = {})
+    TfIdfSimilarity::Document.new(text, opts)
+  end
+  def build_model(documents)
+    TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
+  end
+  # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
+  # No relevant tests to reproduce.
+  # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
+  context 'comparing to vss gem' do
+    let :documents do
+      [ "I'm not even going to mention any TV series.",
+        "The Wire is the best thing ever. Fact.",
+        "Some would argue that Lost got a bit too wierd after season 2.",
+        "Lost is surely not in the same league as The Wire.",
+        "You cannot compare the The Wire and Lost.",
+      ].map do |text|
+        build_document(text)
+      end
+    end
+    let :model do
+      build_model(documents)
+    end
+    pending "Add TfIdfSimilarity::TfIdfModel#search"
+  end
+  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
+  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
+  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
+  context 'comparing to similarity gem' do
+    let :document do
+      TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
+    end
+    def build_model_from_text(*texts)
+      build_model(texts.map{|text| build_document(text)})
+    end
+    let :model_a do
+      build_model_from_text("cow horse sheep", "horse bird dog")
+    end
+    let :model_b do
+      build_model_from_text("cow cow cow bird", "horse horse horse bird")
+    end
+    let :model_c do
+      build_model_from_text("cow cow cow", "horse horse horse")
+    end
+    # Normalizes to the number of tokens in the document.
+    # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
+    def tf(term)
+      document.term_count(term) / document.size.to_f
+    end
+    # Does not add one to the inverse document frequency.
+    # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
+    def idf(model, term)
+      model.plain_idf(term, 0, 1)
+    end
+    it 'should return the terms' do
+      [ "the quick brown fox",
+        "the quick     brown   fox",
+        "The Quick Brown Fox",
+        'The, Quick! Brown. "Fox"',
+      ].each do |text|
+        build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
+      end
+    end
+    it 'should return the number of documents' do
+      model_a.documents.size.should == 2
+    end
+    it 'should return the number of terms' do
+      document.terms.size.should == 3
+      model_a.terms.size.should == 5
+    end
+    it 'should return the term frequency' do
+      tf('cow').should == 0.5
+      tf('horse').should be_within(0.001).of(0.333)
+      tf('sheep').should == 0
+    end
+    it 'should return the similarity matrix' do
+      pending "Calculate the tf*idf matrix like the similarity gem does"
+    end
+    it 'should return the number of documents in which a term appears' do
+      model_b.document_count('cow').should == 1
+      model_b.document_count('horse').should == 1
+      model_b.document_count('bird').should == 2
+    end
+    it 'should return the inverse document frequency' do
+      idf(model_c, 'cow').should be_within(0.001).of(0.0)
+      idf(model_c, 'bird').should be_within(0.001).of(0.693)
+    end
+    it 'should return the document vector' do
+      pending "Calculate the tf*idf matrix like the similarity gem does"
+    end
+  end
+  # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
+  context 'comparing to tf-idf gem' do
+    # Normalizes to the number of unique tokens (terms) in the document.
+    # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
+    let :corpus_a do
+      1.upto(50).map do |n|
+        text = []
+        text << 'the' if n <= 23
+        text << 'a' if n <= 17
+        text << 'said' if n <= 5
+        text << 'phone' if n <= 2
+        text << 'girl' if n <= 1
+        text << 'moon' if n <= 1
+        build_document(text * ' ')
+      end
+    end
+    let :corpus_b do
+      1.upto(50).map do |n|
+        text = []
+        text << 'the' if n <= 23
+        text << 'a' if n <= 17
+        text << 'said' if n <= 5
+        text << 'phone' if n <= 2
+        text << 'girl' if n <= 1
+        build_document(text * ' ')
+      end
+    end
+    let :model_a do
+      build_model(corpus_a)
+    end
+    let :model_b do
+      build_model(corpus_b)
+    end
+    it 'should return the number of documents' do
+      model_a.documents.size.should == 50
+    end
+    it 'should return the number of terms' do
+      model_a.terms.size.should == 6
+    end
+    # Adds one to the numerator when calculating inverse document frequency.
+    # Sets a default inverse document frequency for non-occurring terms.
+    # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
+    # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
+    it 'should return the inverse document frequency' do
+      # should query IDF for nonexistent terms
+      default = model_a.plain_idf('xxx', 1, 1)
+      model_a.plain_idf('nonexistent', 1, 1).should == default
+      model_a.plain_idf('THE', 1, 1).should == default
+      # should query IDF for existent terms
+      model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
+      model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
+      # should add input documents to an existing corpus
+      model_a.plain_idf('water', 1, 1).should == default
+      model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
+      model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
+      model = build_model(corpus_a + [build_document('water moon')])
+      model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
+      model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
+      model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
+      # should add input documents to an empty corpus
+      unless MATRIX_LIBRARY == :gsl
+        model_c = build_model([])
+        default = model_c.plain_idf('xxx', 1, 1)
+        model_c.plain_idf('moon', 1, 1).should == default
+        model_c.plain_idf('water', 1, 1).should == default
+        model_c.plain_idf('said', 1, 1).should == default
+      end
+      model_d = build_model([
+        build_document('moon'),
+        build_document('moon said hello'),
+      ])
+      default = model_d.plain_idf('xxx', 1, 1)
+      model_d.plain_idf('water', 1, 1).should == default
+      model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
+      model_d.plain_idf('moon', 1, 1).should == 0 # 0
+      # should observe stopwords list
+      default = model_b.plain_idf('xxx', 1, 1)
+      model_b.plain_idf('water', 1, 1).should == default
+      model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
+      model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
+      model_e = build_model(corpus_b + [
+        build_document('moon', :tokens => %w()),
+        build_document('moon and water', :tokens => %w(and water)),
+      ])
+      default = model_e.plain_idf('xxx', 1, 1)
+      model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
+      model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
+      model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
+    end
+  end
+  # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
+  context 'comparing to tf_idf gem' do
+    let :one do
+      build_document('a a a a a a a a b b')
+    end
+    let :two do
+      build_document('a a')
+    end
+    let :model do
+      build_model([one, two])
+    end
+    # Normalizes to the number of tokens in the document.
+    # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
+    def tf
+      one.term_count('b') / one.size.to_f
+    end
+    # Performs plain inverse document frequency with base 10.
+    # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
+    def idf
+      model.plain_idf('b') / Math.log(10)
+    end
+    it 'should return the term frequency' do
+      tf.should == 0.2
+      model.tf(one, 'b').should be_within(0.001).of(1.414)
+    end
+    it 'should return the inverse document frequency' do
+      idf.should be_within(0.001).of(0.301) # 0.30102999
+      model.idf('b').should == 1
+    end
+    it 'should return the tf*idf' do
+      (tf * idf).should be_within(0.001).of(0.060) # 0.0602
+      model.tfidf(one, 'b').should be_within(0.001).of(1.414)
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'rubygems'
+require 'coveralls'
+Coveralls.wear!
+require 'rspec'
+require File.dirname(__FILE__) + '/../lib/tf-idf-similarity'
+MATRIX_LIBRARY = (ENV['MATRIX_LIBRARY'] || :matrix).to_sym
+puts "\n==> Running specs with #{MATRIX_LIBRARY}"
+case MATRIX_LIBRARY
+when :gsl
+  require 'gsl'
+when :narray
+  require 'narray'
+when :nmatrix
+  require 'nmatrix'
+else
+  require 'matrix'
+end

data/spec/term_count_model_spec.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require 'spec_helper'
+describe TfIdfSimilarity::TermCountModel do
+  let :text do
+    "FOO-foo BAR bar \r\n\t 123 !@#"
+  end
+  let :tokens do
+    ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
+  end
+  let :document_without_text do
+    TfIdfSimilarity::Document.new('')
+  end
+  let :document do
+    TfIdfSimilarity::Document.new(text)
+  end
+  let :document_with_tokens do
+    TfIdfSimilarity::Document.new(text, :tokens => tokens)
+  end
+  let :document_with_term_counts do
+    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
+  end
+  context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
+    let :model do
+      TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
+    end
+    describe '#documents' do
+      it 'should be empty' do
+        model.documents.should be_empty
+      end
+    end
+    describe '#terms' do
+      it 'should be empty' do
+        model.terms.should be_empty
+      end
+    end
+    describe '#average_document_size' do
+      it 'should be zero' do
+        model.average_document_size.should == 0
+      end
+    end
+    describe '#document_count' do
+      it 'should be zero' do
+        model.document_count('xxx').should == 0
+      end
+    end
+    describe '#term_count' do
+      it 'should be zero' do
+        model.term_count('xxx').should == 0
+      end
+    end
+  end
+  context 'with documents' do
+    let :documents do
+      [
+        document, # 4 tokens
+        document_with_tokens, # 3 tokens
+        document_without_text, # 0 tokens
+        document_with_term_counts, # 15 tokens
+      ]
+    end
+    let :model do
+      TfIdfSimilarity::TermCountModel.new(documents, :library => MATRIX_LIBRARY)
+    end
+    describe '#documents' do
+      it 'should return the documents' do
+        model.documents.should == documents
+      end
+    end
+    describe '#terms' do
+      it 'should return the terms' do
+        model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+      end
+    end
+    describe '#average_document_size' do
+      it 'should return the average number of tokens in a document' do
+        model.average_document_size.should == 5.5
+      end
+    end
+    describe '#document_count' do
+      it 'should return the number of documents the term appears in' do
+        model.document_count('bar').should == 3
+      end
+    end
+    describe '#term_count' do
+      it 'should return the number of times the term appears in the corpus' do
+        model.term_count('bar').should == 9
+      end
+    end
+  end
+end

data/spec/tf_idf_model_spec.rb ADDED Viewed

@@ -0,0 +1,174 @@
+require 'spec_helper'
+describe TfIdfSimilarity::TfIdfModel do
+  let :text do
+    "FOO-foo BAR bar \r\n\t 123 !@#"
+  end
+  let :tokens do
+    ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
+  end
+  let :document_without_text do
+    TfIdfSimilarity::Document.new('')
+  end
+  let :document do
+    TfIdfSimilarity::Document.new(text)
+  end
+  let :document_with_tokens do
+    TfIdfSimilarity::Document.new(text, :tokens => tokens)
+  end
+  let :document_with_term_counts do
+    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
+  end
+  let :non_corpus_document do
+    TfIdfSimilarity::Document.new('foo foo foo')
+  end
+  def similarity_matrix_values(model)
+    matrix = model.similarity_matrix
+    if MATRIX_LIBRARY == :nmatrix
+      matrix.each.to_a
+    else
+      matrix.to_a.flatten
+    end
+  end
+  context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
+    let :model do
+      TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
+    end
+    describe '#documents' do
+      it 'should be empty' do
+        model.documents.should be_empty
+      end
+    end
+    describe '#terms' do
+      it 'should be empty' do
+        model.terms.should be_empty
+      end
+    end
+    describe '#inverse_document_frequency' do
+      it 'should return negative infinity' do
+        model.idf('foo').should == -1/0.0 # -Infinity
+      end
+    end
+    describe '#term_frequency' do
+      it 'should return the term frequency' do
+        model.tf(document, 'foo').should == Math.sqrt(2)
+      end
+    end
+    describe '#term_frequency_inverse_document_frequency' do
+      it 'should return negative infinity' do
+        model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
+      end
+    end
+    describe '#similarity_matrix' do
+      it 'should be empty' do
+        similarity_matrix_values(model).should be_empty
+      end
+    end
+  end
+  context 'with documents' do
+    let :documents do
+      [
+        document,
+        document_with_tokens,
+        document_without_text,
+        document_with_term_counts,
+      ]
+    end
+    let :model do
+      TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
+    end
+    describe '#documents' do
+      it 'should return the documents' do
+        model.documents.should == documents
+      end
+    end
+    describe '#terms' do
+      it 'should return the terms' do
+        model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+      end
+    end
+    describe '#inverse_document_frequency' do
+      it 'should return the inverse document frequency' do
+        model.idf('foo').should be_within(0.001).of(1 + Math.log(2))
+      end
+      it 'should return the inverse document frequency of a non-occurring term' do
+        model.idf('xxx').should be_within(0.001).of(1 + Math.log(4))
+      end
+    end
+    describe '#term_frequency' do
+      it 'should return the term frequency if no tokens given' do
+        model.tf(document, 'foo').should == Math.sqrt(2)
+      end
+      it 'should return the term frequency if tokens given' do
+        model.tf(document_with_tokens, 'foo-foo').should == 1
+      end
+      it 'should return no term frequency if no text given' do
+        model.tf(document_without_text, 'foo').should == 0
+      end
+      it 'should return the term frequency if term counts given' do
+        model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
+      end
+      it 'should return the term frequency of a non-occurring term' do
+        model.tf(document, 'xxx').should == 0
+      end
+      it 'should return the term frequency in a non-occurring document' do
+        model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
+      end
+    end
+    describe '#term_frequency_inverse_document_frequency' do
+      it 'should return the tf*idf' do
+        model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(2))
+      end
+      it 'should return the tf*idf of a non-occurring term' do
+        model.tfidf(document, 'xxx').should == 0
+      end
+      it 'should return the tf*idf in a non-occurring term' do
+        model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(3))
+      end
+    end
+    describe '#similarity_matrix' do
+      it 'should return the similarity matrix' do
+        expected = [
+          1.0,   0.326, 0.0, 0.195,
+          0.326, 1.0,   0.0, 0.247,
+          0.0,   0.0,   0.0, 0.0,
+          0.195, 0.247, 0.0, 1.0,
+        ]
+        similarity_matrix_values(model).each_with_index do |value,i|
+          value.should be_within(0.001).of(expected[i])
+        end
+      end
+    end
+  end
+end