RubyGems - tf-idf-similarity - Versions diffs - 0.1.3 → 0.1.4 - Mend

tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -8
data/Gemfile +2 -2
data/README.md +40 -9
data/lib/tf-idf-similarity.rb +1 -0
data/lib/tf-idf-similarity/bm25_model.rb +23 -62
data/lib/tf-idf-similarity/document.rb +69 -67
data/lib/tf-idf-similarity/extras/document.rb +10 -8
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
data/lib/tf-idf-similarity/model.rb +66 -0
data/lib/tf-idf-similarity/term_count_model.rb +59 -57
data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
data/lib/tf-idf-similarity/token.rb +39 -37
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/bm25_model_spec.rb +200 -0
data/spec/document_spec.rb +98 -96
data/spec/extras/tf_idf_model_spec.rb +224 -222
data/spec/spec_helper.rb +6 -0
data/spec/term_count_model_spec.rb +76 -74
data/spec/tf_idf_model_spec.rb +143 -117
data/spec/token_spec.rb +23 -21
metadata +6 -2

data/spec/extras/tf_idf_model_spec.rb CHANGED

@@ -3,267 +3,269 @@ require 'spec_helper'
 require 'tf-idf-similarity/extras/document'
 require 'tf-idf-similarity/extras/tf_idf_model'
-describe TfIdfSimilarity::TfIdfModel do
-  def build_document(text, opts = {})
-    TfIdfSimilarity::Document.new(text, opts)
-  end
-  def build_model(documents)
-    TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
-  end
-  # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
-  # No relevant tests to reproduce.
+module TfIdfSimilarity
+  describe TfIdfModel do
+    def build_document(text, opts = {})
+      Document.new(text, opts)
+    end
+    def build_model(documents)
+      TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
+    end
+    # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
+    # No relevant tests to reproduce.
+    # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
+    context 'comparing to vss gem' do
+      let :documents do
+        [ "I'm not even going to mention any TV series.",
+          "The Wire is the best thing ever. Fact.",
+          "Some would argue that Lost got a bit too wierd after season 2.",
+          "Lost is surely not in the same league as The Wire.",
+          "You cannot compare the The Wire and Lost.",
+        ].map do |text|
+          build_document(text)
+        end
+      end
-  # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
-  context 'comparing to vss gem' do
-    let :documents do
-      [ "I'm not even going to mention any TV series.",
-        "The Wire is the best thing ever. Fact.",
-        "Some would argue that Lost got a bit too wierd after season 2.",
-        "Lost is surely not in the same league as The Wire.",
-        "You cannot compare the The Wire and Lost.",
-      ].map do |text|
-        build_document(text)
+      let :model do
+        build_model(documents)
       end
-    end
-    let :model do
-      build_model(documents)
+      pending "Add #search"
     end
-    pending "Add TfIdfSimilarity::TfIdfModel#search"
-  end
-  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
-  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
-  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
-  context 'comparing to similarity gem' do
-    let :document do
-      TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
-    end
+    # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
+    # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
+    # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
+    context 'comparing to similarity gem' do
+      let :document do
+        Document.new('cow cow cow horse horse elephant')
+      end
-    def build_model_from_text(*texts)
-      build_model(texts.map{|text| build_document(text)})
-    end
+      def build_model_from_text(*texts)
+        build_model(texts.map{|text| build_document(text)})
+      end
-    let :model_a do
-      build_model_from_text("cow horse sheep", "horse bird dog")
-    end
+      let :model_a do
+        build_model_from_text("cow horse sheep", "horse bird dog")
+      end
-    let :model_b do
-      build_model_from_text("cow cow cow bird", "horse horse horse bird")
-    end
+      let :model_b do
+        build_model_from_text("cow cow cow bird", "horse horse horse bird")
+      end
-    let :model_c do
-      build_model_from_text("cow cow cow", "horse horse horse")
-    end
+      let :model_c do
+        build_model_from_text("cow cow cow", "horse horse horse")
+      end
-    # Normalizes to the number of tokens in the document.
-    # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
-    def tf(term)
-      document.term_count(term) / document.size.to_f
-    end
+      # Normalizes to the number of tokens in the document.
+      # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
+      def tf(term)
+        document.term_count(term) / document.size.to_f
+      end
-    # Does not add one to the inverse document frequency.
-    # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
-    def idf(model, term)
-      model.plain_idf(term, 0, 1)
-    end
+      # Does not add one to the inverse document frequency.
+      # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
+      def idf(model, term)
+        model.plain_idf(term, 0, 1)
+      end
-    it 'should return the terms' do
-      [ "the quick brown fox",
-        "the quick     brown   fox",
-        "The Quick Brown Fox",
-        'The, Quick! Brown. "Fox"',
-      ].each do |text|
-        build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
+      it 'should return the terms' do
+        [ "the quick brown fox",
+          "the quick     brown   fox",
+          "The Quick Brown Fox",
+          'The, Quick! Brown. "Fox"',
+        ].each do |text|
+          build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
+        end
       end
-    end
-    it 'should return the number of documents' do
-      model_a.documents.size.should == 2
-    end
+      it 'should return the number of documents' do
+        model_a.documents.size.should == 2
+      end
-    it 'should return the number of terms' do
-      document.terms.size.should == 3
-      model_a.terms.size.should == 5
-    end
+      it 'should return the number of terms' do
+        document.terms.size.should == 3
+        model_a.terms.size.should == 5
+      end
-    it 'should return the term frequency' do
-      tf('cow').should == 0.5
-      tf('horse').should be_within(0.001).of(0.333)
-      tf('sheep').should == 0
-    end
+      it 'should return the term frequency' do
+        tf('cow').should == 0.5
+        tf('horse').should be_within(0.001).of(0.333)
+        tf('sheep').should == 0
+      end
-    it 'should return the similarity matrix' do
-      pending "Calculate the tf*idf matrix like the similarity gem does"
-    end
+      it 'should return the similarity matrix' do
+        pending "Calculate the tf*idf matrix like the similarity gem does"
+      end
-    it 'should return the number of documents in which a term appears' do
-      model_b.document_count('cow').should == 1
-      model_b.document_count('horse').should == 1
-      model_b.document_count('bird').should == 2
-    end
+      it 'should return the number of documents in which a term appears' do
+        model_b.document_count('cow').should == 1
+        model_b.document_count('horse').should == 1
+        model_b.document_count('bird').should == 2
+      end
-    it 'should return the inverse document frequency' do
-      idf(model_c, 'cow').should be_within(0.001).of(0.0)
-      idf(model_c, 'bird').should be_within(0.001).of(0.693)
-    end
+      it 'should return the inverse document frequency' do
+        idf(model_c, 'cow').should be_within(0.001).of(0.0)
+        idf(model_c, 'bird').should be_within(0.001).of(0.693)
+      end
-    it 'should return the document vector' do
-      pending "Calculate the tf*idf matrix like the similarity gem does"
+      it 'should return the document vector' do
+        pending "Calculate the tf*idf matrix like the similarity gem does"
+      end
     end
-  end
-  # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
-  context 'comparing to tf-idf gem' do
-    # Normalizes to the number of unique tokens (terms) in the document.
-    # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
-    let :corpus_a do
-      1.upto(50).map do |n|
-        text = []
-        text << 'the' if n <= 23
-        text << 'a' if n <= 17
-        text << 'said' if n <= 5
-        text << 'phone' if n <= 2
-        text << 'girl' if n <= 1
-        text << 'moon' if n <= 1
-        build_document(text * ' ')
+    # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
+    context 'comparing to tf-idf gem' do
+      # Normalizes to the number of unique tokens (terms) in the document.
+      # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
+      let :corpus_a do
+        1.upto(50).map do |n|
+          text = []
+          text << 'the' if n <= 23
+          text << 'a' if n <= 17
+          text << 'said' if n <= 5
+          text << 'phone' if n <= 2
+          text << 'girl' if n <= 1
+          text << 'moon' if n <= 1
+          build_document(text * ' ')
+        end
       end
-    end
-    let :corpus_b do
-      1.upto(50).map do |n|
-        text = []
-        text << 'the' if n <= 23
-        text << 'a' if n <= 17
-        text << 'said' if n <= 5
-        text << 'phone' if n <= 2
-        text << 'girl' if n <= 1
-        build_document(text * ' ')
+      let :corpus_b do
+        1.upto(50).map do |n|
+          text = []
+          text << 'the' if n <= 23
+          text << 'a' if n <= 17
+          text << 'said' if n <= 5
+          text << 'phone' if n <= 2
+          text << 'girl' if n <= 1
+          build_document(text * ' ')
+        end
       end
-    end
-    let :model_a do
-      build_model(corpus_a)
-    end
+      let :model_a do
+        build_model(corpus_a)
+      end
-    let :model_b do
-      build_model(corpus_b)
-    end
+      let :model_b do
+        build_model(corpus_b)
+      end
-    it 'should return the number of documents' do
-      model_a.documents.size.should == 50
-    end
+      it 'should return the number of documents' do
+        model_a.documents.size.should == 50
+      end
-    it 'should return the number of terms' do
-      model_a.terms.size.should == 6
-    end
+      it 'should return the number of terms' do
+        model_a.terms.size.should == 6
+      end
-    # Adds one to the numerator when calculating inverse document frequency.
-    # Sets a default inverse document frequency for non-occurring terms.
-    # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
-    # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
-    it 'should return the inverse document frequency' do
-      # should query IDF for nonexistent terms
-      default = model_a.plain_idf('xxx', 1, 1)
-      model_a.plain_idf('nonexistent', 1, 1).should == default
-      model_a.plain_idf('THE', 1, 1).should == default
-      # should query IDF for existent terms
-      model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
-      model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
-      # should add input documents to an existing corpus
-      model_a.plain_idf('water', 1, 1).should == default
-      model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
-      model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
-      model = build_model(corpus_a + [build_document('water moon')])
-      model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
-      model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
-      model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
-      # should add input documents to an empty corpus
-      unless MATRIX_LIBRARY == :gsl
-        model_c = build_model([])
-        default = model_c.plain_idf('xxx', 1, 1)
-        model_c.plain_idf('moon', 1, 1).should == default
-        model_c.plain_idf('water', 1, 1).should == default
-        model_c.plain_idf('said', 1, 1).should == default
-      end
-      model_d = build_model([
-        build_document('moon'),
-        build_document('moon said hello'),
-      ])
-      default = model_d.plain_idf('xxx', 1, 1)
-      model_d.plain_idf('water', 1, 1).should == default
-      model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
-      model_d.plain_idf('moon', 1, 1).should == 0 # 0
-      # should observe stopwords list
-      default = model_b.plain_idf('xxx', 1, 1)
-      model_b.plain_idf('water', 1, 1).should == default
-      model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
-      model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
-      model_e = build_model(corpus_b + [
-        build_document('moon', :tokens => %w()),
-        build_document('moon and water', :tokens => %w(and water)),
-      ])
-      default = model_e.plain_idf('xxx', 1, 1)
-      model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
-      model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
-      model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
+      # Adds one to the numerator when calculating inverse document frequency.
+      # Sets a default inverse document frequency for non-occurring terms.
+      # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
+      # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
+      it 'should return the inverse document frequency' do
+        # should query IDF for nonexistent terms
+        default = model_a.plain_idf('xxx', 1, 1)
+        model_a.plain_idf('nonexistent', 1, 1).should == default
+        model_a.plain_idf('THE', 1, 1).should == default
+        # should query IDF for existent terms
+        model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
+        model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
+        # should add input documents to an existing corpus
+        model_a.plain_idf('water', 1, 1).should == default
+        model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
+        model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
+        model = build_model(corpus_a + [build_document('water moon')])
+        model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
+        model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
+        model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
+        # should add input documents to an empty corpus
+        unless MATRIX_LIBRARY == :gsl
+          model_c = build_model([])
+          default = model_c.plain_idf('xxx', 1, 1)
+          model_c.plain_idf('moon', 1, 1).should == default
+          model_c.plain_idf('water', 1, 1).should == default
+          model_c.plain_idf('said', 1, 1).should == default
+        end
+        model_d = build_model([
+          build_document('moon'),
+          build_document('moon said hello'),
+        ])
+        default = model_d.plain_idf('xxx', 1, 1)
+        model_d.plain_idf('water', 1, 1).should == default
+        model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
+        model_d.plain_idf('moon', 1, 1).should == 0 # 0
+        # should observe stopwords list
+        default = model_b.plain_idf('xxx', 1, 1)
+        model_b.plain_idf('water', 1, 1).should == default
+        model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
+        model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
+        model_e = build_model(corpus_b + [
+          build_document('moon', :tokens => %w()),
+          build_document('moon and water', :tokens => %w(and water)),
+        ])
+        default = model_e.plain_idf('xxx', 1, 1)
+        model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
+        model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
+        model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
+      end
     end
-  end
-  # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
-  context 'comparing to tf_idf gem' do
-    let :one do
-      build_document('a a a a a a a a b b')
-    end
+    # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
+    context 'comparing to tf_idf gem' do
+      let :one do
+        build_document('a a a a a a a a b b')
+      end
-    let :two do
-      build_document('a a')
-    end
+      let :two do
+        build_document('a a')
+      end
-    let :model do
-      build_model([one, two])
-    end
+      let :model do
+        build_model([one, two])
+      end
-    # Normalizes to the number of tokens in the document.
-    # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
-    def tf
-      one.term_count('b') / one.size.to_f
-    end
+      # Normalizes to the number of tokens in the document.
+      # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
+      def tf
+        one.term_count('b') / one.size.to_f
+      end
-    # Performs plain inverse document frequency with base 10.
-    # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
-    def idf
-      model.plain_idf('b') / Math.log(10)
-    end
+      # Performs plain inverse document frequency with base 10.
+      # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
+      def idf
+        model.plain_idf('b') / Math.log(10)
+      end
-    it 'should return the term frequency' do
-      tf.should == 0.2
-      model.tf(one, 'b').should be_within(0.001).of(1.414)
-    end
+      it 'should return the term frequency' do
+        tf.should == 0.2
+        model.tf(one, 'b').should be_within(0.001).of(1.414)
+      end
-    it 'should return the inverse document frequency' do
-      idf.should be_within(0.001).of(0.301) # 0.30102999
-      model.idf('b').should == 1
-    end
+      it 'should return the inverse document frequency' do
+        idf.should be_within(0.001).of(0.301) # 0.30102999
+        model.idf('b').should == 1
+      end
-    it 'should return the tf*idf' do
-      (tf * idf).should be_within(0.001).of(0.060) # 0.0602
-      model.tfidf(one, 'b').should be_within(0.001).of(1.414)
+      it 'should return the tf*idf' do
+        (tf * idf).should be_within(0.001).of(0.060) # 0.0602
+        model.tfidf(one, 'b').should be_within(0.001).of(1.414)
+      end
     end
   end
 end