RubyGems - tf-idf-similarity - Versions diffs - 0.1.3 → 0.1.4 - Mend

tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -8
data/Gemfile +2 -2
data/README.md +40 -9
data/lib/tf-idf-similarity.rb +1 -0
data/lib/tf-idf-similarity/bm25_model.rb +23 -62
data/lib/tf-idf-similarity/document.rb +69 -67
data/lib/tf-idf-similarity/extras/document.rb +10 -8
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
data/lib/tf-idf-similarity/model.rb +66 -0
data/lib/tf-idf-similarity/term_count_model.rb +59 -57
data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
data/lib/tf-idf-similarity/token.rb +39 -37
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/bm25_model_spec.rb +200 -0
data/spec/document_spec.rb +98 -96
data/spec/extras/tf_idf_model_spec.rb +224 -222
data/spec/spec_helper.rb +6 -0
data/spec/term_count_model_spec.rb +76 -74
data/spec/tf_idf_model_spec.rb +143 -117
data/spec/token_spec.rb +23 -21
metadata +6 -2

data/spec/spec_helper.rb CHANGED

@@ -19,3 +19,9 @@ when :nmatrix
 else
   require 'matrix'
 end
+RSpec.configure do |c|
+  if MATRIX_LIBRARY == :gsl # GSL can't initialize an empty matrix
+    c.filter_run_excluding :empty_matrix => true
+  end
+end

data/spec/term_count_model_spec.rb CHANGED

@@ -1,107 +1,109 @@
 require 'spec_helper'
-describe TfIdfSimilarity::TermCountModel do
-  let :text do
-    "FOO-foo BAR bar \r\n\t 123 !@#"
-  end
-  let :tokens do
-    ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
-  end
+module TfIdfSimilarity
+  describe TermCountModel do
+    let :text do
+      "FOO-foo BAR bar \r\n\t 123 !@#"
+    end
-  let :document_without_text do
-    TfIdfSimilarity::Document.new('')
-  end
+    let :tokens do
+      ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
+    end
-  let :document do
-    TfIdfSimilarity::Document.new(text)
-  end
+    let :document_without_text do
+      Document.new('')
+    end
-  let :document_with_tokens do
-    TfIdfSimilarity::Document.new(text, :tokens => tokens)
-  end
+    let :document do
+      Document.new(text)
+    end
-  let :document_with_term_counts do
-    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
-  end
+    let :document_with_tokens do
+      Document.new(text, :tokens => tokens)
+    end
-  context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
-    let :model do
-      TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
+    let :document_with_term_counts do
+      Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
     end
-    describe '#documents' do
-      it 'should be empty' do
-        model.documents.should be_empty
+    context 'without documents', :empty_matrix => true do
+      let :model do
+        TermCountModel.new([], :library => MATRIX_LIBRARY)
       end
-    end
-    describe '#terms' do
-      it 'should be empty' do
-        model.terms.should be_empty
+      describe '#documents' do
+        it 'should be empty' do
+          model.documents.should be_empty
+        end
       end
-    end
-    describe '#average_document_size' do
-      it 'should be zero' do
-        model.average_document_size.should == 0
+      describe '#terms' do
+        it 'should be empty' do
+          model.terms.should be_empty
+        end
       end
-    end
-    describe '#document_count' do
-      it 'should be zero' do
-        model.document_count('xxx').should == 0
+      describe '#average_document_size' do
+        it 'should be zero' do
+          model.average_document_size.should == 0
+        end
       end
-    end
-    describe '#term_count' do
-      it 'should be zero' do
-        model.term_count('xxx').should == 0
+      describe '#document_count' do
+        it 'should be zero' do
+          model.document_count('xxx').should == 0
+        end
       end
-    end
-  end
-  context 'with documents' do
-    let :documents do
-      [
-        document, # 4 tokens
-        document_with_tokens, # 3 tokens
-        document_without_text, # 0 tokens
-        document_with_term_counts, # 15 tokens
-      ]
+      describe '#term_count' do
+        it 'should be zero' do
+          model.term_count('xxx').should == 0
+        end
+      end
     end
-    let :model do
-      TfIdfSimilarity::TermCountModel.new(documents, :library => MATRIX_LIBRARY)
-    end
+    context 'with documents' do
+      let :documents do
+        [
+          document, # 4 tokens
+          document_with_tokens, # 3 tokens
+          document_without_text, # 0 tokens
+          document_with_term_counts, # 15 tokens
+        ]
+      end
-    describe '#documents' do
-      it 'should return the documents' do
-        model.documents.should == documents
+      let :model do
+        TermCountModel.new(documents, :library => MATRIX_LIBRARY)
       end
-    end
-    describe '#terms' do
-      it 'should return the terms' do
-        model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+      describe '#documents' do
+        it 'should return the documents' do
+          model.documents.should == documents
+        end
       end
-    end
-    describe '#average_document_size' do
-      it 'should return the average number of tokens in a document' do
-        model.average_document_size.should == 5.5
+      describe '#terms' do
+        it 'should return the terms' do
+          model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+        end
       end
-    end
-    describe '#document_count' do
-      it 'should return the number of documents the term appears in' do
-        model.document_count('bar').should == 3
+      describe '#average_document_size' do
+        it 'should return the average number of tokens in a document' do
+          model.average_document_size.should == 5.5
+        end
+      end
+      describe '#document_count' do
+        it 'should return the number of documents the term appears in' do
+          model.document_count('bar').should == 3
+        end
       end
-    end
-    describe '#term_count' do
-      it 'should return the number of times the term appears in the corpus' do
-        model.term_count('bar').should == 9
+      describe '#term_count' do
+        it 'should return the number of times the term appears in the corpus' do
+          model.term_count('bar').should == 9
+        end
       end
     end
   end

data/spec/tf_idf_model_spec.rb CHANGED

@@ -1,172 +1,198 @@
 require 'spec_helper'
-describe TfIdfSimilarity::TfIdfModel do
-  let :text do
-    "FOO-foo BAR bar \r\n\t 123 !@#"
-  end
-  let :tokens do
-    ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
-  end
-  let :document_without_text do
-    TfIdfSimilarity::Document.new('')
-  end
-  let :document do
-    TfIdfSimilarity::Document.new(text)
-  end
-  let :document_with_tokens do
-    TfIdfSimilarity::Document.new(text, :tokens => tokens)
-  end
+module TfIdfSimilarity
+  describe TfIdfModel do
+    let :text do
+      "FOO-foo BAR bar \r\n\t 123 !@#"
+    end
-  let :document_with_term_counts do
-    TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
-  end
+    let :tokens do
+      ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
+    end
-  let :non_corpus_document do
-    TfIdfSimilarity::Document.new('foo foo foo')
-  end
+    let :document_without_text do
+      Document.new('')
+    end
-  def similarity_matrix_values(model)
-    matrix = model.similarity_matrix
-    if MATRIX_LIBRARY == :nmatrix
-      matrix.each.to_a
-    else
-      matrix.to_a.flatten
+    let :document do
+      Document.new(text)
     end
-  end
-  context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
-    let :model do
-      TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
+    let :document_with_tokens do
+      Document.new(text, :tokens => tokens)
     end
-    describe '#documents' do
-      it 'should be empty' do
-        model.documents.should be_empty
-      end
+    let :document_with_term_counts do
+      Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
     end
-    describe '#terms' do
-      it 'should be empty' do
-        model.terms.should be_empty
-      end
+    let :non_corpus_document do
+      Document.new('foo foo foo')
     end
-    describe '#inverse_document_frequency' do
-      it 'should return negative infinity' do
-        model.idf('foo').should == -1/0.0 # -Infinity
+    def similarity_matrix_values(model)
+      matrix = model.similarity_matrix
+      if MATRIX_LIBRARY == :nmatrix
+        matrix.each.to_a
+      else
+        matrix.to_a.flatten
       end
     end
-    describe '#term_frequency' do
-      it 'should return the term frequency' do
-        model.tf(document, 'foo').should == Math.sqrt(2)
+    context 'without documents', :empty_matrix => true do
+      let :model do
+        TfIdfModel.new([], :library => MATRIX_LIBRARY)
       end
-    end
-    describe '#term_frequency_inverse_document_frequency' do
-      it 'should return negative infinity' do
-        model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
+      describe '#documents' do
+        it 'should be empty' do
+          model.documents.should be_empty
+        end
       end
-    end
-    describe '#similarity_matrix' do
-      it 'should be empty' do
-        similarity_matrix_values(model).should be_empty
+      describe '#document_index' do
+        it 'should return nil' do
+          model.document_index(document).should be_nil
+        end
       end
-    end
-  end
-  context 'with documents' do
-    let :documents do
-      [
-        document,
-        document_with_tokens,
-        document_without_text,
-        document_with_term_counts,
-      ]
-    end
+      describe '#text_index' do
+        it 'should return nil' do
+          model.text_index(text).should be_nil
+        end
+      end
-    let :model do
-      TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
-    end
+      describe '#terms' do
+        it 'should be empty' do
+          model.terms.should be_empty
+        end
+      end
-    describe '#documents' do
-      it 'should return the documents' do
-        model.documents.should == documents
+      describe '#inverse_document_frequency' do
+        it 'should return negative infinity' do
+          model.idf('foo').should == -1/0.0 # -Infinity
+        end
       end
-    end
-    describe '#terms' do
-      it 'should return the terms' do
-        model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+      describe '#term_frequency' do
+        it 'should return the term frequency' do
+          model.tf(document, 'foo').should == Math.sqrt(2)
+        end
       end
-    end
-    describe '#inverse_document_frequency' do
-      it 'should return the inverse document frequency' do
-        model.idf('foo').should be_within(0.001).of(1 + Math.log(2))
+      describe '#term_frequency_inverse_document_frequency' do
+        it 'should return negative infinity' do
+          model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
+        end
       end
-      it 'should return the inverse document frequency of a non-occurring term' do
-        model.idf('xxx').should be_within(0.001).of(1 + Math.log(4))
+      describe '#similarity_matrix' do
+        it 'should be empty' do
+          similarity_matrix_values(model).should be_empty
+        end
       end
     end
-    describe '#term_frequency' do
-      it 'should return the term frequency if no tokens given' do
-        model.tf(document, 'foo').should == Math.sqrt(2)
+    context 'with documents' do
+      let :documents do
+        [
+          document,
+          document_with_tokens,
+          document_without_text,
+          document_with_term_counts,
+        ]
       end
-      it 'should return the term frequency if tokens given' do
-        model.tf(document_with_tokens, 'foo-foo').should == 1
+      let :model do
+        TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
       end
-      it 'should return no term frequency if no text given' do
-        model.tf(document_without_text, 'foo').should == 0
+      describe '#documents' do
+        it 'should return the documents' do
+          model.documents.should == documents
+        end
       end
-      it 'should return the term frequency if term counts given' do
-        model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
+      describe '#document_index' do
+        it 'should return the index' do
+          model.document_index(document).should == 0
+        end
       end
-      it 'should return the term frequency of a non-occurring term' do
-        model.tf(document, 'xxx').should == 0
+      describe '#text_index' do
+        it 'should return the index' do
+          model.text_index(text).should == 0
+        end
       end
-      it 'should return the term frequency in a non-occurring document' do
-        model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
+      describe '#terms' do
+        it 'should return the terms' do
+          model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
+        end
       end
-    end
-    describe '#term_frequency_inverse_document_frequency' do
-      it 'should return the tf*idf' do
-        model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(2))
+      describe '#inverse_document_frequency' do
+        it 'should return the inverse document frequency' do
+          model.idf('foo').should be_within(0.001).of(1 + Math.log(4 / (1 + 1.0)))
+        end
+        it 'should return the inverse document frequency of a non-occurring term' do
+          model.idf('xxx').should be_within(0.001).of(1 + Math.log(4 / (0 + 1.0)))
+        end
       end
-      it 'should return the tf*idf of a non-occurring term' do
-        model.tfidf(document, 'xxx').should == 0
+      describe '#term_frequency' do
+        it 'should return the term frequency if no tokens given' do
+          model.tf(document, 'foo').should == Math.sqrt(2)
+        end
+        it 'should return the term frequency if tokens given' do
+          model.tf(document_with_tokens, 'foo-foo').should == 1
+        end
+        it 'should return no term frequency if no text given' do
+          model.tf(document_without_text, 'foo').should == 0
+        end
+        it 'should return the term frequency if term counts given' do
+          model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
+        end
+        it 'should return the term frequency of a non-occurring term' do
+          model.tf(document, 'xxx').should == 0
+        end
+        it 'should return the term frequency in a non-occurring document' do
+          model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
+        end
       end
-      it 'should return the tf*idf in a non-occurring term' do
-        model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(3))
+      describe '#term_frequency_inverse_document_frequency' do
+        it 'should return the tf*idf' do
+          model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(2))
+        end
+        it 'should return the tf*idf of a non-occurring term' do
+          model.tfidf(document, 'xxx').should == 0
+        end
+        it 'should return the tf*idf in a non-occurring term' do
+          model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(3))
+        end
       end
-    end
-    describe '#similarity_matrix' do
-      it 'should return the similarity matrix' do
-        expected = [
-          1.0,   0.326, 0.0, 0.195,
-          0.326, 1.0,   0.0, 0.247,
-          0.0,   0.0,   0.0, 0.0,
-          0.195, 0.247, 0.0, 1.0,
-        ]
+      describe '#similarity_matrix' do
+        it 'should return the similarity matrix' do
+          expected = [
+            1.0,   0.326, 0.0, 0.195,
+            0.326, 1.0,   0.0, 0.247,
+            0.0,   0.0,   0.0, 0.0,
+            0.195, 0.247, 0.0, 1.0,
+          ]
-        similarity_matrix_values(model).each_with_index do |value,i|
-          value.should be_within(0.001).of(expected[i])
+          similarity_matrix_values(model).each_with_index do |value,i|
+            value.should be_within(0.001).of(expected[i])
+          end
         end
       end
     end