RubyGems - tf-idf-similarity - Versions diffs - 0.1.6 → 0.3.0 - Mend

tf-idf-similarity 0.1.6 → 0.3.0

Files changed (18) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/.travis.yml +32 -4
data/Gemfile +3 -1
data/README.md +7 -6
data/lib/tf-idf-similarity/bm25_model.rb +6 -2
data/lib/tf-idf-similarity/document.rb +7 -5
data/lib/tf-idf-similarity/matrix_methods.rb +18 -6
data/lib/tf-idf-similarity/model.rb +1 -1
data/lib/tf-idf-similarity/term_count_model.rb +3 -1
data/lib/tf-idf-similarity/token.rb +7 -0
data/lib/tf-idf-similarity/tokenizer.rb +19 -0
data/lib/tf-idf-similarity/version.rb +1 -1
data/lib/tf-idf-similarity.rb +0 -3
data/spec/bm25_model_spec.rb +13 -8
data/spec/spec_helper.rb +2 -0
data/td-idf-similarity.gemspec +3 -2
metadata +11 -11

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 03431fb16064caa54fe9cbfc17a151acb1a25fa5
-  data.tar.gz: be2e97b63e14244925937ee71fc8dc60c88dfce4
+SHA256:
+  metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
+  data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
 SHA512:
-  metadata.gz: f615fae6cfad994fa25c85b1f3d6882742944e7bb5894ae3fcf6b4c9d7b34647b0da1b3914f127eb26e46a299c0f8a4e9d64bc05a7cb1c429663beaf657704eb
-  data.tar.gz: 317ea7c5a1a72e53419f2eadb5b4789bccbe29f0f7bf742f89e9ed9ffb210b43a78180ebef818baf497a48911e0f25897e6906251c45cd787d61c5da43cbbb92
+  metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
+  data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9

data/.gitignore CHANGED Viewed

@@ -4,3 +4,4 @@
 Gemfile.lock
 doc/*
 pkg/*
+coverage/*

data/.travis.yml CHANGED Viewed

@@ -2,14 +2,42 @@ sudo: false
 language: ruby
 cache: bundler
 rvm:
-  - 2.0.0
-  - 2.1.0
-  - 2.2.0
+  - 2.4
+  - 2.5
+  - 2.6
+  - 2.7
+  - 3.0
+  - 3.1
+  - 3.2
+  - ruby-head
+matrix:
+  exclude:
+    # No gem releases since 2017 and failing on new versions.
+    # https://rubygems.org/gems/gsl
+    # https://rubygems.org/gems/nmatrix
+    - rvm: 3.0
+      env: MATRIX_LIBRARY=gsl
+    - rvm: 3.1
+      env: MATRIX_LIBRARY=gsl
+    - rvm: 3.2
+      env: MATRIX_LIBRARY=gsl
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=gsl
+    - rvm: 3.2
+      env: MATRIX_LIBRARY=nmatrix
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=nmatrix
+  allow_failures:
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=matrix
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=narray
 env:
   - MATRIX_LIBRARY=gsl
   - MATRIX_LIBRARY=narray
   - MATRIX_LIBRARY=nmatrix
   - MATRIX_LIBRARY=matrix
+  - MATRIX_LIBRARY=numo
 addons:
   apt:
     packages:
@@ -18,7 +46,7 @@ addons:
     # Installing ATLAS will install BLAS.
     - libatlas-dev
     - libatlas-base-dev
-    - libatlas3gf-base
+    - libatlas3-base
 before_install:
   - bundle config build.nmatrix --with-lapacklib
   - export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas

data/Gemfile CHANGED Viewed

@@ -1,8 +1,10 @@
 source 'https://rubygems.org'
 gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
+gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
 gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
-gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
+gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
+gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
 # Specify your gem's dependencies in the gemspec
 gemspec

data/README.md CHANGED Viewed

@@ -1,12 +1,11 @@
-# Ruby Vector Space Model (VSM) with tf*idf weights
+# Ruby Vector Space Model (VSM) with tf\*idf weights
 [![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](https://badge.fury.io/rb/tf-idf-similarity)
 [![Build Status](https://secure.travis-ci.org/jpmckinney/tf-idf-similarity.png)](https://travis-ci.org/jpmckinney/tf-idf-similarity)
-[![Dependency Status](https://gemnasium.com/jpmckinney/tf-idf-similarity.png)](https://gemnasium.com/jpmckinney/tf-idf-similarity)
 [![Coverage Status](https://coveralls.io/repos/jpmckinney/tf-idf-similarity/badge.png)](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
 [![Code Climate](https://codeclimate.com/github/jpmckinney/tf-idf-similarity.png)](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
-Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
+Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
 ## Usage
@@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix:
 matrix[model.document_index(document1), model.document_index(document2)]
 ```
-Print the tf*idf values for terms in a document:
+Print the tf\*idf values for terms in a document:
 ```ruby
 tfidf_by_term = {}
@@ -86,6 +85,8 @@ end
 document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
 ```
+Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
 [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
 ## Troubleshooting
@@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul
     require 'tf-idf-similarity/extras/document'
     require 'tf-idf-similarity/extras/tf_idf_model'
-The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
+The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
 ## Why?
-At the time of writing, no other Ruby gem implemented the tf*idf formula used by Lucene, Sphinx and Ferret.
+At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
 * [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
 * [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.

data/lib/tf-idf-similarity/bm25_model.rb CHANGED Viewed

@@ -22,8 +22,12 @@ module TfIdfSimilarity
     #
     # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
     def term_frequency(document, term)
-      tf = document.term_count(term)
-      (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
+      if @model.average_document_size.zero?
+        Float::NAN
+      else
+        tf = document.term_count(term)
+        (tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
+      end
     end
     alias_method :tf, :term_frequency
   end

data/lib/tf-idf-similarity/document.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'tf-idf-similarity/tokenizer'
 # A document.
 module TfIdfSimilarity
   class Document
@@ -19,7 +21,8 @@ module TfIdfSimilarity
     def initialize(text, opts = {})
       @text   = text
       @id     = opts[:id] || object_id
-      @tokens = opts[:tokens]
+      @tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
+      @tokenizer = opts[:tokenizer] || Tokenizer.new
       if opts[:term_counts]
         @term_counts = opts[:term_counts]
@@ -51,10 +54,9 @@ module TfIdfSimilarity
     # Tokenizes the text and counts terms and total tokens.
     def set_term_counts_and_size
-      tokenize(text).each do |word|
-        token = Token.new(word)
+      tokenize(text).each do |token|
         if token.valid?
-          term = token.lowercase_filter.classic_filter.to_s
+          term = token.to_s
           @term_counts[term] += 1
           @size += 1
         end
@@ -76,7 +78,7 @@ module TfIdfSimilarity
     # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
     # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
     def tokenize(text)
-      @tokens || UnicodeUtils.each_word(text)
+      @tokens || @tokenizer.tokenize(text)
     end
   end
 end

data/lib/tf-idf-similarity/matrix_methods.rb CHANGED Viewed

@@ -17,6 +17,10 @@ module TfIdfSimilarity
         norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
         norm[norm.where2[1]] = 1.0 # avoid division by zero
         NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
+      when :numo
+        norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
+        norm[(norm.eq 0).where] = 1.0 # avoid division by zero
+        (@matrix / norm)
       when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
         normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
         (0...@matrix.shape[1]).each do |j|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
     # @param [Integer] column index
     def get(i, j)
       case @library
-      when :narray
+      when :narray, :numo
         @matrix[j, i]
       else
         @matrix[i, j]
@@ -57,6 +61,8 @@ module TfIdfSimilarity
       case @library
       when :narray
         @matrix[true, index]
+      when :numo
+        @matrix[index, true]
       else
         @matrix.row(index)
       end
@@ -66,7 +72,7 @@ module TfIdfSimilarity
     # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
     def column(index)
       case @library
-      when :narray
+      when :narray, :numo
         @matrix[index, true]
       else
         @matrix.column(index)
@@ -78,7 +84,7 @@ module TfIdfSimilarity
       case @library
       when :gsl, :nmatrix
         @matrix.shape[0]
-      when :narray
+      when :narray, :numo
         @matrix.shape[1]
       else
         @matrix.row_size
@@ -90,7 +96,7 @@ module TfIdfSimilarity
       case @library
       when :gsl, :nmatrix
         @matrix.shape[1]
-      when :narray
+      when :narray, :numo
         @matrix.shape[0]
       else
         @matrix.column_size
@@ -110,7 +116,7 @@ module TfIdfSimilarity
     # @return [Float] the sum of all values in the matrix
     def sum
       case @library
-      when :narray
+      when :narray, :numo
         @matrix.sum
       else
         values.reduce(0, :+)
@@ -125,6 +131,8 @@ module TfIdfSimilarity
         GSL::Matrix[*array]
       when :narray
         NArray[*array]
+      when :numo
+        Numo::DFloat[*array]
       when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
         NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
       else
@@ -136,7 +144,7 @@ module TfIdfSimilarity
     # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
     def multiply_self(matrix)
       case @library
-      when :nmatrix
+      when :nmatrix, :numo
         matrix.transpose.dot(matrix)
       else
         matrix.transpose * matrix
@@ -149,6 +157,8 @@ module TfIdfSimilarity
         GSL::Sf::log(number)
       when :narray
         NMath.log(number)
+      when :numo
+        Numo::NMath.log(number)
       else
         Math.log(number)
       end
@@ -158,6 +168,8 @@ module TfIdfSimilarity
       case @library
       when :narray
         NMath.sqrt(number)
+      when :numo
+        Numo::NMath.sqrt(number)
       else
         Math.sqrt(number)
       end

data/lib/tf-idf-similarity/model.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module TfIdfSimilarity
       array = Array.new(terms.size) do |i|
         idf = inverse_document_frequency(terms[i])
         Array.new(documents.size) do |j|
-          term_frequency(documents[j], terms[i]) * idf
+          (term_frequency(documents[j], terms[i]) * idf).to_f
         end
       end

data/lib/tf-idf-similarity/term_count_model.rb CHANGED Viewed

@@ -37,6 +37,8 @@ module TfIdfSimilarity
         case @library
         when :gsl, :narray
           row(index).where.size
+        when :numo
+          (row(index).ne 0).where.size
         when :nmatrix
           row(index).each.count(&:nonzero?)
         else
@@ -57,7 +59,7 @@ module TfIdfSimilarity
       index = terms.index(term)
       if index
         case @library
-        when :gsl, :narray
+        when :gsl, :narray, :numo
           row(index).sum
         when :nmatrix
           row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower

data/lib/tf-idf-similarity/token.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # coding: utf-8
 require 'delegate'
+require 'unicode_utils/downcase'
+require 'unicode_utils/each_word'
 # A token.
 #
@@ -47,5 +49,10 @@ module TfIdfSimilarity
     def classic_filter
       self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
     end
+    def to_s
+      # Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
+      UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
+    end
   end
 end

data/lib/tf-idf-similarity/tokenizer.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'unicode_utils/each_word'
+require 'tf-idf-similarity/token'
+# A tokenizer using UnicodeUtils to tokenize a text.
+#
+# @see https://github.com/lang/unicode_utils
+module TfIdfSimilarity
+  class Tokenizer
+    # Tokenizes a text.
+    #
+    # @param [String] text
+    # @return [Enumerator] an enumerator of Token objects
+    def tokenize(text)
+      UnicodeUtils.each_word(text).map do |word|
+        Token.new(word)
+      end
+    end
+  end
+end

data/lib/tf-idf-similarity/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TfIdfSimilarity
-  VERSION = "0.1.6"
+  VERSION = "0.3.0"
 end

data/lib/tf-idf-similarity.rb CHANGED Viewed

@@ -1,9 +1,6 @@
 require 'forwardable'
 require 'set'
-require 'unicode_utils/downcase'
-require 'unicode_utils/each_word'
 module TfIdfSimilarity
 end

data/spec/bm25_model_spec.rb CHANGED Viewed

@@ -82,7 +82,12 @@ module TfIdfSimilarity
       describe '#term_frequency_inverse_document_frequency' do
         it 'should return negative infinity' do
-          model.tfidf(document, 'foo').should be_nan
+          case MATRIX_LIBRARY
+          when :numo
+            model.tfidf(document, 'foo').isnan.should eq 1
+          else
+            model.tfidf(document, 'foo').should be_nan
+          end
         end
       end
@@ -147,7 +152,7 @@ module TfIdfSimilarity
         end
         it 'should return the term frequency if tokens given' do
-          model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
+          model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
         end
         it 'should return no term frequency if no text given' do
@@ -155,7 +160,7 @@ module TfIdfSimilarity
         end
         it 'should return the term frequency if term counts given' do
-          model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
+          model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
         end
         it 'should return the term frequency of a non-occurring term' do
@@ -163,7 +168,7 @@ module TfIdfSimilarity
         end
         it 'should return the term frequency in a non-occurring document' do
-          model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
+          model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
         end
       end
@@ -177,17 +182,17 @@ module TfIdfSimilarity
         end
         it 'should return the tf*idf in a non-occurring term' do
-          model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
+          model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
         end
       end
       describe '#similarity_matrix' do
         it 'should return the similarity matrix' do
           expected = [
-            1.0,   0.564, 0.0, 0.479,
-            0.564, 1.0,   0.0, 0.540,
+            1.0,   0.558, 0.0, 0.449,
+            0.558, 1.0,   0.0, 0.501,
             0.0,   0.0,   0.0, 0.0,
-            0.479, 0.540, 0.0, 1.0,
+            0.449, 0.501, 0.0, 1.0,
           ]
           similarity_matrix_values(model).each_with_index do |value,i|

data/spec/spec_helper.rb CHANGED Viewed

@@ -18,6 +18,8 @@ when :gsl
   require 'gsl'
 when :narray
   require 'narray'
+when :numo
+  require 'numo/narray'
 when :nmatrix
   require 'nmatrix'
 else

data/td-idf-similarity.gemspec CHANGED Viewed

@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
   s.homepage    = "https://github.com/jpmckinney/tf-idf-similarity"
   s.summary     = %q{Calculates the similarity between texts using tf*idf}
   s.license     = 'MIT'
+  s.required_ruby_version = '>= 2.4.0'
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
   s.add_development_dependency('coveralls')
   s.add_development_dependency('json', '< 2')
-  s.add_development_dependency('rake', '< 12')
-  s.add_development_dependency('rspec', '~> 2.10')
+  s.add_development_dependency('rake')
+  s.add_development_dependency('rspec', '~> 3.0')
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tf-idf-similarity
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.3.0
 platform: ruby
 authors:
 - James McKinney
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-03-07 00:00:00.000000000 Z
+date: 2024-02-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode_utils
@@ -56,30 +56,30 @@ dependencies:
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "<"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '12'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "<"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '12'
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.10'
+        version: '3.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.10'
+        version: '3.0'
 description:
 email:
 executables: []
@@ -104,6 +104,7 @@ files:
 - lib/tf-idf-similarity/term_count_model.rb
 - lib/tf-idf-similarity/tf_idf_model.rb
 - lib/tf-idf-similarity/token.rb
+- lib/tf-idf-similarity/tokenizer.rb
 - lib/tf-idf-similarity/version.rb
 - spec/bm25_model_spec.rb
 - spec/document_spec.rb
@@ -125,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 2.4.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.4.5
+rubygems_version: 3.0.3.1
 signing_key:
 specification_version: 4
 summary: Calculates the similarity between texts using tf*idf