RubyGems - tf-idf-similarity - Versions diffs - 0.1.6 → 0.3.0 - Mend

tf-idf-similarity 0.1.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/.travis.yml +32 -4
data/Gemfile +3 -1
data/README.md +7 -6
data/lib/tf-idf-similarity/bm25_model.rb +6 -2
data/lib/tf-idf-similarity/document.rb +7 -5
data/lib/tf-idf-similarity/matrix_methods.rb +18 -6
data/lib/tf-idf-similarity/model.rb +1 -1
data/lib/tf-idf-similarity/term_count_model.rb +3 -1
data/lib/tf-idf-similarity/token.rb +7 -0
data/lib/tf-idf-similarity/tokenizer.rb +19 -0
data/lib/tf-idf-similarity/version.rb +1 -1
data/lib/tf-idf-similarity.rb +0 -3
data/spec/bm25_model_spec.rb +13 -8
data/spec/spec_helper.rb +2 -0
data/td-idf-similarity.gemspec +3 -2
metadata +11 -11

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 03431fb16064caa54fe9cbfc17a151acb1a25fa5
-  data.tar.gz: be2e97b63e14244925937ee71fc8dc60c88dfce4
+SHA256:
+  metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
+  data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
 SHA512:
-  metadata.gz: f615fae6cfad994fa25c85b1f3d6882742944e7bb5894ae3fcf6b4c9d7b34647b0da1b3914f127eb26e46a299c0f8a4e9d64bc05a7cb1c429663beaf657704eb
-  data.tar.gz: 317ea7c5a1a72e53419f2eadb5b4789bccbe29f0f7bf742f89e9ed9ffb210b43a78180ebef818baf497a48911e0f25897e6906251c45cd787d61c5da43cbbb92
+  metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
+  data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9

data/.gitignore CHANGED Viewed

@@ -4,3 +4,4 @@
 Gemfile.lock
 doc/*
 pkg/*
+coverage/*

data/.travis.yml CHANGED Viewed

@@ -2,14 +2,42 @@ sudo: false
 language: ruby
 cache: bundler
 rvm:
-  - 2.0.0
-  - 2.1.0
-  - 2.2.0
+  - 2.4
+  - 2.5
+  - 2.6
+  - 2.7
+  - 3.0
+  - 3.1
+  - 3.2
+  - ruby-head
+matrix:
+  exclude:
+    # No gem releases since 2017 and failing on new versions.
+    # https://rubygems.org/gems/gsl
+    # https://rubygems.org/gems/nmatrix
+    - rvm: 3.0
+      env: MATRIX_LIBRARY=gsl
+    - rvm: 3.1
+      env: MATRIX_LIBRARY=gsl
+    - rvm: 3.2
+      env: MATRIX_LIBRARY=gsl
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=gsl
+    - rvm: 3.2
+      env: MATRIX_LIBRARY=nmatrix
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=nmatrix
+  allow_failures:
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=matrix
+    - rvm: ruby-head
+      env: MATRIX_LIBRARY=narray
 env:
   - MATRIX_LIBRARY=gsl
   - MATRIX_LIBRARY=narray
   - MATRIX_LIBRARY=nmatrix
   - MATRIX_LIBRARY=matrix
+  - MATRIX_LIBRARY=numo
 addons:
   apt:
     packages:
@@ -18,7 +46,7 @@ addons:
     # Installing ATLAS will install BLAS.
     - libatlas-dev
     - libatlas-base-dev
-    - libatlas3gf-base
+    - libatlas3-base
 before_install:
   - bundle config build.nmatrix --with-lapacklib
   - export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas

data/Gemfile CHANGED Viewed

@@ -1,8 +1,10 @@
 source 'https://rubygems.org'
 gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
+gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
 gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
-gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
+gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
+gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
 # Specify your gem's dependencies in the gemspec
 gemspec

data/README.md CHANGED Viewed

@@ -1,12 +1,11 @@
-# Ruby Vector Space Model (VSM) with tf*idf weights
+# Ruby Vector Space Model (VSM) with tf\*idf weights
 [![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](https://badge.fury.io/rb/tf-idf-similarity)
 [![Build Status](https://secure.travis-ci.org/jpmckinney/tf-idf-similarity.png)](https://travis-ci.org/jpmckinney/tf-idf-similarity)
-[![Dependency Status](https://gemnasium.com/jpmckinney/tf-idf-similarity.png)](https://gemnasium.com/jpmckinney/tf-idf-similarity)
 [![Coverage Status](https://coveralls.io/repos/jpmckinney/tf-idf-similarity/badge.png)](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
 [![Code Climate](https://codeclimate.com/github/jpmckinney/tf-idf-similarity.png)](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
-Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
+Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
 ## Usage
@@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix:
 matrix[model.document_index(document1), model.document_index(document2)]
 ```
-Print the tf*idf values for terms in a document:
+Print the tf\*idf values for terms in a document:
 ```ruby
 tfidf_by_term = {}
@@ -86,6 +85,8 @@ end
 document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
 ```
+Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
 [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
 ## Troubleshooting
@@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul
     require 'tf-idf-similarity/extras/document'
     require 'tf-idf-similarity/extras/tf_idf_model'
-The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
+The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
 ## Why?
-At the time of writing, no other Ruby gem implemented the tf*idf formula used by Lucene, Sphinx and Ferret.
+At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
 * [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
 * [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.

data/lib/tf-idf-similarity/bm25_model.rb CHANGED Viewed

@@ -22,8 +22,12 @@ module TfIdfSimilarity
     #
     # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
     def term_frequency(document, term)
-      tf = document.term_count(term)
-      (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
+      if @model.average_document_size.zero?
+        Float::NAN
+      else
+        tf = document.term_count(term)
+        (tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
+      end
     end
     alias_method :tf, :term_frequency
   end

data/lib/tf-idf-similarity/document.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'tf-idf-similarity/tokenizer'
 # A document.
 module TfIdfSimilarity
   class Document
@@ -19,7 +21,8 @@ module TfIdfSimilarity
     def initialize(text, opts = {})
       @text   = text
       @id     = opts[:id] || object_id
-      @tokens = opts[:tokens]
+      @tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
+      @tokenizer = opts[:tokenizer] || Tokenizer.new
       if opts[:term_counts]
         @term_counts = opts[:term_counts]
@@ -51,10 +54,9 @@ module TfIdfSimilarity
     # Tokenizes the text and counts terms and total tokens.
     def set_term_counts_and_size
-      tokenize(text).each do |word|
-        token = Token.new(word)
+      tokenize(text).each do |token|
         if token.valid?
-          term = token.lowercase_filter.classic_filter.to_s
+          term = token.to_s
           @term_counts[term] += 1
           @size += 1
         end
@@ -76,7 +78,7 @@ module TfIdfSimilarity
     # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
     # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
     def tokenize(text)
-      @tokens || UnicodeUtils.each_word(text)
+      @tokens || @tokenizer.tokenize(text)
     end
   end
 end

data/lib/tf-idf-similarity/matrix_methods.rb CHANGED Viewed

@@ -17,6 +17,10 @@ module TfIdfSimilarity
         norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
         norm[norm.where2[1]] = 1.0 # avoid division by zero
         NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
+      when :numo
+        norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
+        norm[(norm.eq 0).where] = 1.0 # avoid division by zero
+        (@matrix / norm)
       when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
         normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
         (0...@matrix.shape[1]).each do |j|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
     # @param [Integer] column index
     def get(i, j)
       case @library
-      when :narray
+      when :narray, :numo
         @matrix[j, i]
       else
         @matrix[i, j]
@@ -57,6 +61,8 @@ module TfIdfSimilarity
       case @library
       when :narray
         @matrix[true, index]
+      when :numo
+        @matrix[index, true]
       else
         @matrix.row(index)
       end
@@ -66,7 +72,7 @@ module TfIdfSimilarity
     # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
     def column(index)
       case @library
-      when :narray
+      when :narray, :numo
         @matrix[index, true]
       else
         @matrix.column(index)
@@ -78,7 +84,7 @@ module TfIdfSimilarity
       case @library
       when :gsl, :nmatrix
         @matrix.shape[0]
-      when :narray
+      when :narray, :numo
         @matrix.shape[1]
       else
         @matrix.row_size
@@ -90,7 +96,7 @@ module TfIdfSimilarity
       case @library
       when :gsl, :nmatrix
         @matrix.shape[1]
-      when :narray
+      when :narray, :numo
         @matrix.shape[0]
       else
         @matrix.column_size
@@ -110,7 +116,7 @@ module TfIdfSimilarity
     # @return [Float] the sum of all values in the matrix
     def sum
       case @library
-      when :narray
+      when :narray, :numo
         @matrix.sum
       else
         values.reduce(0, :+)
@@ -125,6 +131,8 @@ module TfIdfSimilarity
         GSL::Matrix[*array]
       when :narray
         NArray[*array]
+      when :numo
+        Numo::DFloat[*array]
       when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
         NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
       else
@@ -136,7 +144,7 @@ module TfIdfSimilarity
     # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
     def multiply_self(matrix)
       case @library
-      when :nmatrix
+      when :nmatrix, :numo
         matrix.transpose.dot(matrix)
       else
         matrix.transpose * matrix
@@ -149,6 +157,8 @@ module TfIdfSimilarity
         GSL::Sf::log(number)
       when :narray
         NMath.log(number)
+      when :numo
+        Numo::NMath.log(number)
       else
         Math.log(number)
       end
@@ -158,6 +168,8 @@ module TfIdfSimilarity
       case @library
       when :narray
         NMath.sqrt(number)
+      when :numo
+        Numo::NMath.sqrt(number)
       else
         Math.sqrt(number)
       end

data/lib/tf-idf-similarity/model.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module TfIdfSimilarity
       array = Array.new(terms.size) do |i|
         idf = inverse_document_frequency(terms[i])
         Array.new(documents.size) do |j|
-          term_frequency(documents[j], terms[i]) * idf
+          (term_frequency(documents[j], terms[i]) * idf).to_f
         end
       end

data/lib/tf-idf-similarity/term_count_model.rb CHANGED Viewed

@@ -37,6 +37,8 @@ module TfIdfSimilarity
         case @library
         when :gsl, :narray
           row(index).where.size
+        when :numo
+          (row(index).ne 0).where.size
         when :nmatrix
           row(index).each.count(&:nonzero?)
         else
@@ -57,7 +59,7 @@ module TfIdfSimilarity
       index = terms.index(term)
       if index
         case @library
-        when :gsl, :narray
+        when :gsl, :narray, :numo
           row(index).sum
         when :nmatrix
           row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower

data/lib/tf-idf-similarity/token.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # coding: utf-8
 require 'delegate'
+require 'unicode_utils/downcase'
+require 'unicode_utils/each_word'
 # A token.
 #
@@ -47,5 +49,10 @@ module TfIdfSimilarity
     def classic_filter
       self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
     end
+    def to_s
+      # Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
+      UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
+    end
   end
 end

data/lib/tf-idf-similarity/tokenizer.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'unicode_utils/each_word'
+require 'tf-idf-similarity/token'
+# A tokenizer using UnicodeUtils to tokenize a text.
+#
+# @see https://github.com/lang/unicode_utils
+module TfIdfSimilarity
+  class Tokenizer
+    # Tokenizes a text.
+    #
+    # @param [String] text
+    # @return [Enumerator] an enumerator of Token objects
+    def tokenize(text)
+      UnicodeUtils.each_word(text).map do |word|
+        Token.new(word)
+      end
+    end
+  end
+end

data/lib/tf-idf-similarity/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module TfIdfSimilarity
-  VERSION = "0.1.6"
+  VERSION = "0.3.0"
 end

data/lib/tf-idf-similarity.rb CHANGED Viewed

@@ -1,9 +1,6 @@
 require 'forwardable'
 require 'set'
-require 'unicode_utils/downcase'
-require 'unicode_utils/each_word'
 module TfIdfSimilarity
 end

data/spec/bm25_model_spec.rb CHANGED Viewed

@@ -82,7 +82,12 @@ module TfIdfSimilarity
       describe '#term_frequency_inverse_document_frequency' do
         it 'should return negative infinity' do
-          model.tfidf(document, 'foo').should be_nan
+          case MATRIX_LIBRARY
+          when :numo
+            model.tfidf(document, 'foo').isnan.should eq 1
+          else
+            model.tfidf(document, 'foo').should be_nan
+          end
         end
       end
@@ -147,7 +152,7 @@ module TfIdfSimilarity
         end
         it 'should return the term frequency if tokens given' do
-          model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
+          model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
         end
         it 'should return no term frequency if no text given' do
@@ -155,7 +160,7 @@ module TfIdfSimilarity
         end
         it 'should return the term frequency if term counts given' do
-          model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
+          model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
         end
         it 'should return the term frequency of a non-occurring term' do
@@ -163,7 +168,7 @@ module TfIdfSimilarity
         end
         it 'should return the term frequency in a non-occurring document' do
-          model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
+          model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
         end
       end
@@ -177,17 +182,17 @@ module TfIdfSimilarity
         end
         it 'should return the tf*idf in a non-occurring term' do
-          model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
+          model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
         end
       end
       describe '#similarity_matrix' do
         it 'should return the similarity matrix' do
           expected = [
-            1.0,   0.564, 0.0, 0.479,
-            0.564, 1.0,   0.0, 0.540,
+            1.0,   0.558, 0.0, 0.449,
+            0.558, 1.0,   0.0, 0.501,
             0.0,   0.0,   0.0, 0.0,
-            0.479, 0.540, 0.0, 1.0,
+            0.449, 0.501, 0.0, 1.0,
           ]
           similarity_matrix_values(model).each_with_index do |value,i|

data/spec/spec_helper.rb CHANGED Viewed

@@ -18,6 +18,8 @@ when :gsl
   require 'gsl'
 when :narray
   require 'narray'
+when :numo
+  require 'numo/narray'
 when :nmatrix
   require 'nmatrix'
 else

data/td-idf-similarity.gemspec CHANGED Viewed

@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
   s.homepage    = "https://github.com/jpmckinney/tf-idf-similarity"
   s.summary     = %q{Calculates the similarity between texts using tf*idf}
   s.license     = 'MIT'
+  s.required_ruby_version = '>= 2.4.0'
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
   s.add_development_dependency('coveralls')
   s.add_development_dependency('json', '< 2')
-  s.add_development_dependency('rake', '< 12')
-  s.add_development_dependency('rspec', '~> 2.10')
+  s.add_development_dependency('rake')
+  s.add_development_dependency('rspec', '~> 3.0')
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tf-idf-similarity
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.3.0
 platform: ruby
 authors:
 - James McKinney
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-03-07 00:00:00.000000000 Z
+date: 2024-02-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode_utils
@@ -56,30 +56,30 @@ dependencies:
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "<"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '12'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "<"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '12'
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.10'
+        version: '3.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.10'
+        version: '3.0'
 description:
 email:
 executables: []
@@ -104,6 +104,7 @@ files:
 - lib/tf-idf-similarity/term_count_model.rb
 - lib/tf-idf-similarity/tf_idf_model.rb
 - lib/tf-idf-similarity/token.rb
+- lib/tf-idf-similarity/tokenizer.rb
 - lib/tf-idf-similarity/version.rb
 - spec/bm25_model_spec.rb
 - spec/document_spec.rb
@@ -125,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 2.4.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.4.5
+rubygems_version: 3.0.3.1
 signing_key:
 specification_version: 4
 summary: Calculates the similarity between texts using tf*idf