RubyGems - tf-idf-similarity - Versions diffs - 0.1.3 → 0.1.4 - Mend

tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -8
data/Gemfile +2 -2
data/README.md +40 -9
data/lib/tf-idf-similarity.rb +1 -0
data/lib/tf-idf-similarity/bm25_model.rb +23 -62
data/lib/tf-idf-similarity/document.rb +69 -67
data/lib/tf-idf-similarity/extras/document.rb +10 -8
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
data/lib/tf-idf-similarity/model.rb +66 -0
data/lib/tf-idf-similarity/term_count_model.rb +59 -57
data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
data/lib/tf-idf-similarity/token.rb +39 -37
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/bm25_model_spec.rb +200 -0
data/spec/document_spec.rb +98 -96
data/spec/extras/tf_idf_model_spec.rb +224 -222
data/spec/spec_helper.rb +6 -0
data/spec/term_count_model_spec.rb +76 -74
data/spec/tf_idf_model_spec.rb +143 -117
data/spec/token_spec.rb +23 -21
metadata +6 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cdc8ab3c1938db619adf75d74fd77a5647b4e29c
-  data.tar.gz: dfbc6eaaa16328c30bdb2dc56481ec93265ec4aa
+  metadata.gz: c0ba1f941db96541f035a283df336907bf941439
+  data.tar.gz: 22bbec24681023e880e1e4e3fa14d26356630021
 SHA512:
-  metadata.gz: b96cd6bdf856430fa2caad8c3f1284dd4a7842395058778aa282f8d1352c7f092ff299e32440102a871519e329e692761b4bff9f7043b82bead490802c77dc75
-  data.tar.gz: c72cb027c925ca35e3d85eb6604f034f827146aca28b2e6e853736cd8f18b9c4229a92475989983ea7b43deb741f4c66ba6cb08aadf65d8bf38bf52399c2ff1b
+  metadata.gz: 9e7cca8d705d8080dff857d2d953a6f0091e361bb0693f0ce650e64a2f4633ad5db386fa41ea8b73ae1cfe839db8e4e9f56592c98b36cdc6ab756699ecfaa5f7
+  data.tar.gz: 3bcb9dcb07c9eb00c234920ff8d6340aac815c8181510d7ae65183e9b1d528001247439a86c6b603d973362bcee020eb0340558a9318693713cdaaa4b62a2ffd

data/.travis.yml CHANGED

@@ -1,21 +1,14 @@
 language: ruby
 rvm:
-  - 1.8.7
   - 1.9.2
   - 1.9.3
   - 2.0.0
-  - ree
+  - 2.1.0
 env:
   - MATRIX_LIBRARY=gsl
   - MATRIX_LIBRARY=narray
   - MATRIX_LIBRARY=nmatrix
   - MATRIX_LIBRARY=matrix
-matrix:
-  exclude:
-    - rvm: 1.8.7
-      env: MATRIX_LIBRARY=nmatrix
-    - rvm: ree
-      env: MATRIX_LIBRARY=nmatrix
 before_install:
   - bundle config build.nmatrix --with-lapacklib
   - if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi

data/Gemfile CHANGED

@@ -1,8 +1,8 @@
 source "http://rubygems.org"
-gem 'gsl', '~> 1.15.3'     if ENV['MATRIX_LIBRARY'] == 'gsl'
+gem 'rb-gsl', '~> 1.16.0.2'     if ENV['MATRIX_LIBRARY'] == 'gsl'
 gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
-gem 'nmatrix', '~> 0.0.9'  if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
+gem 'nmatrix', '~> 0.1.0.rc5'  if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
 # Specify your gem's dependencies in the gemspec
 gemspec

data/README.md CHANGED

@@ -1,5 +1,6 @@
 # Ruby Vector Space Model (VSM) with tf*idf weights
+[![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](http://badge.fury.io/rb/tf-idf-similarity)
 [![Build Status](https://secure.travis-ci.org/opennorth/tf-idf-similarity.png)](http://travis-ci.org/opennorth/tf-idf-similarity)
 [![Dependency Status](https://gemnasium.com/opennorth/tf-idf-similarity.png)](https://gemnasium.com/opennorth/tf-idf-similarity)
 [![Coverage Status](https://coveralls.io/repos/opennorth/tf-idf-similarity/badge.png?branch=master)](https://coveralls.io/r/opennorth/tf-idf-similarity)
@@ -9,23 +10,53 @@ Calculates the similarity between texts using a [bag-of-words](http://en.wikiped
 ## Usage
-    require 'matrix'
-    require 'tf-idf-similarity'
+```ruby
+require 'matrix'
+require 'tf-idf-similarity'
+```
 Create a set of documents:
-    corpus = []
-    corpus << TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
-    corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
-    corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
+```ruby
+document1 = TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
+document2 = TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
+document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
+corpus = [document1, document2, document3]
+```
 Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
-    model = TfIdfSimilarity::TfIdfModel.new(corpus)
+```ruby
+model = TfIdfSimilarity::TfIdfModel.new(corpus)
+```
-Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
+Or, create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
-    model = TfIdfSimilarity::BM25Model.new(corpus)
+```ruby
+model = TfIdfSimilarity::BM25Model.new(corpus)
+```
+Create a similarity matrix:
+```ruby
+matrix = model.similarity_matrix
+```
+Find the similarity of two documents in the matrix:
+```ruby
+matrix[model.document_index(document1), model.document_index(document2)]
+```
+Print the tf*idf values for terms in a document:
+```ruby
+tfidf_by_term = {}
+document1.terms.each do |term|
+  tfidf_by_term[term] = model.tfidf(document1, term)
+end
+puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
+```
 [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)

data/lib/tf-idf-similarity.rb CHANGED

@@ -13,6 +13,7 @@ end
 require 'tf-idf-similarity/matrix_methods'
 require 'tf-idf-similarity/term_count_model'
+require 'tf-idf-similarity/model'
 require 'tf-idf-similarity/tf_idf_model'
 require 'tf-idf-similarity/bm25_model'
 require 'tf-idf-similarity/document'

data/lib/tf-idf-similarity/bm25_model.rb CHANGED

@@ -2,68 +2,29 @@
 #
 # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
 # @see http://en.wikipedia.org/wiki/Okapi_BM25
-class TfIdfSimilarity::BM25Model
-  include TfIdfSimilarity::MatrixMethods
-  extend Forwardable
-  def_delegators :@model, :documents, :terms, :document_count
-  # @param [Array<TfIdfSimilarity::Document>] documents documents
-  # @param [Hash] opts optional arguments
-  # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
-  def initialize(documents, opts = {})
-    @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
-    @library = (opts[:library] || :matrix).to_sym
-    array = Array.new(terms.size) do |i|
-      idf = inverse_document_frequency(terms[i])
-      Array.new(documents.size) do |j|
-        term_frequency(documents[j], terms[i]) * idf
-      end
+module TfIdfSimilarity
+  class BM25Model < Model
+    # Return the term's inverse document frequency.
+    #
+    # @param [String] term a term
+    # @return [Float] the term's inverse document frequency
+    def inverse_document_frequency(term)
+      df = @model.document_count(term)
+      log((documents.size - df + 0.5) / (df + 0.5))
     end
-    @matrix = initialize_matrix(array)
-  end
-  # Return the term's inverse document frequency.
-  #
-  # @param [String] term a term
-  # @return [Float] the term's inverse document frequency
-  def inverse_document_frequency(term)
-    df = @model.document_count(term)
-    log((documents.size - df + 0.5) / (df + 0.5))
-  end
-  alias_method :idf, :inverse_document_frequency
-  # Returns the term's frequency in the document.
-  #
-  # @param [Document] document a document
-  # @param [String] term a term
-  # @return [Float] the term's frequency in the document
-  #
-  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
-  def term_frequency(document, term)
-    tf = document.term_count(term)
-    (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
-  end
-  alias_method :tf, :term_frequency
-  # Return the term frequency–inverse document frequency.
-  #
-  # @param [Document] document a document
-  # @param [String] term a term
-  # @return [Float] the term frequency–inverse document frequency
-  def term_frequency_inverse_document_frequency(document, term)
-    inverse_document_frequency(term) * term_frequency(document, term)
-  end
-  alias_method :tfidf, :term_frequency_inverse_document_frequency
-  # Returns a similarity matrix for the documents in the corpus.
-  #
-  # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
-  # @note Columns are normalized to unit vectors, so we can calculate the cosine
-  #   similarity of all document vectors.
-  def similarity_matrix
-    multiply_self(normalize)
+    alias_method :idf, :inverse_document_frequency
+    # Returns the term's frequency in the document.
+    #
+    # @param [Document] document a document
+    # @param [String] term a term
+    # @return [Float] the term's frequency in the document
+    #
+    # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
+    def term_frequency(document, term)
+      tf = document.term_count(term)
+      (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
+    end
+    alias_method :tf, :term_frequency
   end
 end

data/lib/tf-idf-similarity/document.rb CHANGED

@@ -1,80 +1,82 @@
 # A document.
-class TfIdfSimilarity::Document
-  # The document's identifier.
-  attr_reader :id
-  # The document's text.
-  attr_reader :text
-  # The number of times each term appears in the document.
-  attr_reader :term_counts
-  # The number of tokens in the document.
-  attr_reader :size
+module TfIdfSimilarity
+  class Document
+    # The document's identifier.
+    attr_reader :id
+    # The document's text.
+    attr_reader :text
+    # The number of times each term appears in the document.
+    attr_reader :term_counts
+    # The number of tokens in the document.
+    attr_reader :size
-  # @param [String] text the document's text
-  # @param [Hash] opts optional arguments
-  # @option opts [String] :id the document's identifier
-  # @option opts [Array] :tokens the document's tokenized text
-  # @option opts [Hash] :term_counts the number of times each term appears
-  # @option opts [Integer] :size the number of tokens in the document
-  def initialize(text, opts = {})
-    @text   = text
-    @id     = opts[:id] || object_id
-    @tokens = opts[:tokens]
+    # @param [String] text the document's text
+    # @param [Hash] opts optional arguments
+    # @option opts [String] :id the document's identifier
+    # @option opts [Array] :tokens the document's tokenized text
+    # @option opts [Hash] :term_counts the number of times each term appears
+    # @option opts [Integer] :size the number of tokens in the document
+    def initialize(text, opts = {})
+      @text   = text
+      @id     = opts[:id] || object_id
+      @tokens = opts[:tokens]
-    if opts[:term_counts]
-      @term_counts = opts[:term_counts]
-      @size = opts[:size] || term_counts.values.reduce(0, :+)
-      # Nothing to do.
-    else
-      @term_counts = Hash.new(0)
-      @size = 0
-      set_term_counts_and_size
+      if opts[:term_counts]
+        @term_counts = opts[:term_counts]
+        @size = opts[:size] || term_counts.values.reduce(0, :+)
+        # Nothing to do.
+      else
+        @term_counts = Hash.new(0)
+        @size = 0
+        set_term_counts_and_size
+      end
     end
-  end
-  # Returns the set of terms in the document.
-  #
-  # @return [Array<String>] the unique terms in the document
-  def terms
-    term_counts.keys
-  end
+    # Returns the set of terms in the document.
+    #
+    # @return [Array<String>] the unique terms in the document
+    def terms
+      term_counts.keys
+    end
-  # Returns the number of occurrences of the term in the document.
-  #
-  # @param [String] term a term
-  # @return [Integer] the number of times the term appears in the document
-  def term_count(term)
-    term_counts[term].to_i # need #to_i if unmarshalled
-  end
+    # Returns the number of occurrences of the term in the document.
+    #
+    # @param [String] term a term
+    # @return [Integer] the number of times the term appears in the document
+    def term_count(term)
+      term_counts[term].to_i # need #to_i if unmarshalled
+    end
-private
+  private
-  # Tokenizes the text and counts terms and total tokens.
-  def set_term_counts_and_size
-    tokenize(text).each do |word|
-      token = TfIdfSimilarity::Token.new(word)
-      if token.valid?
-        term = token.lowercase_filter.classic_filter.to_s
-        @term_counts[term] += 1
-        @size += 1
+    # Tokenizes the text and counts terms and total tokens.
+    def set_term_counts_and_size
+      tokenize(text).each do |word|
+        token = Token.new(word)
+        if token.valid?
+          term = token.lowercase_filter.classic_filter.to_s
+          @term_counts[term] += 1
+          @size += 1
+        end
       end
     end
-  end
-  # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
-  # Word Boundary Specification.
-  #
-  # If a tokenized text was provided at the document's initialization, those
-  # tokens will be returned without additional processing.
-  #
-  # @param [String] text a text
-  # @return [Enumerator] a token enumerator
-  #
-  # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
-  #   or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
-  #
-  # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
-  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
-  def tokenize(text)
-    @tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
+    # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
+    # Word Boundary Specification.
+    #
+    # If a tokenized text was provided at the document's initialization, those
+    # tokens will be returned without additional processing.
+    #
+    # @param [String] text a text
+    # @return [Enumerator] a token enumerator
+    #
+    # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
+    #   or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
+    #
+    # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
+    # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
+    def tokenize(text)
+      @tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
+    end
   end
 end

data/lib/tf-idf-similarity/extras/document.rb CHANGED

@@ -1,11 +1,13 @@
-class TfIdfSimilarity::Document
-  # @return [Float] the maximum term count of any term in the document
-  def maximum_term_count
-    @maximum_term_count ||= term_counts.values.max.to_f
-  end
+module TfIdfSimilarity
+  class Document
+    # @return [Float] the maximum term count of any term in the document
+    def maximum_term_count
+      @maximum_term_count ||= term_counts.values.max.to_f
+    end
-  # @return [Float] the average term count of all terms in the document
-  def average_term_count
-    @average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
+    # @return [Float] the average term count of all terms in the document
+    def average_term_count
+      @average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
+    end
   end
 end

data/lib/tf-idf-similarity/extras/tf_idf_model.rb CHANGED

@@ -10,183 +10,185 @@
 # @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
 # @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
 # @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
-class TfIdfSimilarity::TfIdfModel
-  # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
-  #
-  # SMART n, Salton x, Chisholm NONE
-  def no_collection_frequency(term)
-    1.0
-  end
-  # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
-  #
-  # SMART t, Salton f, Chisholm IDFB
-  def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
-    log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
-  end
-  alias_method :plain_idf, :plain_inverse_document_frequency
+module TfIdfSimilarity
+  class TfIdfModel
+    # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
+    #
+    # SMART n, Salton x, Chisholm NONE
+    def no_collection_frequency(term)
+      1.0
+    end
-  # SMART p, Salton p, Chisholm IDFP
-  def probabilistic_inverse_document_frequency(term)
-    count = @model.document_count(term).to_f
-    log((documents.size - count) / count)
-  end
-  alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
+    # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
+    #
+    # SMART t, Salton f, Chisholm IDFB
+    def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
+      log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
+    end
+    alias_method :plain_idf, :plain_inverse_document_frequency
-  # Chisholm IGFF
-  def global_frequency_inverse_document_frequency(term)
-    @model.term_count(term) / @model.document_count(term).to_f
-  end
-  alias_method :gfidf, :global_frequency_inverse_document_frequency
+    # SMART p, Salton p, Chisholm IDFP
+    def probabilistic_inverse_document_frequency(term)
+      count = @model.document_count(term).to_f
+      log((documents.size - count) / count)
+    end
+    alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
-  # Chisholm IGFL
-  def log_global_frequency_inverse_document_frequency(term)
-    log(global_frequency_inverse_document_frequency(term) + 1)
-  end
-  alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
+    # Chisholm IGFF
+    def global_frequency_inverse_document_frequency(term)
+      @model.term_count(term) / @model.document_count(term).to_f
+    end
+    alias_method :gfidf, :global_frequency_inverse_document_frequency
-  # Chisholm IGFI
-  def incremented_global_frequency_inverse_document_frequency(term)
-    global_frequency_inverse_document_frequency(term) + 1
-  end
-  alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
+    # Chisholm IGFL
+    def log_global_frequency_inverse_document_frequency(term)
+      log(global_frequency_inverse_document_frequency(term) + 1)
+    end
+    alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
-  # Chisholm IGFS
-  def square_root_global_frequency_inverse_document_frequency(term)
-    sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
-  end
-  alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
+    # Chisholm IGFI
+    def incremented_global_frequency_inverse_document_frequency(term)
+      global_frequency_inverse_document_frequency(term) + 1
+    end
+    alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
-  # Chisholm ENPY
-  def entropy(term)
-    denominator = @model.term_count(term).to_f
-    logN = log(documents.size)
-    1 + documents.reduce(0) do |sum,document|
-      quotient = document.term_count(term) / denominator
-      sum += quotient * log(quotient) / logN
+    # Chisholm IGFS
+    def square_root_global_frequency_inverse_document_frequency(term)
+      sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
+    end
+    alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
+    # Chisholm ENPY
+    def entropy(term)
+      denominator = @model.term_count(term).to_f
+      logN = log(documents.size)
+      1 + documents.reduce(0) do |sum,document|
+        quotient = document.term_count(term) / denominator
+        sum += quotient * log(quotient) / logN
+      end
     end
-  end
-  # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
-  # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
-  # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
-  # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
-  # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
-  #
-  # SMART n, Salton x, Chisholm NONE
-  def no_normalization(matrix)
-    matrix
-  end
+    # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
+    # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
+    # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
+    # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
+    # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
+    #
+    # SMART n, Salton x, Chisholm NONE
+    def no_normalization(matrix)
+      matrix
+    end
-  # @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
-  #
-  # SMART u, Chisholm PUQN
-  def pivoted_unique_normalization(matrix)
-    raise NotImplementedError
-  end
+    # @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
+    #
+    # SMART u, Chisholm PUQN
+    def pivoted_unique_normalization(matrix)
+      raise NotImplementedError
+    end
-  # Cosine normalization is implemented as TfIdfSimilarity::MatrixMethods#normalize.
-  #
-  # SMART c, Salton c, Chisholm COSN
+    # Cosine normalization is implemented as MatrixMethods#normalize.
+    #
+    # SMART c, Salton c, Chisholm COSN
-  # The plain term frequency is implemented as TfIdfSimilarity::Document#term_count.
-  #
-  # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
-  # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
-  #
-  # SMART n, Salton t, Chisholm FREQ
+    # The plain term frequency is implemented as Document#term_count.
+    #
+    # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
+    # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
+    #
+    # SMART n, Salton t, Chisholm FREQ
-  # SMART b, Salton b, Chisholm BNRY
-  def binary_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      1
-    else
-      0
+    # SMART b, Salton b, Chisholm BNRY
+    def binary_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        1
+      else
+        0
+      end
     end
-  end
-  alias_method :binary_tf, :binary_term_frequency
-  # @see http://en.wikipedia.org/wiki/Tf*idf
-  # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
-  def normalized_term_frequency(document, term, a = 0)
-    a + (1 - a) * document.term_count(term) / document.maximum_term_count
-  end
-  alias_method :normalized_tf, :normalized_term_frequency
+    alias_method :binary_tf, :binary_term_frequency
-  # SMART a, Salton n, Chisholm ATF1
-  def augmented_normalized_term_frequency(document, term)
-    0.5 + 0.5 * normalized_term_frequency(document, term)
-  end
-  alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
-  # Chisholm ATFA
-  def augmented_average_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      0.9 + 0.1 * count / document.average_term_count
-    else
-      0
+    # @see http://en.wikipedia.org/wiki/Tf*idf
+    # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
+    def normalized_term_frequency(document, term, a = 0)
+      a + (1 - a) * document.term_count(term) / document.maximum_term_count
     end
-  end
-  alias_method :augmented_average_tf, :augmented_average_term_frequency
+    alias_method :normalized_tf, :normalized_term_frequency
-  # Chisholm ATFC
-  def changed_coefficient_augmented_normalized_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      0.2 + 0.8 * count / document.maximum_term_count
-    else
-      0
+    # SMART a, Salton n, Chisholm ATF1
+    def augmented_normalized_term_frequency(document, term)
+      0.5 + 0.5 * normalized_term_frequency(document, term)
     end
-  end
-  alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
-  # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
-  #
-  # SMART l, Chisholm LOGA
-  def log_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      1 + log(count)
-    else
-      0
+    alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
+    # Chisholm ATFA
+    def augmented_average_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        0.9 + 0.1 * count / document.average_term_count
+      else
+        0
+      end
     end
-  end
-  alias_method :log_tf, :log_term_frequency
-  # SMART L, Chisholm LOGN
-  def normalized_log_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      (1 + log(count)) / (1 + log(document.average_term_count))
-    else
-      0
+    alias_method :augmented_average_tf, :augmented_average_term_frequency
+    # Chisholm ATFC
+    def changed_coefficient_augmented_normalized_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        0.2 + 0.8 * count / document.maximum_term_count
+      else
+        0
+      end
     end
-  end
-  alias_method :normalized_log_tf, :normalized_log_term_frequency
-  # Chisholm LOGG
-  def augmented_log_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      0.2 + 0.8 * log(count + 1)
-    else
-      0
+    alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
+    # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
+    #
+    # SMART l, Chisholm LOGA
+    def log_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        1 + log(count)
+      else
+        0
+      end
     end
-  end
-  alias_method :augmented_log_tf, :augmented_log_term_frequency
-  # Chisholm SQRT
-  def square_root_term_frequency(document, term)
-    count = document.term_count(term)
-    if count > 0
-      sqrt(count - 0.5) + 1
-    else
-      0
+    alias_method :log_tf, :log_term_frequency
+    # SMART L, Chisholm LOGN
+    def normalized_log_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        (1 + log(count)) / (1 + log(document.average_term_count))
+      else
+        0
+      end
+    end
+    alias_method :normalized_log_tf, :normalized_log_term_frequency
+    # Chisholm LOGG
+    def augmented_log_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        0.2 + 0.8 * log(count + 1)
+      else
+        0
+      end
+    end
+    alias_method :augmented_log_tf, :augmented_log_term_frequency
+    # Chisholm SQRT
+    def square_root_term_frequency(document, term)
+      count = document.term_count(term)
+      if count > 0
+        sqrt(count - 0.5) + 1
+      else
+        0
+      end
     end
+    alias_method :square_root_tf, :square_root_term_frequency
   end
-  alias_method :square_root_tf, :square_root_term_frequency
 end