tf-idf-similarity 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cdc8ab3c1938db619adf75d74fd77a5647b4e29c
4
- data.tar.gz: dfbc6eaaa16328c30bdb2dc56481ec93265ec4aa
3
+ metadata.gz: c0ba1f941db96541f035a283df336907bf941439
4
+ data.tar.gz: 22bbec24681023e880e1e4e3fa14d26356630021
5
5
  SHA512:
6
- metadata.gz: b96cd6bdf856430fa2caad8c3f1284dd4a7842395058778aa282f8d1352c7f092ff299e32440102a871519e329e692761b4bff9f7043b82bead490802c77dc75
7
- data.tar.gz: c72cb027c925ca35e3d85eb6604f034f827146aca28b2e6e853736cd8f18b9c4229a92475989983ea7b43deb741f4c66ba6cb08aadf65d8bf38bf52399c2ff1b
6
+ metadata.gz: 9e7cca8d705d8080dff857d2d953a6f0091e361bb0693f0ce650e64a2f4633ad5db386fa41ea8b73ae1cfe839db8e4e9f56592c98b36cdc6ab756699ecfaa5f7
7
+ data.tar.gz: 3bcb9dcb07c9eb00c234920ff8d6340aac815c8181510d7ae65183e9b1d528001247439a86c6b603d973362bcee020eb0340558a9318693713cdaaa4b62a2ffd
@@ -1,21 +1,14 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 1.8.7
4
3
  - 1.9.2
5
4
  - 1.9.3
6
5
  - 2.0.0
7
- - ree
6
+ - 2.1.0
8
7
  env:
9
8
  - MATRIX_LIBRARY=gsl
10
9
  - MATRIX_LIBRARY=narray
11
10
  - MATRIX_LIBRARY=nmatrix
12
11
  - MATRIX_LIBRARY=matrix
13
- matrix:
14
- exclude:
15
- - rvm: 1.8.7
16
- env: MATRIX_LIBRARY=nmatrix
17
- - rvm: ree
18
- env: MATRIX_LIBRARY=nmatrix
19
12
  before_install:
20
13
  - bundle config build.nmatrix --with-lapacklib
21
14
  - if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi
data/Gemfile CHANGED
@@ -1,8 +1,8 @@
1
1
  source "http://rubygems.org"
2
2
 
3
- gem 'gsl', '~> 1.15.3' if ENV['MATRIX_LIBRARY'] == 'gsl'
3
+ gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
4
4
  gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
5
- gem 'nmatrix', '~> 0.0.9' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
5
+ gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
6
6
 
7
7
  # Specify your gem's dependencies in the gemspec
8
8
  gemspec
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Ruby Vector Space Model (VSM) with tf*idf weights
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](http://badge.fury.io/rb/tf-idf-similarity)
3
4
  [![Build Status](https://secure.travis-ci.org/opennorth/tf-idf-similarity.png)](http://travis-ci.org/opennorth/tf-idf-similarity)
4
5
  [![Dependency Status](https://gemnasium.com/opennorth/tf-idf-similarity.png)](https://gemnasium.com/opennorth/tf-idf-similarity)
5
6
  [![Coverage Status](https://coveralls.io/repos/opennorth/tf-idf-similarity/badge.png?branch=master)](https://coveralls.io/r/opennorth/tf-idf-similarity)
@@ -9,23 +10,53 @@ Calculates the similarity between texts using a [bag-of-words](http://en.wikiped
9
10
 
10
11
  ## Usage
11
12
 
12
- require 'matrix'
13
- require 'tf-idf-similarity'
13
+ ```ruby
14
+ require 'matrix'
15
+ require 'tf-idf-similarity'
16
+ ```
14
17
 
15
18
  Create a set of documents:
16
19
 
17
- corpus = []
18
- corpus << TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
19
- corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
20
- corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
20
+ ```ruby
21
+ document1 = TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
22
+ document2 = TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
23
+ document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
24
+ corpus = [document1, document2, document3]
25
+ ```
21
26
 
22
27
  Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
23
28
 
24
- model = TfIdfSimilarity::TfIdfModel.new(corpus)
29
+ ```ruby
30
+ model = TfIdfSimilarity::TfIdfModel.new(corpus)
31
+ ```
25
32
 
26
- Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
33
+ Or, create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
27
34
 
28
- model = TfIdfSimilarity::BM25Model.new(corpus)
35
+ ```ruby
36
+ model = TfIdfSimilarity::BM25Model.new(corpus)
37
+ ```
38
+
39
+ Create a similarity matrix:
40
+
41
+ ```ruby
42
+ matrix = model.similarity_matrix
43
+ ```
44
+
45
+ Find the similarity of two documents in the matrix:
46
+
47
+ ```ruby
48
+ matrix[model.document_index(document1), model.document_index(document2)]
49
+ ```
50
+
51
+ Print the tf*idf values for terms in a document:
52
+
53
+ ```ruby
54
+ tfidf_by_term = {}
55
+ document1.terms.each do |term|
56
+ tfidf_by_term[term] = model.tfidf(document1, term)
57
+ end
58
+ puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
59
+ ```
29
60
 
30
61
  [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
31
62
 
@@ -13,6 +13,7 @@ end
13
13
 
14
14
  require 'tf-idf-similarity/matrix_methods'
15
15
  require 'tf-idf-similarity/term_count_model'
16
+ require 'tf-idf-similarity/model'
16
17
  require 'tf-idf-similarity/tf_idf_model'
17
18
  require 'tf-idf-similarity/bm25_model'
18
19
  require 'tf-idf-similarity/document'
@@ -2,68 +2,29 @@
2
2
  #
3
3
  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
4
4
  # @see http://en.wikipedia.org/wiki/Okapi_BM25
5
- class TfIdfSimilarity::BM25Model
6
- include TfIdfSimilarity::MatrixMethods
7
-
8
- extend Forwardable
9
- def_delegators :@model, :documents, :terms, :document_count
10
-
11
- # @param [Array<TfIdfSimilarity::Document>] documents documents
12
- # @param [Hash] opts optional arguments
13
- # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
14
- def initialize(documents, opts = {})
15
- @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
16
- @library = (opts[:library] || :matrix).to_sym
17
-
18
- array = Array.new(terms.size) do |i|
19
- idf = inverse_document_frequency(terms[i])
20
- Array.new(documents.size) do |j|
21
- term_frequency(documents[j], terms[i]) * idf
22
- end
5
+ module TfIdfSimilarity
6
+ class BM25Model < Model
7
+ # Return the term's inverse document frequency.
8
+ #
9
+ # @param [String] term a term
10
+ # @return [Float] the term's inverse document frequency
11
+ def inverse_document_frequency(term)
12
+ df = @model.document_count(term)
13
+ log((documents.size - df + 0.5) / (df + 0.5))
23
14
  end
24
-
25
- @matrix = initialize_matrix(array)
26
- end
27
-
28
- # Return the term's inverse document frequency.
29
- #
30
- # @param [String] term a term
31
- # @return [Float] the term's inverse document frequency
32
- def inverse_document_frequency(term)
33
- df = @model.document_count(term)
34
- log((documents.size - df + 0.5) / (df + 0.5))
35
- end
36
- alias_method :idf, :inverse_document_frequency
37
-
38
- # Returns the term's frequency in the document.
39
- #
40
- # @param [Document] document a document
41
- # @param [String] term a term
42
- # @return [Float] the term's frequency in the document
43
- #
44
- # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
45
- def term_frequency(document, term)
46
- tf = document.term_count(term)
47
- (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
48
- end
49
- alias_method :tf, :term_frequency
50
-
51
- # Return the term frequency–inverse document frequency.
52
- #
53
- # @param [Document] document a document
54
- # @param [String] term a term
55
- # @return [Float] the term frequency–inverse document frequency
56
- def term_frequency_inverse_document_frequency(document, term)
57
- inverse_document_frequency(term) * term_frequency(document, term)
58
- end
59
- alias_method :tfidf, :term_frequency_inverse_document_frequency
60
-
61
- # Returns a similarity matrix for the documents in the corpus.
62
- #
63
- # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
64
- # @note Columns are normalized to unit vectors, so we can calculate the cosine
65
- # similarity of all document vectors.
66
- def similarity_matrix
67
- multiply_self(normalize)
15
+ alias_method :idf, :inverse_document_frequency
16
+
17
+ # Returns the term's frequency in the document.
18
+ #
19
+ # @param [Document] document a document
20
+ # @param [String] term a term
21
+ # @return [Float] the term's frequency in the document
22
+ #
23
+ # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
24
+ def term_frequency(document, term)
25
+ tf = document.term_count(term)
26
+ (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
27
+ end
28
+ alias_method :tf, :term_frequency
68
29
  end
69
30
  end
@@ -1,80 +1,82 @@
1
1
  # A document.
2
- class TfIdfSimilarity::Document
3
- # The document's identifier.
4
- attr_reader :id
5
- # The document's text.
6
- attr_reader :text
7
- # The number of times each term appears in the document.
8
- attr_reader :term_counts
9
- # The number of tokens in the document.
10
- attr_reader :size
2
+ module TfIdfSimilarity
3
+ class Document
4
+ # The document's identifier.
5
+ attr_reader :id
6
+ # The document's text.
7
+ attr_reader :text
8
+ # The number of times each term appears in the document.
9
+ attr_reader :term_counts
10
+ # The number of tokens in the document.
11
+ attr_reader :size
11
12
 
12
- # @param [String] text the document's text
13
- # @param [Hash] opts optional arguments
14
- # @option opts [String] :id the document's identifier
15
- # @option opts [Array] :tokens the document's tokenized text
16
- # @option opts [Hash] :term_counts the number of times each term appears
17
- # @option opts [Integer] :size the number of tokens in the document
18
- def initialize(text, opts = {})
19
- @text = text
20
- @id = opts[:id] || object_id
21
- @tokens = opts[:tokens]
13
+ # @param [String] text the document's text
14
+ # @param [Hash] opts optional arguments
15
+ # @option opts [String] :id the document's identifier
16
+ # @option opts [Array] :tokens the document's tokenized text
17
+ # @option opts [Hash] :term_counts the number of times each term appears
18
+ # @option opts [Integer] :size the number of tokens in the document
19
+ def initialize(text, opts = {})
20
+ @text = text
21
+ @id = opts[:id] || object_id
22
+ @tokens = opts[:tokens]
22
23
 
23
- if opts[:term_counts]
24
- @term_counts = opts[:term_counts]
25
- @size = opts[:size] || term_counts.values.reduce(0, :+)
26
- # Nothing to do.
27
- else
28
- @term_counts = Hash.new(0)
29
- @size = 0
30
- set_term_counts_and_size
24
+ if opts[:term_counts]
25
+ @term_counts = opts[:term_counts]
26
+ @size = opts[:size] || term_counts.values.reduce(0, :+)
27
+ # Nothing to do.
28
+ else
29
+ @term_counts = Hash.new(0)
30
+ @size = 0
31
+ set_term_counts_and_size
32
+ end
31
33
  end
32
- end
33
34
 
34
- # Returns the set of terms in the document.
35
- #
36
- # @return [Array<String>] the unique terms in the document
37
- def terms
38
- term_counts.keys
39
- end
35
+ # Returns the set of terms in the document.
36
+ #
37
+ # @return [Array<String>] the unique terms in the document
38
+ def terms
39
+ term_counts.keys
40
+ end
40
41
 
41
- # Returns the number of occurrences of the term in the document.
42
- #
43
- # @param [String] term a term
44
- # @return [Integer] the number of times the term appears in the document
45
- def term_count(term)
46
- term_counts[term].to_i # need #to_i if unmarshalled
47
- end
42
+ # Returns the number of occurrences of the term in the document.
43
+ #
44
+ # @param [String] term a term
45
+ # @return [Integer] the number of times the term appears in the document
46
+ def term_count(term)
47
+ term_counts[term].to_i # need #to_i if unmarshalled
48
+ end
48
49
 
49
- private
50
+ private
50
51
 
51
- # Tokenizes the text and counts terms and total tokens.
52
- def set_term_counts_and_size
53
- tokenize(text).each do |word|
54
- token = TfIdfSimilarity::Token.new(word)
55
- if token.valid?
56
- term = token.lowercase_filter.classic_filter.to_s
57
- @term_counts[term] += 1
58
- @size += 1
52
+ # Tokenizes the text and counts terms and total tokens.
53
+ def set_term_counts_and_size
54
+ tokenize(text).each do |word|
55
+ token = Token.new(word)
56
+ if token.valid?
57
+ term = token.lowercase_filter.classic_filter.to_s
58
+ @term_counts[term] += 1
59
+ @size += 1
60
+ end
59
61
  end
60
62
  end
61
- end
62
63
 
63
- # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
64
- # Word Boundary Specification.
65
- #
66
- # If a tokenized text was provided at the document's initialization, those
67
- # tokens will be returned without additional processing.
68
- #
69
- # @param [String] text a text
70
- # @return [Enumerator] a token enumerator
71
- #
72
- # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
73
- # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
74
- #
75
- # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
76
- # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
77
- def tokenize(text)
78
- @tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
64
+ # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
65
+ # Word Boundary Specification.
66
+ #
67
+ # If a tokenized text was provided at the document's initialization, those
68
+ # tokens will be returned without additional processing.
69
+ #
70
+ # @param [String] text a text
71
+ # @return [Enumerator] a token enumerator
72
+ #
73
+ # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
74
+ # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
75
+ #
76
+ # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
77
+ # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
78
+ def tokenize(text)
79
+ @tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
80
+ end
79
81
  end
80
82
  end
@@ -1,11 +1,13 @@
1
- class TfIdfSimilarity::Document
2
- # @return [Float] the maximum term count of any term in the document
3
- def maximum_term_count
4
- @maximum_term_count ||= term_counts.values.max.to_f
5
- end
1
+ module TfIdfSimilarity
2
+ class Document
3
+ # @return [Float] the maximum term count of any term in the document
4
+ def maximum_term_count
5
+ @maximum_term_count ||= term_counts.values.max.to_f
6
+ end
6
7
 
7
- # @return [Float] the average term count of all terms in the document
8
- def average_term_count
9
- @average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
8
+ # @return [Float] the average term count of all terms in the document
9
+ def average_term_count
10
+ @average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
11
+ end
10
12
  end
11
13
  end
@@ -10,183 +10,185 @@
10
10
  # @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
11
11
  # @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
12
12
  # @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
13
- class TfIdfSimilarity::TfIdfModel
14
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
15
- #
16
- # SMART n, Salton x, Chisholm NONE
17
- def no_collection_frequency(term)
18
- 1.0
19
- end
20
-
21
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
22
- #
23
- # SMART t, Salton f, Chisholm IDFB
24
- def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
25
- log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
26
- end
27
- alias_method :plain_idf, :plain_inverse_document_frequency
13
+ module TfIdfSimilarity
14
+ class TfIdfModel
15
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
16
+ #
17
+ # SMART n, Salton x, Chisholm NONE
18
+ def no_collection_frequency(term)
19
+ 1.0
20
+ end
28
21
 
29
- # SMART p, Salton p, Chisholm IDFP
30
- def probabilistic_inverse_document_frequency(term)
31
- count = @model.document_count(term).to_f
32
- log((documents.size - count) / count)
33
- end
34
- alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
22
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
23
+ #
24
+ # SMART t, Salton f, Chisholm IDFB
25
+ def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
26
+ log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
27
+ end
28
+ alias_method :plain_idf, :plain_inverse_document_frequency
35
29
 
36
- # Chisholm IGFF
37
- def global_frequency_inverse_document_frequency(term)
38
- @model.term_count(term) / @model.document_count(term).to_f
39
- end
40
- alias_method :gfidf, :global_frequency_inverse_document_frequency
30
+ # SMART p, Salton p, Chisholm IDFP
31
+ def probabilistic_inverse_document_frequency(term)
32
+ count = @model.document_count(term).to_f
33
+ log((documents.size - count) / count)
34
+ end
35
+ alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
41
36
 
42
- # Chisholm IGFL
43
- def log_global_frequency_inverse_document_frequency(term)
44
- log(global_frequency_inverse_document_frequency(term) + 1)
45
- end
46
- alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
37
+ # Chisholm IGFF
38
+ def global_frequency_inverse_document_frequency(term)
39
+ @model.term_count(term) / @model.document_count(term).to_f
40
+ end
41
+ alias_method :gfidf, :global_frequency_inverse_document_frequency
47
42
 
48
- # Chisholm IGFI
49
- def incremented_global_frequency_inverse_document_frequency(term)
50
- global_frequency_inverse_document_frequency(term) + 1
51
- end
52
- alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
43
+ # Chisholm IGFL
44
+ def log_global_frequency_inverse_document_frequency(term)
45
+ log(global_frequency_inverse_document_frequency(term) + 1)
46
+ end
47
+ alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
53
48
 
54
- # Chisholm IGFS
55
- def square_root_global_frequency_inverse_document_frequency(term)
56
- sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
57
- end
58
- alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
49
+ # Chisholm IGFI
50
+ def incremented_global_frequency_inverse_document_frequency(term)
51
+ global_frequency_inverse_document_frequency(term) + 1
52
+ end
53
+ alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
59
54
 
60
- # Chisholm ENPY
61
- def entropy(term)
62
- denominator = @model.term_count(term).to_f
63
- logN = log(documents.size)
64
- 1 + documents.reduce(0) do |sum,document|
65
- quotient = document.term_count(term) / denominator
66
- sum += quotient * log(quotient) / logN
55
+ # Chisholm IGFS
56
+ def square_root_global_frequency_inverse_document_frequency(term)
57
+ sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
58
+ end
59
+ alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
60
+
61
+ # Chisholm ENPY
62
+ def entropy(term)
63
+ denominator = @model.term_count(term).to_f
64
+ logN = log(documents.size)
65
+ 1 + documents.reduce(0) do |sum,document|
66
+ quotient = document.term_count(term) / denominator
67
+ sum += quotient * log(quotient) / logN
68
+ end
67
69
  end
68
- end
69
70
 
70
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
71
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
72
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
73
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
74
- # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
75
- #
76
- # SMART n, Salton x, Chisholm NONE
77
- def no_normalization(matrix)
78
- matrix
79
- end
71
+ # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
72
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
73
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
74
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
75
+ # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
76
+ #
77
+ # SMART n, Salton x, Chisholm NONE
78
+ def no_normalization(matrix)
79
+ matrix
80
+ end
80
81
 
81
- # @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
82
- #
83
- # SMART u, Chisholm PUQN
84
- def pivoted_unique_normalization(matrix)
85
- raise NotImplementedError
86
- end
82
+ # @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
83
+ #
84
+ # SMART u, Chisholm PUQN
85
+ def pivoted_unique_normalization(matrix)
86
+ raise NotImplementedError
87
+ end
87
88
 
88
- # Cosine normalization is implemented as TfIdfSimilarity::MatrixMethods#normalize.
89
- #
90
- # SMART c, Salton c, Chisholm COSN
89
+ # Cosine normalization is implemented as MatrixMethods#normalize.
90
+ #
91
+ # SMART c, Salton c, Chisholm COSN
91
92
 
92
93
 
93
94
 
94
- # The plain term frequency is implemented as TfIdfSimilarity::Document#term_count.
95
- #
96
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
97
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
98
- #
99
- # SMART n, Salton t, Chisholm FREQ
95
+ # The plain term frequency is implemented as Document#term_count.
96
+ #
97
+ # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
98
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
99
+ #
100
+ # SMART n, Salton t, Chisholm FREQ
100
101
 
101
- # SMART b, Salton b, Chisholm BNRY
102
- def binary_term_frequency(document, term)
103
- count = document.term_count(term)
104
- if count > 0
105
- 1
106
- else
107
- 0
102
+ # SMART b, Salton b, Chisholm BNRY
103
+ def binary_term_frequency(document, term)
104
+ count = document.term_count(term)
105
+ if count > 0
106
+ 1
107
+ else
108
+ 0
109
+ end
108
110
  end
109
- end
110
- alias_method :binary_tf, :binary_term_frequency
111
-
112
- # @see http://en.wikipedia.org/wiki/Tf*idf
113
- # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
114
- def normalized_term_frequency(document, term, a = 0)
115
- a + (1 - a) * document.term_count(term) / document.maximum_term_count
116
- end
117
- alias_method :normalized_tf, :normalized_term_frequency
111
+ alias_method :binary_tf, :binary_term_frequency
118
112
 
119
- # SMART a, Salton n, Chisholm ATF1
120
- def augmented_normalized_term_frequency(document, term)
121
- 0.5 + 0.5 * normalized_term_frequency(document, term)
122
- end
123
- alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
124
-
125
- # Chisholm ATFA
126
- def augmented_average_term_frequency(document, term)
127
- count = document.term_count(term)
128
- if count > 0
129
- 0.9 + 0.1 * count / document.average_term_count
130
- else
131
- 0
113
+ # @see http://en.wikipedia.org/wiki/Tf*idf
114
+ # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
115
+ def normalized_term_frequency(document, term, a = 0)
116
+ a + (1 - a) * document.term_count(term) / document.maximum_term_count
132
117
  end
133
- end
134
- alias_method :augmented_average_tf, :augmented_average_term_frequency
118
+ alias_method :normalized_tf, :normalized_term_frequency
135
119
 
136
- # Chisholm ATFC
137
- def changed_coefficient_augmented_normalized_term_frequency(document, term)
138
- count = document.term_count(term)
139
- if count > 0
140
- 0.2 + 0.8 * count / document.maximum_term_count
141
- else
142
- 0
120
+ # SMART a, Salton n, Chisholm ATF1
121
+ def augmented_normalized_term_frequency(document, term)
122
+ 0.5 + 0.5 * normalized_term_frequency(document, term)
143
123
  end
144
- end
145
- alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
146
-
147
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
148
- #
149
- # SMART l, Chisholm LOGA
150
- def log_term_frequency(document, term)
151
- count = document.term_count(term)
152
- if count > 0
153
- 1 + log(count)
154
- else
155
- 0
124
+ alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
125
+
126
+ # Chisholm ATFA
127
+ def augmented_average_term_frequency(document, term)
128
+ count = document.term_count(term)
129
+ if count > 0
130
+ 0.9 + 0.1 * count / document.average_term_count
131
+ else
132
+ 0
133
+ end
156
134
  end
157
- end
158
- alias_method :log_tf, :log_term_frequency
159
-
160
- # SMART L, Chisholm LOGN
161
- def normalized_log_term_frequency(document, term)
162
- count = document.term_count(term)
163
- if count > 0
164
- (1 + log(count)) / (1 + log(document.average_term_count))
165
- else
166
- 0
135
+ alias_method :augmented_average_tf, :augmented_average_term_frequency
136
+
137
+ # Chisholm ATFC
138
+ def changed_coefficient_augmented_normalized_term_frequency(document, term)
139
+ count = document.term_count(term)
140
+ if count > 0
141
+ 0.2 + 0.8 * count / document.maximum_term_count
142
+ else
143
+ 0
144
+ end
167
145
  end
168
- end
169
- alias_method :normalized_log_tf, :normalized_log_term_frequency
170
-
171
- # Chisholm LOGG
172
- def augmented_log_term_frequency(document, term)
173
- count = document.term_count(term)
174
- if count > 0
175
- 0.2 + 0.8 * log(count + 1)
176
- else
177
- 0
146
+ alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
147
+
148
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
149
+ #
150
+ # SMART l, Chisholm LOGA
151
+ def log_term_frequency(document, term)
152
+ count = document.term_count(term)
153
+ if count > 0
154
+ 1 + log(count)
155
+ else
156
+ 0
157
+ end
178
158
  end
179
- end
180
- alias_method :augmented_log_tf, :augmented_log_term_frequency
181
-
182
- # Chisholm SQRT
183
- def square_root_term_frequency(document, term)
184
- count = document.term_count(term)
185
- if count > 0
186
- sqrt(count - 0.5) + 1
187
- else
188
- 0
159
+ alias_method :log_tf, :log_term_frequency
160
+
161
+ # SMART L, Chisholm LOGN
162
+ def normalized_log_term_frequency(document, term)
163
+ count = document.term_count(term)
164
+ if count > 0
165
+ (1 + log(count)) / (1 + log(document.average_term_count))
166
+ else
167
+ 0
168
+ end
169
+ end
170
+ alias_method :normalized_log_tf, :normalized_log_term_frequency
171
+
172
+ # Chisholm LOGG
173
+ def augmented_log_term_frequency(document, term)
174
+ count = document.term_count(term)
175
+ if count > 0
176
+ 0.2 + 0.8 * log(count + 1)
177
+ else
178
+ 0
179
+ end
180
+ end
181
+ alias_method :augmented_log_tf, :augmented_log_term_frequency
182
+
183
+ # Chisholm SQRT
184
+ def square_root_term_frequency(document, term)
185
+ count = document.term_count(term)
186
+ if count > 0
187
+ sqrt(count - 0.5) + 1
188
+ else
189
+ 0
190
+ end
189
191
  end
192
+ alias_method :square_root_tf, :square_root_term_frequency
190
193
  end
191
- alias_method :square_root_tf, :square_root_term_frequency
192
194
  end