tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cdc8ab3c1938db619adf75d74fd77a5647b4e29c
4
- data.tar.gz: dfbc6eaaa16328c30bdb2dc56481ec93265ec4aa
3
+ metadata.gz: c0ba1f941db96541f035a283df336907bf941439
4
+ data.tar.gz: 22bbec24681023e880e1e4e3fa14d26356630021
5
5
  SHA512:
6
- metadata.gz: b96cd6bdf856430fa2caad8c3f1284dd4a7842395058778aa282f8d1352c7f092ff299e32440102a871519e329e692761b4bff9f7043b82bead490802c77dc75
7
- data.tar.gz: c72cb027c925ca35e3d85eb6604f034f827146aca28b2e6e853736cd8f18b9c4229a92475989983ea7b43deb741f4c66ba6cb08aadf65d8bf38bf52399c2ff1b
6
+ metadata.gz: 9e7cca8d705d8080dff857d2d953a6f0091e361bb0693f0ce650e64a2f4633ad5db386fa41ea8b73ae1cfe839db8e4e9f56592c98b36cdc6ab756699ecfaa5f7
7
+ data.tar.gz: 3bcb9dcb07c9eb00c234920ff8d6340aac815c8181510d7ae65183e9b1d528001247439a86c6b603d973362bcee020eb0340558a9318693713cdaaa4b62a2ffd
@@ -1,21 +1,14 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 1.8.7
4
3
  - 1.9.2
5
4
  - 1.9.3
6
5
  - 2.0.0
7
- - ree
6
+ - 2.1.0
8
7
  env:
9
8
  - MATRIX_LIBRARY=gsl
10
9
  - MATRIX_LIBRARY=narray
11
10
  - MATRIX_LIBRARY=nmatrix
12
11
  - MATRIX_LIBRARY=matrix
13
- matrix:
14
- exclude:
15
- - rvm: 1.8.7
16
- env: MATRIX_LIBRARY=nmatrix
17
- - rvm: ree
18
- env: MATRIX_LIBRARY=nmatrix
19
12
  before_install:
20
13
  - bundle config build.nmatrix --with-lapacklib
21
14
  - if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi
data/Gemfile CHANGED
@@ -1,8 +1,8 @@
1
1
  source "http://rubygems.org"
2
2
 
3
- gem 'gsl', '~> 1.15.3' if ENV['MATRIX_LIBRARY'] == 'gsl'
3
+ gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
4
4
  gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
5
- gem 'nmatrix', '~> 0.0.9' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
5
+ gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
6
6
 
7
7
  # Specify your gem's dependencies in the gemspec
8
8
  gemspec
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Ruby Vector Space Model (VSM) with tf*idf weights
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](http://badge.fury.io/rb/tf-idf-similarity)
3
4
  [![Build Status](https://secure.travis-ci.org/opennorth/tf-idf-similarity.png)](http://travis-ci.org/opennorth/tf-idf-similarity)
4
5
  [![Dependency Status](https://gemnasium.com/opennorth/tf-idf-similarity.png)](https://gemnasium.com/opennorth/tf-idf-similarity)
5
6
  [![Coverage Status](https://coveralls.io/repos/opennorth/tf-idf-similarity/badge.png?branch=master)](https://coveralls.io/r/opennorth/tf-idf-similarity)
@@ -9,23 +10,53 @@ Calculates the similarity between texts using a [bag-of-words](http://en.wikiped
9
10
 
10
11
  ## Usage
11
12
 
12
- require 'matrix'
13
- require 'tf-idf-similarity'
13
+ ```ruby
14
+ require 'matrix'
15
+ require 'tf-idf-similarity'
16
+ ```
14
17
 
15
18
  Create a set of documents:
16
19
 
17
- corpus = []
18
- corpus << TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
19
- corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
20
- corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
20
+ ```ruby
21
+ document1 = TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
22
+ document2 = TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
23
+ document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
24
+ corpus = [document1, document2, document3]
25
+ ```
21
26
 
22
27
  Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
23
28
 
24
- model = TfIdfSimilarity::TfIdfModel.new(corpus)
29
+ ```ruby
30
+ model = TfIdfSimilarity::TfIdfModel.new(corpus)
31
+ ```
25
32
 
26
- Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
33
+ Or, create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
27
34
 
28
- model = TfIdfSimilarity::BM25Model.new(corpus)
35
+ ```ruby
36
+ model = TfIdfSimilarity::BM25Model.new(corpus)
37
+ ```
38
+
39
+ Create a similarity matrix:
40
+
41
+ ```ruby
42
+ matrix = model.similarity_matrix
43
+ ```
44
+
45
+ Find the similarity of two documents in the matrix:
46
+
47
+ ```ruby
48
+ matrix[model.document_index(document1), model.document_index(document2)]
49
+ ```
50
+
51
+ Print the tf*idf values for terms in a document:
52
+
53
+ ```ruby
54
+ tfidf_by_term = {}
55
+ document1.terms.each do |term|
56
+ tfidf_by_term[term] = model.tfidf(document1, term)
57
+ end
58
+ puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
59
+ ```
29
60
 
30
61
  [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
31
62
 
@@ -13,6 +13,7 @@ end
13
13
 
14
14
  require 'tf-idf-similarity/matrix_methods'
15
15
  require 'tf-idf-similarity/term_count_model'
16
+ require 'tf-idf-similarity/model'
16
17
  require 'tf-idf-similarity/tf_idf_model'
17
18
  require 'tf-idf-similarity/bm25_model'
18
19
  require 'tf-idf-similarity/document'
@@ -2,68 +2,29 @@
2
2
  #
3
3
  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
4
4
  # @see http://en.wikipedia.org/wiki/Okapi_BM25
5
- class TfIdfSimilarity::BM25Model
6
- include TfIdfSimilarity::MatrixMethods
7
-
8
- extend Forwardable
9
- def_delegators :@model, :documents, :terms, :document_count
10
-
11
- # @param [Array<TfIdfSimilarity::Document>] documents documents
12
- # @param [Hash] opts optional arguments
13
- # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
14
- def initialize(documents, opts = {})
15
- @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
16
- @library = (opts[:library] || :matrix).to_sym
17
-
18
- array = Array.new(terms.size) do |i|
19
- idf = inverse_document_frequency(terms[i])
20
- Array.new(documents.size) do |j|
21
- term_frequency(documents[j], terms[i]) * idf
22
- end
5
+ module TfIdfSimilarity
6
+ class BM25Model < Model
7
+ # Return the term's inverse document frequency.
8
+ #
9
+ # @param [String] term a term
10
+ # @return [Float] the term's inverse document frequency
11
+ def inverse_document_frequency(term)
12
+ df = @model.document_count(term)
13
+ log((documents.size - df + 0.5) / (df + 0.5))
23
14
  end
24
-
25
- @matrix = initialize_matrix(array)
26
- end
27
-
28
- # Return the term's inverse document frequency.
29
- #
30
- # @param [String] term a term
31
- # @return [Float] the term's inverse document frequency
32
- def inverse_document_frequency(term)
33
- df = @model.document_count(term)
34
- log((documents.size - df + 0.5) / (df + 0.5))
35
- end
36
- alias_method :idf, :inverse_document_frequency
37
-
38
- # Returns the term's frequency in the document.
39
- #
40
- # @param [Document] document a document
41
- # @param [String] term a term
42
- # @return [Float] the term's frequency in the document
43
- #
44
- # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
45
- def term_frequency(document, term)
46
- tf = document.term_count(term)
47
- (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
48
- end
49
- alias_method :tf, :term_frequency
50
-
51
- # Return the term frequency–inverse document frequency.
52
- #
53
- # @param [Document] document a document
54
- # @param [String] term a term
55
- # @return [Float] the term frequency–inverse document frequency
56
- def term_frequency_inverse_document_frequency(document, term)
57
- inverse_document_frequency(term) * term_frequency(document, term)
58
- end
59
- alias_method :tfidf, :term_frequency_inverse_document_frequency
60
-
61
- # Returns a similarity matrix for the documents in the corpus.
62
- #
63
- # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
64
- # @note Columns are normalized to unit vectors, so we can calculate the cosine
65
- # similarity of all document vectors.
66
- def similarity_matrix
67
- multiply_self(normalize)
15
+ alias_method :idf, :inverse_document_frequency
16
+
17
+ # Returns the term's frequency in the document.
18
+ #
19
+ # @param [Document] document a document
20
+ # @param [String] term a term
21
+ # @return [Float] the term's frequency in the document
22
+ #
23
+ # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
24
+ def term_frequency(document, term)
25
+ tf = document.term_count(term)
26
+ (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
27
+ end
28
+ alias_method :tf, :term_frequency
68
29
  end
69
30
  end
@@ -1,80 +1,82 @@
1
1
  # A document.
2
- class TfIdfSimilarity::Document
3
- # The document's identifier.
4
- attr_reader :id
5
- # The document's text.
6
- attr_reader :text
7
- # The number of times each term appears in the document.
8
- attr_reader :term_counts
9
- # The number of tokens in the document.
10
- attr_reader :size
2
+ module TfIdfSimilarity
3
+ class Document
4
+ # The document's identifier.
5
+ attr_reader :id
6
+ # The document's text.
7
+ attr_reader :text
8
+ # The number of times each term appears in the document.
9
+ attr_reader :term_counts
10
+ # The number of tokens in the document.
11
+ attr_reader :size
11
12
 
12
- # @param [String] text the document's text
13
- # @param [Hash] opts optional arguments
14
- # @option opts [String] :id the document's identifier
15
- # @option opts [Array] :tokens the document's tokenized text
16
- # @option opts [Hash] :term_counts the number of times each term appears
17
- # @option opts [Integer] :size the number of tokens in the document
18
- def initialize(text, opts = {})
19
- @text = text
20
- @id = opts[:id] || object_id
21
- @tokens = opts[:tokens]
13
+ # @param [String] text the document's text
14
+ # @param [Hash] opts optional arguments
15
+ # @option opts [String] :id the document's identifier
16
+ # @option opts [Array] :tokens the document's tokenized text
17
+ # @option opts [Hash] :term_counts the number of times each term appears
18
+ # @option opts [Integer] :size the number of tokens in the document
19
+ def initialize(text, opts = {})
20
+ @text = text
21
+ @id = opts[:id] || object_id
22
+ @tokens = opts[:tokens]
22
23
 
23
- if opts[:term_counts]
24
- @term_counts = opts[:term_counts]
25
- @size = opts[:size] || term_counts.values.reduce(0, :+)
26
- # Nothing to do.
27
- else
28
- @term_counts = Hash.new(0)
29
- @size = 0
30
- set_term_counts_and_size
24
+ if opts[:term_counts]
25
+ @term_counts = opts[:term_counts]
26
+ @size = opts[:size] || term_counts.values.reduce(0, :+)
27
+ # Nothing to do.
28
+ else
29
+ @term_counts = Hash.new(0)
30
+ @size = 0
31
+ set_term_counts_and_size
32
+ end
31
33
  end
32
- end
33
34
 
34
- # Returns the set of terms in the document.
35
- #
36
- # @return [Array<String>] the unique terms in the document
37
- def terms
38
- term_counts.keys
39
- end
35
+ # Returns the set of terms in the document.
36
+ #
37
+ # @return [Array<String>] the unique terms in the document
38
+ def terms
39
+ term_counts.keys
40
+ end
40
41
 
41
- # Returns the number of occurrences of the term in the document.
42
- #
43
- # @param [String] term a term
44
- # @return [Integer] the number of times the term appears in the document
45
- def term_count(term)
46
- term_counts[term].to_i # need #to_i if unmarshalled
47
- end
42
+ # Returns the number of occurrences of the term in the document.
43
+ #
44
+ # @param [String] term a term
45
+ # @return [Integer] the number of times the term appears in the document
46
+ def term_count(term)
47
+ term_counts[term].to_i # need #to_i if unmarshalled
48
+ end
48
49
 
49
- private
50
+ private
50
51
 
51
- # Tokenizes the text and counts terms and total tokens.
52
- def set_term_counts_and_size
53
- tokenize(text).each do |word|
54
- token = TfIdfSimilarity::Token.new(word)
55
- if token.valid?
56
- term = token.lowercase_filter.classic_filter.to_s
57
- @term_counts[term] += 1
58
- @size += 1
52
+ # Tokenizes the text and counts terms and total tokens.
53
+ def set_term_counts_and_size
54
+ tokenize(text).each do |word|
55
+ token = Token.new(word)
56
+ if token.valid?
57
+ term = token.lowercase_filter.classic_filter.to_s
58
+ @term_counts[term] += 1
59
+ @size += 1
60
+ end
59
61
  end
60
62
  end
61
- end
62
63
 
63
- # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
64
- # Word Boundary Specification.
65
- #
66
- # If a tokenized text was provided at the document's initialization, those
67
- # tokens will be returned without additional processing.
68
- #
69
- # @param [String] text a text
70
- # @return [Enumerator] a token enumerator
71
- #
72
- # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
73
- # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
74
- #
75
- # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
76
- # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
77
- def tokenize(text)
78
- @tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
64
+ # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
65
+ # Word Boundary Specification.
66
+ #
67
+ # If a tokenized text was provided at the document's initialization, those
68
+ # tokens will be returned without additional processing.
69
+ #
70
+ # @param [String] text a text
71
+ # @return [Enumerator] a token enumerator
72
+ #
73
+ # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
74
+ # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
75
+ #
76
+ # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
77
+ # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
78
+ def tokenize(text)
79
+ @tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
80
+ end
79
81
  end
80
82
  end
@@ -1,11 +1,13 @@
1
- class TfIdfSimilarity::Document
2
- # @return [Float] the maximum term count of any term in the document
3
- def maximum_term_count
4
- @maximum_term_count ||= term_counts.values.max.to_f
5
- end
1
+ module TfIdfSimilarity
2
+ class Document
3
+ # @return [Float] the maximum term count of any term in the document
4
+ def maximum_term_count
5
+ @maximum_term_count ||= term_counts.values.max.to_f
6
+ end
6
7
 
7
- # @return [Float] the average term count of all terms in the document
8
- def average_term_count
9
- @average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
8
+ # @return [Float] the average term count of all terms in the document
9
+ def average_term_count
10
+ @average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
11
+ end
10
12
  end
11
13
  end
@@ -10,183 +10,185 @@
10
10
  # @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
11
11
  # @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
12
12
  # @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
13
- class TfIdfSimilarity::TfIdfModel
14
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
15
- #
16
- # SMART n, Salton x, Chisholm NONE
17
- def no_collection_frequency(term)
18
- 1.0
19
- end
20
-
21
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
22
- #
23
- # SMART t, Salton f, Chisholm IDFB
24
- def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
25
- log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
26
- end
27
- alias_method :plain_idf, :plain_inverse_document_frequency
13
+ module TfIdfSimilarity
14
+ class TfIdfModel
15
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
16
+ #
17
+ # SMART n, Salton x, Chisholm NONE
18
+ def no_collection_frequency(term)
19
+ 1.0
20
+ end
28
21
 
29
- # SMART p, Salton p, Chisholm IDFP
30
- def probabilistic_inverse_document_frequency(term)
31
- count = @model.document_count(term).to_f
32
- log((documents.size - count) / count)
33
- end
34
- alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
22
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
23
+ #
24
+ # SMART t, Salton f, Chisholm IDFB
25
+ def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
26
+ log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
27
+ end
28
+ alias_method :plain_idf, :plain_inverse_document_frequency
35
29
 
36
- # Chisholm IGFF
37
- def global_frequency_inverse_document_frequency(term)
38
- @model.term_count(term) / @model.document_count(term).to_f
39
- end
40
- alias_method :gfidf, :global_frequency_inverse_document_frequency
30
+ # SMART p, Salton p, Chisholm IDFP
31
+ def probabilistic_inverse_document_frequency(term)
32
+ count = @model.document_count(term).to_f
33
+ log((documents.size - count) / count)
34
+ end
35
+ alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
41
36
 
42
- # Chisholm IGFL
43
- def log_global_frequency_inverse_document_frequency(term)
44
- log(global_frequency_inverse_document_frequency(term) + 1)
45
- end
46
- alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
37
+ # Chisholm IGFF
38
+ def global_frequency_inverse_document_frequency(term)
39
+ @model.term_count(term) / @model.document_count(term).to_f
40
+ end
41
+ alias_method :gfidf, :global_frequency_inverse_document_frequency
47
42
 
48
- # Chisholm IGFI
49
- def incremented_global_frequency_inverse_document_frequency(term)
50
- global_frequency_inverse_document_frequency(term) + 1
51
- end
52
- alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
43
+ # Chisholm IGFL
44
+ def log_global_frequency_inverse_document_frequency(term)
45
+ log(global_frequency_inverse_document_frequency(term) + 1)
46
+ end
47
+ alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
53
48
 
54
- # Chisholm IGFS
55
- def square_root_global_frequency_inverse_document_frequency(term)
56
- sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
57
- end
58
- alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
49
+ # Chisholm IGFI
50
+ def incremented_global_frequency_inverse_document_frequency(term)
51
+ global_frequency_inverse_document_frequency(term) + 1
52
+ end
53
+ alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
59
54
 
60
- # Chisholm ENPY
61
- def entropy(term)
62
- denominator = @model.term_count(term).to_f
63
- logN = log(documents.size)
64
- 1 + documents.reduce(0) do |sum,document|
65
- quotient = document.term_count(term) / denominator
66
- sum += quotient * log(quotient) / logN
55
+ # Chisholm IGFS
56
+ def square_root_global_frequency_inverse_document_frequency(term)
57
+ sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
58
+ end
59
+ alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
60
+
61
+ # Chisholm ENPY
62
+ def entropy(term)
63
+ denominator = @model.term_count(term).to_f
64
+ logN = log(documents.size)
65
+ 1 + documents.reduce(0) do |sum,document|
66
+ quotient = document.term_count(term) / denominator
67
+ sum += quotient * log(quotient) / logN
68
+ end
67
69
  end
68
- end
69
70
 
70
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
71
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
72
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
73
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
74
- # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
75
- #
76
- # SMART n, Salton x, Chisholm NONE
77
- def no_normalization(matrix)
78
- matrix
79
- end
71
+ # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
72
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
73
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
74
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
75
+ # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
76
+ #
77
+ # SMART n, Salton x, Chisholm NONE
78
+ def no_normalization(matrix)
79
+ matrix
80
+ end
80
81
 
81
- # @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
82
- #
83
- # SMART u, Chisholm PUQN
84
- def pivoted_unique_normalization(matrix)
85
- raise NotImplementedError
86
- end
82
+ # @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
83
+ #
84
+ # SMART u, Chisholm PUQN
85
+ def pivoted_unique_normalization(matrix)
86
+ raise NotImplementedError
87
+ end
87
88
 
88
- # Cosine normalization is implemented as TfIdfSimilarity::MatrixMethods#normalize.
89
- #
90
- # SMART c, Salton c, Chisholm COSN
89
+ # Cosine normalization is implemented as MatrixMethods#normalize.
90
+ #
91
+ # SMART c, Salton c, Chisholm COSN
91
92
 
92
93
 
93
94
 
94
- # The plain term frequency is implemented as TfIdfSimilarity::Document#term_count.
95
- #
96
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
97
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
98
- #
99
- # SMART n, Salton t, Chisholm FREQ
95
+ # The plain term frequency is implemented as Document#term_count.
96
+ #
97
+ # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
98
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
99
+ #
100
+ # SMART n, Salton t, Chisholm FREQ
100
101
 
101
- # SMART b, Salton b, Chisholm BNRY
102
- def binary_term_frequency(document, term)
103
- count = document.term_count(term)
104
- if count > 0
105
- 1
106
- else
107
- 0
102
+ # SMART b, Salton b, Chisholm BNRY
103
+ def binary_term_frequency(document, term)
104
+ count = document.term_count(term)
105
+ if count > 0
106
+ 1
107
+ else
108
+ 0
109
+ end
108
110
  end
109
- end
110
- alias_method :binary_tf, :binary_term_frequency
111
-
112
- # @see http://en.wikipedia.org/wiki/Tf*idf
113
- # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
114
- def normalized_term_frequency(document, term, a = 0)
115
- a + (1 - a) * document.term_count(term) / document.maximum_term_count
116
- end
117
- alias_method :normalized_tf, :normalized_term_frequency
111
+ alias_method :binary_tf, :binary_term_frequency
118
112
 
119
- # SMART a, Salton n, Chisholm ATF1
120
- def augmented_normalized_term_frequency(document, term)
121
- 0.5 + 0.5 * normalized_term_frequency(document, term)
122
- end
123
- alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
124
-
125
- # Chisholm ATFA
126
- def augmented_average_term_frequency(document, term)
127
- count = document.term_count(term)
128
- if count > 0
129
- 0.9 + 0.1 * count / document.average_term_count
130
- else
131
- 0
113
+ # @see http://en.wikipedia.org/wiki/Tf*idf
114
+ # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
115
+ def normalized_term_frequency(document, term, a = 0)
116
+ a + (1 - a) * document.term_count(term) / document.maximum_term_count
132
117
  end
133
- end
134
- alias_method :augmented_average_tf, :augmented_average_term_frequency
118
+ alias_method :normalized_tf, :normalized_term_frequency
135
119
 
136
- # Chisholm ATFC
137
- def changed_coefficient_augmented_normalized_term_frequency(document, term)
138
- count = document.term_count(term)
139
- if count > 0
140
- 0.2 + 0.8 * count / document.maximum_term_count
141
- else
142
- 0
120
+ # SMART a, Salton n, Chisholm ATF1
121
+ def augmented_normalized_term_frequency(document, term)
122
+ 0.5 + 0.5 * normalized_term_frequency(document, term)
143
123
  end
144
- end
145
- alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
146
-
147
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
148
- #
149
- # SMART l, Chisholm LOGA
150
- def log_term_frequency(document, term)
151
- count = document.term_count(term)
152
- if count > 0
153
- 1 + log(count)
154
- else
155
- 0
124
+ alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
125
+
126
+ # Chisholm ATFA
127
+ def augmented_average_term_frequency(document, term)
128
+ count = document.term_count(term)
129
+ if count > 0
130
+ 0.9 + 0.1 * count / document.average_term_count
131
+ else
132
+ 0
133
+ end
156
134
  end
157
- end
158
- alias_method :log_tf, :log_term_frequency
159
-
160
- # SMART L, Chisholm LOGN
161
- def normalized_log_term_frequency(document, term)
162
- count = document.term_count(term)
163
- if count > 0
164
- (1 + log(count)) / (1 + log(document.average_term_count))
165
- else
166
- 0
135
+ alias_method :augmented_average_tf, :augmented_average_term_frequency
136
+
137
+ # Chisholm ATFC
138
+ def changed_coefficient_augmented_normalized_term_frequency(document, term)
139
+ count = document.term_count(term)
140
+ if count > 0
141
+ 0.2 + 0.8 * count / document.maximum_term_count
142
+ else
143
+ 0
144
+ end
167
145
  end
168
- end
169
- alias_method :normalized_log_tf, :normalized_log_term_frequency
170
-
171
- # Chisholm LOGG
172
- def augmented_log_term_frequency(document, term)
173
- count = document.term_count(term)
174
- if count > 0
175
- 0.2 + 0.8 * log(count + 1)
176
- else
177
- 0
146
+ alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
147
+
148
+ # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
149
+ #
150
+ # SMART l, Chisholm LOGA
151
+ def log_term_frequency(document, term)
152
+ count = document.term_count(term)
153
+ if count > 0
154
+ 1 + log(count)
155
+ else
156
+ 0
157
+ end
178
158
  end
179
- end
180
- alias_method :augmented_log_tf, :augmented_log_term_frequency
181
-
182
- # Chisholm SQRT
183
- def square_root_term_frequency(document, term)
184
- count = document.term_count(term)
185
- if count > 0
186
- sqrt(count - 0.5) + 1
187
- else
188
- 0
159
+ alias_method :log_tf, :log_term_frequency
160
+
161
+ # SMART L, Chisholm LOGN
162
+ def normalized_log_term_frequency(document, term)
163
+ count = document.term_count(term)
164
+ if count > 0
165
+ (1 + log(count)) / (1 + log(document.average_term_count))
166
+ else
167
+ 0
168
+ end
169
+ end
170
+ alias_method :normalized_log_tf, :normalized_log_term_frequency
171
+
172
+ # Chisholm LOGG
173
+ def augmented_log_term_frequency(document, term)
174
+ count = document.term_count(term)
175
+ if count > 0
176
+ 0.2 + 0.8 * log(count + 1)
177
+ else
178
+ 0
179
+ end
180
+ end
181
+ alias_method :augmented_log_tf, :augmented_log_term_frequency
182
+
183
+ # Chisholm SQRT
184
+ def square_root_term_frequency(document, term)
185
+ count = document.term_count(term)
186
+ if count > 0
187
+ sqrt(count - 0.5) + 1
188
+ else
189
+ 0
190
+ end
189
191
  end
192
+ alias_method :square_root_tf, :square_root_term_frequency
190
193
  end
191
- alias_method :square_root_tf, :square_root_term_frequency
192
194
  end