tf-idf-similarity 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0ba1f941db96541f035a283df336907bf941439
|
4
|
+
data.tar.gz: 22bbec24681023e880e1e4e3fa14d26356630021
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e7cca8d705d8080dff857d2d953a6f0091e361bb0693f0ce650e64a2f4633ad5db386fa41ea8b73ae1cfe839db8e4e9f56592c98b36cdc6ab756699ecfaa5f7
|
7
|
+
data.tar.gz: 3bcb9dcb07c9eb00c234920ff8d6340aac815c8181510d7ae65183e9b1d528001247439a86c6b603d973362bcee020eb0340558a9318693713cdaaa4b62a2ffd
|
data/.travis.yml
CHANGED
@@ -1,21 +1,14 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
-
- 1.8.7
|
4
3
|
- 1.9.2
|
5
4
|
- 1.9.3
|
6
5
|
- 2.0.0
|
7
|
-
-
|
6
|
+
- 2.1.0
|
8
7
|
env:
|
9
8
|
- MATRIX_LIBRARY=gsl
|
10
9
|
- MATRIX_LIBRARY=narray
|
11
10
|
- MATRIX_LIBRARY=nmatrix
|
12
11
|
- MATRIX_LIBRARY=matrix
|
13
|
-
matrix:
|
14
|
-
exclude:
|
15
|
-
- rvm: 1.8.7
|
16
|
-
env: MATRIX_LIBRARY=nmatrix
|
17
|
-
- rvm: ree
|
18
|
-
env: MATRIX_LIBRARY=nmatrix
|
19
12
|
before_install:
|
20
13
|
- bundle config build.nmatrix --with-lapacklib
|
21
14
|
- if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi
|
data/Gemfile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
gem 'gsl', '~> 1.
|
3
|
+
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
4
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.0.
|
5
|
+
gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in the gemspec
|
8
8
|
gemspec
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Ruby Vector Space Model (VSM) with tf*idf weights
|
2
2
|
|
3
|
+
[](http://badge.fury.io/rb/tf-idf-similarity)
|
3
4
|
[](http://travis-ci.org/opennorth/tf-idf-similarity)
|
4
5
|
[](https://gemnasium.com/opennorth/tf-idf-similarity)
|
5
6
|
[](https://coveralls.io/r/opennorth/tf-idf-similarity)
|
@@ -9,23 +10,53 @@ Calculates the similarity between texts using a [bag-of-words](http://en.wikiped
|
|
9
10
|
|
10
11
|
## Usage
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
```ruby
|
14
|
+
require 'matrix'
|
15
|
+
require 'tf-idf-similarity'
|
16
|
+
```
|
14
17
|
|
15
18
|
Create a set of documents:
|
16
19
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
20
|
+
```ruby
|
21
|
+
document1 = TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
|
22
|
+
document2 = TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
|
23
|
+
document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
24
|
+
corpus = [document1, document2, document3]
|
25
|
+
```
|
21
26
|
|
22
27
|
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
|
23
28
|
|
24
|
-
|
29
|
+
```ruby
|
30
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus)
|
31
|
+
```
|
25
32
|
|
26
|
-
|
33
|
+
Or, create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
|
27
34
|
|
28
|
-
|
35
|
+
```ruby
|
36
|
+
model = TfIdfSimilarity::BM25Model.new(corpus)
|
37
|
+
```
|
38
|
+
|
39
|
+
Create a similarity matrix:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
matrix = model.similarity_matrix
|
43
|
+
```
|
44
|
+
|
45
|
+
Find the similarity of two documents in the matrix:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
matrix[model.document_index(document1), model.document_index(document2)]
|
49
|
+
```
|
50
|
+
|
51
|
+
Print the tf*idf values for terms in a document:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
tfidf_by_term = {}
|
55
|
+
document1.terms.each do |term|
|
56
|
+
tfidf_by_term[term] = model.tfidf(document1, term)
|
57
|
+
end
|
58
|
+
puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
|
59
|
+
```
|
29
60
|
|
30
61
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
31
62
|
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -13,6 +13,7 @@ end
|
|
13
13
|
|
14
14
|
require 'tf-idf-similarity/matrix_methods'
|
15
15
|
require 'tf-idf-similarity/term_count_model'
|
16
|
+
require 'tf-idf-similarity/model'
|
16
17
|
require 'tf-idf-similarity/tf_idf_model'
|
17
18
|
require 'tf-idf-similarity/bm25_model'
|
18
19
|
require 'tf-idf-similarity/document'
|
@@ -2,68 +2,29 @@
|
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
4
4
|
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def initialize(documents, opts = {})
|
15
|
-
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
16
|
-
@library = (opts[:library] || :matrix).to_sym
|
17
|
-
|
18
|
-
array = Array.new(terms.size) do |i|
|
19
|
-
idf = inverse_document_frequency(terms[i])
|
20
|
-
Array.new(documents.size) do |j|
|
21
|
-
term_frequency(documents[j], terms[i]) * idf
|
22
|
-
end
|
5
|
+
module TfIdfSimilarity
|
6
|
+
class BM25Model < Model
|
7
|
+
# Return the term's inverse document frequency.
|
8
|
+
#
|
9
|
+
# @param [String] term a term
|
10
|
+
# @return [Float] the term's inverse document frequency
|
11
|
+
def inverse_document_frequency(term)
|
12
|
+
df = @model.document_count(term)
|
13
|
+
log((documents.size - df + 0.5) / (df + 0.5))
|
23
14
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# Returns the term's frequency in the document.
|
39
|
-
#
|
40
|
-
# @param [Document] document a document
|
41
|
-
# @param [String] term a term
|
42
|
-
# @return [Float] the term's frequency in the document
|
43
|
-
#
|
44
|
-
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
45
|
-
def term_frequency(document, term)
|
46
|
-
tf = document.term_count(term)
|
47
|
-
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
48
|
-
end
|
49
|
-
alias_method :tf, :term_frequency
|
50
|
-
|
51
|
-
# Return the term frequency–inverse document frequency.
|
52
|
-
#
|
53
|
-
# @param [Document] document a document
|
54
|
-
# @param [String] term a term
|
55
|
-
# @return [Float] the term frequency–inverse document frequency
|
56
|
-
def term_frequency_inverse_document_frequency(document, term)
|
57
|
-
inverse_document_frequency(term) * term_frequency(document, term)
|
58
|
-
end
|
59
|
-
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
60
|
-
|
61
|
-
# Returns a similarity matrix for the documents in the corpus.
|
62
|
-
#
|
63
|
-
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
64
|
-
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
65
|
-
# similarity of all document vectors.
|
66
|
-
def similarity_matrix
|
67
|
-
multiply_self(normalize)
|
15
|
+
alias_method :idf, :inverse_document_frequency
|
16
|
+
|
17
|
+
# Returns the term's frequency in the document.
|
18
|
+
#
|
19
|
+
# @param [Document] document a document
|
20
|
+
# @param [String] term a term
|
21
|
+
# @return [Float] the term's frequency in the document
|
22
|
+
#
|
23
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
24
|
+
def term_frequency(document, term)
|
25
|
+
tf = document.term_count(term)
|
26
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
27
|
+
end
|
28
|
+
alias_method :tf, :term_frequency
|
68
29
|
end
|
69
30
|
end
|
@@ -1,80 +1,82 @@
|
|
1
1
|
# A document.
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
module TfIdfSimilarity
|
3
|
+
class Document
|
4
|
+
# The document's identifier.
|
5
|
+
attr_reader :id
|
6
|
+
# The document's text.
|
7
|
+
attr_reader :text
|
8
|
+
# The number of times each term appears in the document.
|
9
|
+
attr_reader :term_counts
|
10
|
+
# The number of tokens in the document.
|
11
|
+
attr_reader :size
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
13
|
+
# @param [String] text the document's text
|
14
|
+
# @param [Hash] opts optional arguments
|
15
|
+
# @option opts [String] :id the document's identifier
|
16
|
+
# @option opts [Array] :tokens the document's tokenized text
|
17
|
+
# @option opts [Hash] :term_counts the number of times each term appears
|
18
|
+
# @option opts [Integer] :size the number of tokens in the document
|
19
|
+
def initialize(text, opts = {})
|
20
|
+
@text = text
|
21
|
+
@id = opts[:id] || object_id
|
22
|
+
@tokens = opts[:tokens]
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
if opts[:term_counts]
|
25
|
+
@term_counts = opts[:term_counts]
|
26
|
+
@size = opts[:size] || term_counts.values.reduce(0, :+)
|
27
|
+
# Nothing to do.
|
28
|
+
else
|
29
|
+
@term_counts = Hash.new(0)
|
30
|
+
@size = 0
|
31
|
+
set_term_counts_and_size
|
32
|
+
end
|
31
33
|
end
|
32
|
-
end
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
# Returns the set of terms in the document.
|
36
|
+
#
|
37
|
+
# @return [Array<String>] the unique terms in the document
|
38
|
+
def terms
|
39
|
+
term_counts.keys
|
40
|
+
end
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
# Returns the number of occurrences of the term in the document.
|
43
|
+
#
|
44
|
+
# @param [String] term a term
|
45
|
+
# @return [Integer] the number of times the term appears in the document
|
46
|
+
def term_count(term)
|
47
|
+
term_counts[term].to_i # need #to_i if unmarshalled
|
48
|
+
end
|
48
49
|
|
49
|
-
private
|
50
|
+
private
|
50
51
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
52
|
+
# Tokenizes the text and counts terms and total tokens.
|
53
|
+
def set_term_counts_and_size
|
54
|
+
tokenize(text).each do |word|
|
55
|
+
token = Token.new(word)
|
56
|
+
if token.valid?
|
57
|
+
term = token.lowercase_filter.classic_filter.to_s
|
58
|
+
@term_counts[term] += 1
|
59
|
+
@size += 1
|
60
|
+
end
|
59
61
|
end
|
60
62
|
end
|
61
|
-
end
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
64
|
+
# Tokenizes a text, respecting the word boundary rules from Unicode’s Default
|
65
|
+
# Word Boundary Specification.
|
66
|
+
#
|
67
|
+
# If a tokenized text was provided at the document's initialization, those
|
68
|
+
# tokens will be returned without additional processing.
|
69
|
+
#
|
70
|
+
# @param [String] text a text
|
71
|
+
# @return [Enumerator] a token enumerator
|
72
|
+
#
|
73
|
+
# @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
|
74
|
+
# or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
|
75
|
+
#
|
76
|
+
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
|
+
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
|
+
def tokenize(text)
|
79
|
+
@tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
|
80
|
+
end
|
79
81
|
end
|
80
82
|
end
|
@@ -1,11 +1,13 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
module TfIdfSimilarity
|
2
|
+
class Document
|
3
|
+
# @return [Float] the maximum term count of any term in the document
|
4
|
+
def maximum_term_count
|
5
|
+
@maximum_term_count ||= term_counts.values.max.to_f
|
6
|
+
end
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
8
|
+
# @return [Float] the average term count of all terms in the document
|
9
|
+
def average_term_count
|
10
|
+
@average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
|
11
|
+
end
|
10
12
|
end
|
11
13
|
end
|
@@ -10,183 +10,185 @@
|
|
10
10
|
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
11
11
|
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
12
12
|
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
22
|
-
#
|
23
|
-
# SMART t, Salton f, Chisholm IDFB
|
24
|
-
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
|
25
|
-
log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
|
26
|
-
end
|
27
|
-
alias_method :plain_idf, :plain_inverse_document_frequency
|
13
|
+
module TfIdfSimilarity
|
14
|
+
class TfIdfModel
|
15
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
|
16
|
+
#
|
17
|
+
# SMART n, Salton x, Chisholm NONE
|
18
|
+
def no_collection_frequency(term)
|
19
|
+
1.0
|
20
|
+
end
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
22
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
23
|
+
#
|
24
|
+
# SMART t, Salton f, Chisholm IDFB
|
25
|
+
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
|
26
|
+
log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
|
27
|
+
end
|
28
|
+
alias_method :plain_idf, :plain_inverse_document_frequency
|
35
29
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
30
|
+
# SMART p, Salton p, Chisholm IDFP
|
31
|
+
def probabilistic_inverse_document_frequency(term)
|
32
|
+
count = @model.document_count(term).to_f
|
33
|
+
log((documents.size - count) / count)
|
34
|
+
end
|
35
|
+
alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
|
41
36
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
37
|
+
# Chisholm IGFF
|
38
|
+
def global_frequency_inverse_document_frequency(term)
|
39
|
+
@model.term_count(term) / @model.document_count(term).to_f
|
40
|
+
end
|
41
|
+
alias_method :gfidf, :global_frequency_inverse_document_frequency
|
47
42
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
43
|
+
# Chisholm IGFL
|
44
|
+
def log_global_frequency_inverse_document_frequency(term)
|
45
|
+
log(global_frequency_inverse_document_frequency(term) + 1)
|
46
|
+
end
|
47
|
+
alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
|
53
48
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
49
|
+
# Chisholm IGFI
|
50
|
+
def incremented_global_frequency_inverse_document_frequency(term)
|
51
|
+
global_frequency_inverse_document_frequency(term) + 1
|
52
|
+
end
|
53
|
+
alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
|
59
54
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
55
|
+
# Chisholm IGFS
|
56
|
+
def square_root_global_frequency_inverse_document_frequency(term)
|
57
|
+
sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
|
58
|
+
end
|
59
|
+
alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
|
60
|
+
|
61
|
+
# Chisholm ENPY
|
62
|
+
def entropy(term)
|
63
|
+
denominator = @model.term_count(term).to_f
|
64
|
+
logN = log(documents.size)
|
65
|
+
1 + documents.reduce(0) do |sum,document|
|
66
|
+
quotient = document.term_count(term) / denominator
|
67
|
+
sum += quotient * log(quotient) / logN
|
68
|
+
end
|
67
69
|
end
|
68
|
-
end
|
69
70
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
71
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
|
72
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
|
73
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
|
74
|
+
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
|
75
|
+
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
|
76
|
+
#
|
77
|
+
# SMART n, Salton x, Chisholm NONE
|
78
|
+
def no_normalization(matrix)
|
79
|
+
matrix
|
80
|
+
end
|
80
81
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
82
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
|
83
|
+
#
|
84
|
+
# SMART u, Chisholm PUQN
|
85
|
+
def pivoted_unique_normalization(matrix)
|
86
|
+
raise NotImplementedError
|
87
|
+
end
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
89
|
+
# Cosine normalization is implemented as MatrixMethods#normalize.
|
90
|
+
#
|
91
|
+
# SMART c, Salton c, Chisholm COSN
|
91
92
|
|
92
93
|
|
93
94
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
95
|
+
# The plain term frequency is implemented as Document#term_count.
|
96
|
+
#
|
97
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
|
98
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
|
99
|
+
#
|
100
|
+
# SMART n, Salton t, Chisholm FREQ
|
100
101
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
102
|
+
# SMART b, Salton b, Chisholm BNRY
|
103
|
+
def binary_term_frequency(document, term)
|
104
|
+
count = document.term_count(term)
|
105
|
+
if count > 0
|
106
|
+
1
|
107
|
+
else
|
108
|
+
0
|
109
|
+
end
|
108
110
|
end
|
109
|
-
|
110
|
-
alias_method :binary_tf, :binary_term_frequency
|
111
|
-
|
112
|
-
# @see http://en.wikipedia.org/wiki/Tf*idf
|
113
|
-
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
114
|
-
def normalized_term_frequency(document, term, a = 0)
|
115
|
-
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
116
|
-
end
|
117
|
-
alias_method :normalized_tf, :normalized_term_frequency
|
111
|
+
alias_method :binary_tf, :binary_term_frequency
|
118
112
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
124
|
-
|
125
|
-
# Chisholm ATFA
|
126
|
-
def augmented_average_term_frequency(document, term)
|
127
|
-
count = document.term_count(term)
|
128
|
-
if count > 0
|
129
|
-
0.9 + 0.1 * count / document.average_term_count
|
130
|
-
else
|
131
|
-
0
|
113
|
+
# @see http://en.wikipedia.org/wiki/Tf*idf
|
114
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
115
|
+
def normalized_term_frequency(document, term, a = 0)
|
116
|
+
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
132
117
|
end
|
133
|
-
|
134
|
-
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
118
|
+
alias_method :normalized_tf, :normalized_term_frequency
|
135
119
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
if count > 0
|
140
|
-
0.2 + 0.8 * count / document.maximum_term_count
|
141
|
-
else
|
142
|
-
0
|
120
|
+
# SMART a, Salton n, Chisholm ATF1
|
121
|
+
def augmented_normalized_term_frequency(document, term)
|
122
|
+
0.5 + 0.5 * normalized_term_frequency(document, term)
|
143
123
|
end
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
else
|
155
|
-
0
|
124
|
+
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
125
|
+
|
126
|
+
# Chisholm ATFA
|
127
|
+
def augmented_average_term_frequency(document, term)
|
128
|
+
count = document.term_count(term)
|
129
|
+
if count > 0
|
130
|
+
0.9 + 0.1 * count / document.average_term_count
|
131
|
+
else
|
132
|
+
0
|
133
|
+
end
|
156
134
|
end
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
135
|
+
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
136
|
+
|
137
|
+
# Chisholm ATFC
|
138
|
+
def changed_coefficient_augmented_normalized_term_frequency(document, term)
|
139
|
+
count = document.term_count(term)
|
140
|
+
if count > 0
|
141
|
+
0.2 + 0.8 * count / document.maximum_term_count
|
142
|
+
else
|
143
|
+
0
|
144
|
+
end
|
167
145
|
end
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
146
|
+
alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
|
147
|
+
|
148
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
|
149
|
+
#
|
150
|
+
# SMART l, Chisholm LOGA
|
151
|
+
def log_term_frequency(document, term)
|
152
|
+
count = document.term_count(term)
|
153
|
+
if count > 0
|
154
|
+
1 + log(count)
|
155
|
+
else
|
156
|
+
0
|
157
|
+
end
|
178
158
|
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
159
|
+
alias_method :log_tf, :log_term_frequency
|
160
|
+
|
161
|
+
# SMART L, Chisholm LOGN
|
162
|
+
def normalized_log_term_frequency(document, term)
|
163
|
+
count = document.term_count(term)
|
164
|
+
if count > 0
|
165
|
+
(1 + log(count)) / (1 + log(document.average_term_count))
|
166
|
+
else
|
167
|
+
0
|
168
|
+
end
|
169
|
+
end
|
170
|
+
alias_method :normalized_log_tf, :normalized_log_term_frequency
|
171
|
+
|
172
|
+
# Chisholm LOGG
|
173
|
+
def augmented_log_term_frequency(document, term)
|
174
|
+
count = document.term_count(term)
|
175
|
+
if count > 0
|
176
|
+
0.2 + 0.8 * log(count + 1)
|
177
|
+
else
|
178
|
+
0
|
179
|
+
end
|
180
|
+
end
|
181
|
+
alias_method :augmented_log_tf, :augmented_log_term_frequency
|
182
|
+
|
183
|
+
# Chisholm SQRT
|
184
|
+
def square_root_term_frequency(document, term)
|
185
|
+
count = document.term_count(term)
|
186
|
+
if count > 0
|
187
|
+
sqrt(count - 0.5) + 1
|
188
|
+
else
|
189
|
+
0
|
190
|
+
end
|
189
191
|
end
|
192
|
+
alias_method :square_root_tf, :square_root_term_frequency
|
190
193
|
end
|
191
|
-
alias_method :square_root_tf, :square_root_term_frequency
|
192
194
|
end
|