tf-idf-similarity 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0ba1f941db96541f035a283df336907bf941439
|
4
|
+
data.tar.gz: 22bbec24681023e880e1e4e3fa14d26356630021
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e7cca8d705d8080dff857d2d953a6f0091e361bb0693f0ce650e64a2f4633ad5db386fa41ea8b73ae1cfe839db8e4e9f56592c98b36cdc6ab756699ecfaa5f7
|
7
|
+
data.tar.gz: 3bcb9dcb07c9eb00c234920ff8d6340aac815c8181510d7ae65183e9b1d528001247439a86c6b603d973362bcee020eb0340558a9318693713cdaaa4b62a2ffd
|
data/.travis.yml
CHANGED
@@ -1,21 +1,14 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
-
- 1.8.7
|
4
3
|
- 1.9.2
|
5
4
|
- 1.9.3
|
6
5
|
- 2.0.0
|
7
|
-
-
|
6
|
+
- 2.1.0
|
8
7
|
env:
|
9
8
|
- MATRIX_LIBRARY=gsl
|
10
9
|
- MATRIX_LIBRARY=narray
|
11
10
|
- MATRIX_LIBRARY=nmatrix
|
12
11
|
- MATRIX_LIBRARY=matrix
|
13
|
-
matrix:
|
14
|
-
exclude:
|
15
|
-
- rvm: 1.8.7
|
16
|
-
env: MATRIX_LIBRARY=nmatrix
|
17
|
-
- rvm: ree
|
18
|
-
env: MATRIX_LIBRARY=nmatrix
|
19
12
|
before_install:
|
20
13
|
- bundle config build.nmatrix --with-lapacklib
|
21
14
|
- if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi
|
data/Gemfile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
gem 'gsl', '~> 1.
|
3
|
+
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
4
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.0.
|
5
|
+
gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in the gemspec
|
8
8
|
gemspec
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Ruby Vector Space Model (VSM) with tf*idf weights
|
2
2
|
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](http://badge.fury.io/rb/tf-idf-similarity)
|
3
4
|
[![Build Status](https://secure.travis-ci.org/opennorth/tf-idf-similarity.png)](http://travis-ci.org/opennorth/tf-idf-similarity)
|
4
5
|
[![Dependency Status](https://gemnasium.com/opennorth/tf-idf-similarity.png)](https://gemnasium.com/opennorth/tf-idf-similarity)
|
5
6
|
[![Coverage Status](https://coveralls.io/repos/opennorth/tf-idf-similarity/badge.png?branch=master)](https://coveralls.io/r/opennorth/tf-idf-similarity)
|
@@ -9,23 +10,53 @@ Calculates the similarity between texts using a [bag-of-words](http://en.wikiped
|
|
9
10
|
|
10
11
|
## Usage
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
```ruby
|
14
|
+
require 'matrix'
|
15
|
+
require 'tf-idf-similarity'
|
16
|
+
```
|
14
17
|
|
15
18
|
Create a set of documents:
|
16
19
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
20
|
+
```ruby
|
21
|
+
document1 = TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
|
22
|
+
document2 = TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
|
23
|
+
document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
24
|
+
corpus = [document1, document2, document3]
|
25
|
+
```
|
21
26
|
|
22
27
|
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
|
23
28
|
|
24
|
-
|
29
|
+
```ruby
|
30
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus)
|
31
|
+
```
|
25
32
|
|
26
|
-
|
33
|
+
Or, create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
|
27
34
|
|
28
|
-
|
35
|
+
```ruby
|
36
|
+
model = TfIdfSimilarity::BM25Model.new(corpus)
|
37
|
+
```
|
38
|
+
|
39
|
+
Create a similarity matrix:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
matrix = model.similarity_matrix
|
43
|
+
```
|
44
|
+
|
45
|
+
Find the similarity of two documents in the matrix:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
matrix[model.document_index(document1), model.document_index(document2)]
|
49
|
+
```
|
50
|
+
|
51
|
+
Print the tf*idf values for terms in a document:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
tfidf_by_term = {}
|
55
|
+
document1.terms.each do |term|
|
56
|
+
tfidf_by_term[term] = model.tfidf(document1, term)
|
57
|
+
end
|
58
|
+
puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
|
59
|
+
```
|
29
60
|
|
30
61
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
31
62
|
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -13,6 +13,7 @@ end
|
|
13
13
|
|
14
14
|
require 'tf-idf-similarity/matrix_methods'
|
15
15
|
require 'tf-idf-similarity/term_count_model'
|
16
|
+
require 'tf-idf-similarity/model'
|
16
17
|
require 'tf-idf-similarity/tf_idf_model'
|
17
18
|
require 'tf-idf-similarity/bm25_model'
|
18
19
|
require 'tf-idf-similarity/document'
|
@@ -2,68 +2,29 @@
|
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
4
4
|
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def initialize(documents, opts = {})
|
15
|
-
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
16
|
-
@library = (opts[:library] || :matrix).to_sym
|
17
|
-
|
18
|
-
array = Array.new(terms.size) do |i|
|
19
|
-
idf = inverse_document_frequency(terms[i])
|
20
|
-
Array.new(documents.size) do |j|
|
21
|
-
term_frequency(documents[j], terms[i]) * idf
|
22
|
-
end
|
5
|
+
module TfIdfSimilarity
|
6
|
+
class BM25Model < Model
|
7
|
+
# Return the term's inverse document frequency.
|
8
|
+
#
|
9
|
+
# @param [String] term a term
|
10
|
+
# @return [Float] the term's inverse document frequency
|
11
|
+
def inverse_document_frequency(term)
|
12
|
+
df = @model.document_count(term)
|
13
|
+
log((documents.size - df + 0.5) / (df + 0.5))
|
23
14
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# Returns the term's frequency in the document.
|
39
|
-
#
|
40
|
-
# @param [Document] document a document
|
41
|
-
# @param [String] term a term
|
42
|
-
# @return [Float] the term's frequency in the document
|
43
|
-
#
|
44
|
-
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
45
|
-
def term_frequency(document, term)
|
46
|
-
tf = document.term_count(term)
|
47
|
-
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
48
|
-
end
|
49
|
-
alias_method :tf, :term_frequency
|
50
|
-
|
51
|
-
# Return the term frequency–inverse document frequency.
|
52
|
-
#
|
53
|
-
# @param [Document] document a document
|
54
|
-
# @param [String] term a term
|
55
|
-
# @return [Float] the term frequency–inverse document frequency
|
56
|
-
def term_frequency_inverse_document_frequency(document, term)
|
57
|
-
inverse_document_frequency(term) * term_frequency(document, term)
|
58
|
-
end
|
59
|
-
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
60
|
-
|
61
|
-
# Returns a similarity matrix for the documents in the corpus.
|
62
|
-
#
|
63
|
-
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
64
|
-
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
65
|
-
# similarity of all document vectors.
|
66
|
-
def similarity_matrix
|
67
|
-
multiply_self(normalize)
|
15
|
+
alias_method :idf, :inverse_document_frequency
|
16
|
+
|
17
|
+
# Returns the term's frequency in the document.
|
18
|
+
#
|
19
|
+
# @param [Document] document a document
|
20
|
+
# @param [String] term a term
|
21
|
+
# @return [Float] the term's frequency in the document
|
22
|
+
#
|
23
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
24
|
+
def term_frequency(document, term)
|
25
|
+
tf = document.term_count(term)
|
26
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
27
|
+
end
|
28
|
+
alias_method :tf, :term_frequency
|
68
29
|
end
|
69
30
|
end
|
@@ -1,80 +1,82 @@
|
|
1
1
|
# A document.
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
module TfIdfSimilarity
|
3
|
+
class Document
|
4
|
+
# The document's identifier.
|
5
|
+
attr_reader :id
|
6
|
+
# The document's text.
|
7
|
+
attr_reader :text
|
8
|
+
# The number of times each term appears in the document.
|
9
|
+
attr_reader :term_counts
|
10
|
+
# The number of tokens in the document.
|
11
|
+
attr_reader :size
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
13
|
+
# @param [String] text the document's text
|
14
|
+
# @param [Hash] opts optional arguments
|
15
|
+
# @option opts [String] :id the document's identifier
|
16
|
+
# @option opts [Array] :tokens the document's tokenized text
|
17
|
+
# @option opts [Hash] :term_counts the number of times each term appears
|
18
|
+
# @option opts [Integer] :size the number of tokens in the document
|
19
|
+
def initialize(text, opts = {})
|
20
|
+
@text = text
|
21
|
+
@id = opts[:id] || object_id
|
22
|
+
@tokens = opts[:tokens]
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
if opts[:term_counts]
|
25
|
+
@term_counts = opts[:term_counts]
|
26
|
+
@size = opts[:size] || term_counts.values.reduce(0, :+)
|
27
|
+
# Nothing to do.
|
28
|
+
else
|
29
|
+
@term_counts = Hash.new(0)
|
30
|
+
@size = 0
|
31
|
+
set_term_counts_and_size
|
32
|
+
end
|
31
33
|
end
|
32
|
-
end
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
# Returns the set of terms in the document.
|
36
|
+
#
|
37
|
+
# @return [Array<String>] the unique terms in the document
|
38
|
+
def terms
|
39
|
+
term_counts.keys
|
40
|
+
end
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
# Returns the number of occurrences of the term in the document.
|
43
|
+
#
|
44
|
+
# @param [String] term a term
|
45
|
+
# @return [Integer] the number of times the term appears in the document
|
46
|
+
def term_count(term)
|
47
|
+
term_counts[term].to_i # need #to_i if unmarshalled
|
48
|
+
end
|
48
49
|
|
49
|
-
private
|
50
|
+
private
|
50
51
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
52
|
+
# Tokenizes the text and counts terms and total tokens.
|
53
|
+
def set_term_counts_and_size
|
54
|
+
tokenize(text).each do |word|
|
55
|
+
token = Token.new(word)
|
56
|
+
if token.valid?
|
57
|
+
term = token.lowercase_filter.classic_filter.to_s
|
58
|
+
@term_counts[term] += 1
|
59
|
+
@size += 1
|
60
|
+
end
|
59
61
|
end
|
60
62
|
end
|
61
|
-
end
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
64
|
+
# Tokenizes a text, respecting the word boundary rules from Unicode’s Default
|
65
|
+
# Word Boundary Specification.
|
66
|
+
#
|
67
|
+
# If a tokenized text was provided at the document's initialization, those
|
68
|
+
# tokens will be returned without additional processing.
|
69
|
+
#
|
70
|
+
# @param [String] text a text
|
71
|
+
# @return [Enumerator] a token enumerator
|
72
|
+
#
|
73
|
+
# @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
|
74
|
+
# or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
|
75
|
+
#
|
76
|
+
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
|
+
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
|
+
def tokenize(text)
|
79
|
+
@tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
|
80
|
+
end
|
79
81
|
end
|
80
82
|
end
|
@@ -1,11 +1,13 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
module TfIdfSimilarity
|
2
|
+
class Document
|
3
|
+
# @return [Float] the maximum term count of any term in the document
|
4
|
+
def maximum_term_count
|
5
|
+
@maximum_term_count ||= term_counts.values.max.to_f
|
6
|
+
end
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
8
|
+
# @return [Float] the average term count of all terms in the document
|
9
|
+
def average_term_count
|
10
|
+
@average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
|
11
|
+
end
|
10
12
|
end
|
11
13
|
end
|
@@ -10,183 +10,185 @@
|
|
10
10
|
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
11
11
|
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
12
12
|
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
22
|
-
#
|
23
|
-
# SMART t, Salton f, Chisholm IDFB
|
24
|
-
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
|
25
|
-
log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
|
26
|
-
end
|
27
|
-
alias_method :plain_idf, :plain_inverse_document_frequency
|
13
|
+
module TfIdfSimilarity
|
14
|
+
class TfIdfModel
|
15
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
|
16
|
+
#
|
17
|
+
# SMART n, Salton x, Chisholm NONE
|
18
|
+
def no_collection_frequency(term)
|
19
|
+
1.0
|
20
|
+
end
|
28
21
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
22
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
23
|
+
#
|
24
|
+
# SMART t, Salton f, Chisholm IDFB
|
25
|
+
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
|
26
|
+
log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
|
27
|
+
end
|
28
|
+
alias_method :plain_idf, :plain_inverse_document_frequency
|
35
29
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
30
|
+
# SMART p, Salton p, Chisholm IDFP
|
31
|
+
def probabilistic_inverse_document_frequency(term)
|
32
|
+
count = @model.document_count(term).to_f
|
33
|
+
log((documents.size - count) / count)
|
34
|
+
end
|
35
|
+
alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
|
41
36
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
37
|
+
# Chisholm IGFF
|
38
|
+
def global_frequency_inverse_document_frequency(term)
|
39
|
+
@model.term_count(term) / @model.document_count(term).to_f
|
40
|
+
end
|
41
|
+
alias_method :gfidf, :global_frequency_inverse_document_frequency
|
47
42
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
43
|
+
# Chisholm IGFL
|
44
|
+
def log_global_frequency_inverse_document_frequency(term)
|
45
|
+
log(global_frequency_inverse_document_frequency(term) + 1)
|
46
|
+
end
|
47
|
+
alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
|
53
48
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
49
|
+
# Chisholm IGFI
|
50
|
+
def incremented_global_frequency_inverse_document_frequency(term)
|
51
|
+
global_frequency_inverse_document_frequency(term) + 1
|
52
|
+
end
|
53
|
+
alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
|
59
54
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
55
|
+
# Chisholm IGFS
|
56
|
+
def square_root_global_frequency_inverse_document_frequency(term)
|
57
|
+
sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
|
58
|
+
end
|
59
|
+
alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
|
60
|
+
|
61
|
+
# Chisholm ENPY
|
62
|
+
def entropy(term)
|
63
|
+
denominator = @model.term_count(term).to_f
|
64
|
+
logN = log(documents.size)
|
65
|
+
1 + documents.reduce(0) do |sum,document|
|
66
|
+
quotient = document.term_count(term) / denominator
|
67
|
+
sum += quotient * log(quotient) / logN
|
68
|
+
end
|
67
69
|
end
|
68
|
-
end
|
69
70
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
71
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
|
72
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
|
73
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
|
74
|
+
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
|
75
|
+
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
|
76
|
+
#
|
77
|
+
# SMART n, Salton x, Chisholm NONE
|
78
|
+
def no_normalization(matrix)
|
79
|
+
matrix
|
80
|
+
end
|
80
81
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
82
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
|
83
|
+
#
|
84
|
+
# SMART u, Chisholm PUQN
|
85
|
+
def pivoted_unique_normalization(matrix)
|
86
|
+
raise NotImplementedError
|
87
|
+
end
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
89
|
+
# Cosine normalization is implemented as MatrixMethods#normalize.
|
90
|
+
#
|
91
|
+
# SMART c, Salton c, Chisholm COSN
|
91
92
|
|
92
93
|
|
93
94
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
95
|
+
# The plain term frequency is implemented as Document#term_count.
|
96
|
+
#
|
97
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
|
98
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
|
99
|
+
#
|
100
|
+
# SMART n, Salton t, Chisholm FREQ
|
100
101
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
102
|
+
# SMART b, Salton b, Chisholm BNRY
|
103
|
+
def binary_term_frequency(document, term)
|
104
|
+
count = document.term_count(term)
|
105
|
+
if count > 0
|
106
|
+
1
|
107
|
+
else
|
108
|
+
0
|
109
|
+
end
|
108
110
|
end
|
109
|
-
|
110
|
-
alias_method :binary_tf, :binary_term_frequency
|
111
|
-
|
112
|
-
# @see http://en.wikipedia.org/wiki/Tf*idf
|
113
|
-
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
114
|
-
def normalized_term_frequency(document, term, a = 0)
|
115
|
-
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
116
|
-
end
|
117
|
-
alias_method :normalized_tf, :normalized_term_frequency
|
111
|
+
alias_method :binary_tf, :binary_term_frequency
|
118
112
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
124
|
-
|
125
|
-
# Chisholm ATFA
|
126
|
-
def augmented_average_term_frequency(document, term)
|
127
|
-
count = document.term_count(term)
|
128
|
-
if count > 0
|
129
|
-
0.9 + 0.1 * count / document.average_term_count
|
130
|
-
else
|
131
|
-
0
|
113
|
+
# @see http://en.wikipedia.org/wiki/Tf*idf
|
114
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
115
|
+
def normalized_term_frequency(document, term, a = 0)
|
116
|
+
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
132
117
|
end
|
133
|
-
|
134
|
-
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
118
|
+
alias_method :normalized_tf, :normalized_term_frequency
|
135
119
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
if count > 0
|
140
|
-
0.2 + 0.8 * count / document.maximum_term_count
|
141
|
-
else
|
142
|
-
0
|
120
|
+
# SMART a, Salton n, Chisholm ATF1
|
121
|
+
def augmented_normalized_term_frequency(document, term)
|
122
|
+
0.5 + 0.5 * normalized_term_frequency(document, term)
|
143
123
|
end
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
else
|
155
|
-
0
|
124
|
+
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
125
|
+
|
126
|
+
# Chisholm ATFA
|
127
|
+
def augmented_average_term_frequency(document, term)
|
128
|
+
count = document.term_count(term)
|
129
|
+
if count > 0
|
130
|
+
0.9 + 0.1 * count / document.average_term_count
|
131
|
+
else
|
132
|
+
0
|
133
|
+
end
|
156
134
|
end
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
135
|
+
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
136
|
+
|
137
|
+
# Chisholm ATFC
|
138
|
+
def changed_coefficient_augmented_normalized_term_frequency(document, term)
|
139
|
+
count = document.term_count(term)
|
140
|
+
if count > 0
|
141
|
+
0.2 + 0.8 * count / document.maximum_term_count
|
142
|
+
else
|
143
|
+
0
|
144
|
+
end
|
167
145
|
end
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
146
|
+
alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
|
147
|
+
|
148
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
|
149
|
+
#
|
150
|
+
# SMART l, Chisholm LOGA
|
151
|
+
def log_term_frequency(document, term)
|
152
|
+
count = document.term_count(term)
|
153
|
+
if count > 0
|
154
|
+
1 + log(count)
|
155
|
+
else
|
156
|
+
0
|
157
|
+
end
|
178
158
|
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
159
|
+
alias_method :log_tf, :log_term_frequency
|
160
|
+
|
161
|
+
# SMART L, Chisholm LOGN
|
162
|
+
def normalized_log_term_frequency(document, term)
|
163
|
+
count = document.term_count(term)
|
164
|
+
if count > 0
|
165
|
+
(1 + log(count)) / (1 + log(document.average_term_count))
|
166
|
+
else
|
167
|
+
0
|
168
|
+
end
|
169
|
+
end
|
170
|
+
alias_method :normalized_log_tf, :normalized_log_term_frequency
|
171
|
+
|
172
|
+
# Chisholm LOGG
|
173
|
+
def augmented_log_term_frequency(document, term)
|
174
|
+
count = document.term_count(term)
|
175
|
+
if count > 0
|
176
|
+
0.2 + 0.8 * log(count + 1)
|
177
|
+
else
|
178
|
+
0
|
179
|
+
end
|
180
|
+
end
|
181
|
+
alias_method :augmented_log_tf, :augmented_log_term_frequency
|
182
|
+
|
183
|
+
# Chisholm SQRT
|
184
|
+
def square_root_term_frequency(document, term)
|
185
|
+
count = document.term_count(term)
|
186
|
+
if count > 0
|
187
|
+
sqrt(count - 0.5) + 1
|
188
|
+
else
|
189
|
+
0
|
190
|
+
end
|
189
191
|
end
|
192
|
+
alias_method :square_root_tf, :square_root_term_frequency
|
190
193
|
end
|
191
|
-
alias_method :square_root_tf, :square_root_term_frequency
|
192
194
|
end
|