tf-idf-similarity 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/README.md +41 -29
- data/lib/tf-idf-similarity.rb +12 -1
- data/lib/tf-idf-similarity/document.rb +35 -28
- data/lib/tf-idf-similarity/extras/document.rb +2 -125
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
- data/lib/tf-idf-similarity/term_count_model.rb +78 -0
- data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
- data/lib/tf-idf-similarity/token.rb +34 -12
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/document_spec.rb +136 -0
- data/spec/extras/tf_idf_model_spec.rb +269 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/term_count_model_spec.rb +108 -0
- data/spec/tf_idf_model_spec.rb +174 -0
- data/spec/token_spec.rb +34 -0
- data/td-idf-similarity.gemspec +3 -3
- metadata +91 -63
- data/lib/tf-idf-similarity/collection.rb +0 -205
- data/lib/tf-idf-similarity/extras/collection.rb +0 -110
data/.travis.yml
CHANGED
@@ -1,3 +1,32 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
+
- 1.8.7
|
4
|
+
- 1.9.2
|
3
5
|
- 1.9.3
|
6
|
+
- 2.0.0
|
7
|
+
- ree
|
8
|
+
env:
|
9
|
+
- MATRIX_LIBRARY=gsl
|
10
|
+
- MATRIX_LIBRARY=narray
|
11
|
+
- MATRIX_LIBRARY=nmatrix
|
12
|
+
- MATRIX_LIBRARY=matrix
|
13
|
+
matrix:
|
14
|
+
exclude:
|
15
|
+
- rvm: 1.8.7
|
16
|
+
env: MATRIX_LIBRARY=nmatrix
|
17
|
+
- rvm: ree
|
18
|
+
env: MATRIX_LIBRARY=nmatrix
|
19
|
+
before_install:
|
20
|
+
- bundle config build.nmatrix --with-lapacklib
|
21
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi
|
22
|
+
- if [ $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get install gsl-bin libgsl0-dev; fi
|
23
|
+
# Installing ATLAS will install BLAS.
|
24
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
|
25
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
|
26
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then git clone git://github.com/SciRuby/nmatrix.git; fi
|
27
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd nmatrix && ORIGINAL_BUNDLE_GEMFILE=$BUNDLE_GEMFILE; fi
|
28
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then BUNDLE_GEMFILE=`pwd`/Gemfile && bundle && bundle exec rake install; fi
|
29
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd .. && BUNDLE_GEMFILE=$ORIGINAL_BUNDLE_GEMFILE; fi
|
30
|
+
# Travis sometimes runs without Bundler.
|
31
|
+
install: bundle
|
32
|
+
script: bundle exec rake --trace
|
data/Gemfile
CHANGED
@@ -1,4 +1,8 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
+
gem 'gsl', '~> 1.15.3' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
|
+
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
+
gem 'nmatrix', :git => 'git://github.com/SciRuby/nmatrix.git' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
|
+
|
3
7
|
# Specify your gem's dependencies in the gemspec
|
4
8
|
gemspec
|
data/README.md
CHANGED
@@ -1,70 +1,90 @@
|
|
1
1
|
# Ruby Vector Space Model (VSM) with tf*idf weights
|
2
2
|
|
3
|
+
[![Build Status](https://secure.travis-ci.org/opennorth/tf-idf-similarity.png)](http://travis-ci.org/opennorth/tf-idf-similarity)
|
3
4
|
[![Dependency Status](https://gemnasium.com/opennorth/tf-idf-similarity.png)](https://gemnasium.com/opennorth/tf-idf-similarity)
|
4
|
-
[![
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/opennorth/tf-idf-similarity/badge.png?branch=master)](https://coveralls.io/r/opennorth/tf-idf-similarity)
|
6
|
+
[![Code Climate](https://codeclimate.com/github/opennorth/tf-idf-similarity.png)](https://codeclimate.com/github/opennorth/tf-idf-similarity)
|
5
7
|
|
6
|
-
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](http://en.wikipedia.org/wiki/
|
9
|
+
) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) or similar (see below).
|
7
10
|
|
8
11
|
## Usage
|
9
12
|
|
13
|
+
require 'matrix'
|
10
14
|
require 'tf-idf-similarity'
|
11
15
|
|
12
|
-
|
16
|
+
Create a set of documents:
|
17
|
+
|
18
|
+
corpus = []
|
13
19
|
corpus << TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
|
14
20
|
corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
|
15
21
|
corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
16
22
|
|
17
|
-
|
23
|
+
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/) (default:
|
18
24
|
|
19
|
-
|
25
|
+
model = TfIdfSimilarity::TfIdfModel(corpus, :function => :tf_idf)
|
20
26
|
|
21
|
-
|
27
|
+
Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
|
22
28
|
|
23
|
-
|
29
|
+
model = TfIdfSimilarity::TfIdfModel(corpus, :function => :bm25)
|
30
|
+
|
31
|
+
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
32
|
+
|
33
|
+
## Speed
|
24
34
|
|
25
|
-
|
35
|
+
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the `gsl`, `narray` or `nmatrix` gems for faster matrix operations, e.g.:
|
26
36
|
|
27
|
-
|
28
|
-
|
29
|
-
git checkout -b gsl-1.14 83ed49411f076e30ced04c2cbebb054b2645a431
|
30
|
-
brew install gsl
|
31
|
-
git checkout master
|
32
|
-
git branch -d gsl-1.14
|
33
|
-
```
|
37
|
+
require 'gsl'
|
38
|
+
model = TfIdfSimilarity::TfIdfModel(corpus, :library => :gsl)
|
34
39
|
|
35
|
-
|
40
|
+
### [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/)
|
36
41
|
|
37
|
-
gem install gsl
|
42
|
+
gem install gsl
|
38
43
|
|
39
44
|
### [NArray](http://narray.rubyforge.org/)
|
40
45
|
|
41
46
|
gem install narray
|
42
47
|
|
48
|
+
### [NMatrix](https://github.com/SciRuby/nmatrix)
|
49
|
+
|
50
|
+
The nmatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install the nmatrix gem. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/Installation).
|
51
|
+
|
43
52
|
## Extras
|
44
53
|
|
45
54
|
You can access more term frequency, document frequency, and normalization formulas with:
|
46
55
|
|
47
|
-
require 'tf-idf-similarity/extras/collection'
|
48
56
|
require 'tf-idf-similarity/extras/document'
|
57
|
+
require 'tf-idf-similarity/extras/tf_idf_model'
|
49
58
|
|
50
59
|
The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
51
60
|
|
52
61
|
## Why?
|
53
62
|
|
54
|
-
|
63
|
+
At the time of writing, no other Ruby gem implemented the tf*idf formula used by Lucene, Sphinx and Ferret.
|
64
|
+
|
65
|
+
* [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
|
66
|
+
* [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
|
67
|
+
* [similarity](https://github.com/bbcrd/Similarity) uses [cosine normalization](https://github.com/bbcrd/Similarity/blob/master/lib/similarity/term_document_matrix.rb#L23), which corresponds roughly to Lucene.
|
55
68
|
|
56
69
|
### Term frequencies
|
57
70
|
|
58
|
-
The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important. The [
|
71
|
+
The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important. The [tf_idf](https://github.com/reddavis/TF-IDF) and similarity gems normalize the frequency of a term in a document to the number of terms in that document, which never occurs in the literature. The [tf-idf](https://github.com/mchung/tf-idf) gem normalizes the frequency of a term in a document to the number of *unique* terms in that document, which never occurs in the literature.
|
59
72
|
|
60
73
|
### Document frequencies
|
61
74
|
|
62
|
-
The vss gem does not normalize the inverse document frequency. The tf_idf, tf-idf
|
75
|
+
The vss gem does not normalize the inverse document frequency. The treat, tf_idf, tf-idf and similarity gems use variants of the typical inverse document frequency formula.
|
63
76
|
|
64
77
|
### Normalization
|
65
78
|
|
66
79
|
The treat, tf_idf, tf-idf, rsemantic and vss gems have no normalization component.
|
67
80
|
|
81
|
+
## Additional adapters
|
82
|
+
|
83
|
+
Adapters for the following projects were also considered:
|
84
|
+
|
85
|
+
* [Ruby-LAPACK](http://ruby.gfd-dennou.org/products/ruby-lapack/) is a very thin wrapper around LAPACK, which has an opaque Fortran-style naming scheme.
|
86
|
+
* [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) give access to LAPACK from Ruby, but are old and unavailable as gems.
|
87
|
+
|
68
88
|
## Reference
|
69
89
|
|
70
90
|
* [G. Salton and C. Buckley. "Term Weighting Approaches in Automatic Text Retrieval."" Technical Report. Cornell University, Ithaca, NY, USA. 1987.](http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf)
|
@@ -81,14 +101,6 @@ Lucene implements many more [similarity functions](http://lucene.apache.org/core
|
|
81
101
|
|
82
102
|
Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
|
83
103
|
|
84
|
-
## Other optimizations
|
85
|
-
|
86
|
-
[Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/) is available through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). You can use it through the next release (after `0.0.2`) of the [nmatrix gem](https://github.com/SciRuby/nmatrix). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install it. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/NMatrix-Installation).
|
87
|
-
|
88
|
-
### Other Options
|
89
|
-
|
90
|
-
[Ruby-LAPACK](http://ruby.gfd-dennou.org/products/ruby-lapack/) is a very thin wrapper around LAPACK, which has an opaque Fortran-style naming scheme. [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) are old and not available as gems.
|
91
|
-
|
92
104
|
## Bugs? Questions?
|
93
105
|
|
94
106
|
This gem's main repository is on GitHub: [http://github.com/opennorth/tf-idf-similarity](http://github.com/opennorth/tf-idf-similarity), where your contributions, forks, bug reports, feature requests, and feedback are greatly welcomed.
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -1,6 +1,17 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'unicode_utils'
|
6
|
+
rescue LoadError
|
7
|
+
# Ruby 1.8
|
8
|
+
end
|
9
|
+
|
1
10
|
module TfIdfSimilarity
|
2
11
|
end
|
3
12
|
|
4
|
-
require 'tf-idf-similarity/
|
13
|
+
require 'tf-idf-similarity/matrix_methods'
|
14
|
+
require 'tf-idf-similarity/term_count_model'
|
15
|
+
require 'tf-idf-similarity/tf_idf_model'
|
5
16
|
require 'tf-idf-similarity/document'
|
6
17
|
require 'tf-idf-similarity/token'
|
@@ -1,56 +1,63 @@
|
|
1
|
-
#
|
2
|
-
require 'unicode_utils'
|
3
|
-
|
1
|
+
# A document.
|
4
2
|
class TfIdfSimilarity::Document
|
5
|
-
#
|
3
|
+
# The document's identifier.
|
6
4
|
attr_reader :id
|
7
5
|
# The document's text.
|
8
6
|
attr_reader :text
|
9
|
-
# The document's tokenized text.
|
10
|
-
attr_reader :tokens
|
11
7
|
# The number of times each term appears in the document.
|
12
8
|
attr_reader :term_counts
|
13
|
-
# The
|
9
|
+
# The number of tokens in the document.
|
14
10
|
attr_reader :size
|
15
11
|
|
16
12
|
# @param [String] text the document's text
|
17
13
|
# @param [Hash] opts optional arguments
|
18
|
-
# @option opts [String] :id
|
14
|
+
# @option opts [String] :id the document's identifier
|
19
15
|
# @option opts [Array] :tokens the document's tokenized text
|
16
|
+
# @option opts [Hash] :term_counts the number of times each term appears
|
17
|
+
# @option opts [Integer] :size the number of tokens in the document
|
20
18
|
def initialize(text, opts = {})
|
21
|
-
@text
|
22
|
-
@id
|
23
|
-
@tokens
|
24
|
-
|
25
|
-
|
19
|
+
@text = text
|
20
|
+
@id = opts[:id] || object_id
|
21
|
+
@tokens = opts[:tokens]
|
22
|
+
|
23
|
+
if opts[:term_counts]
|
24
|
+
@term_counts = opts[:term_counts]
|
25
|
+
@size = opts[:size] || term_counts.values.reduce(0, :+)
|
26
|
+
# Nothing to do.
|
27
|
+
else
|
28
|
+
@term_counts = Hash.new(0)
|
29
|
+
@size = 0
|
30
|
+
set_term_counts_and_size
|
31
|
+
end
|
26
32
|
end
|
27
33
|
|
28
|
-
#
|
34
|
+
# Returns the set of terms in the document.
|
35
|
+
#
|
36
|
+
# @return [Array<String>] the unique terms in the document
|
29
37
|
def terms
|
30
38
|
term_counts.keys
|
31
39
|
end
|
32
|
-
|
33
|
-
#
|
34
|
-
# @return [Float] the square root of the term count
|
40
|
+
|
41
|
+
# Returns the number of occurrences of the term in the document.
|
35
42
|
#
|
36
|
-
# @
|
37
|
-
# @
|
38
|
-
def
|
39
|
-
|
43
|
+
# @param [String] term a term
|
44
|
+
# @return [Integer] the number of times the term appears in the document
|
45
|
+
def term_count(term)
|
46
|
+
term_counts[term].to_i # need #to_i if unmarshalled
|
40
47
|
end
|
41
|
-
alias_method :tf, :term_frequency
|
42
48
|
|
43
49
|
private
|
44
50
|
|
45
|
-
#
|
46
|
-
def
|
51
|
+
# Tokenizes the text and counts terms and total tokens.
|
52
|
+
def set_term_counts_and_size
|
47
53
|
tokenize(text).each do |word|
|
48
|
-
token = TfIdfSimilarity::Token.new
|
54
|
+
token = TfIdfSimilarity::Token.new(word)
|
49
55
|
if token.valid?
|
50
|
-
|
56
|
+
term = token.lowercase_filter.classic_filter.to_s
|
57
|
+
@term_counts[term] += 1
|
58
|
+
@size += 1
|
51
59
|
end
|
52
60
|
end
|
53
|
-
@size = term_counts.values.reduce(:+)
|
54
61
|
end
|
55
62
|
|
56
63
|
# Tokenizes a text, respecting the word boundary rules from Unicode’s Default
|
@@ -68,6 +75,6 @@ private
|
|
68
75
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
69
76
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
70
77
|
def tokenize(text)
|
71
|
-
@tokens || UnicodeUtils.each_word(text)
|
78
|
+
@tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
|
72
79
|
end
|
73
80
|
end
|
@@ -1,134 +1,11 @@
|
|
1
|
-
require 'tf-idf-similarity/document'
|
2
|
-
|
3
|
-
# @todo http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
4
|
-
#
|
5
|
-
# @note The treat, tf_idf, similarity and rsemantic gems normalizes to the number of terms in the document.
|
6
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L77
|
7
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
|
8
|
-
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
|
9
|
-
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L17
|
10
|
-
#
|
11
|
-
# @note The tf-idf gem normalizes to the number of unique terms in the document.
|
12
|
-
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
|
13
|
-
#
|
14
|
-
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
15
|
-
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
16
|
-
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
17
1
|
class TfIdfSimilarity::Document
|
18
2
|
# @return [Float] the maximum term count of any term in the document
|
19
3
|
def maximum_term_count
|
20
|
-
@maximum_term_count ||=
|
4
|
+
@maximum_term_count ||= term_counts.values.max.to_f
|
21
5
|
end
|
22
6
|
|
23
7
|
# @return [Float] the average term count of all terms in the document
|
24
8
|
def average_term_count
|
25
|
-
@average_term_count ||=
|
26
|
-
end
|
27
|
-
|
28
|
-
# Returns the term count.
|
29
|
-
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
|
30
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
|
31
|
-
#
|
32
|
-
# SMART n, Salton t, Chisholm FREQ
|
33
|
-
def plain_term_frequency(term)
|
34
|
-
term_counts[term]
|
35
|
-
end
|
36
|
-
alias :plain_tf, :plain_term_frequency
|
37
|
-
|
38
|
-
# Returns 1 if the term is present, 0 otherwise.
|
39
|
-
#
|
40
|
-
# SMART b, Salton b, Chisholm BNRY
|
41
|
-
def binary_term_frequency(term)
|
42
|
-
count = term_counts[term]
|
43
|
-
if count > 0
|
44
|
-
1
|
45
|
-
else
|
46
|
-
0
|
47
|
-
end
|
48
|
-
end
|
49
|
-
alias_method :binary_tf, :binary_term_frequency
|
50
|
-
|
51
|
-
# Normalizes the term count by the maximum term count.
|
52
|
-
#
|
53
|
-
# @see http://en.wikipedia.org/wiki/Tf*idf
|
54
|
-
def normalized_term_frequency(term)
|
55
|
-
term_counts[term] / maximum_term_count
|
56
|
-
end
|
57
|
-
alias_method :normalized_tf, :normalized_term_frequency
|
58
|
-
|
59
|
-
# Further normalizes the normalized term frequency to lie between 0.5 and 1.
|
60
|
-
#
|
61
|
-
# SMART a, Salton n, Chisholm ATF1
|
62
|
-
def augmented_normalized_term_frequency(term)
|
63
|
-
0.5 + 0.5 * normalized_term_frequency(term)
|
64
|
-
end
|
65
|
-
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
66
|
-
|
67
|
-
# Chisholm ATFA
|
68
|
-
def augmented_average_term_frequency(term)
|
69
|
-
count = term_counts[term]
|
70
|
-
if count > 0
|
71
|
-
0.9 + 0.1 * count / average_term_count
|
72
|
-
else
|
73
|
-
0
|
74
|
-
end
|
75
|
-
end
|
76
|
-
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
77
|
-
|
78
|
-
# Chisholm ATFC
|
79
|
-
def changed_coefficient_augmented_normalized_term_frequency(term)
|
80
|
-
count = term_counts[term]
|
81
|
-
if count > 0
|
82
|
-
0.2 + 0.8 * count / maximum_term_count
|
83
|
-
else
|
84
|
-
0
|
85
|
-
end
|
86
|
-
end
|
87
|
-
alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
|
88
|
-
|
89
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
|
90
|
-
#
|
91
|
-
# SMART l, Chisholm LOGA
|
92
|
-
def log_term_frequency(term)
|
93
|
-
count = term_counts[term]
|
94
|
-
if count > 0
|
95
|
-
1 + Math.log(count)
|
96
|
-
else
|
97
|
-
0
|
98
|
-
end
|
99
|
-
end
|
100
|
-
alias_method :log_tf, :log_term_frequency
|
101
|
-
|
102
|
-
# SMART L, Chisholm LOGN
|
103
|
-
def normalized_log_term_frequency(term)
|
104
|
-
count = term_counts[term]
|
105
|
-
if count > 0
|
106
|
-
(1 + Math.log(count)) / (1 + Math.log(average_term_count))
|
107
|
-
else
|
108
|
-
0
|
109
|
-
end
|
110
|
-
end
|
111
|
-
alias_method :normalized_log_tf, :normalized_log_term_frequency
|
112
|
-
|
113
|
-
# Chisholm LOGG
|
114
|
-
def augmented_log_term_frequency(term)
|
115
|
-
count = term_counts[term]
|
116
|
-
if count > 0
|
117
|
-
0.2 + 0.8 * Math.log(count + 1)
|
118
|
-
else
|
119
|
-
0
|
120
|
-
end
|
121
|
-
end
|
122
|
-
alias_method :augmented_log_tf, :augmented_log_term_frequency
|
123
|
-
|
124
|
-
# Chisholm SQRT
|
125
|
-
def square_root_term_frequency(term)
|
126
|
-
count = term_counts[term]
|
127
|
-
if count > 0
|
128
|
-
Math.sqrt(count - 0.5) + 1
|
129
|
-
else
|
130
|
-
0
|
131
|
-
end
|
9
|
+
@average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
|
132
10
|
end
|
133
|
-
alias_method :square_root_tf, :square_root_term_frequency
|
134
11
|
end
|
@@ -0,0 +1,192 @@
|
|
1
|
+
# @note The vss gem does not take the logarithm of the inverse document frequency.
|
2
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L79
|
3
|
+
|
4
|
+
# @note The treat gem does not add one to the inverse document frequency.
|
5
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L16
|
6
|
+
|
7
|
+
# @note The treat gem normalizes to the number of tokens in the document.
|
8
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
|
9
|
+
|
10
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
11
|
+
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
12
|
+
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
13
|
+
class TfIdfSimilarity::TfIdfModel
|
14
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
|
15
|
+
#
|
16
|
+
# SMART n, Salton x, Chisholm NONE
|
17
|
+
def no_collection_frequency(term)
|
18
|
+
1.0
|
19
|
+
end
|
20
|
+
|
21
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
22
|
+
#
|
23
|
+
# SMART t, Salton f, Chisholm IDFB
|
24
|
+
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
|
25
|
+
log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
|
26
|
+
end
|
27
|
+
alias_method :plain_idf, :plain_inverse_document_frequency
|
28
|
+
|
29
|
+
# SMART p, Salton p, Chisholm IDFP
|
30
|
+
def probabilistic_inverse_document_frequency(term)
|
31
|
+
count = @model.document_count(term).to_f
|
32
|
+
log((documents.size - count) / count)
|
33
|
+
end
|
34
|
+
alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
|
35
|
+
|
36
|
+
# Chisholm IGFF
|
37
|
+
def global_frequency_inverse_document_frequency(term)
|
38
|
+
@model.term_count(term) / @model.document_count(term).to_f
|
39
|
+
end
|
40
|
+
alias_method :gfidf, :global_frequency_inverse_document_frequency
|
41
|
+
|
42
|
+
# Chisholm IGFL
|
43
|
+
def log_global_frequency_inverse_document_frequency(term)
|
44
|
+
log(global_frequency_inverse_document_frequency(term) + 1)
|
45
|
+
end
|
46
|
+
alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
|
47
|
+
|
48
|
+
# Chisholm IGFI
|
49
|
+
def incremented_global_frequency_inverse_document_frequency(term)
|
50
|
+
global_frequency_inverse_document_frequency(term) + 1
|
51
|
+
end
|
52
|
+
alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
|
53
|
+
|
54
|
+
# Chisholm IGFS
|
55
|
+
def square_root_global_frequency_inverse_document_frequency(term)
|
56
|
+
sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
|
57
|
+
end
|
58
|
+
alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
|
59
|
+
|
60
|
+
# Chisholm ENPY
|
61
|
+
def entropy(term)
|
62
|
+
denominator = @model.term_count(term).to_f
|
63
|
+
logN = log(documents.size)
|
64
|
+
1 + documents.reduce(0) do |sum,document|
|
65
|
+
quotient = document.term_count(term) / denominator
|
66
|
+
sum += quotient * log(quotient) / logN
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
|
71
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
|
72
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
|
73
|
+
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
|
74
|
+
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
|
75
|
+
#
|
76
|
+
# SMART n, Salton x, Chisholm NONE
|
77
|
+
def no_normalization(matrix)
|
78
|
+
matrix
|
79
|
+
end
|
80
|
+
|
81
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
|
82
|
+
#
|
83
|
+
# SMART u, Chisholm PUQN
|
84
|
+
def pivoted_unique_normalization(matrix)
|
85
|
+
raise NotImplementedError
|
86
|
+
end
|
87
|
+
|
88
|
+
# Cosine normalization is implemented as TfIdfSimilarity::MatrixMethods#normalize.
|
89
|
+
#
|
90
|
+
# SMART c, Salton c, Chisholm COSN
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
# The plain term frequency is implemented as TfIdfSimilarity::Document#term_count.
|
95
|
+
#
|
96
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
|
97
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
|
98
|
+
#
|
99
|
+
# SMART n, Salton t, Chisholm FREQ
|
100
|
+
|
101
|
+
# SMART b, Salton b, Chisholm BNRY
|
102
|
+
def binary_term_frequency(document, term)
|
103
|
+
count = document.term_count(term)
|
104
|
+
if count > 0
|
105
|
+
1
|
106
|
+
else
|
107
|
+
0
|
108
|
+
end
|
109
|
+
end
|
110
|
+
alias_method :binary_tf, :binary_term_frequency
|
111
|
+
|
112
|
+
# @see http://en.wikipedia.org/wiki/Tf*idf
|
113
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
114
|
+
def normalized_term_frequency(document, term, a = 0)
|
115
|
+
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
116
|
+
end
|
117
|
+
alias_method :normalized_tf, :normalized_term_frequency
|
118
|
+
|
119
|
+
# SMART a, Salton n, Chisholm ATF1
|
120
|
+
def augmented_normalized_term_frequency(document, term)
|
121
|
+
0.5 + 0.5 * normalized_term_frequency(document, term)
|
122
|
+
end
|
123
|
+
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
124
|
+
|
125
|
+
# Chisholm ATFA
|
126
|
+
def augmented_average_term_frequency(document, term)
|
127
|
+
count = document.term_count(term)
|
128
|
+
if count > 0
|
129
|
+
0.9 + 0.1 * count / document.average_term_count
|
130
|
+
else
|
131
|
+
0
|
132
|
+
end
|
133
|
+
end
|
134
|
+
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
135
|
+
|
136
|
+
# Chisholm ATFC
|
137
|
+
def changed_coefficient_augmented_normalized_term_frequency(document, term)
|
138
|
+
count = document.term_count(term)
|
139
|
+
if count > 0
|
140
|
+
0.2 + 0.8 * count / document.maximum_term_count
|
141
|
+
else
|
142
|
+
0
|
143
|
+
end
|
144
|
+
end
|
145
|
+
alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
|
146
|
+
|
147
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
|
148
|
+
#
|
149
|
+
# SMART l, Chisholm LOGA
|
150
|
+
def log_term_frequency(document, term)
|
151
|
+
count = document.term_count(term)
|
152
|
+
if count > 0
|
153
|
+
1 + log(count)
|
154
|
+
else
|
155
|
+
0
|
156
|
+
end
|
157
|
+
end
|
158
|
+
alias_method :log_tf, :log_term_frequency
|
159
|
+
|
160
|
+
# SMART L, Chisholm LOGN
|
161
|
+
def normalized_log_term_frequency(document, term)
|
162
|
+
count = document.term_count(term)
|
163
|
+
if count > 0
|
164
|
+
(1 + log(count)) / (1 + log(document.average_term_count))
|
165
|
+
else
|
166
|
+
0
|
167
|
+
end
|
168
|
+
end
|
169
|
+
alias_method :normalized_log_tf, :normalized_log_term_frequency
|
170
|
+
|
171
|
+
# Chisholm LOGG
|
172
|
+
def augmented_log_term_frequency(document, term)
|
173
|
+
count = document.term_count(term)
|
174
|
+
if count > 0
|
175
|
+
0.2 + 0.8 * log(count + 1)
|
176
|
+
else
|
177
|
+
0
|
178
|
+
end
|
179
|
+
end
|
180
|
+
alias_method :augmented_log_tf, :augmented_log_term_frequency
|
181
|
+
|
182
|
+
# Chisholm SQRT
|
183
|
+
def square_root_term_frequency(document, term)
|
184
|
+
count = document.term_count(term)
|
185
|
+
if count > 0
|
186
|
+
sqrt(count - 0.5) + 1
|
187
|
+
else
|
188
|
+
0
|
189
|
+
end
|
190
|
+
end
|
191
|
+
alias_method :square_root_tf, :square_root_term_frequency
|
192
|
+
end
|