tf-idf-similarity 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/README.md +41 -29
- data/lib/tf-idf-similarity.rb +12 -1
- data/lib/tf-idf-similarity/document.rb +35 -28
- data/lib/tf-idf-similarity/extras/document.rb +2 -125
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
- data/lib/tf-idf-similarity/term_count_model.rb +78 -0
- data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
- data/lib/tf-idf-similarity/token.rb +34 -12
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/document_spec.rb +136 -0
- data/spec/extras/tf_idf_model_spec.rb +269 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/term_count_model_spec.rb +108 -0
- data/spec/tf_idf_model_spec.rb +174 -0
- data/spec/token_spec.rb +34 -0
- data/td-idf-similarity.gemspec +3 -3
- metadata +91 -63
- data/lib/tf-idf-similarity/collection.rb +0 -205
- data/lib/tf-idf-similarity/extras/collection.rb +0 -110
data/.travis.yml
CHANGED
@@ -1,3 +1,32 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
|
+
- 1.8.7
|
4
|
+
- 1.9.2
|
3
5
|
- 1.9.3
|
6
|
+
- 2.0.0
|
7
|
+
- ree
|
8
|
+
env:
|
9
|
+
- MATRIX_LIBRARY=gsl
|
10
|
+
- MATRIX_LIBRARY=narray
|
11
|
+
- MATRIX_LIBRARY=nmatrix
|
12
|
+
- MATRIX_LIBRARY=matrix
|
13
|
+
matrix:
|
14
|
+
exclude:
|
15
|
+
- rvm: 1.8.7
|
16
|
+
env: MATRIX_LIBRARY=nmatrix
|
17
|
+
- rvm: ree
|
18
|
+
env: MATRIX_LIBRARY=nmatrix
|
19
|
+
before_install:
|
20
|
+
- bundle config build.nmatrix --with-lapacklib
|
21
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' -o $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get update -qq; fi
|
22
|
+
- if [ $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get install gsl-bin libgsl0-dev; fi
|
23
|
+
# Installing ATLAS will install BLAS.
|
24
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
|
25
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
|
26
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then git clone git://github.com/SciRuby/nmatrix.git; fi
|
27
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd nmatrix && ORIGINAL_BUNDLE_GEMFILE=$BUNDLE_GEMFILE; fi
|
28
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then BUNDLE_GEMFILE=`pwd`/Gemfile && bundle && bundle exec rake install; fi
|
29
|
+
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd .. && BUNDLE_GEMFILE=$ORIGINAL_BUNDLE_GEMFILE; fi
|
30
|
+
# Travis sometimes runs without Bundler.
|
31
|
+
install: bundle
|
32
|
+
script: bundle exec rake --trace
|
data/Gemfile
CHANGED
@@ -1,4 +1,8 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
+
gem 'gsl', '~> 1.15.3' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
|
+
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
+
gem 'nmatrix', :git => 'git://github.com/SciRuby/nmatrix.git' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
|
+
|
3
7
|
# Specify your gem's dependencies in the gemspec
|
4
8
|
gemspec
|
data/README.md
CHANGED
@@ -1,70 +1,90 @@
|
|
1
1
|
# Ruby Vector Space Model (VSM) with tf*idf weights
|
2
2
|
|
3
|
+
[](http://travis-ci.org/opennorth/tf-idf-similarity)
|
3
4
|
[](https://gemnasium.com/opennorth/tf-idf-similarity)
|
4
|
-
[](https://coveralls.io/r/opennorth/tf-idf-similarity)
|
6
|
+
[](https://codeclimate.com/github/opennorth/tf-idf-similarity)
|
5
7
|
|
6
|
-
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](http://en.wikipedia.org/wiki/
|
9
|
+
) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) or similar (see below).
|
7
10
|
|
8
11
|
## Usage
|
9
12
|
|
13
|
+
require 'matrix'
|
10
14
|
require 'tf-idf-similarity'
|
11
15
|
|
12
|
-
|
16
|
+
Create a set of documents:
|
17
|
+
|
18
|
+
corpus = []
|
13
19
|
corpus << TfIdfSimilarity::Document.new("Lorem ipsum dolor sit amet...")
|
14
20
|
corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
|
15
21
|
corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
16
22
|
|
17
|
-
|
23
|
+
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/) (default:
|
18
24
|
|
19
|
-
|
25
|
+
model = TfIdfSimilarity::TfIdfModel(corpus, :function => :tf_idf)
|
20
26
|
|
21
|
-
|
27
|
+
Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
|
22
28
|
|
23
|
-
|
29
|
+
model = TfIdfSimilarity::TfIdfModel(corpus, :function => :bm25)
|
30
|
+
|
31
|
+
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
32
|
+
|
33
|
+
## Speed
|
24
34
|
|
25
|
-
|
35
|
+
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the `gsl`, `narray` or `nmatrix` gems for faster matrix operations, e.g.:
|
26
36
|
|
27
|
-
|
28
|
-
|
29
|
-
git checkout -b gsl-1.14 83ed49411f076e30ced04c2cbebb054b2645a431
|
30
|
-
brew install gsl
|
31
|
-
git checkout master
|
32
|
-
git branch -d gsl-1.14
|
33
|
-
```
|
37
|
+
require 'gsl'
|
38
|
+
model = TfIdfSimilarity::TfIdfModel(corpus, :library => :gsl)
|
34
39
|
|
35
|
-
|
40
|
+
### [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/)
|
36
41
|
|
37
|
-
gem install gsl
|
42
|
+
gem install gsl
|
38
43
|
|
39
44
|
### [NArray](http://narray.rubyforge.org/)
|
40
45
|
|
41
46
|
gem install narray
|
42
47
|
|
48
|
+
### [NMatrix](https://github.com/SciRuby/nmatrix)
|
49
|
+
|
50
|
+
The nmatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install the nmatrix gem. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/Installation).
|
51
|
+
|
43
52
|
## Extras
|
44
53
|
|
45
54
|
You can access more term frequency, document frequency, and normalization formulas with:
|
46
55
|
|
47
|
-
require 'tf-idf-similarity/extras/collection'
|
48
56
|
require 'tf-idf-similarity/extras/document'
|
57
|
+
require 'tf-idf-similarity/extras/tf_idf_model'
|
49
58
|
|
50
59
|
The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
51
60
|
|
52
61
|
## Why?
|
53
62
|
|
54
|
-
|
63
|
+
At the time of writing, no other Ruby gem implemented the tf*idf formula used by Lucene, Sphinx and Ferret.
|
64
|
+
|
65
|
+
* [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
|
66
|
+
* [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
|
67
|
+
* [similarity](https://github.com/bbcrd/Similarity) uses [cosine normalization](https://github.com/bbcrd/Similarity/blob/master/lib/similarity/term_document_matrix.rb#L23), which corresponds roughly to Lucene.
|
55
68
|
|
56
69
|
### Term frequencies
|
57
70
|
|
58
|
-
The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important. The [
|
71
|
+
The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important. The [tf_idf](https://github.com/reddavis/TF-IDF) and similarity gems normalize the frequency of a term in a document to the number of terms in that document, which never occurs in the literature. The [tf-idf](https://github.com/mchung/tf-idf) gem normalizes the frequency of a term in a document to the number of *unique* terms in that document, which never occurs in the literature.
|
59
72
|
|
60
73
|
### Document frequencies
|
61
74
|
|
62
|
-
The vss gem does not normalize the inverse document frequency. The tf_idf, tf-idf
|
75
|
+
The vss gem does not normalize the inverse document frequency. The treat, tf_idf, tf-idf and similarity gems use variants of the typical inverse document frequency formula.
|
63
76
|
|
64
77
|
### Normalization
|
65
78
|
|
66
79
|
The treat, tf_idf, tf-idf, rsemantic and vss gems have no normalization component.
|
67
80
|
|
81
|
+
## Additional adapters
|
82
|
+
|
83
|
+
Adapters for the following projects were also considered:
|
84
|
+
|
85
|
+
* [Ruby-LAPACK](http://ruby.gfd-dennou.org/products/ruby-lapack/) is a very thin wrapper around LAPACK, which has an opaque Fortran-style naming scheme.
|
86
|
+
* [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) give access to LAPACK from Ruby, but are old and unavailable as gems.
|
87
|
+
|
68
88
|
## Reference
|
69
89
|
|
70
90
|
* [G. Salton and C. Buckley. "Term Weighting Approaches in Automatic Text Retrieval."" Technical Report. Cornell University, Ithaca, NY, USA. 1987.](http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf)
|
@@ -81,14 +101,6 @@ Lucene implements many more [similarity functions](http://lucene.apache.org/core
|
|
81
101
|
|
82
102
|
Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
|
83
103
|
|
84
|
-
## Other optimizations
|
85
|
-
|
86
|
-
[Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/) is available through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). You can use it through the next release (after `0.0.2`) of the [nmatrix gem](https://github.com/SciRuby/nmatrix). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install it. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/NMatrix-Installation).
|
87
|
-
|
88
|
-
### Other Options
|
89
|
-
|
90
|
-
[Ruby-LAPACK](http://ruby.gfd-dennou.org/products/ruby-lapack/) is a very thin wrapper around LAPACK, which has an opaque Fortran-style naming scheme. [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) are old and not available as gems.
|
91
|
-
|
92
104
|
## Bugs? Questions?
|
93
105
|
|
94
106
|
This gem's main repository is on GitHub: [http://github.com/opennorth/tf-idf-similarity](http://github.com/opennorth/tf-idf-similarity), where your contributions, forks, bug reports, feature requests, and feedback are greatly welcomed.
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -1,6 +1,17 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'unicode_utils'
|
6
|
+
rescue LoadError
|
7
|
+
# Ruby 1.8
|
8
|
+
end
|
9
|
+
|
1
10
|
module TfIdfSimilarity
|
2
11
|
end
|
3
12
|
|
4
|
-
require 'tf-idf-similarity/
|
13
|
+
require 'tf-idf-similarity/matrix_methods'
|
14
|
+
require 'tf-idf-similarity/term_count_model'
|
15
|
+
require 'tf-idf-similarity/tf_idf_model'
|
5
16
|
require 'tf-idf-similarity/document'
|
6
17
|
require 'tf-idf-similarity/token'
|
@@ -1,56 +1,63 @@
|
|
1
|
-
#
|
2
|
-
require 'unicode_utils'
|
3
|
-
|
1
|
+
# A document.
|
4
2
|
class TfIdfSimilarity::Document
|
5
|
-
#
|
3
|
+
# The document's identifier.
|
6
4
|
attr_reader :id
|
7
5
|
# The document's text.
|
8
6
|
attr_reader :text
|
9
|
-
# The document's tokenized text.
|
10
|
-
attr_reader :tokens
|
11
7
|
# The number of times each term appears in the document.
|
12
8
|
attr_reader :term_counts
|
13
|
-
# The
|
9
|
+
# The number of tokens in the document.
|
14
10
|
attr_reader :size
|
15
11
|
|
16
12
|
# @param [String] text the document's text
|
17
13
|
# @param [Hash] opts optional arguments
|
18
|
-
# @option opts [String] :id
|
14
|
+
# @option opts [String] :id the document's identifier
|
19
15
|
# @option opts [Array] :tokens the document's tokenized text
|
16
|
+
# @option opts [Hash] :term_counts the number of times each term appears
|
17
|
+
# @option opts [Integer] :size the number of tokens in the document
|
20
18
|
def initialize(text, opts = {})
|
21
|
-
@text
|
22
|
-
@id
|
23
|
-
@tokens
|
24
|
-
|
25
|
-
|
19
|
+
@text = text
|
20
|
+
@id = opts[:id] || object_id
|
21
|
+
@tokens = opts[:tokens]
|
22
|
+
|
23
|
+
if opts[:term_counts]
|
24
|
+
@term_counts = opts[:term_counts]
|
25
|
+
@size = opts[:size] || term_counts.values.reduce(0, :+)
|
26
|
+
# Nothing to do.
|
27
|
+
else
|
28
|
+
@term_counts = Hash.new(0)
|
29
|
+
@size = 0
|
30
|
+
set_term_counts_and_size
|
31
|
+
end
|
26
32
|
end
|
27
33
|
|
28
|
-
#
|
34
|
+
# Returns the set of terms in the document.
|
35
|
+
#
|
36
|
+
# @return [Array<String>] the unique terms in the document
|
29
37
|
def terms
|
30
38
|
term_counts.keys
|
31
39
|
end
|
32
|
-
|
33
|
-
#
|
34
|
-
# @return [Float] the square root of the term count
|
40
|
+
|
41
|
+
# Returns the number of occurrences of the term in the document.
|
35
42
|
#
|
36
|
-
# @
|
37
|
-
# @
|
38
|
-
def
|
39
|
-
|
43
|
+
# @param [String] term a term
|
44
|
+
# @return [Integer] the number of times the term appears in the document
|
45
|
+
def term_count(term)
|
46
|
+
term_counts[term].to_i # need #to_i if unmarshalled
|
40
47
|
end
|
41
|
-
alias_method :tf, :term_frequency
|
42
48
|
|
43
49
|
private
|
44
50
|
|
45
|
-
#
|
46
|
-
def
|
51
|
+
# Tokenizes the text and counts terms and total tokens.
|
52
|
+
def set_term_counts_and_size
|
47
53
|
tokenize(text).each do |word|
|
48
|
-
token = TfIdfSimilarity::Token.new
|
54
|
+
token = TfIdfSimilarity::Token.new(word)
|
49
55
|
if token.valid?
|
50
|
-
|
56
|
+
term = token.lowercase_filter.classic_filter.to_s
|
57
|
+
@term_counts[term] += 1
|
58
|
+
@size += 1
|
51
59
|
end
|
52
60
|
end
|
53
|
-
@size = term_counts.values.reduce(:+)
|
54
61
|
end
|
55
62
|
|
56
63
|
# Tokenizes a text, respecting the word boundary rules from Unicode’s Default
|
@@ -68,6 +75,6 @@ private
|
|
68
75
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
69
76
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
70
77
|
def tokenize(text)
|
71
|
-
@tokens || UnicodeUtils.each_word(text)
|
78
|
+
@tokens || defined?(UnicodeUtils) && UnicodeUtils.each_word(text) || text.split(/\b/) # @todo Ruby 1.8 has no good word boundary code
|
72
79
|
end
|
73
80
|
end
|
@@ -1,134 +1,11 @@
|
|
1
|
-
require 'tf-idf-similarity/document'
|
2
|
-
|
3
|
-
# @todo http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
4
|
-
#
|
5
|
-
# @note The treat, tf_idf, similarity and rsemantic gems normalizes to the number of terms in the document.
|
6
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L77
|
7
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
|
8
|
-
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
|
9
|
-
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L17
|
10
|
-
#
|
11
|
-
# @note The tf-idf gem normalizes to the number of unique terms in the document.
|
12
|
-
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
|
13
|
-
#
|
14
|
-
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
15
|
-
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
16
|
-
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
17
1
|
class TfIdfSimilarity::Document
|
18
2
|
# @return [Float] the maximum term count of any term in the document
|
19
3
|
def maximum_term_count
|
20
|
-
@maximum_term_count ||=
|
4
|
+
@maximum_term_count ||= term_counts.values.max.to_f
|
21
5
|
end
|
22
6
|
|
23
7
|
# @return [Float] the average term count of all terms in the document
|
24
8
|
def average_term_count
|
25
|
-
@average_term_count ||=
|
26
|
-
end
|
27
|
-
|
28
|
-
# Returns the term count.
|
29
|
-
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
|
30
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
|
31
|
-
#
|
32
|
-
# SMART n, Salton t, Chisholm FREQ
|
33
|
-
def plain_term_frequency(term)
|
34
|
-
term_counts[term]
|
35
|
-
end
|
36
|
-
alias :plain_tf, :plain_term_frequency
|
37
|
-
|
38
|
-
# Returns 1 if the term is present, 0 otherwise.
|
39
|
-
#
|
40
|
-
# SMART b, Salton b, Chisholm BNRY
|
41
|
-
def binary_term_frequency(term)
|
42
|
-
count = term_counts[term]
|
43
|
-
if count > 0
|
44
|
-
1
|
45
|
-
else
|
46
|
-
0
|
47
|
-
end
|
48
|
-
end
|
49
|
-
alias_method :binary_tf, :binary_term_frequency
|
50
|
-
|
51
|
-
# Normalizes the term count by the maximum term count.
|
52
|
-
#
|
53
|
-
# @see http://en.wikipedia.org/wiki/Tf*idf
|
54
|
-
def normalized_term_frequency(term)
|
55
|
-
term_counts[term] / maximum_term_count
|
56
|
-
end
|
57
|
-
alias_method :normalized_tf, :normalized_term_frequency
|
58
|
-
|
59
|
-
# Further normalizes the normalized term frequency to lie between 0.5 and 1.
|
60
|
-
#
|
61
|
-
# SMART a, Salton n, Chisholm ATF1
|
62
|
-
def augmented_normalized_term_frequency(term)
|
63
|
-
0.5 + 0.5 * normalized_term_frequency(term)
|
64
|
-
end
|
65
|
-
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
66
|
-
|
67
|
-
# Chisholm ATFA
|
68
|
-
def augmented_average_term_frequency(term)
|
69
|
-
count = term_counts[term]
|
70
|
-
if count > 0
|
71
|
-
0.9 + 0.1 * count / average_term_count
|
72
|
-
else
|
73
|
-
0
|
74
|
-
end
|
75
|
-
end
|
76
|
-
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
77
|
-
|
78
|
-
# Chisholm ATFC
|
79
|
-
def changed_coefficient_augmented_normalized_term_frequency(term)
|
80
|
-
count = term_counts[term]
|
81
|
-
if count > 0
|
82
|
-
0.2 + 0.8 * count / maximum_term_count
|
83
|
-
else
|
84
|
-
0
|
85
|
-
end
|
86
|
-
end
|
87
|
-
alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
|
88
|
-
|
89
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
|
90
|
-
#
|
91
|
-
# SMART l, Chisholm LOGA
|
92
|
-
def log_term_frequency(term)
|
93
|
-
count = term_counts[term]
|
94
|
-
if count > 0
|
95
|
-
1 + Math.log(count)
|
96
|
-
else
|
97
|
-
0
|
98
|
-
end
|
99
|
-
end
|
100
|
-
alias_method :log_tf, :log_term_frequency
|
101
|
-
|
102
|
-
# SMART L, Chisholm LOGN
|
103
|
-
def normalized_log_term_frequency(term)
|
104
|
-
count = term_counts[term]
|
105
|
-
if count > 0
|
106
|
-
(1 + Math.log(count)) / (1 + Math.log(average_term_count))
|
107
|
-
else
|
108
|
-
0
|
109
|
-
end
|
110
|
-
end
|
111
|
-
alias_method :normalized_log_tf, :normalized_log_term_frequency
|
112
|
-
|
113
|
-
# Chisholm LOGG
|
114
|
-
def augmented_log_term_frequency(term)
|
115
|
-
count = term_counts[term]
|
116
|
-
if count > 0
|
117
|
-
0.2 + 0.8 * Math.log(count + 1)
|
118
|
-
else
|
119
|
-
0
|
120
|
-
end
|
121
|
-
end
|
122
|
-
alias_method :augmented_log_tf, :augmented_log_term_frequency
|
123
|
-
|
124
|
-
# Chisholm SQRT
|
125
|
-
def square_root_term_frequency(term)
|
126
|
-
count = term_counts[term]
|
127
|
-
if count > 0
|
128
|
-
Math.sqrt(count - 0.5) + 1
|
129
|
-
else
|
130
|
-
0
|
131
|
-
end
|
9
|
+
@average_term_count ||= term_counts.values.reduce(0, :+) / term_counts.size.to_f
|
132
10
|
end
|
133
|
-
alias_method :square_root_tf, :square_root_term_frequency
|
134
11
|
end
|
@@ -0,0 +1,192 @@
|
|
1
|
+
# @note The vss gem does not take the logarithm of the inverse document frequency.
|
2
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L79
|
3
|
+
|
4
|
+
# @note The treat gem does not add one to the inverse document frequency.
|
5
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L16
|
6
|
+
|
7
|
+
# @note The treat gem normalizes to the number of tokens in the document.
|
8
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
|
9
|
+
|
10
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
11
|
+
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
12
|
+
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
13
|
+
class TfIdfSimilarity::TfIdfModel
|
14
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
|
15
|
+
#
|
16
|
+
# SMART n, Salton x, Chisholm NONE
|
17
|
+
def no_collection_frequency(term)
|
18
|
+
1.0
|
19
|
+
end
|
20
|
+
|
21
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
22
|
+
#
|
23
|
+
# SMART t, Salton f, Chisholm IDFB
|
24
|
+
def plain_inverse_document_frequency(term, numerator = 0, denominator = 0)
|
25
|
+
log((documents.size + numerator) / (@model.document_count(term).to_f + denominator))
|
26
|
+
end
|
27
|
+
alias_method :plain_idf, :plain_inverse_document_frequency
|
28
|
+
|
29
|
+
# SMART p, Salton p, Chisholm IDFP
|
30
|
+
def probabilistic_inverse_document_frequency(term)
|
31
|
+
count = @model.document_count(term).to_f
|
32
|
+
log((documents.size - count) / count)
|
33
|
+
end
|
34
|
+
alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
|
35
|
+
|
36
|
+
# Chisholm IGFF
|
37
|
+
def global_frequency_inverse_document_frequency(term)
|
38
|
+
@model.term_count(term) / @model.document_count(term).to_f
|
39
|
+
end
|
40
|
+
alias_method :gfidf, :global_frequency_inverse_document_frequency
|
41
|
+
|
42
|
+
# Chisholm IGFL
|
43
|
+
def log_global_frequency_inverse_document_frequency(term)
|
44
|
+
log(global_frequency_inverse_document_frequency(term) + 1)
|
45
|
+
end
|
46
|
+
alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
|
47
|
+
|
48
|
+
# Chisholm IGFI
|
49
|
+
def incremented_global_frequency_inverse_document_frequency(term)
|
50
|
+
global_frequency_inverse_document_frequency(term) + 1
|
51
|
+
end
|
52
|
+
alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
|
53
|
+
|
54
|
+
# Chisholm IGFS
|
55
|
+
def square_root_global_frequency_inverse_document_frequency(term)
|
56
|
+
sqrt(global_frequency_inverse_document_frequency(term) - 0.9)
|
57
|
+
end
|
58
|
+
alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
|
59
|
+
|
60
|
+
# Chisholm ENPY
|
61
|
+
def entropy(term)
|
62
|
+
denominator = @model.term_count(term).to_f
|
63
|
+
logN = log(documents.size)
|
64
|
+
1 + documents.reduce(0) do |sum,document|
|
65
|
+
quotient = document.term_count(term) / denominator
|
66
|
+
sum += quotient * log(quotient) / logN
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
|
71
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
|
72
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
|
73
|
+
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
|
74
|
+
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
|
75
|
+
#
|
76
|
+
# SMART n, Salton x, Chisholm NONE
|
77
|
+
def no_normalization(matrix)
|
78
|
+
matrix
|
79
|
+
end
|
80
|
+
|
81
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
|
82
|
+
#
|
83
|
+
# SMART u, Chisholm PUQN
|
84
|
+
def pivoted_unique_normalization(matrix)
|
85
|
+
raise NotImplementedError
|
86
|
+
end
|
87
|
+
|
88
|
+
# Cosine normalization is implemented as TfIdfSimilarity::MatrixMethods#normalize.
|
89
|
+
#
|
90
|
+
# SMART c, Salton c, Chisholm COSN
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
# The plain term frequency is implemented as TfIdfSimilarity::Document#term_count.
|
95
|
+
#
|
96
|
+
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L75
|
97
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L11
|
98
|
+
#
|
99
|
+
# SMART n, Salton t, Chisholm FREQ
|
100
|
+
|
101
|
+
# SMART b, Salton b, Chisholm BNRY
|
102
|
+
def binary_term_frequency(document, term)
|
103
|
+
count = document.term_count(term)
|
104
|
+
if count > 0
|
105
|
+
1
|
106
|
+
else
|
107
|
+
0
|
108
|
+
end
|
109
|
+
end
|
110
|
+
alias_method :binary_tf, :binary_term_frequency
|
111
|
+
|
112
|
+
# @see http://en.wikipedia.org/wiki/Tf*idf
|
113
|
+
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
114
|
+
def normalized_term_frequency(document, term, a = 0)
|
115
|
+
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
116
|
+
end
|
117
|
+
alias_method :normalized_tf, :normalized_term_frequency
|
118
|
+
|
119
|
+
# SMART a, Salton n, Chisholm ATF1
|
120
|
+
def augmented_normalized_term_frequency(document, term)
|
121
|
+
0.5 + 0.5 * normalized_term_frequency(document, term)
|
122
|
+
end
|
123
|
+
alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
|
124
|
+
|
125
|
+
# Chisholm ATFA
|
126
|
+
def augmented_average_term_frequency(document, term)
|
127
|
+
count = document.term_count(term)
|
128
|
+
if count > 0
|
129
|
+
0.9 + 0.1 * count / document.average_term_count
|
130
|
+
else
|
131
|
+
0
|
132
|
+
end
|
133
|
+
end
|
134
|
+
alias_method :augmented_average_tf, :augmented_average_term_frequency
|
135
|
+
|
136
|
+
# Chisholm ATFC
|
137
|
+
def changed_coefficient_augmented_normalized_term_frequency(document, term)
|
138
|
+
count = document.term_count(term)
|
139
|
+
if count > 0
|
140
|
+
0.2 + 0.8 * count / document.maximum_term_count
|
141
|
+
else
|
142
|
+
0
|
143
|
+
end
|
144
|
+
end
|
145
|
+
alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
|
146
|
+
|
147
|
+
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L12
|
148
|
+
#
|
149
|
+
# SMART l, Chisholm LOGA
|
150
|
+
def log_term_frequency(document, term)
|
151
|
+
count = document.term_count(term)
|
152
|
+
if count > 0
|
153
|
+
1 + log(count)
|
154
|
+
else
|
155
|
+
0
|
156
|
+
end
|
157
|
+
end
|
158
|
+
alias_method :log_tf, :log_term_frequency
|
159
|
+
|
160
|
+
# SMART L, Chisholm LOGN
|
161
|
+
def normalized_log_term_frequency(document, term)
|
162
|
+
count = document.term_count(term)
|
163
|
+
if count > 0
|
164
|
+
(1 + log(count)) / (1 + log(document.average_term_count))
|
165
|
+
else
|
166
|
+
0
|
167
|
+
end
|
168
|
+
end
|
169
|
+
alias_method :normalized_log_tf, :normalized_log_term_frequency
|
170
|
+
|
171
|
+
# Chisholm LOGG
|
172
|
+
def augmented_log_term_frequency(document, term)
|
173
|
+
count = document.term_count(term)
|
174
|
+
if count > 0
|
175
|
+
0.2 + 0.8 * log(count + 1)
|
176
|
+
else
|
177
|
+
0
|
178
|
+
end
|
179
|
+
end
|
180
|
+
alias_method :augmented_log_tf, :augmented_log_term_frequency
|
181
|
+
|
182
|
+
# Chisholm SQRT
|
183
|
+
def square_root_term_frequency(document, term)
|
184
|
+
count = document.term_count(term)
|
185
|
+
if count > 0
|
186
|
+
sqrt(count - 0.5) + 1
|
187
|
+
else
|
188
|
+
0
|
189
|
+
end
|
190
|
+
end
|
191
|
+
alias_method :square_root_tf, :square_root_term_frequency
|
192
|
+
end
|