tf-idf-similarity 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/README.md +41 -29
- data/lib/tf-idf-similarity.rb +12 -1
- data/lib/tf-idf-similarity/document.rb +35 -28
- data/lib/tf-idf-similarity/extras/document.rb +2 -125
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
- data/lib/tf-idf-similarity/term_count_model.rb +78 -0
- data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
- data/lib/tf-idf-similarity/token.rb +34 -12
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/document_spec.rb +136 -0
- data/spec/extras/tf_idf_model_spec.rb +269 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/term_count_model_spec.rb +108 -0
- data/spec/tf_idf_model_spec.rb +174 -0
- data/spec/token_spec.rb +34 -0
- data/td-idf-similarity.gemspec +3 -3
- metadata +91 -63
- data/lib/tf-idf-similarity/collection.rb +0 -205
- data/lib/tf-idf-similarity/extras/collection.rb +0 -110
@@ -1,110 +0,0 @@
|
|
1
|
-
require 'tf-idf-similarity/collection'
|
2
|
-
|
3
|
-
# @note The treat and similarity gems do not add one to the inverse document frequency.
|
4
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L16
|
5
|
-
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
|
6
|
-
#
|
7
|
-
# @note The tf-idf gem adds one to the numerator when calculating inverse document frequency.
|
8
|
-
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
|
9
|
-
#
|
10
|
-
# @note The vss gem does not take the logarithm of the inverse document frequency.
|
11
|
-
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L79
|
12
|
-
#
|
13
|
-
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
14
|
-
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
15
|
-
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
16
|
-
class TfIdfSimilarity::Collection
|
17
|
-
# https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
|
18
|
-
#
|
19
|
-
# SMART n, Salton x, Chisholm NONE
|
20
|
-
def no_collection_frequency(term)
|
21
|
-
1.0
|
22
|
-
end
|
23
|
-
|
24
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
25
|
-
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L15
|
26
|
-
#
|
27
|
-
# SMART t, Salton f, Chisholm IDFB
|
28
|
-
def plain_inverse_document_frequency(term)
|
29
|
-
Math.log documents.size / document_counts[term].to_f
|
30
|
-
end
|
31
|
-
alias_method :plain_idf, :plain_inverse_document_frequency
|
32
|
-
|
33
|
-
# SMART p, Salton p, Chisholm IDFP
|
34
|
-
def probabilistic_inverse_document_frequency(term)
|
35
|
-
count = document_counts[term].to_f
|
36
|
-
Math.log (documents.size - count) / count
|
37
|
-
end
|
38
|
-
alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
|
39
|
-
|
40
|
-
# Chisholm IGFF
|
41
|
-
def global_frequency_inverse_document_frequency(term)
|
42
|
-
term_counts[term] / document_counts[term].to_f
|
43
|
-
end
|
44
|
-
alias_method :gfidf, :global_frequency_inverse_document_frequency
|
45
|
-
|
46
|
-
# Chisholm IGFL
|
47
|
-
def log_global_frequency_inverse_document_frequency(term)
|
48
|
-
Math.log global_frequency_inverse_document_frequency(term) + 1
|
49
|
-
end
|
50
|
-
alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
|
51
|
-
|
52
|
-
# Chisholm IGFI
|
53
|
-
def incremented_global_frequency_inverse_document_frequency(term)
|
54
|
-
global_frequency_inverse_document_frequency(term) + 1
|
55
|
-
end
|
56
|
-
alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
|
57
|
-
|
58
|
-
# Chisholm IGFS
|
59
|
-
def square_root_global_frequency_inverse_document_frequency(term)
|
60
|
-
Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9
|
61
|
-
end
|
62
|
-
alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
|
63
|
-
|
64
|
-
# Chisholm ENPY
|
65
|
-
def entropy(term)
|
66
|
-
denominator = term_counts[term].to_f
|
67
|
-
logN = Math.log documents.size
|
68
|
-
1 + documents.reduce(0) do |sum,document|
|
69
|
-
quotient = document.term_counts[term] / denominator
|
70
|
-
sum += quotient * Math.log(quotient) / logN
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
# @param [Document] matrix a term-document matrix
|
77
|
-
# @return [Matrix] the same matrix
|
78
|
-
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
|
79
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
|
80
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
|
81
|
-
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
|
82
|
-
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
|
83
|
-
#
|
84
|
-
# SMART n, Salton x, Chisholm NONE
|
85
|
-
def no_normalization(matrix)
|
86
|
-
matrix
|
87
|
-
end
|
88
|
-
|
89
|
-
# @param [Document] matrix a term-document matrix
|
90
|
-
# @return [Matrix] a matrix in which all document vectors are unit vectors
|
91
|
-
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/term_document_matrix.rb#L23
|
92
|
-
#
|
93
|
-
# SMART c, Salton c, Chisholm COSN
|
94
|
-
def cosine_normalization(matrix)
|
95
|
-
if gsl?
|
96
|
-
matrix.each_col(&:normalize!)
|
97
|
-
else
|
98
|
-
Matrix.columns matrix.column_vectors.map(&:normalize)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# @param [Document] matrix a term-document matrix
|
103
|
-
# @return [Matrix] a matrix
|
104
|
-
# @todo http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
|
105
|
-
#
|
106
|
-
# SMART u, Chisholm PUQN
|
107
|
-
def pivoted_unique_normalization(matrix)
|
108
|
-
raise NotImplementedError
|
109
|
-
end
|
110
|
-
end
|