tf-idf-similarity 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/README.md +41 -29
- data/lib/tf-idf-similarity.rb +12 -1
- data/lib/tf-idf-similarity/document.rb +35 -28
- data/lib/tf-idf-similarity/extras/document.rb +2 -125
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
- data/lib/tf-idf-similarity/term_count_model.rb +78 -0
- data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
- data/lib/tf-idf-similarity/token.rb +34 -12
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/document_spec.rb +136 -0
- data/spec/extras/tf_idf_model_spec.rb +269 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/term_count_model_spec.rb +108 -0
- data/spec/tf_idf_model_spec.rb +174 -0
- data/spec/token_spec.rb +34 -0
- data/td-idf-similarity.gemspec +3 -3
- metadata +91 -63
- data/lib/tf-idf-similarity/collection.rb +0 -205
- data/lib/tf-idf-similarity/extras/collection.rb +0 -110
@@ -1,110 +0,0 @@
|
|
1
|
-
require 'tf-idf-similarity/collection'
|
2
|
-
|
3
|
-
# @note The treat and similarity gems do not add one to the inverse document frequency.
|
4
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L16
|
5
|
-
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
|
6
|
-
#
|
7
|
-
# @note The tf-idf gem adds one to the numerator when calculating inverse document frequency.
|
8
|
-
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
|
9
|
-
#
|
10
|
-
# @note The vss gem does not take the logarithm of the inverse document frequency.
|
11
|
-
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L79
|
12
|
-
#
|
13
|
-
# @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
|
14
|
-
# @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
|
15
|
-
# @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
|
16
|
-
class TfIdfSimilarity::Collection
|
17
|
-
# https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
|
18
|
-
#
|
19
|
-
# SMART n, Salton x, Chisholm NONE
|
20
|
-
def no_collection_frequency(term)
|
21
|
-
1.0
|
22
|
-
end
|
23
|
-
|
24
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
25
|
-
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L15
|
26
|
-
#
|
27
|
-
# SMART t, Salton f, Chisholm IDFB
|
28
|
-
def plain_inverse_document_frequency(term)
|
29
|
-
Math.log documents.size / document_counts[term].to_f
|
30
|
-
end
|
31
|
-
alias_method :plain_idf, :plain_inverse_document_frequency
|
32
|
-
|
33
|
-
# SMART p, Salton p, Chisholm IDFP
|
34
|
-
def probabilistic_inverse_document_frequency(term)
|
35
|
-
count = document_counts[term].to_f
|
36
|
-
Math.log (documents.size - count) / count
|
37
|
-
end
|
38
|
-
alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
|
39
|
-
|
40
|
-
# Chisholm IGFF
|
41
|
-
def global_frequency_inverse_document_frequency(term)
|
42
|
-
term_counts[term] / document_counts[term].to_f
|
43
|
-
end
|
44
|
-
alias_method :gfidf, :global_frequency_inverse_document_frequency
|
45
|
-
|
46
|
-
# Chisholm IGFL
|
47
|
-
def log_global_frequency_inverse_document_frequency(term)
|
48
|
-
Math.log global_frequency_inverse_document_frequency(term) + 1
|
49
|
-
end
|
50
|
-
alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
|
51
|
-
|
52
|
-
# Chisholm IGFI
|
53
|
-
def incremented_global_frequency_inverse_document_frequency(term)
|
54
|
-
global_frequency_inverse_document_frequency(term) + 1
|
55
|
-
end
|
56
|
-
alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
|
57
|
-
|
58
|
-
# Chisholm IGFS
|
59
|
-
def square_root_global_frequency_inverse_document_frequency(term)
|
60
|
-
Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9
|
61
|
-
end
|
62
|
-
alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
|
63
|
-
|
64
|
-
# Chisholm ENPY
|
65
|
-
def entropy(term)
|
66
|
-
denominator = term_counts[term].to_f
|
67
|
-
logN = Math.log documents.size
|
68
|
-
1 + documents.reduce(0) do |sum,document|
|
69
|
-
quotient = document.term_counts[term] / denominator
|
70
|
-
sum += quotient * Math.log(quotient) / logN
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
# @param [Document] matrix a term-document matrix
|
77
|
-
# @return [Matrix] the same matrix
|
78
|
-
# @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
|
79
|
-
# @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
|
80
|
-
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
|
81
|
-
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
|
82
|
-
# @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
|
83
|
-
#
|
84
|
-
# SMART n, Salton x, Chisholm NONE
|
85
|
-
def no_normalization(matrix)
|
86
|
-
matrix
|
87
|
-
end
|
88
|
-
|
89
|
-
# @param [Document] matrix a term-document matrix
|
90
|
-
# @return [Matrix] a matrix in which all document vectors are unit vectors
|
91
|
-
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/term_document_matrix.rb#L23
|
92
|
-
#
|
93
|
-
# SMART c, Salton c, Chisholm COSN
|
94
|
-
def cosine_normalization(matrix)
|
95
|
-
if gsl?
|
96
|
-
matrix.each_col(&:normalize!)
|
97
|
-
else
|
98
|
-
Matrix.columns matrix.column_vectors.map(&:normalize)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# @param [Document] matrix a term-document matrix
|
103
|
-
# @return [Matrix] a matrix
|
104
|
-
# @todo http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
|
105
|
-
#
|
106
|
-
# SMART u, Chisholm PUQN
|
107
|
-
def pivoted_unique_normalization(matrix)
|
108
|
-
raise NotImplementedError
|
109
|
-
end
|
110
|
-
end
|