tf-idf-similarity 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,110 +0,0 @@
1
- require 'tf-idf-similarity/collection'
2
-
3
- # @note The treat and similarity gems do not add one to the inverse document frequency.
4
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L16
5
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
6
- #
7
- # @note The tf-idf gem adds one to the numerator when calculating inverse document frequency.
8
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
9
- #
10
- # @note The vss gem does not take the logarithm of the inverse document frequency.
11
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L79
12
- #
13
- # @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
14
- # @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
15
- # @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
16
- class TfIdfSimilarity::Collection
17
- # https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
18
- #
19
- # SMART n, Salton x, Chisholm NONE
20
- def no_collection_frequency(term)
21
- 1.0
22
- end
23
-
24
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
25
- # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L15
26
- #
27
- # SMART t, Salton f, Chisholm IDFB
28
- def plain_inverse_document_frequency(term)
29
- Math.log documents.size / document_counts[term].to_f
30
- end
31
- alias_method :plain_idf, :plain_inverse_document_frequency
32
-
33
- # SMART p, Salton p, Chisholm IDFP
34
- def probabilistic_inverse_document_frequency(term)
35
- count = document_counts[term].to_f
36
- Math.log (documents.size - count) / count
37
- end
38
- alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
39
-
40
- # Chisholm IGFF
41
- def global_frequency_inverse_document_frequency(term)
42
- term_counts[term] / document_counts[term].to_f
43
- end
44
- alias_method :gfidf, :global_frequency_inverse_document_frequency
45
-
46
- # Chisholm IGFL
47
- def log_global_frequency_inverse_document_frequency(term)
48
- Math.log global_frequency_inverse_document_frequency(term) + 1
49
- end
50
- alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
51
-
52
- # Chisholm IGFI
53
- def incremented_global_frequency_inverse_document_frequency(term)
54
- global_frequency_inverse_document_frequency(term) + 1
55
- end
56
- alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
57
-
58
- # Chisholm IGFS
59
- def square_root_global_frequency_inverse_document_frequency(term)
60
- Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9
61
- end
62
- alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
63
-
64
- # Chisholm ENPY
65
- def entropy(term)
66
- denominator = term_counts[term].to_f
67
- logN = Math.log documents.size
68
- 1 + documents.reduce(0) do |sum,document|
69
- quotient = document.term_counts[term] / denominator
70
- sum += quotient * Math.log(quotient) / logN
71
- end
72
- end
73
-
74
-
75
-
76
- # @param [Document] matrix a term-document matrix
77
- # @return [Matrix] the same matrix
78
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
79
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
80
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
81
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
82
- # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
83
- #
84
- # SMART n, Salton x, Chisholm NONE
85
- def no_normalization(matrix)
86
- matrix
87
- end
88
-
89
- # @param [Document] matrix a term-document matrix
90
- # @return [Matrix] a matrix in which all document vectors are unit vectors
91
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/term_document_matrix.rb#L23
92
- #
93
- # SMART c, Salton c, Chisholm COSN
94
- def cosine_normalization(matrix)
95
- if gsl?
96
- matrix.each_col(&:normalize!)
97
- else
98
- Matrix.columns matrix.column_vectors.map(&:normalize)
99
- end
100
- end
101
-
102
- # @param [Document] matrix a term-document matrix
103
- # @return [Matrix] a matrix
104
- # @todo http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
105
- #
106
- # SMART u, Chisholm PUQN
107
- def pivoted_unique_normalization(matrix)
108
- raise NotImplementedError
109
- end
110
- end