tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,110 +0,0 @@
1
- require 'tf-idf-similarity/collection'
2
-
3
- # @note The treat and similarity gems do not add one to the inverse document frequency.
4
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L16
5
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
6
- #
7
- # @note The tf-idf gem adds one to the numerator when calculating inverse document frequency.
8
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
9
- #
10
- # @note The vss gem does not take the logarithm of the inverse document frequency.
11
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb#L79
12
- #
13
- # @see http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html
14
- # @see http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf
15
- # @see http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf
16
- class TfIdfSimilarity::Collection
17
- # https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L17
18
- #
19
- # SMART n, Salton x, Chisholm NONE
20
- def no_collection_frequency(term)
21
- 1.0
22
- end
23
-
24
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
25
- # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L15
26
- #
27
- # SMART t, Salton f, Chisholm IDFB
28
- def plain_inverse_document_frequency(term)
29
- Math.log documents.size / document_counts[term].to_f
30
- end
31
- alias_method :plain_idf, :plain_inverse_document_frequency
32
-
33
- # SMART p, Salton p, Chisholm IDFP
34
- def probabilistic_inverse_document_frequency(term)
35
- count = document_counts[term].to_f
36
- Math.log (documents.size - count) / count
37
- end
38
- alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
39
-
40
- # Chisholm IGFF
41
- def global_frequency_inverse_document_frequency(term)
42
- term_counts[term] / document_counts[term].to_f
43
- end
44
- alias_method :gfidf, :global_frequency_inverse_document_frequency
45
-
46
- # Chisholm IGFL
47
- def log_global_frequency_inverse_document_frequency(term)
48
- Math.log global_frequency_inverse_document_frequency(term) + 1
49
- end
50
- alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
51
-
52
- # Chisholm IGFI
53
- def incremented_global_frequency_inverse_document_frequency(term)
54
- global_frequency_inverse_document_frequency(term) + 1
55
- end
56
- alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
57
-
58
- # Chisholm IGFS
59
- def square_root_global_frequency_inverse_document_frequency(term)
60
- Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9
61
- end
62
- alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
63
-
64
- # Chisholm ENPY
65
- def entropy(term)
66
- denominator = term_counts[term].to_f
67
- logN = Math.log documents.size
68
- 1 + documents.reduce(0) do |sum,document|
69
- quotient = document.term_counts[term] / denominator
70
- sum += quotient * Math.log(quotient) / logN
71
- end
72
- end
73
-
74
-
75
-
76
- # @param [Document] matrix a term-document matrix
77
- # @return [Matrix] the same matrix
78
- # @see https://github.com/mkdynamic/vss/blob/master/lib/vss/engine.rb
79
- # @see https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb
80
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb
81
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb
82
- # @see https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb
83
- #
84
- # SMART n, Salton x, Chisholm NONE
85
- def no_normalization(matrix)
86
- matrix
87
- end
88
-
89
- # @param [Document] matrix a term-document matrix
90
- # @return [Matrix] a matrix in which all document vectors are unit vectors
91
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/term_document_matrix.rb#L23
92
- #
93
- # SMART c, Salton c, Chisholm COSN
94
- def cosine_normalization(matrix)
95
- if gsl?
96
- matrix.each_col(&:normalize!)
97
- else
98
- Matrix.columns matrix.column_vectors.map(&:normalize)
99
- end
100
- end
101
-
102
- # @param [Document] matrix a term-document matrix
103
- # @return [Matrix] a matrix
104
- # @todo http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
105
- #
106
- # SMART u, Chisholm PUQN
107
- def pivoted_unique_normalization(matrix)
108
- raise NotImplementedError
109
- end
110
- end