tf-idf-similarity 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,269 @@
1
+ require 'spec_helper'
2
+
3
+ require 'tf-idf-similarity/extras/document'
4
+ require 'tf-idf-similarity/extras/tf_idf_model'
5
+
6
+ describe TfIdfSimilarity::TfIdfModel do
7
+ def build_document(text, opts = {})
8
+ TfIdfSimilarity::Document.new(text, opts)
9
+ end
10
+
11
+ def build_model(documents)
12
+ TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
13
+ end
14
+
15
+ # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
16
+ # No relevant tests to reproduce.
17
+
18
+ # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
19
+ context 'comparing to vss gem' do
20
+ let :documents do
21
+ [ "I'm not even going to mention any TV series.",
22
+ "The Wire is the best thing ever. Fact.",
23
+ "Some would argue that Lost got a bit too wierd after season 2.",
24
+ "Lost is surely not in the same league as The Wire.",
25
+ "You cannot compare the The Wire and Lost.",
26
+ ].map do |text|
27
+ build_document(text)
28
+ end
29
+ end
30
+
31
+ let :model do
32
+ build_model(documents)
33
+ end
34
+
35
+ pending "Add TfIdfSimilarity::TfIdfModel#search"
36
+ end
37
+
38
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
39
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
40
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
41
+ context 'comparing to similarity gem' do
42
+ let :document do
43
+ TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
44
+ end
45
+
46
+ def build_model_from_text(*texts)
47
+ build_model(texts.map{|text| build_document(text)})
48
+ end
49
+
50
+ let :model_a do
51
+ build_model_from_text("cow horse sheep", "horse bird dog")
52
+ end
53
+
54
+ let :model_b do
55
+ build_model_from_text("cow cow cow bird", "horse horse horse bird")
56
+ end
57
+
58
+ let :model_c do
59
+ build_model_from_text("cow cow cow", "horse horse horse")
60
+ end
61
+
62
+ # Normalizes to the number of tokens in the document.
63
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
64
+ def tf(term)
65
+ document.term_count(term) / document.size.to_f
66
+ end
67
+
68
+ # Does not add one to the inverse document frequency.
69
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
70
+ def idf(model, term)
71
+ model.plain_idf(term, 0, 1)
72
+ end
73
+
74
+ it 'should return the terms' do
75
+ [ "the quick brown fox",
76
+ "the quick brown fox",
77
+ "The Quick Brown Fox",
78
+ 'The, Quick! Brown. "Fox"',
79
+ ].each do |text|
80
+ build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
81
+ end
82
+ end
83
+
84
+ it 'should return the number of documents' do
85
+ model_a.documents.size.should == 2
86
+ end
87
+
88
+ it 'should return the number of terms' do
89
+ document.terms.size.should == 3
90
+ model_a.terms.size.should == 5
91
+ end
92
+
93
+ it 'should return the term frequency' do
94
+ tf('cow').should == 0.5
95
+ tf('horse').should be_within(0.001).of(0.333)
96
+ tf('sheep').should == 0
97
+ end
98
+
99
+ it 'should return the similarity matrix' do
100
+ pending "Calculate the tf*idf matrix like the similarity gem does"
101
+ end
102
+
103
+ it 'should return the number of documents in which a term appears' do
104
+ model_b.document_count('cow').should == 1
105
+ model_b.document_count('horse').should == 1
106
+ model_b.document_count('bird').should == 2
107
+ end
108
+
109
+ it 'should return the inverse document frequency' do
110
+ idf(model_c, 'cow').should be_within(0.001).of(0.0)
111
+ idf(model_c, 'bird').should be_within(0.001).of(0.693)
112
+ end
113
+
114
+ it 'should return the document vector' do
115
+ pending "Calculate the tf*idf matrix like the similarity gem does"
116
+ end
117
+ end
118
+
119
+ # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
120
+ context 'comparing to tf-idf gem' do
121
+ # Normalizes to the number of unique tokens (terms) in the document.
122
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
123
+
124
+ let :corpus_a do
125
+ 1.upto(50).map do |n|
126
+ text = []
127
+ text << 'the' if n <= 23
128
+ text << 'a' if n <= 17
129
+ text << 'said' if n <= 5
130
+ text << 'phone' if n <= 2
131
+ text << 'girl' if n <= 1
132
+ text << 'moon' if n <= 1
133
+ build_document(text * ' ')
134
+ end
135
+ end
136
+
137
+ let :corpus_b do
138
+ 1.upto(50).map do |n|
139
+ text = []
140
+ text << 'the' if n <= 23
141
+ text << 'a' if n <= 17
142
+ text << 'said' if n <= 5
143
+ text << 'phone' if n <= 2
144
+ text << 'girl' if n <= 1
145
+ build_document(text * ' ')
146
+ end
147
+ end
148
+
149
+ let :model_a do
150
+ build_model(corpus_a)
151
+ end
152
+
153
+ let :model_b do
154
+ build_model(corpus_b)
155
+ end
156
+
157
+ it 'should return the number of documents' do
158
+ model_a.documents.size.should == 50
159
+ end
160
+
161
+ it 'should return the number of terms' do
162
+ model_a.terms.size.should == 6
163
+ end
164
+
165
+ # Adds one to the numerator when calculating inverse document frequency.
166
+ # Sets a default inverse document frequency for non-occurring terms.
167
+ # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
168
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
169
+ it 'should return the inverse document frequency' do
170
+ # should query IDF for nonexistent terms
171
+ default = model_a.plain_idf('xxx', 1, 1)
172
+ model_a.plain_idf('nonexistent', 1, 1).should == default
173
+ model_a.plain_idf('THE', 1, 1).should == default
174
+
175
+ # should query IDF for existent terms
176
+ model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
177
+ model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
178
+
179
+ # should add input documents to an existing corpus
180
+ model_a.plain_idf('water', 1, 1).should == default
181
+ model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
182
+ model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
183
+
184
+ model = build_model(corpus_a + [build_document('water moon')])
185
+
186
+ model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
187
+ model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
188
+ model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
189
+
190
+ # should add input documents to an empty corpus
191
+ unless MATRIX_LIBRARY == :gsl
192
+ model_c = build_model([])
193
+
194
+ default = model_c.plain_idf('xxx', 1, 1)
195
+ model_c.plain_idf('moon', 1, 1).should == default
196
+ model_c.plain_idf('water', 1, 1).should == default
197
+ model_c.plain_idf('said', 1, 1).should == default
198
+ end
199
+
200
+ model_d = build_model([
201
+ build_document('moon'),
202
+ build_document('moon said hello'),
203
+ ])
204
+
205
+ default = model_d.plain_idf('xxx', 1, 1)
206
+ model_d.plain_idf('water', 1, 1).should == default
207
+ model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
208
+ model_d.plain_idf('moon', 1, 1).should == 0 # 0
209
+
210
+ # should observe stopwords list
211
+ default = model_b.plain_idf('xxx', 1, 1)
212
+ model_b.plain_idf('water', 1, 1).should == default
213
+ model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
214
+ model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
215
+
216
+ model_e = build_model(corpus_b + [
217
+ build_document('moon', :tokens => %w()),
218
+ build_document('moon and water', :tokens => %w(and water)),
219
+ ])
220
+
221
+ default = model_e.plain_idf('xxx', 1, 1)
222
+ model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
223
+ model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
224
+ model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
225
+ end
226
+ end
227
+
228
+ # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
229
+ context 'comparing to tf_idf gem' do
230
+ let :one do
231
+ build_document('a a a a a a a a b b')
232
+ end
233
+
234
+ let :two do
235
+ build_document('a a')
236
+ end
237
+
238
+ let :model do
239
+ build_model([one, two])
240
+ end
241
+
242
+ # Normalizes to the number of tokens in the document.
243
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
244
+ def tf
245
+ one.term_count('b') / one.size.to_f
246
+ end
247
+
248
+ # Performs plain inverse document frequency with base 10.
249
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
250
+ def idf
251
+ model.plain_idf('b') / Math.log(10)
252
+ end
253
+
254
+ it 'should return the term frequency' do
255
+ tf.should == 0.2
256
+ model.tf(one, 'b').should be_within(0.001).of(1.414)
257
+ end
258
+
259
+ it 'should return the inverse document frequency' do
260
+ idf.should be_within(0.001).of(0.301) # 0.30102999
261
+ model.idf('b').should == 1
262
+ end
263
+
264
+ it 'should return the tf*idf' do
265
+ (tf * idf).should be_within(0.001).of(0.060) # 0.0602
266
+ model.tfidf(one, 'b').should be_within(0.001).of(1.414)
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+
3
+ require 'coveralls'
4
+ Coveralls.wear!
5
+
6
+ require 'rspec'
7
+ require File.dirname(__FILE__) + '/../lib/tf-idf-similarity'
8
+
9
+ MATRIX_LIBRARY = (ENV['MATRIX_LIBRARY'] || :matrix).to_sym
10
+ puts "\n==> Running specs with #{MATRIX_LIBRARY}"
11
+
12
+ case MATRIX_LIBRARY
13
+ when :gsl
14
+ require 'gsl'
15
+ when :narray
16
+ require 'narray'
17
+ when :nmatrix
18
+ require 'nmatrix'
19
+ else
20
+ require 'matrix'
21
+ end
@@ -0,0 +1,108 @@
1
+ require 'spec_helper'
2
+
3
+ describe TfIdfSimilarity::TermCountModel do
4
+ let :text do
5
+ "FOO-foo BAR bar \r\n\t 123 !@#"
6
+ end
7
+
8
+ let :tokens do
9
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
+ end
11
+
12
+ let :document_without_text do
13
+ TfIdfSimilarity::Document.new('')
14
+ end
15
+
16
+ let :document do
17
+ TfIdfSimilarity::Document.new(text)
18
+ end
19
+
20
+ let :document_with_tokens do
21
+ TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
+ end
23
+
24
+ let :document_with_term_counts do
25
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
+ end
27
+
28
+ context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
29
+ let :model do
30
+ TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
31
+ end
32
+
33
+ describe '#documents' do
34
+ it 'should be empty' do
35
+ model.documents.should be_empty
36
+ end
37
+ end
38
+
39
+ describe '#terms' do
40
+ it 'should be empty' do
41
+ model.terms.should be_empty
42
+ end
43
+ end
44
+
45
+ describe '#average_document_size' do
46
+ it 'should be zero' do
47
+ model.average_document_size.should == 0
48
+ end
49
+ end
50
+
51
+ describe '#document_count' do
52
+ it 'should be zero' do
53
+ model.document_count('xxx').should == 0
54
+ end
55
+ end
56
+
57
+ describe '#term_count' do
58
+ it 'should be zero' do
59
+ model.term_count('xxx').should == 0
60
+ end
61
+ end
62
+ end
63
+
64
+ context 'with documents' do
65
+ let :documents do
66
+ [
67
+ document, # 4 tokens
68
+ document_with_tokens, # 3 tokens
69
+ document_without_text, # 0 tokens
70
+ document_with_term_counts, # 15 tokens
71
+ ]
72
+ end
73
+
74
+ let :model do
75
+ TfIdfSimilarity::TermCountModel.new(documents, :library => MATRIX_LIBRARY)
76
+ end
77
+
78
+ describe '#documents' do
79
+ it 'should return the documents' do
80
+ model.documents.should == documents
81
+ end
82
+ end
83
+
84
+ describe '#terms' do
85
+ it 'should return the terms' do
86
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
87
+ end
88
+ end
89
+
90
+ describe '#average_document_size' do
91
+ it 'should return the average number of tokens in a document' do
92
+ model.average_document_size.should == 5.5
93
+ end
94
+ end
95
+
96
+ describe '#document_count' do
97
+ it 'should return the number of documents the term appears in' do
98
+ model.document_count('bar').should == 3
99
+ end
100
+ end
101
+
102
+ describe '#term_count' do
103
+ it 'should return the number of times the term appears in the corpus' do
104
+ model.term_count('bar').should == 9
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,174 @@
1
+ require 'spec_helper'
2
+
3
+ describe TfIdfSimilarity::TfIdfModel do
4
+ let :text do
5
+ "FOO-foo BAR bar \r\n\t 123 !@#"
6
+ end
7
+
8
+ let :tokens do
9
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
+ end
11
+
12
+ let :document_without_text do
13
+ TfIdfSimilarity::Document.new('')
14
+ end
15
+
16
+ let :document do
17
+ TfIdfSimilarity::Document.new(text)
18
+ end
19
+
20
+ let :document_with_tokens do
21
+ TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
+ end
23
+
24
+ let :document_with_term_counts do
25
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
+ end
27
+
28
+ let :non_corpus_document do
29
+ TfIdfSimilarity::Document.new('foo foo foo')
30
+ end
31
+
32
+ def similarity_matrix_values(model)
33
+ matrix = model.similarity_matrix
34
+ if MATRIX_LIBRARY == :nmatrix
35
+ matrix.each.to_a
36
+ else
37
+ matrix.to_a.flatten
38
+ end
39
+ end
40
+
41
+ context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
42
+ let :model do
43
+ TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
44
+ end
45
+
46
+ describe '#documents' do
47
+ it 'should be empty' do
48
+ model.documents.should be_empty
49
+ end
50
+ end
51
+
52
+ describe '#terms' do
53
+ it 'should be empty' do
54
+ model.terms.should be_empty
55
+ end
56
+ end
57
+
58
+ describe '#inverse_document_frequency' do
59
+ it 'should return negative infinity' do
60
+ model.idf('foo').should == -1/0.0 # -Infinity
61
+ end
62
+ end
63
+
64
+ describe '#term_frequency' do
65
+ it 'should return the term frequency' do
66
+ model.tf(document, 'foo').should == Math.sqrt(2)
67
+ end
68
+ end
69
+
70
+ describe '#term_frequency_inverse_document_frequency' do
71
+ it 'should return negative infinity' do
72
+ model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
73
+ end
74
+ end
75
+
76
+ describe '#similarity_matrix' do
77
+ it 'should be empty' do
78
+ similarity_matrix_values(model).should be_empty
79
+ end
80
+ end
81
+ end
82
+
83
+ context 'with documents' do
84
+ let :documents do
85
+ [
86
+ document,
87
+ document_with_tokens,
88
+ document_without_text,
89
+ document_with_term_counts,
90
+ ]
91
+ end
92
+
93
+ let :model do
94
+ TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
95
+ end
96
+
97
+ describe '#documents' do
98
+ it 'should return the documents' do
99
+ model.documents.should == documents
100
+ end
101
+ end
102
+
103
+ describe '#terms' do
104
+ it 'should return the terms' do
105
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
106
+ end
107
+ end
108
+
109
+ describe '#inverse_document_frequency' do
110
+ it 'should return the inverse document frequency' do
111
+ model.idf('foo').should be_within(0.001).of(1 + Math.log(2))
112
+ end
113
+
114
+ it 'should return the inverse document frequency of a non-occurring term' do
115
+ model.idf('xxx').should be_within(0.001).of(1 + Math.log(4))
116
+ end
117
+ end
118
+
119
+ describe '#term_frequency' do
120
+ it 'should return the term frequency if no tokens given' do
121
+ model.tf(document, 'foo').should == Math.sqrt(2)
122
+ end
123
+
124
+ it 'should return the term frequency if tokens given' do
125
+ model.tf(document_with_tokens, 'foo-foo').should == 1
126
+ end
127
+
128
+ it 'should return no term frequency if no text given' do
129
+ model.tf(document_without_text, 'foo').should == 0
130
+ end
131
+
132
+ it 'should return the term frequency if term counts given' do
133
+ model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
134
+ end
135
+
136
+ it 'should return the term frequency of a non-occurring term' do
137
+ model.tf(document, 'xxx').should == 0
138
+ end
139
+
140
+ it 'should return the term frequency in a non-occurring document' do
141
+ model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
142
+ end
143
+ end
144
+
145
+ describe '#term_frequency_inverse_document_frequency' do
146
+ it 'should return the tf*idf' do
147
+ model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(2))
148
+ end
149
+
150
+ it 'should return the tf*idf of a non-occurring term' do
151
+ model.tfidf(document, 'xxx').should == 0
152
+ end
153
+
154
+ it 'should return the tf*idf in a non-occurring term' do
155
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(3))
156
+ end
157
+ end
158
+
159
+ describe '#similarity_matrix' do
160
+ it 'should return the similarity matrix' do
161
+ expected = [
162
+ 1.0, 0.326, 0.0, 0.195,
163
+ 0.326, 1.0, 0.0, 0.247,
164
+ 0.0, 0.0, 0.0, 0.0,
165
+ 0.195, 0.247, 0.0, 1.0,
166
+ ]
167
+
168
+ similarity_matrix_values(model).each_with_index do |value,i|
169
+ value.should be_within(0.001).of(expected[i])
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end