tf-idf-similarity 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,267 +3,269 @@ require 'spec_helper'
3
3
  require 'tf-idf-similarity/extras/document'
4
4
  require 'tf-idf-similarity/extras/tf_idf_model'
5
5
 
6
- describe TfIdfSimilarity::TfIdfModel do
7
- def build_document(text, opts = {})
8
- TfIdfSimilarity::Document.new(text, opts)
9
- end
10
-
11
- def build_model(documents)
12
- TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
13
- end
14
-
15
- # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
16
- # No relevant tests to reproduce.
6
+ module TfIdfSimilarity
7
+ describe TfIdfModel do
8
+ def build_document(text, opts = {})
9
+ Document.new(text, opts)
10
+ end
11
+
12
+ def build_model(documents)
13
+ TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
14
+ end
15
+
16
+ # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
17
+ # No relevant tests to reproduce.
18
+
19
+ # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
20
+ context 'comparing to vss gem' do
21
+ let :documents do
22
+ [ "I'm not even going to mention any TV series.",
23
+ "The Wire is the best thing ever. Fact.",
24
+ "Some would argue that Lost got a bit too wierd after season 2.",
25
+ "Lost is surely not in the same league as The Wire.",
26
+ "You cannot compare the The Wire and Lost.",
27
+ ].map do |text|
28
+ build_document(text)
29
+ end
30
+ end
17
31
 
18
- # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
19
- context 'comparing to vss gem' do
20
- let :documents do
21
- [ "I'm not even going to mention any TV series.",
22
- "The Wire is the best thing ever. Fact.",
23
- "Some would argue that Lost got a bit too wierd after season 2.",
24
- "Lost is surely not in the same league as The Wire.",
25
- "You cannot compare the The Wire and Lost.",
26
- ].map do |text|
27
- build_document(text)
32
+ let :model do
33
+ build_model(documents)
28
34
  end
29
- end
30
35
 
31
- let :model do
32
- build_model(documents)
36
+ pending "Add #search"
33
37
  end
34
38
 
35
- pending "Add TfIdfSimilarity::TfIdfModel#search"
36
- end
37
-
38
- # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
39
- # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
40
- # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
41
- context 'comparing to similarity gem' do
42
- let :document do
43
- TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
44
- end
39
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
40
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
41
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
42
+ context 'comparing to similarity gem' do
43
+ let :document do
44
+ Document.new('cow cow cow horse horse elephant')
45
+ end
45
46
 
46
- def build_model_from_text(*texts)
47
- build_model(texts.map{|text| build_document(text)})
48
- end
47
+ def build_model_from_text(*texts)
48
+ build_model(texts.map{|text| build_document(text)})
49
+ end
49
50
 
50
- let :model_a do
51
- build_model_from_text("cow horse sheep", "horse bird dog")
52
- end
51
+ let :model_a do
52
+ build_model_from_text("cow horse sheep", "horse bird dog")
53
+ end
53
54
 
54
- let :model_b do
55
- build_model_from_text("cow cow cow bird", "horse horse horse bird")
56
- end
55
+ let :model_b do
56
+ build_model_from_text("cow cow cow bird", "horse horse horse bird")
57
+ end
57
58
 
58
- let :model_c do
59
- build_model_from_text("cow cow cow", "horse horse horse")
60
- end
59
+ let :model_c do
60
+ build_model_from_text("cow cow cow", "horse horse horse")
61
+ end
61
62
 
62
- # Normalizes to the number of tokens in the document.
63
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
64
- def tf(term)
65
- document.term_count(term) / document.size.to_f
66
- end
63
+ # Normalizes to the number of tokens in the document.
64
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
65
+ def tf(term)
66
+ document.term_count(term) / document.size.to_f
67
+ end
67
68
 
68
- # Does not add one to the inverse document frequency.
69
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
70
- def idf(model, term)
71
- model.plain_idf(term, 0, 1)
72
- end
69
+ # Does not add one to the inverse document frequency.
70
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
71
+ def idf(model, term)
72
+ model.plain_idf(term, 0, 1)
73
+ end
73
74
 
74
- it 'should return the terms' do
75
- [ "the quick brown fox",
76
- "the quick brown fox",
77
- "The Quick Brown Fox",
78
- 'The, Quick! Brown. "Fox"',
79
- ].each do |text|
80
- build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
75
+ it 'should return the terms' do
76
+ [ "the quick brown fox",
77
+ "the quick brown fox",
78
+ "The Quick Brown Fox",
79
+ 'The, Quick! Brown. "Fox"',
80
+ ].each do |text|
81
+ build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
82
+ end
81
83
  end
82
- end
83
84
 
84
- it 'should return the number of documents' do
85
- model_a.documents.size.should == 2
86
- end
85
+ it 'should return the number of documents' do
86
+ model_a.documents.size.should == 2
87
+ end
87
88
 
88
- it 'should return the number of terms' do
89
- document.terms.size.should == 3
90
- model_a.terms.size.should == 5
91
- end
89
+ it 'should return the number of terms' do
90
+ document.terms.size.should == 3
91
+ model_a.terms.size.should == 5
92
+ end
92
93
 
93
- it 'should return the term frequency' do
94
- tf('cow').should == 0.5
95
- tf('horse').should be_within(0.001).of(0.333)
96
- tf('sheep').should == 0
97
- end
94
+ it 'should return the term frequency' do
95
+ tf('cow').should == 0.5
96
+ tf('horse').should be_within(0.001).of(0.333)
97
+ tf('sheep').should == 0
98
+ end
98
99
 
99
- it 'should return the similarity matrix' do
100
- pending "Calculate the tf*idf matrix like the similarity gem does"
101
- end
100
+ it 'should return the similarity matrix' do
101
+ pending "Calculate the tf*idf matrix like the similarity gem does"
102
+ end
102
103
 
103
- it 'should return the number of documents in which a term appears' do
104
- model_b.document_count('cow').should == 1
105
- model_b.document_count('horse').should == 1
106
- model_b.document_count('bird').should == 2
107
- end
104
+ it 'should return the number of documents in which a term appears' do
105
+ model_b.document_count('cow').should == 1
106
+ model_b.document_count('horse').should == 1
107
+ model_b.document_count('bird').should == 2
108
+ end
108
109
 
109
- it 'should return the inverse document frequency' do
110
- idf(model_c, 'cow').should be_within(0.001).of(0.0)
111
- idf(model_c, 'bird').should be_within(0.001).of(0.693)
112
- end
110
+ it 'should return the inverse document frequency' do
111
+ idf(model_c, 'cow').should be_within(0.001).of(0.0)
112
+ idf(model_c, 'bird').should be_within(0.001).of(0.693)
113
+ end
113
114
 
114
- it 'should return the document vector' do
115
- pending "Calculate the tf*idf matrix like the similarity gem does"
115
+ it 'should return the document vector' do
116
+ pending "Calculate the tf*idf matrix like the similarity gem does"
117
+ end
116
118
  end
117
- end
118
119
 
119
- # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
120
- context 'comparing to tf-idf gem' do
121
- # Normalizes to the number of unique tokens (terms) in the document.
122
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
123
-
124
- let :corpus_a do
125
- 1.upto(50).map do |n|
126
- text = []
127
- text << 'the' if n <= 23
128
- text << 'a' if n <= 17
129
- text << 'said' if n <= 5
130
- text << 'phone' if n <= 2
131
- text << 'girl' if n <= 1
132
- text << 'moon' if n <= 1
133
- build_document(text * ' ')
120
+ # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
121
+ context 'comparing to tf-idf gem' do
122
+ # Normalizes to the number of unique tokens (terms) in the document.
123
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
124
+
125
+ let :corpus_a do
126
+ 1.upto(50).map do |n|
127
+ text = []
128
+ text << 'the' if n <= 23
129
+ text << 'a' if n <= 17
130
+ text << 'said' if n <= 5
131
+ text << 'phone' if n <= 2
132
+ text << 'girl' if n <= 1
133
+ text << 'moon' if n <= 1
134
+ build_document(text * ' ')
135
+ end
134
136
  end
135
- end
136
137
 
137
- let :corpus_b do
138
- 1.upto(50).map do |n|
139
- text = []
140
- text << 'the' if n <= 23
141
- text << 'a' if n <= 17
142
- text << 'said' if n <= 5
143
- text << 'phone' if n <= 2
144
- text << 'girl' if n <= 1
145
- build_document(text * ' ')
138
+ let :corpus_b do
139
+ 1.upto(50).map do |n|
140
+ text = []
141
+ text << 'the' if n <= 23
142
+ text << 'a' if n <= 17
143
+ text << 'said' if n <= 5
144
+ text << 'phone' if n <= 2
145
+ text << 'girl' if n <= 1
146
+ build_document(text * ' ')
147
+ end
146
148
  end
147
- end
148
149
 
149
- let :model_a do
150
- build_model(corpus_a)
151
- end
150
+ let :model_a do
151
+ build_model(corpus_a)
152
+ end
152
153
 
153
- let :model_b do
154
- build_model(corpus_b)
155
- end
154
+ let :model_b do
155
+ build_model(corpus_b)
156
+ end
156
157
 
157
- it 'should return the number of documents' do
158
- model_a.documents.size.should == 50
159
- end
158
+ it 'should return the number of documents' do
159
+ model_a.documents.size.should == 50
160
+ end
160
161
 
161
- it 'should return the number of terms' do
162
- model_a.terms.size.should == 6
163
- end
162
+ it 'should return the number of terms' do
163
+ model_a.terms.size.should == 6
164
+ end
164
165
 
165
- # Adds one to the numerator when calculating inverse document frequency.
166
- # Sets a default inverse document frequency for non-occurring terms.
167
- # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
168
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
169
- it 'should return the inverse document frequency' do
170
- # should query IDF for nonexistent terms
171
- default = model_a.plain_idf('xxx', 1, 1)
172
- model_a.plain_idf('nonexistent', 1, 1).should == default
173
- model_a.plain_idf('THE', 1, 1).should == default
174
-
175
- # should query IDF for existent terms
176
- model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
177
- model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
178
-
179
- # should add input documents to an existing corpus
180
- model_a.plain_idf('water', 1, 1).should == default
181
- model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
182
- model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
183
-
184
- model = build_model(corpus_a + [build_document('water moon')])
185
-
186
- model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
187
- model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
188
- model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
189
-
190
- # should add input documents to an empty corpus
191
- unless MATRIX_LIBRARY == :gsl
192
- model_c = build_model([])
193
-
194
- default = model_c.plain_idf('xxx', 1, 1)
195
- model_c.plain_idf('moon', 1, 1).should == default
196
- model_c.plain_idf('water', 1, 1).should == default
197
- model_c.plain_idf('said', 1, 1).should == default
198
- end
199
-
200
- model_d = build_model([
201
- build_document('moon'),
202
- build_document('moon said hello'),
203
- ])
204
-
205
- default = model_d.plain_idf('xxx', 1, 1)
206
- model_d.plain_idf('water', 1, 1).should == default
207
- model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
208
- model_d.plain_idf('moon', 1, 1).should == 0 # 0
209
-
210
- # should observe stopwords list
211
- default = model_b.plain_idf('xxx', 1, 1)
212
- model_b.plain_idf('water', 1, 1).should == default
213
- model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
214
- model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
215
-
216
- model_e = build_model(corpus_b + [
217
- build_document('moon', :tokens => %w()),
218
- build_document('moon and water', :tokens => %w(and water)),
219
- ])
220
-
221
- default = model_e.plain_idf('xxx', 1, 1)
222
- model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
223
- model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
224
- model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
166
+ # Adds one to the numerator when calculating inverse document frequency.
167
+ # Sets a default inverse document frequency for non-occurring terms.
168
+ # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
169
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
170
+ it 'should return the inverse document frequency' do
171
+ # should query IDF for nonexistent terms
172
+ default = model_a.plain_idf('xxx', 1, 1)
173
+ model_a.plain_idf('nonexistent', 1, 1).should == default
174
+ model_a.plain_idf('THE', 1, 1).should == default
175
+
176
+ # should query IDF for existent terms
177
+ model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
178
+ model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
179
+
180
+ # should add input documents to an existing corpus
181
+ model_a.plain_idf('water', 1, 1).should == default
182
+ model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
183
+ model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
184
+
185
+ model = build_model(corpus_a + [build_document('water moon')])
186
+
187
+ model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
188
+ model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
189
+ model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
190
+
191
+ # should add input documents to an empty corpus
192
+ unless MATRIX_LIBRARY == :gsl
193
+ model_c = build_model([])
194
+
195
+ default = model_c.plain_idf('xxx', 1, 1)
196
+ model_c.plain_idf('moon', 1, 1).should == default
197
+ model_c.plain_idf('water', 1, 1).should == default
198
+ model_c.plain_idf('said', 1, 1).should == default
199
+ end
200
+
201
+ model_d = build_model([
202
+ build_document('moon'),
203
+ build_document('moon said hello'),
204
+ ])
205
+
206
+ default = model_d.plain_idf('xxx', 1, 1)
207
+ model_d.plain_idf('water', 1, 1).should == default
208
+ model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
209
+ model_d.plain_idf('moon', 1, 1).should == 0 # 0
210
+
211
+ # should observe stopwords list
212
+ default = model_b.plain_idf('xxx', 1, 1)
213
+ model_b.plain_idf('water', 1, 1).should == default
214
+ model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
215
+ model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
216
+
217
+ model_e = build_model(corpus_b + [
218
+ build_document('moon', :tokens => %w()),
219
+ build_document('moon and water', :tokens => %w(and water)),
220
+ ])
221
+
222
+ default = model_e.plain_idf('xxx', 1, 1)
223
+ model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
224
+ model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
225
+ model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
226
+ end
225
227
  end
226
- end
227
228
 
228
- # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
229
- context 'comparing to tf_idf gem' do
230
- let :one do
231
- build_document('a a a a a a a a b b')
232
- end
229
+ # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
230
+ context 'comparing to tf_idf gem' do
231
+ let :one do
232
+ build_document('a a a a a a a a b b')
233
+ end
233
234
 
234
- let :two do
235
- build_document('a a')
236
- end
235
+ let :two do
236
+ build_document('a a')
237
+ end
237
238
 
238
- let :model do
239
- build_model([one, two])
240
- end
239
+ let :model do
240
+ build_model([one, two])
241
+ end
241
242
 
242
- # Normalizes to the number of tokens in the document.
243
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
244
- def tf
245
- one.term_count('b') / one.size.to_f
246
- end
243
+ # Normalizes to the number of tokens in the document.
244
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
245
+ def tf
246
+ one.term_count('b') / one.size.to_f
247
+ end
247
248
 
248
- # Performs plain inverse document frequency with base 10.
249
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
250
- def idf
251
- model.plain_idf('b') / Math.log(10)
252
- end
249
+ # Performs plain inverse document frequency with base 10.
250
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
251
+ def idf
252
+ model.plain_idf('b') / Math.log(10)
253
+ end
253
254
 
254
- it 'should return the term frequency' do
255
- tf.should == 0.2
256
- model.tf(one, 'b').should be_within(0.001).of(1.414)
257
- end
255
+ it 'should return the term frequency' do
256
+ tf.should == 0.2
257
+ model.tf(one, 'b').should be_within(0.001).of(1.414)
258
+ end
258
259
 
259
- it 'should return the inverse document frequency' do
260
- idf.should be_within(0.001).of(0.301) # 0.30102999
261
- model.idf('b').should == 1
262
- end
260
+ it 'should return the inverse document frequency' do
261
+ idf.should be_within(0.001).of(0.301) # 0.30102999
262
+ model.idf('b').should == 1
263
+ end
263
264
 
264
- it 'should return the tf*idf' do
265
- (tf * idf).should be_within(0.001).of(0.060) # 0.0602
266
- model.tfidf(one, 'b').should be_within(0.001).of(1.414)
265
+ it 'should return the tf*idf' do
266
+ (tf * idf).should be_within(0.001).of(0.060) # 0.0602
267
+ model.tfidf(one, 'b').should be_within(0.001).of(1.414)
268
+ end
267
269
  end
268
270
  end
269
271
  end