tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,267 +3,269 @@ require 'spec_helper'
3
3
  require 'tf-idf-similarity/extras/document'
4
4
  require 'tf-idf-similarity/extras/tf_idf_model'
5
5
 
6
- describe TfIdfSimilarity::TfIdfModel do
7
- def build_document(text, opts = {})
8
- TfIdfSimilarity::Document.new(text, opts)
9
- end
10
-
11
- def build_model(documents)
12
- TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
13
- end
14
-
15
- # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
16
- # No relevant tests to reproduce.
6
+ module TfIdfSimilarity
7
+ describe TfIdfModel do
8
+ def build_document(text, opts = {})
9
+ Document.new(text, opts)
10
+ end
11
+
12
+ def build_model(documents)
13
+ TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
14
+ end
15
+
16
+ # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
17
+ # No relevant tests to reproduce.
18
+
19
+ # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
20
+ context 'comparing to vss gem' do
21
+ let :documents do
22
+ [ "I'm not even going to mention any TV series.",
23
+ "The Wire is the best thing ever. Fact.",
24
+ "Some would argue that Lost got a bit too wierd after season 2.",
25
+ "Lost is surely not in the same league as The Wire.",
26
+ "You cannot compare the The Wire and Lost.",
27
+ ].map do |text|
28
+ build_document(text)
29
+ end
30
+ end
17
31
 
18
- # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
19
- context 'comparing to vss gem' do
20
- let :documents do
21
- [ "I'm not even going to mention any TV series.",
22
- "The Wire is the best thing ever. Fact.",
23
- "Some would argue that Lost got a bit too wierd after season 2.",
24
- "Lost is surely not in the same league as The Wire.",
25
- "You cannot compare the The Wire and Lost.",
26
- ].map do |text|
27
- build_document(text)
32
+ let :model do
33
+ build_model(documents)
28
34
  end
29
- end
30
35
 
31
- let :model do
32
- build_model(documents)
36
+ pending "Add #search"
33
37
  end
34
38
 
35
- pending "Add TfIdfSimilarity::TfIdfModel#search"
36
- end
37
-
38
- # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
39
- # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
40
- # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
41
- context 'comparing to similarity gem' do
42
- let :document do
43
- TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
44
- end
39
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
40
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
41
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
42
+ context 'comparing to similarity gem' do
43
+ let :document do
44
+ Document.new('cow cow cow horse horse elephant')
45
+ end
45
46
 
46
- def build_model_from_text(*texts)
47
- build_model(texts.map{|text| build_document(text)})
48
- end
47
+ def build_model_from_text(*texts)
48
+ build_model(texts.map{|text| build_document(text)})
49
+ end
49
50
 
50
- let :model_a do
51
- build_model_from_text("cow horse sheep", "horse bird dog")
52
- end
51
+ let :model_a do
52
+ build_model_from_text("cow horse sheep", "horse bird dog")
53
+ end
53
54
 
54
- let :model_b do
55
- build_model_from_text("cow cow cow bird", "horse horse horse bird")
56
- end
55
+ let :model_b do
56
+ build_model_from_text("cow cow cow bird", "horse horse horse bird")
57
+ end
57
58
 
58
- let :model_c do
59
- build_model_from_text("cow cow cow", "horse horse horse")
60
- end
59
+ let :model_c do
60
+ build_model_from_text("cow cow cow", "horse horse horse")
61
+ end
61
62
 
62
- # Normalizes to the number of tokens in the document.
63
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
64
- def tf(term)
65
- document.term_count(term) / document.size.to_f
66
- end
63
+ # Normalizes to the number of tokens in the document.
64
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
65
+ def tf(term)
66
+ document.term_count(term) / document.size.to_f
67
+ end
67
68
 
68
- # Does not add one to the inverse document frequency.
69
- # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
70
- def idf(model, term)
71
- model.plain_idf(term, 0, 1)
72
- end
69
+ # Does not add one to the inverse document frequency.
70
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
71
+ def idf(model, term)
72
+ model.plain_idf(term, 0, 1)
73
+ end
73
74
 
74
- it 'should return the terms' do
75
- [ "the quick brown fox",
76
- "the quick brown fox",
77
- "The Quick Brown Fox",
78
- 'The, Quick! Brown. "Fox"',
79
- ].each do |text|
80
- build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
75
+ it 'should return the terms' do
76
+ [ "the quick brown fox",
77
+ "the quick brown fox",
78
+ "The Quick Brown Fox",
79
+ 'The, Quick! Brown. "Fox"',
80
+ ].each do |text|
81
+ build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
82
+ end
81
83
  end
82
- end
83
84
 
84
- it 'should return the number of documents' do
85
- model_a.documents.size.should == 2
86
- end
85
+ it 'should return the number of documents' do
86
+ model_a.documents.size.should == 2
87
+ end
87
88
 
88
- it 'should return the number of terms' do
89
- document.terms.size.should == 3
90
- model_a.terms.size.should == 5
91
- end
89
+ it 'should return the number of terms' do
90
+ document.terms.size.should == 3
91
+ model_a.terms.size.should == 5
92
+ end
92
93
 
93
- it 'should return the term frequency' do
94
- tf('cow').should == 0.5
95
- tf('horse').should be_within(0.001).of(0.333)
96
- tf('sheep').should == 0
97
- end
94
+ it 'should return the term frequency' do
95
+ tf('cow').should == 0.5
96
+ tf('horse').should be_within(0.001).of(0.333)
97
+ tf('sheep').should == 0
98
+ end
98
99
 
99
- it 'should return the similarity matrix' do
100
- pending "Calculate the tf*idf matrix like the similarity gem does"
101
- end
100
+ it 'should return the similarity matrix' do
101
+ pending "Calculate the tf*idf matrix like the similarity gem does"
102
+ end
102
103
 
103
- it 'should return the number of documents in which a term appears' do
104
- model_b.document_count('cow').should == 1
105
- model_b.document_count('horse').should == 1
106
- model_b.document_count('bird').should == 2
107
- end
104
+ it 'should return the number of documents in which a term appears' do
105
+ model_b.document_count('cow').should == 1
106
+ model_b.document_count('horse').should == 1
107
+ model_b.document_count('bird').should == 2
108
+ end
108
109
 
109
- it 'should return the inverse document frequency' do
110
- idf(model_c, 'cow').should be_within(0.001).of(0.0)
111
- idf(model_c, 'bird').should be_within(0.001).of(0.693)
112
- end
110
+ it 'should return the inverse document frequency' do
111
+ idf(model_c, 'cow').should be_within(0.001).of(0.0)
112
+ idf(model_c, 'bird').should be_within(0.001).of(0.693)
113
+ end
113
114
 
114
- it 'should return the document vector' do
115
- pending "Calculate the tf*idf matrix like the similarity gem does"
115
+ it 'should return the document vector' do
116
+ pending "Calculate the tf*idf matrix like the similarity gem does"
117
+ end
116
118
  end
117
- end
118
119
 
119
- # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
120
- context 'comparing to tf-idf gem' do
121
- # Normalizes to the number of unique tokens (terms) in the document.
122
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
123
-
124
- let :corpus_a do
125
- 1.upto(50).map do |n|
126
- text = []
127
- text << 'the' if n <= 23
128
- text << 'a' if n <= 17
129
- text << 'said' if n <= 5
130
- text << 'phone' if n <= 2
131
- text << 'girl' if n <= 1
132
- text << 'moon' if n <= 1
133
- build_document(text * ' ')
120
+ # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
121
+ context 'comparing to tf-idf gem' do
122
+ # Normalizes to the number of unique tokens (terms) in the document.
123
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
124
+
125
+ let :corpus_a do
126
+ 1.upto(50).map do |n|
127
+ text = []
128
+ text << 'the' if n <= 23
129
+ text << 'a' if n <= 17
130
+ text << 'said' if n <= 5
131
+ text << 'phone' if n <= 2
132
+ text << 'girl' if n <= 1
133
+ text << 'moon' if n <= 1
134
+ build_document(text * ' ')
135
+ end
134
136
  end
135
- end
136
137
 
137
- let :corpus_b do
138
- 1.upto(50).map do |n|
139
- text = []
140
- text << 'the' if n <= 23
141
- text << 'a' if n <= 17
142
- text << 'said' if n <= 5
143
- text << 'phone' if n <= 2
144
- text << 'girl' if n <= 1
145
- build_document(text * ' ')
138
+ let :corpus_b do
139
+ 1.upto(50).map do |n|
140
+ text = []
141
+ text << 'the' if n <= 23
142
+ text << 'a' if n <= 17
143
+ text << 'said' if n <= 5
144
+ text << 'phone' if n <= 2
145
+ text << 'girl' if n <= 1
146
+ build_document(text * ' ')
147
+ end
146
148
  end
147
- end
148
149
 
149
- let :model_a do
150
- build_model(corpus_a)
151
- end
150
+ let :model_a do
151
+ build_model(corpus_a)
152
+ end
152
153
 
153
- let :model_b do
154
- build_model(corpus_b)
155
- end
154
+ let :model_b do
155
+ build_model(corpus_b)
156
+ end
156
157
 
157
- it 'should return the number of documents' do
158
- model_a.documents.size.should == 50
159
- end
158
+ it 'should return the number of documents' do
159
+ model_a.documents.size.should == 50
160
+ end
160
161
 
161
- it 'should return the number of terms' do
162
- model_a.terms.size.should == 6
163
- end
162
+ it 'should return the number of terms' do
163
+ model_a.terms.size.should == 6
164
+ end
164
165
 
165
- # Adds one to the numerator when calculating inverse document frequency.
166
- # Sets a default inverse document frequency for non-occurring terms.
167
- # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
168
- # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
169
- it 'should return the inverse document frequency' do
170
- # should query IDF for nonexistent terms
171
- default = model_a.plain_idf('xxx', 1, 1)
172
- model_a.plain_idf('nonexistent', 1, 1).should == default
173
- model_a.plain_idf('THE', 1, 1).should == default
174
-
175
- # should query IDF for existent terms
176
- model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
177
- model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
178
-
179
- # should add input documents to an existing corpus
180
- model_a.plain_idf('water', 1, 1).should == default
181
- model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
182
- model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
183
-
184
- model = build_model(corpus_a + [build_document('water moon')])
185
-
186
- model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
187
- model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
188
- model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
189
-
190
- # should add input documents to an empty corpus
191
- unless MATRIX_LIBRARY == :gsl
192
- model_c = build_model([])
193
-
194
- default = model_c.plain_idf('xxx', 1, 1)
195
- model_c.plain_idf('moon', 1, 1).should == default
196
- model_c.plain_idf('water', 1, 1).should == default
197
- model_c.plain_idf('said', 1, 1).should == default
198
- end
199
-
200
- model_d = build_model([
201
- build_document('moon'),
202
- build_document('moon said hello'),
203
- ])
204
-
205
- default = model_d.plain_idf('xxx', 1, 1)
206
- model_d.plain_idf('water', 1, 1).should == default
207
- model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
208
- model_d.plain_idf('moon', 1, 1).should == 0 # 0
209
-
210
- # should observe stopwords list
211
- default = model_b.plain_idf('xxx', 1, 1)
212
- model_b.plain_idf('water', 1, 1).should == default
213
- model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
214
- model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
215
-
216
- model_e = build_model(corpus_b + [
217
- build_document('moon', :tokens => %w()),
218
- build_document('moon and water', :tokens => %w(and water)),
219
- ])
220
-
221
- default = model_e.plain_idf('xxx', 1, 1)
222
- model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
223
- model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
224
- model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
166
+ # Adds one to the numerator when calculating inverse document frequency.
167
+ # Sets a default inverse document frequency for non-occurring terms.
168
+ # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
169
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
170
+ it 'should return the inverse document frequency' do
171
+ # should query IDF for nonexistent terms
172
+ default = model_a.plain_idf('xxx', 1, 1)
173
+ model_a.plain_idf('nonexistent', 1, 1).should == default
174
+ model_a.plain_idf('THE', 1, 1).should == default
175
+
176
+ # should query IDF for existent terms
177
+ model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
178
+ model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
179
+
180
+ # should add input documents to an existing corpus
181
+ model_a.plain_idf('water', 1, 1).should == default
182
+ model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
183
+ model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
184
+
185
+ model = build_model(corpus_a + [build_document('water moon')])
186
+
187
+ model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
188
+ model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
189
+ model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
190
+
191
+ # should add input documents to an empty corpus
192
+ unless MATRIX_LIBRARY == :gsl
193
+ model_c = build_model([])
194
+
195
+ default = model_c.plain_idf('xxx', 1, 1)
196
+ model_c.plain_idf('moon', 1, 1).should == default
197
+ model_c.plain_idf('water', 1, 1).should == default
198
+ model_c.plain_idf('said', 1, 1).should == default
199
+ end
200
+
201
+ model_d = build_model([
202
+ build_document('moon'),
203
+ build_document('moon said hello'),
204
+ ])
205
+
206
+ default = model_d.plain_idf('xxx', 1, 1)
207
+ model_d.plain_idf('water', 1, 1).should == default
208
+ model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
209
+ model_d.plain_idf('moon', 1, 1).should == 0 # 0
210
+
211
+ # should observe stopwords list
212
+ default = model_b.plain_idf('xxx', 1, 1)
213
+ model_b.plain_idf('water', 1, 1).should == default
214
+ model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
215
+ model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
216
+
217
+ model_e = build_model(corpus_b + [
218
+ build_document('moon', :tokens => %w()),
219
+ build_document('moon and water', :tokens => %w(and water)),
220
+ ])
221
+
222
+ default = model_e.plain_idf('xxx', 1, 1)
223
+ model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
224
+ model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
225
+ model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
226
+ end
225
227
  end
226
- end
227
228
 
228
- # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
229
- context 'comparing to tf_idf gem' do
230
- let :one do
231
- build_document('a a a a a a a a b b')
232
- end
229
+ # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
230
+ context 'comparing to tf_idf gem' do
231
+ let :one do
232
+ build_document('a a a a a a a a b b')
233
+ end
233
234
 
234
- let :two do
235
- build_document('a a')
236
- end
235
+ let :two do
236
+ build_document('a a')
237
+ end
237
238
 
238
- let :model do
239
- build_model([one, two])
240
- end
239
+ let :model do
240
+ build_model([one, two])
241
+ end
241
242
 
242
- # Normalizes to the number of tokens in the document.
243
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
244
- def tf
245
- one.term_count('b') / one.size.to_f
246
- end
243
+ # Normalizes to the number of tokens in the document.
244
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
245
+ def tf
246
+ one.term_count('b') / one.size.to_f
247
+ end
247
248
 
248
- # Performs plain inverse document frequency with base 10.
249
- # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
250
- def idf
251
- model.plain_idf('b') / Math.log(10)
252
- end
249
+ # Performs plain inverse document frequency with base 10.
250
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
251
+ def idf
252
+ model.plain_idf('b') / Math.log(10)
253
+ end
253
254
 
254
- it 'should return the term frequency' do
255
- tf.should == 0.2
256
- model.tf(one, 'b').should be_within(0.001).of(1.414)
257
- end
255
+ it 'should return the term frequency' do
256
+ tf.should == 0.2
257
+ model.tf(one, 'b').should be_within(0.001).of(1.414)
258
+ end
258
259
 
259
- it 'should return the inverse document frequency' do
260
- idf.should be_within(0.001).of(0.301) # 0.30102999
261
- model.idf('b').should == 1
262
- end
260
+ it 'should return the inverse document frequency' do
261
+ idf.should be_within(0.001).of(0.301) # 0.30102999
262
+ model.idf('b').should == 1
263
+ end
263
264
 
264
- it 'should return the tf*idf' do
265
- (tf * idf).should be_within(0.001).of(0.060) # 0.0602
266
- model.tfidf(one, 'b').should be_within(0.001).of(1.414)
265
+ it 'should return the tf*idf' do
266
+ (tf * idf).should be_within(0.001).of(0.060) # 0.0602
267
+ model.tfidf(one, 'b').should be_within(0.001).of(1.414)
268
+ end
267
269
  end
268
270
  end
269
271
  end