tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,269 @@
1
+ require 'spec_helper'
2
+
3
+ require 'tf-idf-similarity/extras/document'
4
+ require 'tf-idf-similarity/extras/tf_idf_model'
5
+
6
+ describe TfIdfSimilarity::TfIdfModel do
7
+ def build_document(text, opts = {})
8
+ TfIdfSimilarity::Document.new(text, opts)
9
+ end
10
+
11
+ def build_model(documents)
12
+ TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
13
+ end
14
+
15
+ # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
16
+ # No relevant tests to reproduce.
17
+
18
+ # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
19
+ context 'comparing to vss gem' do
20
+ let :documents do
21
+ [ "I'm not even going to mention any TV series.",
22
+ "The Wire is the best thing ever. Fact.",
23
+ "Some would argue that Lost got a bit too wierd after season 2.",
24
+ "Lost is surely not in the same league as The Wire.",
25
+ "You cannot compare the The Wire and Lost.",
26
+ ].map do |text|
27
+ build_document(text)
28
+ end
29
+ end
30
+
31
+ let :model do
32
+ build_model(documents)
33
+ end
34
+
35
+ pending "Add TfIdfSimilarity::TfIdfModel#search"
36
+ end
37
+
38
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
39
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
40
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
41
+ context 'comparing to similarity gem' do
42
+ let :document do
43
+ TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
44
+ end
45
+
46
+ def build_model_from_text(*texts)
47
+ build_model(texts.map{|text| build_document(text)})
48
+ end
49
+
50
+ let :model_a do
51
+ build_model_from_text("cow horse sheep", "horse bird dog")
52
+ end
53
+
54
+ let :model_b do
55
+ build_model_from_text("cow cow cow bird", "horse horse horse bird")
56
+ end
57
+
58
+ let :model_c do
59
+ build_model_from_text("cow cow cow", "horse horse horse")
60
+ end
61
+
62
+ # Normalizes to the number of tokens in the document.
63
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
64
+ def tf(term)
65
+ document.term_count(term) / document.size.to_f
66
+ end
67
+
68
+ # Does not add one to the inverse document frequency.
69
+ # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
70
+ def idf(model, term)
71
+ model.plain_idf(term, 0, 1)
72
+ end
73
+
74
+ it 'should return the terms' do
75
+ [ "the quick brown fox",
76
+ "the quick brown fox",
77
+ "The Quick Brown Fox",
78
+ 'The, Quick! Brown. "Fox"',
79
+ ].each do |text|
80
+ build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
81
+ end
82
+ end
83
+
84
+ it 'should return the number of documents' do
85
+ model_a.documents.size.should == 2
86
+ end
87
+
88
+ it 'should return the number of terms' do
89
+ document.terms.size.should == 3
90
+ model_a.terms.size.should == 5
91
+ end
92
+
93
+ it 'should return the term frequency' do
94
+ tf('cow').should == 0.5
95
+ tf('horse').should be_within(0.001).of(0.333)
96
+ tf('sheep').should == 0
97
+ end
98
+
99
+ it 'should return the similarity matrix' do
100
+ pending "Calculate the tf*idf matrix like the similarity gem does"
101
+ end
102
+
103
+ it 'should return the number of documents in which a term appears' do
104
+ model_b.document_count('cow').should == 1
105
+ model_b.document_count('horse').should == 1
106
+ model_b.document_count('bird').should == 2
107
+ end
108
+
109
+ it 'should return the inverse document frequency' do
110
+ idf(model_c, 'cow').should be_within(0.001).of(0.0)
111
+ idf(model_c, 'bird').should be_within(0.001).of(0.693)
112
+ end
113
+
114
+ it 'should return the document vector' do
115
+ pending "Calculate the tf*idf matrix like the similarity gem does"
116
+ end
117
+ end
118
+
119
+ # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
120
+ context 'comparing to tf-idf gem' do
121
+ # Normalizes to the number of unique tokens (terms) in the document.
122
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
123
+
124
+ let :corpus_a do
125
+ 1.upto(50).map do |n|
126
+ text = []
127
+ text << 'the' if n <= 23
128
+ text << 'a' if n <= 17
129
+ text << 'said' if n <= 5
130
+ text << 'phone' if n <= 2
131
+ text << 'girl' if n <= 1
132
+ text << 'moon' if n <= 1
133
+ build_document(text * ' ')
134
+ end
135
+ end
136
+
137
+ let :corpus_b do
138
+ 1.upto(50).map do |n|
139
+ text = []
140
+ text << 'the' if n <= 23
141
+ text << 'a' if n <= 17
142
+ text << 'said' if n <= 5
143
+ text << 'phone' if n <= 2
144
+ text << 'girl' if n <= 1
145
+ build_document(text * ' ')
146
+ end
147
+ end
148
+
149
+ let :model_a do
150
+ build_model(corpus_a)
151
+ end
152
+
153
+ let :model_b do
154
+ build_model(corpus_b)
155
+ end
156
+
157
+ it 'should return the number of documents' do
158
+ model_a.documents.size.should == 50
159
+ end
160
+
161
+ it 'should return the number of terms' do
162
+ model_a.terms.size.should == 6
163
+ end
164
+
165
+ # Adds one to the numerator when calculating inverse document frequency.
166
+ # Sets a default inverse document frequency for non-occurring terms.
167
+ # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
168
+ # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
169
+ it 'should return the inverse document frequency' do
170
+ # should query IDF for nonexistent terms
171
+ default = model_a.plain_idf('xxx', 1, 1)
172
+ model_a.plain_idf('nonexistent', 1, 1).should == default
173
+ model_a.plain_idf('THE', 1, 1).should == default
174
+
175
+ # should query IDF for existent terms
176
+ model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
177
+ model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
178
+
179
+ # should add input documents to an existing corpus
180
+ model_a.plain_idf('water', 1, 1).should == default
181
+ model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
182
+ model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
183
+
184
+ model = build_model(corpus_a + [build_document('water moon')])
185
+
186
+ model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
187
+ model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
188
+ model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
189
+
190
+ # should add input documents to an empty corpus
191
+ unless MATRIX_LIBRARY == :gsl
192
+ model_c = build_model([])
193
+
194
+ default = model_c.plain_idf('xxx', 1, 1)
195
+ model_c.plain_idf('moon', 1, 1).should == default
196
+ model_c.plain_idf('water', 1, 1).should == default
197
+ model_c.plain_idf('said', 1, 1).should == default
198
+ end
199
+
200
+ model_d = build_model([
201
+ build_document('moon'),
202
+ build_document('moon said hello'),
203
+ ])
204
+
205
+ default = model_d.plain_idf('xxx', 1, 1)
206
+ model_d.plain_idf('water', 1, 1).should == default
207
+ model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
208
+ model_d.plain_idf('moon', 1, 1).should == 0 # 0
209
+
210
+ # should observe stopwords list
211
+ default = model_b.plain_idf('xxx', 1, 1)
212
+ model_b.plain_idf('water', 1, 1).should == default
213
+ model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
214
+ model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
215
+
216
+ model_e = build_model(corpus_b + [
217
+ build_document('moon', :tokens => %w()),
218
+ build_document('moon and water', :tokens => %w(and water)),
219
+ ])
220
+
221
+ default = model_e.plain_idf('xxx', 1, 1)
222
+ model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
223
+ model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
224
+ model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
225
+ end
226
+ end
227
+
228
+ # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
229
+ context 'comparing to tf_idf gem' do
230
+ let :one do
231
+ build_document('a a a a a a a a b b')
232
+ end
233
+
234
+ let :two do
235
+ build_document('a a')
236
+ end
237
+
238
+ let :model do
239
+ build_model([one, two])
240
+ end
241
+
242
+ # Normalizes to the number of tokens in the document.
243
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
244
+ def tf
245
+ one.term_count('b') / one.size.to_f
246
+ end
247
+
248
+ # Performs plain inverse document frequency with base 10.
249
+ # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
250
+ def idf
251
+ model.plain_idf('b') / Math.log(10)
252
+ end
253
+
254
+ it 'should return the term frequency' do
255
+ tf.should == 0.2
256
+ model.tf(one, 'b').should be_within(0.001).of(1.414)
257
+ end
258
+
259
+ it 'should return the inverse document frequency' do
260
+ idf.should be_within(0.001).of(0.301) # 0.30102999
261
+ model.idf('b').should == 1
262
+ end
263
+
264
+ it 'should return the tf*idf' do
265
+ (tf * idf).should be_within(0.001).of(0.060) # 0.0602
266
+ model.tfidf(one, 'b').should be_within(0.001).of(1.414)
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+
3
+ require 'coveralls'
4
+ Coveralls.wear!
5
+
6
+ require 'rspec'
7
+ require File.dirname(__FILE__) + '/../lib/tf-idf-similarity'
8
+
9
+ MATRIX_LIBRARY = (ENV['MATRIX_LIBRARY'] || :matrix).to_sym
10
+ puts "\n==> Running specs with #{MATRIX_LIBRARY}"
11
+
12
+ case MATRIX_LIBRARY
13
+ when :gsl
14
+ require 'gsl'
15
+ when :narray
16
+ require 'narray'
17
+ when :nmatrix
18
+ require 'nmatrix'
19
+ else
20
+ require 'matrix'
21
+ end
@@ -0,0 +1,108 @@
1
+ require 'spec_helper'
2
+
3
+ describe TfIdfSimilarity::TermCountModel do
4
+ let :text do
5
+ "FOO-foo BAR bar \r\n\t 123 !@#"
6
+ end
7
+
8
+ let :tokens do
9
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
+ end
11
+
12
+ let :document_without_text do
13
+ TfIdfSimilarity::Document.new('')
14
+ end
15
+
16
+ let :document do
17
+ TfIdfSimilarity::Document.new(text)
18
+ end
19
+
20
+ let :document_with_tokens do
21
+ TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
+ end
23
+
24
+ let :document_with_term_counts do
25
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
+ end
27
+
28
+ context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
29
+ let :model do
30
+ TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
31
+ end
32
+
33
+ describe '#documents' do
34
+ it 'should be empty' do
35
+ model.documents.should be_empty
36
+ end
37
+ end
38
+
39
+ describe '#terms' do
40
+ it 'should be empty' do
41
+ model.terms.should be_empty
42
+ end
43
+ end
44
+
45
+ describe '#average_document_size' do
46
+ it 'should be zero' do
47
+ model.average_document_size.should == 0
48
+ end
49
+ end
50
+
51
+ describe '#document_count' do
52
+ it 'should be zero' do
53
+ model.document_count('xxx').should == 0
54
+ end
55
+ end
56
+
57
+ describe '#term_count' do
58
+ it 'should be zero' do
59
+ model.term_count('xxx').should == 0
60
+ end
61
+ end
62
+ end
63
+
64
+ context 'with documents' do
65
+ let :documents do
66
+ [
67
+ document, # 4 tokens
68
+ document_with_tokens, # 3 tokens
69
+ document_without_text, # 0 tokens
70
+ document_with_term_counts, # 15 tokens
71
+ ]
72
+ end
73
+
74
+ let :model do
75
+ TfIdfSimilarity::TermCountModel.new(documents, :library => MATRIX_LIBRARY)
76
+ end
77
+
78
+ describe '#documents' do
79
+ it 'should return the documents' do
80
+ model.documents.should == documents
81
+ end
82
+ end
83
+
84
+ describe '#terms' do
85
+ it 'should return the terms' do
86
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
87
+ end
88
+ end
89
+
90
+ describe '#average_document_size' do
91
+ it 'should return the average number of tokens in a document' do
92
+ model.average_document_size.should == 5.5
93
+ end
94
+ end
95
+
96
+ describe '#document_count' do
97
+ it 'should return the number of documents the term appears in' do
98
+ model.document_count('bar').should == 3
99
+ end
100
+ end
101
+
102
+ describe '#term_count' do
103
+ it 'should return the number of times the term appears in the corpus' do
104
+ model.term_count('bar').should == 9
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,174 @@
1
+ require 'spec_helper'
2
+
3
+ describe TfIdfSimilarity::TfIdfModel do
4
+ let :text do
5
+ "FOO-foo BAR bar \r\n\t 123 !@#"
6
+ end
7
+
8
+ let :tokens do
9
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
+ end
11
+
12
+ let :document_without_text do
13
+ TfIdfSimilarity::Document.new('')
14
+ end
15
+
16
+ let :document do
17
+ TfIdfSimilarity::Document.new(text)
18
+ end
19
+
20
+ let :document_with_tokens do
21
+ TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
+ end
23
+
24
+ let :document_with_term_counts do
25
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
+ end
27
+
28
+ let :non_corpus_document do
29
+ TfIdfSimilarity::Document.new('foo foo foo')
30
+ end
31
+
32
+ def similarity_matrix_values(model)
33
+ matrix = model.similarity_matrix
34
+ if MATRIX_LIBRARY == :nmatrix
35
+ matrix.each.to_a
36
+ else
37
+ matrix.to_a.flatten
38
+ end
39
+ end
40
+
41
+ context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
42
+ let :model do
43
+ TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
44
+ end
45
+
46
+ describe '#documents' do
47
+ it 'should be empty' do
48
+ model.documents.should be_empty
49
+ end
50
+ end
51
+
52
+ describe '#terms' do
53
+ it 'should be empty' do
54
+ model.terms.should be_empty
55
+ end
56
+ end
57
+
58
+ describe '#inverse_document_frequency' do
59
+ it 'should return negative infinity' do
60
+ model.idf('foo').should == -1/0.0 # -Infinity
61
+ end
62
+ end
63
+
64
+ describe '#term_frequency' do
65
+ it 'should return the term frequency' do
66
+ model.tf(document, 'foo').should == Math.sqrt(2)
67
+ end
68
+ end
69
+
70
+ describe '#term_frequency_inverse_document_frequency' do
71
+ it 'should return negative infinity' do
72
+ model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
73
+ end
74
+ end
75
+
76
+ describe '#similarity_matrix' do
77
+ it 'should be empty' do
78
+ similarity_matrix_values(model).should be_empty
79
+ end
80
+ end
81
+ end
82
+
83
+ context 'with documents' do
84
+ let :documents do
85
+ [
86
+ document,
87
+ document_with_tokens,
88
+ document_without_text,
89
+ document_with_term_counts,
90
+ ]
91
+ end
92
+
93
+ let :model do
94
+ TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
95
+ end
96
+
97
+ describe '#documents' do
98
+ it 'should return the documents' do
99
+ model.documents.should == documents
100
+ end
101
+ end
102
+
103
+ describe '#terms' do
104
+ it 'should return the terms' do
105
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
106
+ end
107
+ end
108
+
109
+ describe '#inverse_document_frequency' do
110
+ it 'should return the inverse document frequency' do
111
+ model.idf('foo').should be_within(0.001).of(1 + Math.log(2))
112
+ end
113
+
114
+ it 'should return the inverse document frequency of a non-occurring term' do
115
+ model.idf('xxx').should be_within(0.001).of(1 + Math.log(4))
116
+ end
117
+ end
118
+
119
+ describe '#term_frequency' do
120
+ it 'should return the term frequency if no tokens given' do
121
+ model.tf(document, 'foo').should == Math.sqrt(2)
122
+ end
123
+
124
+ it 'should return the term frequency if tokens given' do
125
+ model.tf(document_with_tokens, 'foo-foo').should == 1
126
+ end
127
+
128
+ it 'should return no term frequency if no text given' do
129
+ model.tf(document_without_text, 'foo').should == 0
130
+ end
131
+
132
+ it 'should return the term frequency if term counts given' do
133
+ model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
134
+ end
135
+
136
+ it 'should return the term frequency of a non-occurring term' do
137
+ model.tf(document, 'xxx').should == 0
138
+ end
139
+
140
+ it 'should return the term frequency in a non-occurring document' do
141
+ model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
142
+ end
143
+ end
144
+
145
+ describe '#term_frequency_inverse_document_frequency' do
146
+ it 'should return the tf*idf' do
147
+ model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(2))
148
+ end
149
+
150
+ it 'should return the tf*idf of a non-occurring term' do
151
+ model.tfidf(document, 'xxx').should == 0
152
+ end
153
+
154
+ it 'should return the tf*idf in a non-occurring term' do
155
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(3))
156
+ end
157
+ end
158
+
159
+ describe '#similarity_matrix' do
160
+ it 'should return the similarity matrix' do
161
+ expected = [
162
+ 1.0, 0.326, 0.0, 0.195,
163
+ 0.326, 1.0, 0.0, 0.247,
164
+ 0.0, 0.0, 0.0, 0.0,
165
+ 0.195, 0.247, 0.0, 1.0,
166
+ ]
167
+
168
+ similarity_matrix_values(model).each_with_index do |value,i|
169
+ value.should be_within(0.001).of(expected[i])
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end