tf-idf-similarity 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
@@ -3,267 +3,269 @@ require 'spec_helper'
|
|
3
3
|
require 'tf-idf-similarity/extras/document'
|
4
4
|
require 'tf-idf-similarity/extras/tf_idf_model'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
6
|
+
module TfIdfSimilarity
|
7
|
+
describe TfIdfModel do
|
8
|
+
def build_document(text, opts = {})
|
9
|
+
Document.new(text, opts)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build_model(documents)
|
13
|
+
TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
|
14
|
+
end
|
15
|
+
|
16
|
+
# @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
|
17
|
+
# No relevant tests to reproduce.
|
18
|
+
|
19
|
+
# @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
|
20
|
+
context 'comparing to vss gem' do
|
21
|
+
let :documents do
|
22
|
+
[ "I'm not even going to mention any TV series.",
|
23
|
+
"The Wire is the best thing ever. Fact.",
|
24
|
+
"Some would argue that Lost got a bit too wierd after season 2.",
|
25
|
+
"Lost is surely not in the same league as The Wire.",
|
26
|
+
"You cannot compare the The Wire and Lost.",
|
27
|
+
].map do |text|
|
28
|
+
build_document(text)
|
29
|
+
end
|
30
|
+
end
|
17
31
|
|
18
|
-
|
19
|
-
|
20
|
-
let :documents do
|
21
|
-
[ "I'm not even going to mention any TV series.",
|
22
|
-
"The Wire is the best thing ever. Fact.",
|
23
|
-
"Some would argue that Lost got a bit too wierd after season 2.",
|
24
|
-
"Lost is surely not in the same league as The Wire.",
|
25
|
-
"You cannot compare the The Wire and Lost.",
|
26
|
-
].map do |text|
|
27
|
-
build_document(text)
|
32
|
+
let :model do
|
33
|
+
build_model(documents)
|
28
34
|
end
|
29
|
-
end
|
30
35
|
|
31
|
-
|
32
|
-
build_model(documents)
|
36
|
+
pending "Add #search"
|
33
37
|
end
|
34
38
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
let :document do
|
43
|
-
TfIdfSimilarity::Document.new('cow cow cow horse horse elephant')
|
44
|
-
end
|
39
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
|
40
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
|
41
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
|
42
|
+
context 'comparing to similarity gem' do
|
43
|
+
let :document do
|
44
|
+
Document.new('cow cow cow horse horse elephant')
|
45
|
+
end
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
def build_model_from_text(*texts)
|
48
|
+
build_model(texts.map{|text| build_document(text)})
|
49
|
+
end
|
49
50
|
|
50
|
-
|
51
|
-
|
52
|
-
|
51
|
+
let :model_a do
|
52
|
+
build_model_from_text("cow horse sheep", "horse bird dog")
|
53
|
+
end
|
53
54
|
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
let :model_b do
|
56
|
+
build_model_from_text("cow cow cow bird", "horse horse horse bird")
|
57
|
+
end
|
57
58
|
|
58
|
-
|
59
|
-
|
60
|
-
|
59
|
+
let :model_c do
|
60
|
+
build_model_from_text("cow cow cow", "horse horse horse")
|
61
|
+
end
|
61
62
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
63
|
+
# Normalizes to the number of tokens in the document.
|
64
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
|
65
|
+
def tf(term)
|
66
|
+
document.term_count(term) / document.size.to_f
|
67
|
+
end
|
67
68
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
69
|
+
# Does not add one to the inverse document frequency.
|
70
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
|
71
|
+
def idf(model, term)
|
72
|
+
model.plain_idf(term, 0, 1)
|
73
|
+
end
|
73
74
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
75
|
+
it 'should return the terms' do
|
76
|
+
[ "the quick brown fox",
|
77
|
+
"the quick brown fox",
|
78
|
+
"The Quick Brown Fox",
|
79
|
+
'The, Quick! Brown. "Fox"',
|
80
|
+
].each do |text|
|
81
|
+
build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
|
82
|
+
end
|
81
83
|
end
|
82
|
-
end
|
83
84
|
|
84
|
-
|
85
|
-
|
86
|
-
|
85
|
+
it 'should return the number of documents' do
|
86
|
+
model_a.documents.size.should == 2
|
87
|
+
end
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
it 'should return the number of terms' do
|
90
|
+
document.terms.size.should == 3
|
91
|
+
model_a.terms.size.should == 5
|
92
|
+
end
|
92
93
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
94
|
+
it 'should return the term frequency' do
|
95
|
+
tf('cow').should == 0.5
|
96
|
+
tf('horse').should be_within(0.001).of(0.333)
|
97
|
+
tf('sheep').should == 0
|
98
|
+
end
|
98
99
|
|
99
|
-
|
100
|
-
|
101
|
-
|
100
|
+
it 'should return the similarity matrix' do
|
101
|
+
pending "Calculate the tf*idf matrix like the similarity gem does"
|
102
|
+
end
|
102
103
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
104
|
+
it 'should return the number of documents in which a term appears' do
|
105
|
+
model_b.document_count('cow').should == 1
|
106
|
+
model_b.document_count('horse').should == 1
|
107
|
+
model_b.document_count('bird').should == 2
|
108
|
+
end
|
108
109
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
110
|
+
it 'should return the inverse document frequency' do
|
111
|
+
idf(model_c, 'cow').should be_within(0.001).of(0.0)
|
112
|
+
idf(model_c, 'bird').should be_within(0.001).of(0.693)
|
113
|
+
end
|
113
114
|
|
114
|
-
|
115
|
-
|
115
|
+
it 'should return the document vector' do
|
116
|
+
pending "Calculate the tf*idf matrix like the similarity gem does"
|
117
|
+
end
|
116
118
|
end
|
117
|
-
end
|
118
119
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
120
|
+
# @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
|
121
|
+
context 'comparing to tf-idf gem' do
|
122
|
+
# Normalizes to the number of unique tokens (terms) in the document.
|
123
|
+
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172
|
124
|
+
|
125
|
+
let :corpus_a do
|
126
|
+
1.upto(50).map do |n|
|
127
|
+
text = []
|
128
|
+
text << 'the' if n <= 23
|
129
|
+
text << 'a' if n <= 17
|
130
|
+
text << 'said' if n <= 5
|
131
|
+
text << 'phone' if n <= 2
|
132
|
+
text << 'girl' if n <= 1
|
133
|
+
text << 'moon' if n <= 1
|
134
|
+
build_document(text * ' ')
|
135
|
+
end
|
134
136
|
end
|
135
|
-
end
|
136
137
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
138
|
+
let :corpus_b do
|
139
|
+
1.upto(50).map do |n|
|
140
|
+
text = []
|
141
|
+
text << 'the' if n <= 23
|
142
|
+
text << 'a' if n <= 17
|
143
|
+
text << 'said' if n <= 5
|
144
|
+
text << 'phone' if n <= 2
|
145
|
+
text << 'girl' if n <= 1
|
146
|
+
build_document(text * ' ')
|
147
|
+
end
|
146
148
|
end
|
147
|
-
end
|
148
149
|
|
149
|
-
|
150
|
-
|
151
|
-
|
150
|
+
let :model_a do
|
151
|
+
build_model(corpus_a)
|
152
|
+
end
|
152
153
|
|
153
|
-
|
154
|
-
|
155
|
-
|
154
|
+
let :model_b do
|
155
|
+
build_model(corpus_b)
|
156
|
+
end
|
156
157
|
|
157
|
-
|
158
|
-
|
159
|
-
|
158
|
+
it 'should return the number of documents' do
|
159
|
+
model_a.documents.size.should == 50
|
160
|
+
end
|
160
161
|
|
161
|
-
|
162
|
-
|
163
|
-
|
162
|
+
it 'should return the number of terms' do
|
163
|
+
model_a.terms.size.should == 6
|
164
|
+
end
|
164
165
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
166
|
+
# Adds one to the numerator when calculating inverse document frequency.
|
167
|
+
# Sets a default inverse document frequency for non-occurring terms.
|
168
|
+
# @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
|
169
|
+
# @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
|
170
|
+
it 'should return the inverse document frequency' do
|
171
|
+
# should query IDF for nonexistent terms
|
172
|
+
default = model_a.plain_idf('xxx', 1, 1)
|
173
|
+
model_a.plain_idf('nonexistent', 1, 1).should == default
|
174
|
+
model_a.plain_idf('THE', 1, 1).should == default
|
175
|
+
|
176
|
+
# should query IDF for existent terms
|
177
|
+
model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
|
178
|
+
model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)
|
179
|
+
|
180
|
+
# should add input documents to an existing corpus
|
181
|
+
model_a.plain_idf('water', 1, 1).should == default
|
182
|
+
model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
|
183
|
+
model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
|
184
|
+
|
185
|
+
model = build_model(corpus_a + [build_document('water moon')])
|
186
|
+
|
187
|
+
model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
|
188
|
+
model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
|
189
|
+
model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337
|
190
|
+
|
191
|
+
# should add input documents to an empty corpus
|
192
|
+
unless MATRIX_LIBRARY == :gsl
|
193
|
+
model_c = build_model([])
|
194
|
+
|
195
|
+
default = model_c.plain_idf('xxx', 1, 1)
|
196
|
+
model_c.plain_idf('moon', 1, 1).should == default
|
197
|
+
model_c.plain_idf('water', 1, 1).should == default
|
198
|
+
model_c.plain_idf('said', 1, 1).should == default
|
199
|
+
end
|
200
|
+
|
201
|
+
model_d = build_model([
|
202
|
+
build_document('moon'),
|
203
|
+
build_document('moon said hello'),
|
204
|
+
])
|
205
|
+
|
206
|
+
default = model_d.plain_idf('xxx', 1, 1)
|
207
|
+
model_d.plain_idf('water', 1, 1).should == default
|
208
|
+
model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
|
209
|
+
model_d.plain_idf('moon', 1, 1).should == 0 # 0
|
210
|
+
|
211
|
+
# should observe stopwords list
|
212
|
+
default = model_b.plain_idf('xxx', 1, 1)
|
213
|
+
model_b.plain_idf('water', 1, 1).should == default
|
214
|
+
model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
|
215
|
+
model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627
|
216
|
+
|
217
|
+
model_e = build_model(corpus_b + [
|
218
|
+
build_document('moon', :tokens => %w()),
|
219
|
+
build_document('moon and water', :tokens => %w(and water)),
|
220
|
+
])
|
221
|
+
|
222
|
+
default = model_e.plain_idf('xxx', 1, 1)
|
223
|
+
model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
|
224
|
+
model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
|
225
|
+
model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
|
226
|
+
end
|
225
227
|
end
|
226
|
-
end
|
227
228
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
229
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
|
230
|
+
context 'comparing to tf_idf gem' do
|
231
|
+
let :one do
|
232
|
+
build_document('a a a a a a a a b b')
|
233
|
+
end
|
233
234
|
|
234
|
-
|
235
|
-
|
236
|
-
|
235
|
+
let :two do
|
236
|
+
build_document('a a')
|
237
|
+
end
|
237
238
|
|
238
|
-
|
239
|
-
|
240
|
-
|
239
|
+
let :model do
|
240
|
+
build_model([one, two])
|
241
|
+
end
|
241
242
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
243
|
+
# Normalizes to the number of tokens in the document.
|
244
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
|
245
|
+
def tf
|
246
|
+
one.term_count('b') / one.size.to_f
|
247
|
+
end
|
247
248
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
249
|
+
# Performs plain inverse document frequency with base 10.
|
250
|
+
# @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
|
251
|
+
def idf
|
252
|
+
model.plain_idf('b') / Math.log(10)
|
253
|
+
end
|
253
254
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
255
|
+
it 'should return the term frequency' do
|
256
|
+
tf.should == 0.2
|
257
|
+
model.tf(one, 'b').should be_within(0.001).of(1.414)
|
258
|
+
end
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
it 'should return the inverse document frequency' do
|
261
|
+
idf.should be_within(0.001).of(0.301) # 0.30102999
|
262
|
+
model.idf('b').should == 1
|
263
|
+
end
|
263
264
|
|
264
|
-
|
265
|
-
|
266
|
-
|
265
|
+
it 'should return the tf*idf' do
|
266
|
+
(tf * idf).should be_within(0.001).of(0.060) # 0.0602
|
267
|
+
model.tfidf(one, 'b').should be_within(0.001).of(1.414)
|
268
|
+
end
|
267
269
|
end
|
268
270
|
end
|
269
271
|
end
|