tf-idf-similarity 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
data/spec/spec_helper.rb
CHANGED
@@ -1,107 +1,109 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
let :tokens do
|
9
|
-
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
10
|
-
end
|
3
|
+
module TfIdfSimilarity
|
4
|
+
describe TermCountModel do
|
5
|
+
let :text do
|
6
|
+
"FOO-foo BAR bar \r\n\t 123 !@#"
|
7
|
+
end
|
11
8
|
|
12
|
-
|
13
|
-
|
14
|
-
|
9
|
+
let :tokens do
|
10
|
+
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
11
|
+
end
|
15
12
|
|
16
|
-
|
17
|
-
|
18
|
-
|
13
|
+
let :document_without_text do
|
14
|
+
Document.new('')
|
15
|
+
end
|
19
16
|
|
20
|
-
|
21
|
-
|
22
|
-
|
17
|
+
let :document do
|
18
|
+
Document.new(text)
|
19
|
+
end
|
23
20
|
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
let :document_with_tokens do
|
22
|
+
Document.new(text, :tokens => tokens)
|
23
|
+
end
|
27
24
|
|
28
|
-
|
29
|
-
|
30
|
-
TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
|
25
|
+
let :document_with_term_counts do
|
26
|
+
Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
|
31
27
|
end
|
32
28
|
|
33
|
-
|
34
|
-
|
35
|
-
|
29
|
+
context 'without documents', :empty_matrix => true do
|
30
|
+
let :model do
|
31
|
+
TermCountModel.new([], :library => MATRIX_LIBRARY)
|
36
32
|
end
|
37
|
-
end
|
38
33
|
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
describe '#documents' do
|
35
|
+
it 'should be empty' do
|
36
|
+
model.documents.should be_empty
|
37
|
+
end
|
42
38
|
end
|
43
|
-
end
|
44
39
|
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
describe '#terms' do
|
41
|
+
it 'should be empty' do
|
42
|
+
model.terms.should be_empty
|
43
|
+
end
|
48
44
|
end
|
49
|
-
end
|
50
45
|
|
51
|
-
|
52
|
-
|
53
|
-
|
46
|
+
describe '#average_document_size' do
|
47
|
+
it 'should be zero' do
|
48
|
+
model.average_document_size.should == 0
|
49
|
+
end
|
54
50
|
end
|
55
|
-
end
|
56
51
|
|
57
|
-
|
58
|
-
|
59
|
-
|
52
|
+
describe '#document_count' do
|
53
|
+
it 'should be zero' do
|
54
|
+
model.document_count('xxx').should == 0
|
55
|
+
end
|
60
56
|
end
|
61
|
-
end
|
62
|
-
end
|
63
57
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
document_without_text, # 0 tokens
|
70
|
-
document_with_term_counts, # 15 tokens
|
71
|
-
]
|
58
|
+
describe '#term_count' do
|
59
|
+
it 'should be zero' do
|
60
|
+
model.term_count('xxx').should == 0
|
61
|
+
end
|
62
|
+
end
|
72
63
|
end
|
73
64
|
|
74
|
-
|
75
|
-
|
76
|
-
|
65
|
+
context 'with documents' do
|
66
|
+
let :documents do
|
67
|
+
[
|
68
|
+
document, # 4 tokens
|
69
|
+
document_with_tokens, # 3 tokens
|
70
|
+
document_without_text, # 0 tokens
|
71
|
+
document_with_term_counts, # 15 tokens
|
72
|
+
]
|
73
|
+
end
|
77
74
|
|
78
|
-
|
79
|
-
|
80
|
-
model.documents.should == documents
|
75
|
+
let :model do
|
76
|
+
TermCountModel.new(documents, :library => MATRIX_LIBRARY)
|
81
77
|
end
|
82
|
-
end
|
83
78
|
|
84
|
-
|
85
|
-
|
86
|
-
|
79
|
+
describe '#documents' do
|
80
|
+
it 'should return the documents' do
|
81
|
+
model.documents.should == documents
|
82
|
+
end
|
87
83
|
end
|
88
|
-
end
|
89
84
|
|
90
|
-
|
91
|
-
|
92
|
-
|
85
|
+
describe '#terms' do
|
86
|
+
it 'should return the terms' do
|
87
|
+
model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
|
88
|
+
end
|
93
89
|
end
|
94
|
-
end
|
95
90
|
|
96
|
-
|
97
|
-
|
98
|
-
|
91
|
+
describe '#average_document_size' do
|
92
|
+
it 'should return the average number of tokens in a document' do
|
93
|
+
model.average_document_size.should == 5.5
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
describe '#document_count' do
|
98
|
+
it 'should return the number of documents the term appears in' do
|
99
|
+
model.document_count('bar').should == 3
|
100
|
+
end
|
99
101
|
end
|
100
|
-
end
|
101
102
|
|
102
|
-
|
103
|
-
|
104
|
-
|
103
|
+
describe '#term_count' do
|
104
|
+
it 'should return the number of times the term appears in the corpus' do
|
105
|
+
model.term_count('bar').should == 9
|
106
|
+
end
|
105
107
|
end
|
106
108
|
end
|
107
109
|
end
|
data/spec/tf_idf_model_spec.rb
CHANGED
@@ -1,172 +1,198 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
let :tokens do
|
9
|
-
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
10
|
-
end
|
11
|
-
|
12
|
-
let :document_without_text do
|
13
|
-
TfIdfSimilarity::Document.new('')
|
14
|
-
end
|
15
|
-
|
16
|
-
let :document do
|
17
|
-
TfIdfSimilarity::Document.new(text)
|
18
|
-
end
|
19
|
-
|
20
|
-
let :document_with_tokens do
|
21
|
-
TfIdfSimilarity::Document.new(text, :tokens => tokens)
|
22
|
-
end
|
3
|
+
module TfIdfSimilarity
|
4
|
+
describe TfIdfModel do
|
5
|
+
let :text do
|
6
|
+
"FOO-foo BAR bar \r\n\t 123 !@#"
|
7
|
+
end
|
23
8
|
|
24
|
-
|
25
|
-
|
26
|
-
|
9
|
+
let :tokens do
|
10
|
+
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
11
|
+
end
|
27
12
|
|
28
|
-
|
29
|
-
|
30
|
-
|
13
|
+
let :document_without_text do
|
14
|
+
Document.new('')
|
15
|
+
end
|
31
16
|
|
32
|
-
|
33
|
-
|
34
|
-
if MATRIX_LIBRARY == :nmatrix
|
35
|
-
matrix.each.to_a
|
36
|
-
else
|
37
|
-
matrix.to_a.flatten
|
17
|
+
let :document do
|
18
|
+
Document.new(text)
|
38
19
|
end
|
39
|
-
end
|
40
20
|
|
41
|
-
|
42
|
-
|
43
|
-
TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
|
21
|
+
let :document_with_tokens do
|
22
|
+
Document.new(text, :tokens => tokens)
|
44
23
|
end
|
45
24
|
|
46
|
-
|
47
|
-
|
48
|
-
model.documents.should be_empty
|
49
|
-
end
|
25
|
+
let :document_with_term_counts do
|
26
|
+
Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
|
50
27
|
end
|
51
28
|
|
52
|
-
|
53
|
-
|
54
|
-
model.terms.should be_empty
|
55
|
-
end
|
29
|
+
let :non_corpus_document do
|
30
|
+
Document.new('foo foo foo')
|
56
31
|
end
|
57
32
|
|
58
|
-
|
59
|
-
|
60
|
-
|
33
|
+
def similarity_matrix_values(model)
|
34
|
+
matrix = model.similarity_matrix
|
35
|
+
if MATRIX_LIBRARY == :nmatrix
|
36
|
+
matrix.each.to_a
|
37
|
+
else
|
38
|
+
matrix.to_a.flatten
|
61
39
|
end
|
62
40
|
end
|
63
41
|
|
64
|
-
|
65
|
-
|
66
|
-
|
42
|
+
context 'without documents', :empty_matrix => true do
|
43
|
+
let :model do
|
44
|
+
TfIdfModel.new([], :library => MATRIX_LIBRARY)
|
67
45
|
end
|
68
|
-
end
|
69
46
|
|
70
|
-
|
71
|
-
|
72
|
-
|
47
|
+
describe '#documents' do
|
48
|
+
it 'should be empty' do
|
49
|
+
model.documents.should be_empty
|
50
|
+
end
|
73
51
|
end
|
74
|
-
end
|
75
52
|
|
76
|
-
|
77
|
-
|
78
|
-
|
53
|
+
describe '#document_index' do
|
54
|
+
it 'should return nil' do
|
55
|
+
model.document_index(document).should be_nil
|
56
|
+
end
|
79
57
|
end
|
80
|
-
end
|
81
|
-
end
|
82
58
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
document_without_text,
|
89
|
-
document_with_term_counts,
|
90
|
-
]
|
91
|
-
end
|
59
|
+
describe '#text_index' do
|
60
|
+
it 'should return nil' do
|
61
|
+
model.text_index(text).should be_nil
|
62
|
+
end
|
63
|
+
end
|
92
64
|
|
93
|
-
|
94
|
-
|
95
|
-
|
65
|
+
describe '#terms' do
|
66
|
+
it 'should be empty' do
|
67
|
+
model.terms.should be_empty
|
68
|
+
end
|
69
|
+
end
|
96
70
|
|
97
|
-
|
98
|
-
|
99
|
-
|
71
|
+
describe '#inverse_document_frequency' do
|
72
|
+
it 'should return negative infinity' do
|
73
|
+
model.idf('foo').should == -1/0.0 # -Infinity
|
74
|
+
end
|
100
75
|
end
|
101
|
-
end
|
102
76
|
|
103
|
-
|
104
|
-
|
105
|
-
|
77
|
+
describe '#term_frequency' do
|
78
|
+
it 'should return the term frequency' do
|
79
|
+
model.tf(document, 'foo').should == Math.sqrt(2)
|
80
|
+
end
|
106
81
|
end
|
107
|
-
end
|
108
82
|
|
109
|
-
|
110
|
-
|
111
|
-
|
83
|
+
describe '#term_frequency_inverse_document_frequency' do
|
84
|
+
it 'should return negative infinity' do
|
85
|
+
model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
|
86
|
+
end
|
112
87
|
end
|
113
88
|
|
114
|
-
|
115
|
-
|
89
|
+
describe '#similarity_matrix' do
|
90
|
+
it 'should be empty' do
|
91
|
+
similarity_matrix_values(model).should be_empty
|
92
|
+
end
|
116
93
|
end
|
117
94
|
end
|
118
95
|
|
119
|
-
|
120
|
-
|
121
|
-
|
96
|
+
context 'with documents' do
|
97
|
+
let :documents do
|
98
|
+
[
|
99
|
+
document,
|
100
|
+
document_with_tokens,
|
101
|
+
document_without_text,
|
102
|
+
document_with_term_counts,
|
103
|
+
]
|
122
104
|
end
|
123
105
|
|
124
|
-
|
125
|
-
|
106
|
+
let :model do
|
107
|
+
TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
|
126
108
|
end
|
127
109
|
|
128
|
-
|
129
|
-
|
110
|
+
describe '#documents' do
|
111
|
+
it 'should return the documents' do
|
112
|
+
model.documents.should == documents
|
113
|
+
end
|
130
114
|
end
|
131
115
|
|
132
|
-
|
133
|
-
|
116
|
+
describe '#document_index' do
|
117
|
+
it 'should return the index' do
|
118
|
+
model.document_index(document).should == 0
|
119
|
+
end
|
134
120
|
end
|
135
121
|
|
136
|
-
|
137
|
-
|
122
|
+
describe '#text_index' do
|
123
|
+
it 'should return the index' do
|
124
|
+
model.text_index(text).should == 0
|
125
|
+
end
|
138
126
|
end
|
139
127
|
|
140
|
-
|
141
|
-
|
128
|
+
describe '#terms' do
|
129
|
+
it 'should return the terms' do
|
130
|
+
model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
|
131
|
+
end
|
142
132
|
end
|
143
|
-
end
|
144
133
|
|
145
|
-
|
146
|
-
|
147
|
-
|
134
|
+
describe '#inverse_document_frequency' do
|
135
|
+
it 'should return the inverse document frequency' do
|
136
|
+
model.idf('foo').should be_within(0.001).of(1 + Math.log(4 / (1 + 1.0)))
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'should return the inverse document frequency of a non-occurring term' do
|
140
|
+
model.idf('xxx').should be_within(0.001).of(1 + Math.log(4 / (0 + 1.0)))
|
141
|
+
end
|
148
142
|
end
|
149
143
|
|
150
|
-
|
151
|
-
|
144
|
+
describe '#term_frequency' do
|
145
|
+
it 'should return the term frequency if no tokens given' do
|
146
|
+
model.tf(document, 'foo').should == Math.sqrt(2)
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'should return the term frequency if tokens given' do
|
150
|
+
model.tf(document_with_tokens, 'foo-foo').should == 1
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'should return no term frequency if no text given' do
|
154
|
+
model.tf(document_without_text, 'foo').should == 0
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'should return the term frequency if term counts given' do
|
158
|
+
model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
|
159
|
+
end
|
160
|
+
|
161
|
+
it 'should return the term frequency of a non-occurring term' do
|
162
|
+
model.tf(document, 'xxx').should == 0
|
163
|
+
end
|
164
|
+
|
165
|
+
it 'should return the term frequency in a non-occurring document' do
|
166
|
+
model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
|
167
|
+
end
|
152
168
|
end
|
153
169
|
|
154
|
-
|
155
|
-
|
170
|
+
describe '#term_frequency_inverse_document_frequency' do
|
171
|
+
it 'should return the tf*idf' do
|
172
|
+
model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(2))
|
173
|
+
end
|
174
|
+
|
175
|
+
it 'should return the tf*idf of a non-occurring term' do
|
176
|
+
model.tfidf(document, 'xxx').should == 0
|
177
|
+
end
|
178
|
+
|
179
|
+
it 'should return the tf*idf in a non-occurring term' do
|
180
|
+
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(3))
|
181
|
+
end
|
156
182
|
end
|
157
|
-
end
|
158
183
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
184
|
+
describe '#similarity_matrix' do
|
185
|
+
it 'should return the similarity matrix' do
|
186
|
+
expected = [
|
187
|
+
1.0, 0.326, 0.0, 0.195,
|
188
|
+
0.326, 1.0, 0.0, 0.247,
|
189
|
+
0.0, 0.0, 0.0, 0.0,
|
190
|
+
0.195, 0.247, 0.0, 1.0,
|
191
|
+
]
|
167
192
|
|
168
|
-
|
169
|
-
|
193
|
+
similarity_matrix_values(model).each_with_index do |value,i|
|
194
|
+
value.should be_within(0.001).of(expected[i])
|
195
|
+
end
|
170
196
|
end
|
171
197
|
end
|
172
198
|
end
|