tf-idf-similarity 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,3 +19,9 @@ when :nmatrix
19
19
  else
20
20
  require 'matrix'
21
21
  end
22
+
23
+ RSpec.configure do |c|
24
+ if MATRIX_LIBRARY == :gsl # GSL can't initialize an empty matrix
25
+ c.filter_run_excluding :empty_matrix => true
26
+ end
27
+ end
@@ -1,107 +1,109 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe TfIdfSimilarity::TermCountModel do
4
- let :text do
5
- "FOO-foo BAR bar \r\n\t 123 !@#"
6
- end
7
-
8
- let :tokens do
9
- ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
- end
3
+ module TfIdfSimilarity
4
+ describe TermCountModel do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
11
8
 
12
- let :document_without_text do
13
- TfIdfSimilarity::Document.new('')
14
- end
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
15
12
 
16
- let :document do
17
- TfIdfSimilarity::Document.new(text)
18
- end
13
+ let :document_without_text do
14
+ Document.new('')
15
+ end
19
16
 
20
- let :document_with_tokens do
21
- TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
- end
17
+ let :document do
18
+ Document.new(text)
19
+ end
23
20
 
24
- let :document_with_term_counts do
25
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
- end
21
+ let :document_with_tokens do
22
+ Document.new(text, :tokens => tokens)
23
+ end
27
24
 
28
- context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
29
- let :model do
30
- TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
25
+ let :document_with_term_counts do
26
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
31
27
  end
32
28
 
33
- describe '#documents' do
34
- it 'should be empty' do
35
- model.documents.should be_empty
29
+ context 'without documents', :empty_matrix => true do
30
+ let :model do
31
+ TermCountModel.new([], :library => MATRIX_LIBRARY)
36
32
  end
37
- end
38
33
 
39
- describe '#terms' do
40
- it 'should be empty' do
41
- model.terms.should be_empty
34
+ describe '#documents' do
35
+ it 'should be empty' do
36
+ model.documents.should be_empty
37
+ end
42
38
  end
43
- end
44
39
 
45
- describe '#average_document_size' do
46
- it 'should be zero' do
47
- model.average_document_size.should == 0
40
+ describe '#terms' do
41
+ it 'should be empty' do
42
+ model.terms.should be_empty
43
+ end
48
44
  end
49
- end
50
45
 
51
- describe '#document_count' do
52
- it 'should be zero' do
53
- model.document_count('xxx').should == 0
46
+ describe '#average_document_size' do
47
+ it 'should be zero' do
48
+ model.average_document_size.should == 0
49
+ end
54
50
  end
55
- end
56
51
 
57
- describe '#term_count' do
58
- it 'should be zero' do
59
- model.term_count('xxx').should == 0
52
+ describe '#document_count' do
53
+ it 'should be zero' do
54
+ model.document_count('xxx').should == 0
55
+ end
60
56
  end
61
- end
62
- end
63
57
 
64
- context 'with documents' do
65
- let :documents do
66
- [
67
- document, # 4 tokens
68
- document_with_tokens, # 3 tokens
69
- document_without_text, # 0 tokens
70
- document_with_term_counts, # 15 tokens
71
- ]
58
+ describe '#term_count' do
59
+ it 'should be zero' do
60
+ model.term_count('xxx').should == 0
61
+ end
62
+ end
72
63
  end
73
64
 
74
- let :model do
75
- TfIdfSimilarity::TermCountModel.new(documents, :library => MATRIX_LIBRARY)
76
- end
65
+ context 'with documents' do
66
+ let :documents do
67
+ [
68
+ document, # 4 tokens
69
+ document_with_tokens, # 3 tokens
70
+ document_without_text, # 0 tokens
71
+ document_with_term_counts, # 15 tokens
72
+ ]
73
+ end
77
74
 
78
- describe '#documents' do
79
- it 'should return the documents' do
80
- model.documents.should == documents
75
+ let :model do
76
+ TermCountModel.new(documents, :library => MATRIX_LIBRARY)
81
77
  end
82
- end
83
78
 
84
- describe '#terms' do
85
- it 'should return the terms' do
86
- model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
79
+ describe '#documents' do
80
+ it 'should return the documents' do
81
+ model.documents.should == documents
82
+ end
87
83
  end
88
- end
89
84
 
90
- describe '#average_document_size' do
91
- it 'should return the average number of tokens in a document' do
92
- model.average_document_size.should == 5.5
85
+ describe '#terms' do
86
+ it 'should return the terms' do
87
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
88
+ end
93
89
  end
94
- end
95
90
 
96
- describe '#document_count' do
97
- it 'should return the number of documents the term appears in' do
98
- model.document_count('bar').should == 3
91
+ describe '#average_document_size' do
92
+ it 'should return the average number of tokens in a document' do
93
+ model.average_document_size.should == 5.5
94
+ end
95
+ end
96
+
97
+ describe '#document_count' do
98
+ it 'should return the number of documents the term appears in' do
99
+ model.document_count('bar').should == 3
100
+ end
99
101
  end
100
- end
101
102
 
102
- describe '#term_count' do
103
- it 'should return the number of times the term appears in the corpus' do
104
- model.term_count('bar').should == 9
103
+ describe '#term_count' do
104
+ it 'should return the number of times the term appears in the corpus' do
105
+ model.term_count('bar').should == 9
106
+ end
105
107
  end
106
108
  end
107
109
  end
@@ -1,172 +1,198 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe TfIdfSimilarity::TfIdfModel do
4
- let :text do
5
- "FOO-foo BAR bar \r\n\t 123 !@#"
6
- end
7
-
8
- let :tokens do
9
- ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
- end
11
-
12
- let :document_without_text do
13
- TfIdfSimilarity::Document.new('')
14
- end
15
-
16
- let :document do
17
- TfIdfSimilarity::Document.new(text)
18
- end
19
-
20
- let :document_with_tokens do
21
- TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
- end
3
+ module TfIdfSimilarity
4
+ describe TfIdfModel do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
23
8
 
24
- let :document_with_term_counts do
25
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
- end
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
27
12
 
28
- let :non_corpus_document do
29
- TfIdfSimilarity::Document.new('foo foo foo')
30
- end
13
+ let :document_without_text do
14
+ Document.new('')
15
+ end
31
16
 
32
- def similarity_matrix_values(model)
33
- matrix = model.similarity_matrix
34
- if MATRIX_LIBRARY == :nmatrix
35
- matrix.each.to_a
36
- else
37
- matrix.to_a.flatten
17
+ let :document do
18
+ Document.new(text)
38
19
  end
39
- end
40
20
 
41
- context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
42
- let :model do
43
- TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
21
+ let :document_with_tokens do
22
+ Document.new(text, :tokens => tokens)
44
23
  end
45
24
 
46
- describe '#documents' do
47
- it 'should be empty' do
48
- model.documents.should be_empty
49
- end
25
+ let :document_with_term_counts do
26
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
50
27
  end
51
28
 
52
- describe '#terms' do
53
- it 'should be empty' do
54
- model.terms.should be_empty
55
- end
29
+ let :non_corpus_document do
30
+ Document.new('foo foo foo')
56
31
  end
57
32
 
58
- describe '#inverse_document_frequency' do
59
- it 'should return negative infinity' do
60
- model.idf('foo').should == -1/0.0 # -Infinity
33
+ def similarity_matrix_values(model)
34
+ matrix = model.similarity_matrix
35
+ if MATRIX_LIBRARY == :nmatrix
36
+ matrix.each.to_a
37
+ else
38
+ matrix.to_a.flatten
61
39
  end
62
40
  end
63
41
 
64
- describe '#term_frequency' do
65
- it 'should return the term frequency' do
66
- model.tf(document, 'foo').should == Math.sqrt(2)
42
+ context 'without documents', :empty_matrix => true do
43
+ let :model do
44
+ TfIdfModel.new([], :library => MATRIX_LIBRARY)
67
45
  end
68
- end
69
46
 
70
- describe '#term_frequency_inverse_document_frequency' do
71
- it 'should return negative infinity' do
72
- model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
47
+ describe '#documents' do
48
+ it 'should be empty' do
49
+ model.documents.should be_empty
50
+ end
73
51
  end
74
- end
75
52
 
76
- describe '#similarity_matrix' do
77
- it 'should be empty' do
78
- similarity_matrix_values(model).should be_empty
53
+ describe '#document_index' do
54
+ it 'should return nil' do
55
+ model.document_index(document).should be_nil
56
+ end
79
57
  end
80
- end
81
- end
82
58
 
83
- context 'with documents' do
84
- let :documents do
85
- [
86
- document,
87
- document_with_tokens,
88
- document_without_text,
89
- document_with_term_counts,
90
- ]
91
- end
59
+ describe '#text_index' do
60
+ it 'should return nil' do
61
+ model.text_index(text).should be_nil
62
+ end
63
+ end
92
64
 
93
- let :model do
94
- TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
95
- end
65
+ describe '#terms' do
66
+ it 'should be empty' do
67
+ model.terms.should be_empty
68
+ end
69
+ end
96
70
 
97
- describe '#documents' do
98
- it 'should return the documents' do
99
- model.documents.should == documents
71
+ describe '#inverse_document_frequency' do
72
+ it 'should return negative infinity' do
73
+ model.idf('foo').should == -1/0.0 # -Infinity
74
+ end
100
75
  end
101
- end
102
76
 
103
- describe '#terms' do
104
- it 'should return the terms' do
105
- model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
77
+ describe '#term_frequency' do
78
+ it 'should return the term frequency' do
79
+ model.tf(document, 'foo').should == Math.sqrt(2)
80
+ end
106
81
  end
107
- end
108
82
 
109
- describe '#inverse_document_frequency' do
110
- it 'should return the inverse document frequency' do
111
- model.idf('foo').should be_within(0.001).of(1 + Math.log(2))
83
+ describe '#term_frequency_inverse_document_frequency' do
84
+ it 'should return negative infinity' do
85
+ model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
86
+ end
112
87
  end
113
88
 
114
- it 'should return the inverse document frequency of a non-occurring term' do
115
- model.idf('xxx').should be_within(0.001).of(1 + Math.log(4))
89
+ describe '#similarity_matrix' do
90
+ it 'should be empty' do
91
+ similarity_matrix_values(model).should be_empty
92
+ end
116
93
  end
117
94
  end
118
95
 
119
- describe '#term_frequency' do
120
- it 'should return the term frequency if no tokens given' do
121
- model.tf(document, 'foo').should == Math.sqrt(2)
96
+ context 'with documents' do
97
+ let :documents do
98
+ [
99
+ document,
100
+ document_with_tokens,
101
+ document_without_text,
102
+ document_with_term_counts,
103
+ ]
122
104
  end
123
105
 
124
- it 'should return the term frequency if tokens given' do
125
- model.tf(document_with_tokens, 'foo-foo').should == 1
106
+ let :model do
107
+ TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
126
108
  end
127
109
 
128
- it 'should return no term frequency if no text given' do
129
- model.tf(document_without_text, 'foo').should == 0
110
+ describe '#documents' do
111
+ it 'should return the documents' do
112
+ model.documents.should == documents
113
+ end
130
114
  end
131
115
 
132
- it 'should return the term frequency if term counts given' do
133
- model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
116
+ describe '#document_index' do
117
+ it 'should return the index' do
118
+ model.document_index(document).should == 0
119
+ end
134
120
  end
135
121
 
136
- it 'should return the term frequency of a non-occurring term' do
137
- model.tf(document, 'xxx').should == 0
122
+ describe '#text_index' do
123
+ it 'should return the index' do
124
+ model.text_index(text).should == 0
125
+ end
138
126
  end
139
127
 
140
- it 'should return the term frequency in a non-occurring document' do
141
- model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
128
+ describe '#terms' do
129
+ it 'should return the terms' do
130
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
131
+ end
142
132
  end
143
- end
144
133
 
145
- describe '#term_frequency_inverse_document_frequency' do
146
- it 'should return the tf*idf' do
147
- model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(2))
134
+ describe '#inverse_document_frequency' do
135
+ it 'should return the inverse document frequency' do
136
+ model.idf('foo').should be_within(0.001).of(1 + Math.log(4 / (1 + 1.0)))
137
+ end
138
+
139
+ it 'should return the inverse document frequency of a non-occurring term' do
140
+ model.idf('xxx').should be_within(0.001).of(1 + Math.log(4 / (0 + 1.0)))
141
+ end
148
142
  end
149
143
 
150
- it 'should return the tf*idf of a non-occurring term' do
151
- model.tfidf(document, 'xxx').should == 0
144
+ describe '#term_frequency' do
145
+ it 'should return the term frequency if no tokens given' do
146
+ model.tf(document, 'foo').should == Math.sqrt(2)
147
+ end
148
+
149
+ it 'should return the term frequency if tokens given' do
150
+ model.tf(document_with_tokens, 'foo-foo').should == 1
151
+ end
152
+
153
+ it 'should return no term frequency if no text given' do
154
+ model.tf(document_without_text, 'foo').should == 0
155
+ end
156
+
157
+ it 'should return the term frequency if term counts given' do
158
+ model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
159
+ end
160
+
161
+ it 'should return the term frequency of a non-occurring term' do
162
+ model.tf(document, 'xxx').should == 0
163
+ end
164
+
165
+ it 'should return the term frequency in a non-occurring document' do
166
+ model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
167
+ end
152
168
  end
153
169
 
154
- it 'should return the tf*idf in a non-occurring term' do
155
- model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(3))
170
+ describe '#term_frequency_inverse_document_frequency' do
171
+ it 'should return the tf*idf' do
172
+ model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(2))
173
+ end
174
+
175
+ it 'should return the tf*idf of a non-occurring term' do
176
+ model.tfidf(document, 'xxx').should == 0
177
+ end
178
+
179
+ it 'should return the tf*idf in a non-occurring term' do
180
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(3))
181
+ end
156
182
  end
157
- end
158
183
 
159
- describe '#similarity_matrix' do
160
- it 'should return the similarity matrix' do
161
- expected = [
162
- 1.0, 0.326, 0.0, 0.195,
163
- 0.326, 1.0, 0.0, 0.247,
164
- 0.0, 0.0, 0.0, 0.0,
165
- 0.195, 0.247, 0.0, 1.0,
166
- ]
184
+ describe '#similarity_matrix' do
185
+ it 'should return the similarity matrix' do
186
+ expected = [
187
+ 1.0, 0.326, 0.0, 0.195,
188
+ 0.326, 1.0, 0.0, 0.247,
189
+ 0.0, 0.0, 0.0, 0.0,
190
+ 0.195, 0.247, 0.0, 1.0,
191
+ ]
167
192
 
168
- similarity_matrix_values(model).each_with_index do |value,i|
169
- value.should be_within(0.001).of(expected[i])
193
+ similarity_matrix_values(model).each_with_index do |value,i|
194
+ value.should be_within(0.001).of(expected[i])
195
+ end
170
196
  end
171
197
  end
172
198
  end