tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,3 +19,9 @@ when :nmatrix
19
19
  else
20
20
  require 'matrix'
21
21
  end
22
+
23
+ RSpec.configure do |c|
24
+ if MATRIX_LIBRARY == :gsl # GSL can't initialize an empty matrix
25
+ c.filter_run_excluding :empty_matrix => true
26
+ end
27
+ end
@@ -1,107 +1,109 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe TfIdfSimilarity::TermCountModel do
4
- let :text do
5
- "FOO-foo BAR bar \r\n\t 123 !@#"
6
- end
7
-
8
- let :tokens do
9
- ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
- end
3
+ module TfIdfSimilarity
4
+ describe TermCountModel do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
11
8
 
12
- let :document_without_text do
13
- TfIdfSimilarity::Document.new('')
14
- end
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
15
12
 
16
- let :document do
17
- TfIdfSimilarity::Document.new(text)
18
- end
13
+ let :document_without_text do
14
+ Document.new('')
15
+ end
19
16
 
20
- let :document_with_tokens do
21
- TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
- end
17
+ let :document do
18
+ Document.new(text)
19
+ end
23
20
 
24
- let :document_with_term_counts do
25
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
- end
21
+ let :document_with_tokens do
22
+ Document.new(text, :tokens => tokens)
23
+ end
27
24
 
28
- context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
29
- let :model do
30
- TfIdfSimilarity::TermCountModel.new([], :library => MATRIX_LIBRARY)
25
+ let :document_with_term_counts do
26
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
31
27
  end
32
28
 
33
- describe '#documents' do
34
- it 'should be empty' do
35
- model.documents.should be_empty
29
+ context 'without documents', :empty_matrix => true do
30
+ let :model do
31
+ TermCountModel.new([], :library => MATRIX_LIBRARY)
36
32
  end
37
- end
38
33
 
39
- describe '#terms' do
40
- it 'should be empty' do
41
- model.terms.should be_empty
34
+ describe '#documents' do
35
+ it 'should be empty' do
36
+ model.documents.should be_empty
37
+ end
42
38
  end
43
- end
44
39
 
45
- describe '#average_document_size' do
46
- it 'should be zero' do
47
- model.average_document_size.should == 0
40
+ describe '#terms' do
41
+ it 'should be empty' do
42
+ model.terms.should be_empty
43
+ end
48
44
  end
49
- end
50
45
 
51
- describe '#document_count' do
52
- it 'should be zero' do
53
- model.document_count('xxx').should == 0
46
+ describe '#average_document_size' do
47
+ it 'should be zero' do
48
+ model.average_document_size.should == 0
49
+ end
54
50
  end
55
- end
56
51
 
57
- describe '#term_count' do
58
- it 'should be zero' do
59
- model.term_count('xxx').should == 0
52
+ describe '#document_count' do
53
+ it 'should be zero' do
54
+ model.document_count('xxx').should == 0
55
+ end
60
56
  end
61
- end
62
- end
63
57
 
64
- context 'with documents' do
65
- let :documents do
66
- [
67
- document, # 4 tokens
68
- document_with_tokens, # 3 tokens
69
- document_without_text, # 0 tokens
70
- document_with_term_counts, # 15 tokens
71
- ]
58
+ describe '#term_count' do
59
+ it 'should be zero' do
60
+ model.term_count('xxx').should == 0
61
+ end
62
+ end
72
63
  end
73
64
 
74
- let :model do
75
- TfIdfSimilarity::TermCountModel.new(documents, :library => MATRIX_LIBRARY)
76
- end
65
+ context 'with documents' do
66
+ let :documents do
67
+ [
68
+ document, # 4 tokens
69
+ document_with_tokens, # 3 tokens
70
+ document_without_text, # 0 tokens
71
+ document_with_term_counts, # 15 tokens
72
+ ]
73
+ end
77
74
 
78
- describe '#documents' do
79
- it 'should return the documents' do
80
- model.documents.should == documents
75
+ let :model do
76
+ TermCountModel.new(documents, :library => MATRIX_LIBRARY)
81
77
  end
82
- end
83
78
 
84
- describe '#terms' do
85
- it 'should return the terms' do
86
- model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
79
+ describe '#documents' do
80
+ it 'should return the documents' do
81
+ model.documents.should == documents
82
+ end
87
83
  end
88
- end
89
84
 
90
- describe '#average_document_size' do
91
- it 'should return the average number of tokens in a document' do
92
- model.average_document_size.should == 5.5
85
+ describe '#terms' do
86
+ it 'should return the terms' do
87
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
88
+ end
93
89
  end
94
- end
95
90
 
96
- describe '#document_count' do
97
- it 'should return the number of documents the term appears in' do
98
- model.document_count('bar').should == 3
91
+ describe '#average_document_size' do
92
+ it 'should return the average number of tokens in a document' do
93
+ model.average_document_size.should == 5.5
94
+ end
95
+ end
96
+
97
+ describe '#document_count' do
98
+ it 'should return the number of documents the term appears in' do
99
+ model.document_count('bar').should == 3
100
+ end
99
101
  end
100
- end
101
102
 
102
- describe '#term_count' do
103
- it 'should return the number of times the term appears in the corpus' do
104
- model.term_count('bar').should == 9
103
+ describe '#term_count' do
104
+ it 'should return the number of times the term appears in the corpus' do
105
+ model.term_count('bar').should == 9
106
+ end
105
107
  end
106
108
  end
107
109
  end
@@ -1,172 +1,198 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe TfIdfSimilarity::TfIdfModel do
4
- let :text do
5
- "FOO-foo BAR bar \r\n\t 123 !@#"
6
- end
7
-
8
- let :tokens do
9
- ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
10
- end
11
-
12
- let :document_without_text do
13
- TfIdfSimilarity::Document.new('')
14
- end
15
-
16
- let :document do
17
- TfIdfSimilarity::Document.new(text)
18
- end
19
-
20
- let :document_with_tokens do
21
- TfIdfSimilarity::Document.new(text, :tokens => tokens)
22
- end
3
+ module TfIdfSimilarity
4
+ describe TfIdfModel do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
23
8
 
24
- let :document_with_term_counts do
25
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
26
- end
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
27
12
 
28
- let :non_corpus_document do
29
- TfIdfSimilarity::Document.new('foo foo foo')
30
- end
13
+ let :document_without_text do
14
+ Document.new('')
15
+ end
31
16
 
32
- def similarity_matrix_values(model)
33
- matrix = model.similarity_matrix
34
- if MATRIX_LIBRARY == :nmatrix
35
- matrix.each.to_a
36
- else
37
- matrix.to_a.flatten
17
+ let :document do
18
+ Document.new(text)
38
19
  end
39
- end
40
20
 
41
- context 'without documents', :unless => lambda{MATRIX_LIBRARY == :gsl} do
42
- let :model do
43
- TfIdfSimilarity::TfIdfModel.new([], :library => MATRIX_LIBRARY)
21
+ let :document_with_tokens do
22
+ Document.new(text, :tokens => tokens)
44
23
  end
45
24
 
46
- describe '#documents' do
47
- it 'should be empty' do
48
- model.documents.should be_empty
49
- end
25
+ let :document_with_term_counts do
26
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
50
27
  end
51
28
 
52
- describe '#terms' do
53
- it 'should be empty' do
54
- model.terms.should be_empty
55
- end
29
+ let :non_corpus_document do
30
+ Document.new('foo foo foo')
56
31
  end
57
32
 
58
- describe '#inverse_document_frequency' do
59
- it 'should return negative infinity' do
60
- model.idf('foo').should == -1/0.0 # -Infinity
33
+ def similarity_matrix_values(model)
34
+ matrix = model.similarity_matrix
35
+ if MATRIX_LIBRARY == :nmatrix
36
+ matrix.each.to_a
37
+ else
38
+ matrix.to_a.flatten
61
39
  end
62
40
  end
63
41
 
64
- describe '#term_frequency' do
65
- it 'should return the term frequency' do
66
- model.tf(document, 'foo').should == Math.sqrt(2)
42
+ context 'without documents', :empty_matrix => true do
43
+ let :model do
44
+ TfIdfModel.new([], :library => MATRIX_LIBRARY)
67
45
  end
68
- end
69
46
 
70
- describe '#term_frequency_inverse_document_frequency' do
71
- it 'should return negative infinity' do
72
- model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
47
+ describe '#documents' do
48
+ it 'should be empty' do
49
+ model.documents.should be_empty
50
+ end
73
51
  end
74
- end
75
52
 
76
- describe '#similarity_matrix' do
77
- it 'should be empty' do
78
- similarity_matrix_values(model).should be_empty
53
+ describe '#document_index' do
54
+ it 'should return nil' do
55
+ model.document_index(document).should be_nil
56
+ end
79
57
  end
80
- end
81
- end
82
58
 
83
- context 'with documents' do
84
- let :documents do
85
- [
86
- document,
87
- document_with_tokens,
88
- document_without_text,
89
- document_with_term_counts,
90
- ]
91
- end
59
+ describe '#text_index' do
60
+ it 'should return nil' do
61
+ model.text_index(text).should be_nil
62
+ end
63
+ end
92
64
 
93
- let :model do
94
- TfIdfSimilarity::TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
95
- end
65
+ describe '#terms' do
66
+ it 'should be empty' do
67
+ model.terms.should be_empty
68
+ end
69
+ end
96
70
 
97
- describe '#documents' do
98
- it 'should return the documents' do
99
- model.documents.should == documents
71
+ describe '#inverse_document_frequency' do
72
+ it 'should return negative infinity' do
73
+ model.idf('foo').should == -1/0.0 # -Infinity
74
+ end
100
75
  end
101
- end
102
76
 
103
- describe '#terms' do
104
- it 'should return the terms' do
105
- model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
77
+ describe '#term_frequency' do
78
+ it 'should return the term frequency' do
79
+ model.tf(document, 'foo').should == Math.sqrt(2)
80
+ end
106
81
  end
107
- end
108
82
 
109
- describe '#inverse_document_frequency' do
110
- it 'should return the inverse document frequency' do
111
- model.idf('foo').should be_within(0.001).of(1 + Math.log(2))
83
+ describe '#term_frequency_inverse_document_frequency' do
84
+ it 'should return negative infinity' do
85
+ model.tfidf(document, 'foo').should == -1/0.0 # -Infinity
86
+ end
112
87
  end
113
88
 
114
- it 'should return the inverse document frequency of a non-occurring term' do
115
- model.idf('xxx').should be_within(0.001).of(1 + Math.log(4))
89
+ describe '#similarity_matrix' do
90
+ it 'should be empty' do
91
+ similarity_matrix_values(model).should be_empty
92
+ end
116
93
  end
117
94
  end
118
95
 
119
- describe '#term_frequency' do
120
- it 'should return the term frequency if no tokens given' do
121
- model.tf(document, 'foo').should == Math.sqrt(2)
96
+ context 'with documents' do
97
+ let :documents do
98
+ [
99
+ document,
100
+ document_with_tokens,
101
+ document_without_text,
102
+ document_with_term_counts,
103
+ ]
122
104
  end
123
105
 
124
- it 'should return the term frequency if tokens given' do
125
- model.tf(document_with_tokens, 'foo-foo').should == 1
106
+ let :model do
107
+ TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
126
108
  end
127
109
 
128
- it 'should return no term frequency if no text given' do
129
- model.tf(document_without_text, 'foo').should == 0
110
+ describe '#documents' do
111
+ it 'should return the documents' do
112
+ model.documents.should == documents
113
+ end
130
114
  end
131
115
 
132
- it 'should return the term frequency if term counts given' do
133
- model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
116
+ describe '#document_index' do
117
+ it 'should return the index' do
118
+ model.document_index(document).should == 0
119
+ end
134
120
  end
135
121
 
136
- it 'should return the term frequency of a non-occurring term' do
137
- model.tf(document, 'xxx').should == 0
122
+ describe '#text_index' do
123
+ it 'should return the index' do
124
+ model.text_index(text).should == 0
125
+ end
138
126
  end
139
127
 
140
- it 'should return the term frequency in a non-occurring document' do
141
- model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
128
+ describe '#terms' do
129
+ it 'should return the terms' do
130
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
131
+ end
142
132
  end
143
- end
144
133
 
145
- describe '#term_frequency_inverse_document_frequency' do
146
- it 'should return the tf*idf' do
147
- model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(2))
134
+ describe '#inverse_document_frequency' do
135
+ it 'should return the inverse document frequency' do
136
+ model.idf('foo').should be_within(0.001).of(1 + Math.log(4 / (1 + 1.0)))
137
+ end
138
+
139
+ it 'should return the inverse document frequency of a non-occurring term' do
140
+ model.idf('xxx').should be_within(0.001).of(1 + Math.log(4 / (0 + 1.0)))
141
+ end
148
142
  end
149
143
 
150
- it 'should return the tf*idf of a non-occurring term' do
151
- model.tfidf(document, 'xxx').should == 0
144
+ describe '#term_frequency' do
145
+ it 'should return the term frequency if no tokens given' do
146
+ model.tf(document, 'foo').should == Math.sqrt(2)
147
+ end
148
+
149
+ it 'should return the term frequency if tokens given' do
150
+ model.tf(document_with_tokens, 'foo-foo').should == 1
151
+ end
152
+
153
+ it 'should return no term frequency if no text given' do
154
+ model.tf(document_without_text, 'foo').should == 0
155
+ end
156
+
157
+ it 'should return the term frequency if term counts given' do
158
+ model.tf(document_with_term_counts, 'bar').should == Math.sqrt(5)
159
+ end
160
+
161
+ it 'should return the term frequency of a non-occurring term' do
162
+ model.tf(document, 'xxx').should == 0
163
+ end
164
+
165
+ it 'should return the term frequency in a non-occurring document' do
166
+ model.tf(non_corpus_document, 'foo').should == Math.sqrt(3)
167
+ end
152
168
  end
153
169
 
154
- it 'should return the tf*idf in a non-occurring term' do
155
- model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(2)) * Math.sqrt(3))
170
+ describe '#term_frequency_inverse_document_frequency' do
171
+ it 'should return the tf*idf' do
172
+ model.tfidf(document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(2))
173
+ end
174
+
175
+ it 'should return the tf*idf of a non-occurring term' do
176
+ model.tfidf(document, 'xxx').should == 0
177
+ end
178
+
179
+ it 'should return the tf*idf in a non-occurring term' do
180
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of((1 + Math.log(4 / (1 + 1.0))) * Math.sqrt(3))
181
+ end
156
182
  end
157
- end
158
183
 
159
- describe '#similarity_matrix' do
160
- it 'should return the similarity matrix' do
161
- expected = [
162
- 1.0, 0.326, 0.0, 0.195,
163
- 0.326, 1.0, 0.0, 0.247,
164
- 0.0, 0.0, 0.0, 0.0,
165
- 0.195, 0.247, 0.0, 1.0,
166
- ]
184
+ describe '#similarity_matrix' do
185
+ it 'should return the similarity matrix' do
186
+ expected = [
187
+ 1.0, 0.326, 0.0, 0.195,
188
+ 0.326, 1.0, 0.0, 0.247,
189
+ 0.0, 0.0, 0.0, 0.0,
190
+ 0.195, 0.247, 0.0, 1.0,
191
+ ]
167
192
 
168
- similarity_matrix_values(model).each_with_index do |value,i|
169
- value.should be_within(0.001).of(expected[i])
193
+ similarity_matrix_values(model).each_with_index do |value,i|
194
+ value.should be_within(0.001).of(expected[i])
195
+ end
170
196
  end
171
197
  end
172
198
  end