tf-idf-similarity 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,66 +1,27 @@
1
1
  # A document-term matrix using the tf*idf function.
2
2
  #
3
3
  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
4
- class TfIdfSimilarity::TfIdfModel
5
- include TfIdfSimilarity::MatrixMethods
6
-
7
- extend Forwardable
8
- def_delegators :@model, :documents, :terms, :document_count
9
-
10
- # @param [Array<TfIdfSimilarity::Document>] documents documents
11
- # @param [Hash] opts optional arguments
12
- # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
13
- def initialize(documents, opts = {})
14
- @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
15
- @library = (opts[:library] || :matrix).to_sym
16
-
17
- array = Array.new(terms.size) do |i|
18
- idf = inverse_document_frequency(terms[i])
19
- Array.new(documents.size) do |j|
20
- term_frequency(documents[j], terms[i]) * idf
21
- end
4
+ module TfIdfSimilarity
5
+ class TfIdfModel < Model
6
+ # Return the term's inverse document frequency.
7
+ #
8
+ # @param [String] term a term
9
+ # @return [Float] the term's inverse document frequency
10
+ def inverse_document_frequency(term)
11
+ df = @model.document_count(term)
12
+ 1 + log(documents.size / (df + 1.0))
22
13
  end
23
-
24
- @matrix = initialize_matrix(array)
25
- end
26
-
27
- # Return the term's inverse document frequency.
28
- #
29
- # @param [String] term a term
30
- # @return [Float] the term's inverse document frequency
31
- def inverse_document_frequency(term)
32
- df = @model.document_count(term)
33
- 1 + log(documents.size / (df + 1.0))
34
- end
35
- alias_method :idf, :inverse_document_frequency
36
-
37
- # Returns the term's frequency in the document.
38
- #
39
- # @param [Document] document a document
40
- # @param [String] term a term
41
- # @return [Float] the term's frequency in the document
42
- def term_frequency(document, term)
43
- tf = document.term_count(term)
44
- sqrt(tf)
45
- end
46
- alias_method :tf, :term_frequency
47
-
48
- # Return the term frequency–inverse document frequency.
49
- #
50
- # @param [Document] document a document
51
- # @param [String] term a term
52
- # @return [Float] the term frequency–inverse document frequency
53
- def term_frequency_inverse_document_frequency(document, term)
54
- inverse_document_frequency(term) * term_frequency(document, term)
55
- end
56
- alias_method :tfidf, :term_frequency_inverse_document_frequency
57
-
58
- # Returns a similarity matrix for the documents in the corpus.
59
- #
60
- # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
61
- # @note Columns are normalized to unit vectors, so we can calculate the cosine
62
- # similarity of all document vectors.
63
- def similarity_matrix
64
- multiply_self(normalize)
14
+ alias_method :idf, :inverse_document_frequency
15
+
16
+ # Returns the term's frequency in the document.
17
+ #
18
+ # @param [Document] document a document
19
+ # @param [String] term a term
20
+ # @return [Float] the term's frequency in the document
21
+ def term_frequency(document, term)
22
+ tf = document.term_count(term)
23
+ sqrt(tf)
24
+ end
25
+ alias_method :tf, :term_frequency
65
26
  end
66
27
  end
@@ -8,44 +8,46 @@
8
8
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory
9
9
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
10
10
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
11
- class TfIdfSimilarity::Token < String
12
- # Returns a falsy value if all its characters are numbers, punctuation,
13
- # whitespace or control characters.
14
- #
15
- # @note Some implementations ignore one and two-letter words.
16
- #
17
- # @return [Boolean] whether the string is a token
18
- def valid?
19
- !self[%r{
20
- \A
21
- (
22
- \d | # number
23
- [[:cntrl:]] | # control character
24
- [[:punct:]] | # punctuation
25
- [[:space:]] # whitespace
26
- )+
27
- \z
28
- }x]
29
- end
11
+ module TfIdfSimilarity
12
+ class Token < String
13
+ # Returns a falsy value if all its characters are numbers, punctuation,
14
+ # whitespace or control characters.
15
+ #
16
+ # @note Some implementations ignore one and two-letter words.
17
+ #
18
+ # @return [Boolean] whether the string is a token
19
+ def valid?
20
+ !self[%r{
21
+ \A
22
+ (
23
+ \d | # number
24
+ [[:cntrl:]] | # control character
25
+ [[:punct:]] | # punctuation
26
+ [[:space:]] # whitespace
27
+ )+
28
+ \z
29
+ }x]
30
+ end
30
31
 
31
- # Returns a lowercase string.
32
- #
33
- # @return [Token] a lowercase string
34
- #
35
- # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
36
- def lowercase_filter
37
- self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
38
- "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
39
- "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
40
- ).downcase)
41
- end
32
+ # Returns a lowercase string.
33
+ #
34
+ # @return [Token] a lowercase string
35
+ #
36
+ # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
37
+ def lowercase_filter
38
+ self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
39
+ "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
40
+ "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
41
+ ).downcase)
42
+ end
42
43
 
43
- # Returns a string with no English possessive or periods in acronyms.
44
- #
45
- # @return [Token] a string with no English possessive or periods in acronyms
46
- #
47
- # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
48
- def classic_filter
49
- self.class.new(self.gsub('.', '').chomp("'s"))
44
+ # Returns a string with no English possessive or periods in acronyms.
45
+ #
46
+ # @return [Token] a string with no English possessive or periods in acronyms
47
+ #
48
+ # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
49
+ def classic_filter
50
+ self.class.new(self.gsub('.', '').chomp("'s"))
51
+ end
50
52
  end
51
53
  end
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -0,0 +1,200 @@
1
+ require 'spec_helper'
2
+
3
+ module TfIdfSimilarity
4
+ describe BM25Model do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
8
+
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
12
+
13
+ let :document_without_text do
14
+ Document.new('')
15
+ end
16
+
17
+ let :document do
18
+ Document.new(text)
19
+ end
20
+
21
+ let :document_with_tokens do
22
+ Document.new(text, :tokens => tokens)
23
+ end
24
+
25
+ let :document_with_term_counts do
26
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
27
+ end
28
+
29
+ let :non_corpus_document do
30
+ Document.new('foo foo foo')
31
+ end
32
+
33
+ def similarity_matrix_values(model)
34
+ matrix = model.similarity_matrix
35
+ if MATRIX_LIBRARY == :nmatrix
36
+ matrix.each.to_a
37
+ else
38
+ matrix.to_a.flatten
39
+ end
40
+ end
41
+
42
+ context 'without documents', :empty_matrix => true do
43
+ let :model do
44
+ BM25Model.new([], :library => MATRIX_LIBRARY)
45
+ end
46
+
47
+ describe '#documents' do
48
+ it 'should be empty' do
49
+ model.documents.should be_empty
50
+ end
51
+ end
52
+
53
+ describe '#document_index' do
54
+ it 'should return nil' do
55
+ model.document_index(document).should be_nil
56
+ end
57
+ end
58
+
59
+ describe '#text_index' do
60
+ it 'should return nil' do
61
+ model.text_index(text).should be_nil
62
+ end
63
+ end
64
+
65
+ describe '#terms' do
66
+ it 'should be empty' do
67
+ model.terms.should be_empty
68
+ end
69
+ end
70
+
71
+ describe '#inverse_document_frequency' do
72
+ it 'should return negative infinity' do
73
+ model.idf('foo').should == 0.0
74
+ end
75
+ end
76
+
77
+ describe '#term_frequency' do
78
+ it 'should return the term frequency' do
79
+ model.tf(document, 'foo').should be_nan
80
+ end
81
+ end
82
+
83
+ describe '#term_frequency_inverse_document_frequency' do
84
+ it 'should return negative infinity' do
85
+ model.tfidf(document, 'foo').should be_nan
86
+ end
87
+ end
88
+
89
+ describe '#similarity_matrix' do
90
+ it 'should be empty' do
91
+ similarity_matrix_values(model).should be_empty
92
+ end
93
+ end
94
+ end
95
+
96
+ context 'with documents' do
97
+ let :documents do
98
+ [
99
+ document,
100
+ document_with_tokens,
101
+ document_without_text,
102
+ document_with_term_counts,
103
+ ]
104
+ end
105
+
106
+ let :model do
107
+ BM25Model.new(documents, :library => MATRIX_LIBRARY)
108
+ end
109
+
110
+ describe '#documents' do
111
+ it 'should return the documents' do
112
+ model.documents.should == documents
113
+ end
114
+ end
115
+
116
+ describe '#document_index' do
117
+ it 'should return nil' do
118
+ model.document_index(document).should == 0
119
+ end
120
+ end
121
+
122
+ describe '#text_index' do
123
+ it 'should return the index' do
124
+ model.text_index(text).should == 0
125
+ end
126
+ end
127
+
128
+ describe '#terms' do
129
+ it 'should return the terms' do
130
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
131
+ end
132
+ end
133
+
134
+ describe '#inverse_document_frequency' do
135
+ it 'should return the inverse document frequency' do
136
+ model.idf('foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)))
137
+ end
138
+
139
+ it 'should return the inverse document frequency of a non-occurring term' do
140
+ model.idf('xxx').should be_within(0.001).of(Math.log((4 - 0 + 0.5) / (0 + 0.5)))
141
+ end
142
+ end
143
+
144
+ describe '#term_frequency' do
145
+ it 'should return the term frequency if no tokens given' do
146
+ model.tf(document, 'foo').should == (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5)
147
+ end
148
+
149
+ it 'should return the term frequency if tokens given' do
150
+ model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
151
+ end
152
+
153
+ it 'should return no term frequency if no text given' do
154
+ model.tf(document_without_text, 'foo').should == 0
155
+ end
156
+
157
+ it 'should return the term frequency if term counts given' do
158
+ model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
159
+ end
160
+
161
+ it 'should return the term frequency of a non-occurring term' do
162
+ model.tf(document, 'xxx').should == 0
163
+ end
164
+
165
+ it 'should return the term frequency in a non-occurring document' do
166
+ model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
167
+ end
168
+ end
169
+
170
+ describe '#term_frequency_inverse_document_frequency' do
171
+ it 'should return the tf*idf' do
172
+ model.tfidf(document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5))
173
+ end
174
+
175
+ it 'should return the tf*idf of a non-occurring term' do
176
+ model.tfidf(document, 'xxx').should == 0
177
+ end
178
+
179
+ it 'should return the tf*idf in a non-occurring term' do
180
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
181
+ end
182
+ end
183
+
184
+ describe '#similarity_matrix' do
185
+ it 'should return the similarity matrix' do
186
+ expected = [
187
+ 1.0, 0.564, 0.0, 0.479,
188
+ 0.564, 1.0, 0.0, 0.540,
189
+ 0.0, 0.0, 0.0, 0.0,
190
+ 0.479, 0.540, 0.0, 1.0,
191
+ ]
192
+
193
+ similarity_matrix_values(model).each_with_index do |value,i|
194
+ value.should be_within(0.001).of(expected[i])
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
@@ -1,136 +1,138 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
4
- describe TfIdfSimilarity::Document do
5
- let :text do
6
- "FOO-foo BAR bar \r\n\t 123 !@#"
7
- end
8
-
9
- let :tokens do
10
- ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
- end
12
-
13
- let :document_without_text do
14
- TfIdfSimilarity::Document.new('')
15
- end
16
-
17
- let :document do
18
- TfIdfSimilarity::Document.new(text)
19
- end
20
-
21
- let :document_with_id do
22
- TfIdfSimilarity::Document.new(text, :id => 'baz')
23
- end
24
-
25
- let :document_with_tokens do
26
- TfIdfSimilarity::Document.new(text, :tokens => tokens)
27
- end
28
-
29
- let :document_with_term_counts do
30
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
31
- end
32
-
33
- let :document_with_term_counts_and_size do
34
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
35
- end
36
-
37
- let :document_with_size do
38
- TfIdfSimilarity::Document.new(text, :size => 10)
39
- end
40
-
41
- describe '#id' do
42
- it 'should return the ID if no ID given' do
43
- document.id.should == document.object_id
4
+ module TfIdfSimilarity
5
+ describe Document do
6
+ let :text do
7
+ "FOO-foo BAR bar \r\n\t 123 !@#"
44
8
  end
45
9
 
46
- it 'should return the given ID' do
47
- document_with_id.id.should == 'baz'
10
+ let :tokens do
11
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
48
12
  end
49
- end
50
13
 
51
- describe '#text' do
52
- it 'should return the text' do
53
- document.text.should == text
14
+ let :document_without_text do
15
+ Document.new('')
54
16
  end
55
- end
56
17
 
57
- describe '#size' do
58
- it 'should return the number of tokens if no tokens given' do
59
- document.size.should == 4
18
+ let :document do
19
+ Document.new(text)
60
20
  end
61
21
 
62
- it 'should return the number of tokens if tokens given' do
63
- document_with_tokens.size.should == 3
22
+ let :document_with_id do
23
+ Document.new(text, :id => 'baz')
64
24
  end
65
25
 
66
- it 'should return the number of tokens if no text given' do
67
- document_without_text.size.should == 0
26
+ let :document_with_tokens do
27
+ Document.new(text, :tokens => tokens)
68
28
  end
69
29
 
70
- it 'should return the number of tokens if term counts given' do
71
- document_with_term_counts.size.should == 15
30
+ let :document_with_term_counts do
31
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
72
32
  end
73
33
 
74
- it 'should return the given number of tokens if term counts and size given' do
75
- document_with_term_counts_and_size.size.should == 20
34
+ let :document_with_term_counts_and_size do
35
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
76
36
  end
77
37
 
78
- it 'should not return the given number of tokens if term counts not given' do
79
- document_with_size.size.should_not == 10
38
+ let :document_with_size do
39
+ Document.new(text, :size => 10)
80
40
  end
81
- end
82
41
 
83
- describe '#term_counts' do
84
- it 'should return the term counts if no tokens given' do
85
- document.term_counts.should == {'foo' => 2, 'bar' => 2}
86
- end
42
+ describe '#id' do
43
+ it 'should return the ID if no ID given' do
44
+ document.id.should == document.object_id
45
+ end
87
46
 
88
- it 'should return the term counts if tokens given' do
89
- document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
47
+ it 'should return the given ID' do
48
+ document_with_id.id.should == 'baz'
49
+ end
90
50
  end
91
51
 
92
- it 'should return no term counts if no text given' do
93
- document_without_text.term_counts.should == {}
52
+ describe '#text' do
53
+ it 'should return the text' do
54
+ document.text.should == text
55
+ end
94
56
  end
95
57
 
96
- it 'should return the term counts if term counts given' do
97
- document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
98
- end
99
- end
58
+ describe '#size' do
59
+ it 'should return the number of tokens if no tokens given' do
60
+ document.size.should == 4
61
+ end
100
62
 
101
- describe '#terms' do
102
- it 'should return the terms if no tokens given' do
103
- document.terms.sort.should == ['bar', 'foo']
104
- end
63
+ it 'should return the number of tokens if tokens given' do
64
+ document_with_tokens.size.should == 3
65
+ end
105
66
 
106
- it 'should return the terms if tokens given' do
107
- document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
108
- end
67
+ it 'should return the number of tokens if no text given' do
68
+ document_without_text.size.should == 0
69
+ end
109
70
 
110
- it 'should return no terms if no text given' do
111
- document_without_text.terms.should == []
112
- end
71
+ it 'should return the number of tokens if term counts given' do
72
+ document_with_term_counts.size.should == 15
73
+ end
113
74
 
114
- it 'should return the terms if term counts given' do
115
- document_with_term_counts.terms.sort.should == ['bar', 'baz']
116
- end
117
- end
75
+ it 'should return the given number of tokens if term counts and size given' do
76
+ document_with_term_counts_and_size.size.should == 20
77
+ end
118
78
 
119
- describe '#term_count' do
120
- it 'should return the term count if no tokens given' do
121
- document.term_count('foo').should == 2
79
+ it 'should not return the given number of tokens if term counts not given' do
80
+ document_with_size.size.should_not == 10
81
+ end
122
82
  end
123
83
 
124
- it 'should return the term count if tokens given' do
125
- document_with_tokens.term_count('foo-foo').should == 1
84
+ describe '#term_counts' do
85
+ it 'should return the term counts if no tokens given' do
86
+ document.term_counts.should == {'foo' => 2, 'bar' => 2}
87
+ end
88
+
89
+ it 'should return the term counts if tokens given' do
90
+ document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
91
+ end
92
+
93
+ it 'should return no term counts if no text given' do
94
+ document_without_text.term_counts.should == {}
95
+ end
96
+
97
+ it 'should return the term counts if term counts given' do
98
+ document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
99
+ end
126
100
  end
127
101
 
128
- it 'should return no term count if no text given' do
129
- document_without_text.term_count('foo').should == 0
102
+ describe '#terms' do
103
+ it 'should return the terms if no tokens given' do
104
+ document.terms.sort.should == ['bar', 'foo']
105
+ end
106
+
107
+ it 'should return the terms if tokens given' do
108
+ document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
109
+ end
110
+
111
+ it 'should return no terms if no text given' do
112
+ document_without_text.terms.should == []
113
+ end
114
+
115
+ it 'should return the terms if term counts given' do
116
+ document_with_term_counts.terms.sort.should == ['bar', 'baz']
117
+ end
130
118
  end
131
119
 
132
- it 'should return the term count if term counts given' do
133
- document_with_term_counts.term_count('bar').should == 5
120
+ describe '#term_count' do
121
+ it 'should return the term count if no tokens given' do
122
+ document.term_count('foo').should == 2
123
+ end
124
+
125
+ it 'should return the term count if tokens given' do
126
+ document_with_tokens.term_count('foo-foo').should == 1
127
+ end
128
+
129
+ it 'should return no term count if no text given' do
130
+ document_without_text.term_count('foo').should == 0
131
+ end
132
+
133
+ it 'should return the term count if term counts given' do
134
+ document_with_term_counts.term_count('bar').should == 5
135
+ end
134
136
  end
135
137
  end
136
138
  end