tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,66 +1,27 @@
1
1
  # A document-term matrix using the tf*idf function.
2
2
  #
3
3
  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
4
- class TfIdfSimilarity::TfIdfModel
5
- include TfIdfSimilarity::MatrixMethods
6
-
7
- extend Forwardable
8
- def_delegators :@model, :documents, :terms, :document_count
9
-
10
- # @param [Array<TfIdfSimilarity::Document>] documents documents
11
- # @param [Hash] opts optional arguments
12
- # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
13
- def initialize(documents, opts = {})
14
- @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
15
- @library = (opts[:library] || :matrix).to_sym
16
-
17
- array = Array.new(terms.size) do |i|
18
- idf = inverse_document_frequency(terms[i])
19
- Array.new(documents.size) do |j|
20
- term_frequency(documents[j], terms[i]) * idf
21
- end
4
+ module TfIdfSimilarity
5
+ class TfIdfModel < Model
6
+ # Return the term's inverse document frequency.
7
+ #
8
+ # @param [String] term a term
9
+ # @return [Float] the term's inverse document frequency
10
+ def inverse_document_frequency(term)
11
+ df = @model.document_count(term)
12
+ 1 + log(documents.size / (df + 1.0))
22
13
  end
23
-
24
- @matrix = initialize_matrix(array)
25
- end
26
-
27
- # Return the term's inverse document frequency.
28
- #
29
- # @param [String] term a term
30
- # @return [Float] the term's inverse document frequency
31
- def inverse_document_frequency(term)
32
- df = @model.document_count(term)
33
- 1 + log(documents.size / (df + 1.0))
34
- end
35
- alias_method :idf, :inverse_document_frequency
36
-
37
- # Returns the term's frequency in the document.
38
- #
39
- # @param [Document] document a document
40
- # @param [String] term a term
41
- # @return [Float] the term's frequency in the document
42
- def term_frequency(document, term)
43
- tf = document.term_count(term)
44
- sqrt(tf)
45
- end
46
- alias_method :tf, :term_frequency
47
-
48
- # Return the term frequency–inverse document frequency.
49
- #
50
- # @param [Document] document a document
51
- # @param [String] term a term
52
- # @return [Float] the term frequency–inverse document frequency
53
- def term_frequency_inverse_document_frequency(document, term)
54
- inverse_document_frequency(term) * term_frequency(document, term)
55
- end
56
- alias_method :tfidf, :term_frequency_inverse_document_frequency
57
-
58
- # Returns a similarity matrix for the documents in the corpus.
59
- #
60
- # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
61
- # @note Columns are normalized to unit vectors, so we can calculate the cosine
62
- # similarity of all document vectors.
63
- def similarity_matrix
64
- multiply_self(normalize)
14
+ alias_method :idf, :inverse_document_frequency
15
+
16
+ # Returns the term's frequency in the document.
17
+ #
18
+ # @param [Document] document a document
19
+ # @param [String] term a term
20
+ # @return [Float] the term's frequency in the document
21
+ def term_frequency(document, term)
22
+ tf = document.term_count(term)
23
+ sqrt(tf)
24
+ end
25
+ alias_method :tf, :term_frequency
65
26
  end
66
27
  end
@@ -8,44 +8,46 @@
8
8
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory
9
9
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
10
10
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
11
- class TfIdfSimilarity::Token < String
12
- # Returns a falsy value if all its characters are numbers, punctuation,
13
- # whitespace or control characters.
14
- #
15
- # @note Some implementations ignore one and two-letter words.
16
- #
17
- # @return [Boolean] whether the string is a token
18
- def valid?
19
- !self[%r{
20
- \A
21
- (
22
- \d | # number
23
- [[:cntrl:]] | # control character
24
- [[:punct:]] | # punctuation
25
- [[:space:]] # whitespace
26
- )+
27
- \z
28
- }x]
29
- end
11
+ module TfIdfSimilarity
12
+ class Token < String
13
+ # Returns a falsy value if all its characters are numbers, punctuation,
14
+ # whitespace or control characters.
15
+ #
16
+ # @note Some implementations ignore one and two-letter words.
17
+ #
18
+ # @return [Boolean] whether the string is a token
19
+ def valid?
20
+ !self[%r{
21
+ \A
22
+ (
23
+ \d | # number
24
+ [[:cntrl:]] | # control character
25
+ [[:punct:]] | # punctuation
26
+ [[:space:]] # whitespace
27
+ )+
28
+ \z
29
+ }x]
30
+ end
30
31
 
31
- # Returns a lowercase string.
32
- #
33
- # @return [Token] a lowercase string
34
- #
35
- # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
36
- def lowercase_filter
37
- self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
38
- "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
39
- "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
40
- ).downcase)
41
- end
32
+ # Returns a lowercase string.
33
+ #
34
+ # @return [Token] a lowercase string
35
+ #
36
+ # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
37
+ def lowercase_filter
38
+ self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
39
+ "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
40
+ "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
41
+ ).downcase)
42
+ end
42
43
 
43
- # Returns a string with no English possessive or periods in acronyms.
44
- #
45
- # @return [Token] a string with no English possessive or periods in acronyms
46
- #
47
- # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
48
- def classic_filter
49
- self.class.new(self.gsub('.', '').chomp("'s"))
44
+ # Returns a string with no English possessive or periods in acronyms.
45
+ #
46
+ # @return [Token] a string with no English possessive or periods in acronyms
47
+ #
48
+ # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
49
+ def classic_filter
50
+ self.class.new(self.gsub('.', '').chomp("'s"))
51
+ end
50
52
  end
51
53
  end
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -0,0 +1,200 @@
1
+ require 'spec_helper'
2
+
3
+ module TfIdfSimilarity
4
+ describe BM25Model do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
8
+
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
12
+
13
+ let :document_without_text do
14
+ Document.new('')
15
+ end
16
+
17
+ let :document do
18
+ Document.new(text)
19
+ end
20
+
21
+ let :document_with_tokens do
22
+ Document.new(text, :tokens => tokens)
23
+ end
24
+
25
+ let :document_with_term_counts do
26
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
27
+ end
28
+
29
+ let :non_corpus_document do
30
+ Document.new('foo foo foo')
31
+ end
32
+
33
+ def similarity_matrix_values(model)
34
+ matrix = model.similarity_matrix
35
+ if MATRIX_LIBRARY == :nmatrix
36
+ matrix.each.to_a
37
+ else
38
+ matrix.to_a.flatten
39
+ end
40
+ end
41
+
42
+ context 'without documents', :empty_matrix => true do
43
+ let :model do
44
+ BM25Model.new([], :library => MATRIX_LIBRARY)
45
+ end
46
+
47
+ describe '#documents' do
48
+ it 'should be empty' do
49
+ model.documents.should be_empty
50
+ end
51
+ end
52
+
53
+ describe '#document_index' do
54
+ it 'should return nil' do
55
+ model.document_index(document).should be_nil
56
+ end
57
+ end
58
+
59
+ describe '#text_index' do
60
+ it 'should return nil' do
61
+ model.text_index(text).should be_nil
62
+ end
63
+ end
64
+
65
+ describe '#terms' do
66
+ it 'should be empty' do
67
+ model.terms.should be_empty
68
+ end
69
+ end
70
+
71
+ describe '#inverse_document_frequency' do
72
+ it 'should return negative infinity' do
73
+ model.idf('foo').should == 0.0
74
+ end
75
+ end
76
+
77
+ describe '#term_frequency' do
78
+ it 'should return the term frequency' do
79
+ model.tf(document, 'foo').should be_nan
80
+ end
81
+ end
82
+
83
+ describe '#term_frequency_inverse_document_frequency' do
84
+ it 'should return negative infinity' do
85
+ model.tfidf(document, 'foo').should be_nan
86
+ end
87
+ end
88
+
89
+ describe '#similarity_matrix' do
90
+ it 'should be empty' do
91
+ similarity_matrix_values(model).should be_empty
92
+ end
93
+ end
94
+ end
95
+
96
+ context 'with documents' do
97
+ let :documents do
98
+ [
99
+ document,
100
+ document_with_tokens,
101
+ document_without_text,
102
+ document_with_term_counts,
103
+ ]
104
+ end
105
+
106
+ let :model do
107
+ BM25Model.new(documents, :library => MATRIX_LIBRARY)
108
+ end
109
+
110
+ describe '#documents' do
111
+ it 'should return the documents' do
112
+ model.documents.should == documents
113
+ end
114
+ end
115
+
116
+ describe '#document_index' do
117
+ it 'should return nil' do
118
+ model.document_index(document).should == 0
119
+ end
120
+ end
121
+
122
+ describe '#text_index' do
123
+ it 'should return the index' do
124
+ model.text_index(text).should == 0
125
+ end
126
+ end
127
+
128
+ describe '#terms' do
129
+ it 'should return the terms' do
130
+ model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
131
+ end
132
+ end
133
+
134
+ describe '#inverse_document_frequency' do
135
+ it 'should return the inverse document frequency' do
136
+ model.idf('foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)))
137
+ end
138
+
139
+ it 'should return the inverse document frequency of a non-occurring term' do
140
+ model.idf('xxx').should be_within(0.001).of(Math.log((4 - 0 + 0.5) / (0 + 0.5)))
141
+ end
142
+ end
143
+
144
+ describe '#term_frequency' do
145
+ it 'should return the term frequency if no tokens given' do
146
+ model.tf(document, 'foo').should == (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5)
147
+ end
148
+
149
+ it 'should return the term frequency if tokens given' do
150
+ model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
151
+ end
152
+
153
+ it 'should return no term frequency if no text given' do
154
+ model.tf(document_without_text, 'foo').should == 0
155
+ end
156
+
157
+ it 'should return the term frequency if term counts given' do
158
+ model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
159
+ end
160
+
161
+ it 'should return the term frequency of a non-occurring term' do
162
+ model.tf(document, 'xxx').should == 0
163
+ end
164
+
165
+ it 'should return the term frequency in a non-occurring document' do
166
+ model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
167
+ end
168
+ end
169
+
170
+ describe '#term_frequency_inverse_document_frequency' do
171
+ it 'should return the tf*idf' do
172
+ model.tfidf(document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5))
173
+ end
174
+
175
+ it 'should return the tf*idf of a non-occurring term' do
176
+ model.tfidf(document, 'xxx').should == 0
177
+ end
178
+
179
+ it 'should return the tf*idf in a non-occurring term' do
180
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
181
+ end
182
+ end
183
+
184
+ describe '#similarity_matrix' do
185
+ it 'should return the similarity matrix' do
186
+ expected = [
187
+ 1.0, 0.564, 0.0, 0.479,
188
+ 0.564, 1.0, 0.0, 0.540,
189
+ 0.0, 0.0, 0.0, 0.0,
190
+ 0.479, 0.540, 0.0, 1.0,
191
+ ]
192
+
193
+ similarity_matrix_values(model).each_with_index do |value,i|
194
+ value.should be_within(0.001).of(expected[i])
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
@@ -1,136 +1,138 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
4
- describe TfIdfSimilarity::Document do
5
- let :text do
6
- "FOO-foo BAR bar \r\n\t 123 !@#"
7
- end
8
-
9
- let :tokens do
10
- ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
- end
12
-
13
- let :document_without_text do
14
- TfIdfSimilarity::Document.new('')
15
- end
16
-
17
- let :document do
18
- TfIdfSimilarity::Document.new(text)
19
- end
20
-
21
- let :document_with_id do
22
- TfIdfSimilarity::Document.new(text, :id => 'baz')
23
- end
24
-
25
- let :document_with_tokens do
26
- TfIdfSimilarity::Document.new(text, :tokens => tokens)
27
- end
28
-
29
- let :document_with_term_counts do
30
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
31
- end
32
-
33
- let :document_with_term_counts_and_size do
34
- TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
35
- end
36
-
37
- let :document_with_size do
38
- TfIdfSimilarity::Document.new(text, :size => 10)
39
- end
40
-
41
- describe '#id' do
42
- it 'should return the ID if no ID given' do
43
- document.id.should == document.object_id
4
+ module TfIdfSimilarity
5
+ describe Document do
6
+ let :text do
7
+ "FOO-foo BAR bar \r\n\t 123 !@#"
44
8
  end
45
9
 
46
- it 'should return the given ID' do
47
- document_with_id.id.should == 'baz'
10
+ let :tokens do
11
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
48
12
  end
49
- end
50
13
 
51
- describe '#text' do
52
- it 'should return the text' do
53
- document.text.should == text
14
+ let :document_without_text do
15
+ Document.new('')
54
16
  end
55
- end
56
17
 
57
- describe '#size' do
58
- it 'should return the number of tokens if no tokens given' do
59
- document.size.should == 4
18
+ let :document do
19
+ Document.new(text)
60
20
  end
61
21
 
62
- it 'should return the number of tokens if tokens given' do
63
- document_with_tokens.size.should == 3
22
+ let :document_with_id do
23
+ Document.new(text, :id => 'baz')
64
24
  end
65
25
 
66
- it 'should return the number of tokens if no text given' do
67
- document_without_text.size.should == 0
26
+ let :document_with_tokens do
27
+ Document.new(text, :tokens => tokens)
68
28
  end
69
29
 
70
- it 'should return the number of tokens if term counts given' do
71
- document_with_term_counts.size.should == 15
30
+ let :document_with_term_counts do
31
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
72
32
  end
73
33
 
74
- it 'should return the given number of tokens if term counts and size given' do
75
- document_with_term_counts_and_size.size.should == 20
34
+ let :document_with_term_counts_and_size do
35
+ Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
76
36
  end
77
37
 
78
- it 'should not return the given number of tokens if term counts not given' do
79
- document_with_size.size.should_not == 10
38
+ let :document_with_size do
39
+ Document.new(text, :size => 10)
80
40
  end
81
- end
82
41
 
83
- describe '#term_counts' do
84
- it 'should return the term counts if no tokens given' do
85
- document.term_counts.should == {'foo' => 2, 'bar' => 2}
86
- end
42
+ describe '#id' do
43
+ it 'should return the ID if no ID given' do
44
+ document.id.should == document.object_id
45
+ end
87
46
 
88
- it 'should return the term counts if tokens given' do
89
- document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
47
+ it 'should return the given ID' do
48
+ document_with_id.id.should == 'baz'
49
+ end
90
50
  end
91
51
 
92
- it 'should return no term counts if no text given' do
93
- document_without_text.term_counts.should == {}
52
+ describe '#text' do
53
+ it 'should return the text' do
54
+ document.text.should == text
55
+ end
94
56
  end
95
57
 
96
- it 'should return the term counts if term counts given' do
97
- document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
98
- end
99
- end
58
+ describe '#size' do
59
+ it 'should return the number of tokens if no tokens given' do
60
+ document.size.should == 4
61
+ end
100
62
 
101
- describe '#terms' do
102
- it 'should return the terms if no tokens given' do
103
- document.terms.sort.should == ['bar', 'foo']
104
- end
63
+ it 'should return the number of tokens if tokens given' do
64
+ document_with_tokens.size.should == 3
65
+ end
105
66
 
106
- it 'should return the terms if tokens given' do
107
- document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
108
- end
67
+ it 'should return the number of tokens if no text given' do
68
+ document_without_text.size.should == 0
69
+ end
109
70
 
110
- it 'should return no terms if no text given' do
111
- document_without_text.terms.should == []
112
- end
71
+ it 'should return the number of tokens if term counts given' do
72
+ document_with_term_counts.size.should == 15
73
+ end
113
74
 
114
- it 'should return the terms if term counts given' do
115
- document_with_term_counts.terms.sort.should == ['bar', 'baz']
116
- end
117
- end
75
+ it 'should return the given number of tokens if term counts and size given' do
76
+ document_with_term_counts_and_size.size.should == 20
77
+ end
118
78
 
119
- describe '#term_count' do
120
- it 'should return the term count if no tokens given' do
121
- document.term_count('foo').should == 2
79
+ it 'should not return the given number of tokens if term counts not given' do
80
+ document_with_size.size.should_not == 10
81
+ end
122
82
  end
123
83
 
124
- it 'should return the term count if tokens given' do
125
- document_with_tokens.term_count('foo-foo').should == 1
84
+ describe '#term_counts' do
85
+ it 'should return the term counts if no tokens given' do
86
+ document.term_counts.should == {'foo' => 2, 'bar' => 2}
87
+ end
88
+
89
+ it 'should return the term counts if tokens given' do
90
+ document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
91
+ end
92
+
93
+ it 'should return no term counts if no text given' do
94
+ document_without_text.term_counts.should == {}
95
+ end
96
+
97
+ it 'should return the term counts if term counts given' do
98
+ document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
99
+ end
126
100
  end
127
101
 
128
- it 'should return no term count if no text given' do
129
- document_without_text.term_count('foo').should == 0
102
+ describe '#terms' do
103
+ it 'should return the terms if no tokens given' do
104
+ document.terms.sort.should == ['bar', 'foo']
105
+ end
106
+
107
+ it 'should return the terms if tokens given' do
108
+ document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
109
+ end
110
+
111
+ it 'should return no terms if no text given' do
112
+ document_without_text.terms.should == []
113
+ end
114
+
115
+ it 'should return the terms if term counts given' do
116
+ document_with_term_counts.terms.sort.should == ['bar', 'baz']
117
+ end
130
118
  end
131
119
 
132
- it 'should return the term count if term counts given' do
133
- document_with_term_counts.term_count('bar').should == 5
120
+ describe '#term_count' do
121
+ it 'should return the term count if no tokens given' do
122
+ document.term_count('foo').should == 2
123
+ end
124
+
125
+ it 'should return the term count if tokens given' do
126
+ document_with_tokens.term_count('foo-foo').should == 1
127
+ end
128
+
129
+ it 'should return no term count if no text given' do
130
+ document_without_text.term_count('foo').should == 0
131
+ end
132
+
133
+ it 'should return the term count if term counts given' do
134
+ document_with_term_counts.term_count('bar').should == 5
135
+ end
134
136
  end
135
137
  end
136
138
  end