tf-idf-similarity 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,164 @@
1
+ module TfIdfSimilarity::MatrixMethods
2
+ private
3
+
4
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
5
+ #
6
+ # @note Lucene normalizes document length differently.
7
+ def normalize
8
+ case @library
9
+ when :gsl
10
+ @matrix.clone.each_col do |column|
11
+ unless column.isnull?
12
+ column.normalize!
13
+ end
14
+ end
15
+ when :narray # @see https://github.com/masa16/narray/issues/21
16
+ norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
17
+ norm[norm.where2[1]] = 1.0 # avoid division by zero
18
+ NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
19
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
20
+ normal = NMatrix.new(:dense, @matrix.shape, :float64)
21
+ (0...@matrix.shape[1]).each do |j|
22
+ column = @matrix.column(j)
23
+ norm = Math.sqrt(column.transpose.dot(column)[0, 0])
24
+ (0...@matrix.shape[0]).each do |i|
25
+ normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
26
+ end
27
+ end
28
+ normal
29
+ else
30
+ Matrix.columns(@matrix.column_vectors.map do |column|
31
+ if column.to_a.all?(&:zero?)
32
+ column
33
+ elsif column.respond_to?(:normalize)
34
+ column.normalize
35
+ else
36
+ column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
37
+ end
38
+ end)
39
+ end
40
+ end
41
+
42
+ # @param [Integer] row index
43
+ # @param [Integer] column index
44
+ def get(i, j)
45
+ case @library
46
+ when :narray
47
+ @matrix[j, i]
48
+ else
49
+ @matrix[i, j]
50
+ end
51
+ end
52
+
53
+ # @param [Integer] index the row index
54
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
55
+ def row(index)
56
+ case @library
57
+ when :narray
58
+ @matrix[true, index]
59
+ else
60
+ @matrix.row(index)
61
+ end
62
+ end
63
+
64
+ # @param [Integer] index the column index
65
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
66
+ def column(index)
67
+ case @library
68
+ when :narray
69
+ @matrix[index, true]
70
+ else
71
+ @matrix.column(index)
72
+ end
73
+ end
74
+
75
+ # @return [Float] the number of rows in the matrix
76
+ def row_size
77
+ case @library
78
+ when :gsl, :nmatrix
79
+ @matrix.shape[0]
80
+ when :narray
81
+ @matrix.shape[1]
82
+ else
83
+ @matrix.row_size
84
+ end
85
+ end
86
+
87
+ # @return [Float] the number of columns in the matrix
88
+ def column_size
89
+ case @library
90
+ when :gsl, :nmatrix
91
+ @matrix.shape[1]
92
+ when :narray
93
+ @matrix.shape[0]
94
+ else
95
+ @matrix.column_size
96
+ end
97
+ end
98
+
99
+ # @return [Array<Float>] the matrix's values
100
+ def values
101
+ case @library
102
+ when :nmatrix
103
+ @matrix.each.to_a
104
+ else
105
+ @matrix.to_a.flatten
106
+ end
107
+ end
108
+
109
+ # @return [Float] the sum of all values in the matrix
110
+ def sum
111
+ case @library
112
+ when :narray
113
+ @matrix.sum
114
+ else
115
+ values.reduce(0, :+)
116
+ end
117
+ end
118
+
119
+ # @param [Array<Array>] array matrix rows
120
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
121
+ def initialize_matrix(array)
122
+ case @library
123
+ when :gsl
124
+ GSL::Matrix[*array]
125
+ when :narray
126
+ NArray[*array]
127
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91
128
+ NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten)
129
+ else
130
+ Matrix[*array]
131
+ end
132
+ end
133
+
134
+ # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
135
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
136
+ def multiply_self(matrix)
137
+ case @library
138
+ when :nmatrix
139
+ matrix.transpose.dot(matrix)
140
+ else
141
+ matrix.transpose * matrix
142
+ end
143
+ end
144
+
145
+ def log(number)
146
+ case @library
147
+ when :gsl
148
+ GSL::Sf::log(number)
149
+ when :narray
150
+ NMath.log(number)
151
+ else
152
+ Math.log(number)
153
+ end
154
+ end
155
+
156
+ def sqrt(number)
157
+ case @library
158
+ when :narray
159
+ NMath.sqrt(number)
160
+ else
161
+ Math.sqrt(number)
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,78 @@
1
+ # A simple document-term matrix.
2
+ #
3
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
4
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
5
+ # @see http://en.wikipedia.org/wiki/Okapi_BM25
6
+ class TfIdfSimilarity::TermCountModel
7
+ include TfIdfSimilarity::MatrixMethods
8
+
9
+ # The documents in the corpus.
10
+ attr_reader :documents
11
+ # The set of terms in the corpus.
12
+ attr_reader :terms
13
+ # The average number of tokens in a document.
14
+ attr_reader :average_document_size
15
+
16
+ # @param [Array<TfIdfSimilarity::Document>] documents documents
17
+ # @param [Hash] opts optional arguments
18
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
19
+ def initialize(documents, opts = {})
20
+ @documents = documents
21
+ @terms = Set.new(documents.map(&:terms).flatten).to_a
22
+ @library = (opts[:library] || :matrix).to_sym
23
+
24
+ array = Array.new(terms.size) do |i|
25
+ Array.new(documents.size) do |j|
26
+ documents[j].term_count(terms[i])
27
+ end
28
+ end
29
+
30
+ @matrix = initialize_matrix(array)
31
+
32
+ @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
33
+ end
34
+
35
+ # @param [String] term a term
36
+ # @return [Integer] the number of documents the term appears in
37
+ def document_count(term)
38
+ index = terms.index(term)
39
+ if index
40
+ case @library
41
+ when :gsl, :narray
42
+ row(index).where.size
43
+ when :nmatrix
44
+ row(index).each.count(&:nonzero?)
45
+ else
46
+ vector = row(index)
47
+ unless vector.respond_to?(:count)
48
+ vector = vector.to_a
49
+ end
50
+ vector.count(&:nonzero?)
51
+ end
52
+ else
53
+ 0
54
+ end
55
+ end
56
+
57
+ # @param [String] term a term
58
+ # @return [Integer] the number of times the term appears in the corpus
59
+ def term_count(term)
60
+ index = terms.index(term)
61
+ if index
62
+ case @library
63
+ when :gsl, :narray
64
+ row(index).sum
65
+ when :nmatrix
66
+ row(index).each.reduce(0, :+)
67
+ else
68
+ vector = row(index)
69
+ unless vector.respond_to?(:reduce)
70
+ vector = vector.to_a
71
+ end
72
+ vector.reduce(0, :+)
73
+ end
74
+ else
75
+ 0
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,81 @@
1
+ # A document-term matrix using either the tf*idf or BM25 functions.
2
+ #
3
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
4
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
5
+ # @see http://en.wikipedia.org/wiki/Okapi_BM25
6
+ class TfIdfSimilarity::TfIdfModel
7
+ include TfIdfSimilarity::MatrixMethods
8
+
9
+ extend Forwardable
10
+ def_delegators :@model, :documents, :terms, :document_count
11
+
12
+ # @param [Array<TfIdfSimilarity::Document>] documents documents
13
+ # @param [Hash] opts optional arguments
14
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
15
+ # @option opts [Symbol] :function :tfidf (default) or :bm25
16
+ def initialize(documents, opts = {})
17
+ @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
18
+ @library = (opts[:library] || :matrix).to_sym
19
+ @function = (opts[:function] || :tfidf).to_sym
20
+
21
+ array = Array.new(terms.size) do |i|
22
+ idf = inverse_document_frequency(terms[i])
23
+ Array.new(documents.size) do |j|
24
+ term_frequency(documents[j], terms[i]) * idf
25
+ end
26
+ end
27
+
28
+ @matrix = initialize_matrix(array)
29
+ end
30
+
31
+ # Return the term's inverse document frequency.
32
+ #
33
+ # @param [String] term a term
34
+ # @return [Float] the term's inverse document frequency
35
+ def inverse_document_frequency(term)
36
+ df = @model.document_count(term)
37
+ if @function == :bm25
38
+ log((documents.size - df + 0.5) / (df + 0.5))
39
+ else
40
+ 1 + log(documents.size / (df + 1.0))
41
+ end
42
+ end
43
+ alias_method :idf, :inverse_document_frequency
44
+
45
+ # Returns the term's frequency in the document.
46
+ #
47
+ # @param [Document] document a document
48
+ # @param [String] term a term
49
+ # @return [Float] the term's frequency in the document
50
+ #
51
+ # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
52
+ def term_frequency(document, term)
53
+ tf = document.term_count(term)
54
+ if @function == :bm25
55
+ (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
56
+ else
57
+ sqrt(tf)
58
+ end
59
+ end
60
+ alias_method :tf, :term_frequency
61
+
62
+ # Return the term frequency–inverse document frequency.
63
+ #
64
+ # @param [Document] document a document
65
+ # @param [String] term a term
66
+ # @return [Float] the term frequency–inverse document frequency
67
+ def term_frequency_inverse_document_frequency(document, term)
68
+ inverse_document_frequency(term) * term_frequency(document, term)
69
+ end
70
+ alias_method :tfidf, :term_frequency_inverse_document_frequency
71
+
72
+ # Returns a similarity matrix for the documents in the corpus.
73
+ #
74
+ # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
75
+ # @note Columns are normalized to unit vectors, so we can calculate the cosine
76
+ # similarity of all document vectors. BM25 doesn't normalize columns, but
77
+ # BM25 wasn't written with this use case in mind.
78
+ def similarity_matrix
79
+ multiply_self(normalize)
80
+ end
81
+ end
@@ -1,5 +1,7 @@
1
1
  # coding: utf-8
2
2
 
3
+ # A token.
4
+ #
3
5
  # @note We can add more filters from Solr and stem using Porter's Snowball.
4
6
  #
5
7
  # @see https://github.com/aurelian/ruby-stemmer
@@ -14,29 +16,49 @@ class TfIdfSimilarity::Token < String
14
16
  #
15
17
  # @return [Boolean] whether the string is a token
16
18
  def valid?
17
- !self[%r{
18
- \A
19
- (
20
- \d | # number
21
- \p{Cntrl} | # control character
22
- \p{Punct} | # punctuation
23
- [[:space:]] # whitespace
24
- )+
25
- \z
26
- }x]
19
+ if RUBY_VERSION < '1.9'
20
+ !self[%r{
21
+ \A
22
+ (
23
+ \d | # number
24
+ [[:cntrl:]] | # control character
25
+ [[:punct:]] | # punctuation
26
+ [[:space:]] # whitespace
27
+ )+
28
+ \z
29
+ }x]
30
+ else
31
+ !self[%r{
32
+ \A
33
+ (
34
+ \d | # number
35
+ \p{Cntrl} | # control character
36
+ \p{Punct} | # punctuation
37
+ \p{Space} # whitespace
38
+ )+
39
+ \z
40
+ }x] # The Ruby 1.8 parser will complain about this regular expression.
41
+ end
27
42
  end
28
43
 
44
+ # Returns a lowercase string.
45
+ #
29
46
  # @return [Token] a lowercase string
30
47
  #
31
48
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
32
49
  def lowercase_filter
33
- self.class.new UnicodeUtils.downcase(self, :fr)
50
+ self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
51
+ "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
52
+ "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
53
+ ).downcase)
34
54
  end
35
55
 
56
+ # Returns a string with no English possessive or periods in acronyms.
57
+ #
36
58
  # @return [Token] a string with no English possessive or periods in acronyms
37
59
  #
38
60
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
39
61
  def classic_filter
40
- self.class.new self.gsub('.', '').chomp("'s")
62
+ self.class.new(self.gsub('.', '').chomp("'s"))
41
63
  end
42
64
  end
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.0.9"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -0,0 +1,136 @@
1
+ require 'spec_helper'
2
+
3
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
4
+ describe TfIdfSimilarity::Document do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
8
+
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
12
+
13
+ let :document_without_text do
14
+ TfIdfSimilarity::Document.new('')
15
+ end
16
+
17
+ let :document do
18
+ TfIdfSimilarity::Document.new(text)
19
+ end
20
+
21
+ let :document_with_id do
22
+ TfIdfSimilarity::Document.new(text, :id => 'baz')
23
+ end
24
+
25
+ let :document_with_tokens do
26
+ TfIdfSimilarity::Document.new(text, :tokens => tokens)
27
+ end
28
+
29
+ let :document_with_term_counts do
30
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
31
+ end
32
+
33
+ let :document_with_term_counts_and_size do
34
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
35
+ end
36
+
37
+ let :document_with_size do
38
+ TfIdfSimilarity::Document.new(text, :size => 10)
39
+ end
40
+
41
+ describe '#id' do
42
+ it 'should return the ID if no ID given' do
43
+ document.id.should == document.object_id
44
+ end
45
+
46
+ it 'should return the given ID' do
47
+ document_with_id.id.should == 'baz'
48
+ end
49
+ end
50
+
51
+ describe '#text' do
52
+ it 'should return the text' do
53
+ document.text.should == text
54
+ end
55
+ end
56
+
57
+ describe '#size' do
58
+ it 'should return the number of tokens if no tokens given' do
59
+ document.size.should == 4
60
+ end
61
+
62
+ it 'should return the number of tokens if tokens given' do
63
+ document_with_tokens.size.should == 3
64
+ end
65
+
66
+ it 'should return the number of tokens if no text given' do
67
+ document_without_text.size.should == 0
68
+ end
69
+
70
+ it 'should return the number of tokens if term counts given' do
71
+ document_with_term_counts.size.should == 15
72
+ end
73
+
74
+ it 'should return the given number of tokens if term counts and size given' do
75
+ document_with_term_counts_and_size.size.should == 20
76
+ end
77
+
78
+ it 'should not return the given number of tokens if term counts not given' do
79
+ document_with_size.size.should_not == 10
80
+ end
81
+ end
82
+
83
+ describe '#term_counts' do
84
+ it 'should return the term counts if no tokens given' do
85
+ document.term_counts.should == {'foo' => 2, 'bar' => 2}
86
+ end
87
+
88
+ it 'should return the term counts if tokens given' do
89
+ document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
90
+ end
91
+
92
+ it 'should return no term counts if no text given' do
93
+ document_without_text.term_counts.should == {}
94
+ end
95
+
96
+ it 'should return the term counts if term counts given' do
97
+ document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
98
+ end
99
+ end
100
+
101
+ describe '#terms' do
102
+ it 'should return the terms if no tokens given' do
103
+ document.terms.sort.should == ['bar', 'foo']
104
+ end
105
+
106
+ it 'should return the terms if tokens given' do
107
+ document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
108
+ end
109
+
110
+ it 'should return no terms if no text given' do
111
+ document_without_text.terms.should == []
112
+ end
113
+
114
+ it 'should return the terms if term counts given' do
115
+ document_with_term_counts.terms.sort.should == ['bar', 'baz']
116
+ end
117
+ end
118
+
119
+ describe '#term_count' do
120
+ it 'should return the term count if no tokens given' do
121
+ document.term_count('foo').should == 2
122
+ end
123
+
124
+ it 'should return the term count if tokens given' do
125
+ document_with_tokens.term_count('foo-foo').should == 1
126
+ end
127
+
128
+ it 'should return no term count if no text given' do
129
+ document_without_text.term_count('foo').should == 0
130
+ end
131
+
132
+ it 'should return the term count if term counts given' do
133
+ document_with_term_counts.term_count('bar').should == 5
134
+ end
135
+ end
136
+ end