tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ module TfIdfSimilarity::MatrixMethods
2
+ private
3
+
4
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
5
+ #
6
+ # @note Lucene normalizes document length differently.
7
+ def normalize
8
+ case @library
9
+ when :gsl
10
+ @matrix.clone.each_col do |column|
11
+ unless column.isnull?
12
+ column.normalize!
13
+ end
14
+ end
15
+ when :narray # @see https://github.com/masa16/narray/issues/21
16
+ norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
17
+ norm[norm.where2[1]] = 1.0 # avoid division by zero
18
+ NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
19
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
20
+ normal = NMatrix.new(:dense, @matrix.shape, :float64)
21
+ (0...@matrix.shape[1]).each do |j|
22
+ column = @matrix.column(j)
23
+ norm = Math.sqrt(column.transpose.dot(column)[0, 0])
24
+ (0...@matrix.shape[0]).each do |i|
25
+ normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
26
+ end
27
+ end
28
+ normal
29
+ else
30
+ Matrix.columns(@matrix.column_vectors.map do |column|
31
+ if column.to_a.all?(&:zero?)
32
+ column
33
+ elsif column.respond_to?(:normalize)
34
+ column.normalize
35
+ else
36
+ column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
37
+ end
38
+ end)
39
+ end
40
+ end
41
+
42
+ # @param [Integer] row index
43
+ # @param [Integer] column index
44
+ def get(i, j)
45
+ case @library
46
+ when :narray
47
+ @matrix[j, i]
48
+ else
49
+ @matrix[i, j]
50
+ end
51
+ end
52
+
53
+ # @param [Integer] index the row index
54
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
55
+ def row(index)
56
+ case @library
57
+ when :narray
58
+ @matrix[true, index]
59
+ else
60
+ @matrix.row(index)
61
+ end
62
+ end
63
+
64
+ # @param [Integer] index the column index
65
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
66
+ def column(index)
67
+ case @library
68
+ when :narray
69
+ @matrix[index, true]
70
+ else
71
+ @matrix.column(index)
72
+ end
73
+ end
74
+
75
+ # @return [Float] the number of rows in the matrix
76
+ def row_size
77
+ case @library
78
+ when :gsl, :nmatrix
79
+ @matrix.shape[0]
80
+ when :narray
81
+ @matrix.shape[1]
82
+ else
83
+ @matrix.row_size
84
+ end
85
+ end
86
+
87
+ # @return [Float] the number of columns in the matrix
88
+ def column_size
89
+ case @library
90
+ when :gsl, :nmatrix
91
+ @matrix.shape[1]
92
+ when :narray
93
+ @matrix.shape[0]
94
+ else
95
+ @matrix.column_size
96
+ end
97
+ end
98
+
99
+ # @return [Array<Float>] the matrix's values
100
+ def values
101
+ case @library
102
+ when :nmatrix
103
+ @matrix.each.to_a
104
+ else
105
+ @matrix.to_a.flatten
106
+ end
107
+ end
108
+
109
+ # @return [Float] the sum of all values in the matrix
110
+ def sum
111
+ case @library
112
+ when :narray
113
+ @matrix.sum
114
+ else
115
+ values.reduce(0, :+)
116
+ end
117
+ end
118
+
119
+ # @param [Array<Array>] array matrix rows
120
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
121
+ def initialize_matrix(array)
122
+ case @library
123
+ when :gsl
124
+ GSL::Matrix[*array]
125
+ when :narray
126
+ NArray[*array]
127
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91
128
+ NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten)
129
+ else
130
+ Matrix[*array]
131
+ end
132
+ end
133
+
134
+ # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
135
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
136
+ def multiply_self(matrix)
137
+ case @library
138
+ when :nmatrix
139
+ matrix.transpose.dot(matrix)
140
+ else
141
+ matrix.transpose * matrix
142
+ end
143
+ end
144
+
145
+ def log(number)
146
+ case @library
147
+ when :gsl
148
+ GSL::Sf::log(number)
149
+ when :narray
150
+ NMath.log(number)
151
+ else
152
+ Math.log(number)
153
+ end
154
+ end
155
+
156
+ def sqrt(number)
157
+ case @library
158
+ when :narray
159
+ NMath.sqrt(number)
160
+ else
161
+ Math.sqrt(number)
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,78 @@
1
+ # A simple document-term matrix.
2
+ #
3
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
4
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
5
+ # @see http://en.wikipedia.org/wiki/Okapi_BM25
6
+ class TfIdfSimilarity::TermCountModel
7
+ include TfIdfSimilarity::MatrixMethods
8
+
9
+ # The documents in the corpus.
10
+ attr_reader :documents
11
+ # The set of terms in the corpus.
12
+ attr_reader :terms
13
+ # The average number of tokens in a document.
14
+ attr_reader :average_document_size
15
+
16
+ # @param [Array<TfIdfSimilarity::Document>] documents documents
17
+ # @param [Hash] opts optional arguments
18
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
19
+ def initialize(documents, opts = {})
20
+ @documents = documents
21
+ @terms = Set.new(documents.map(&:terms).flatten).to_a
22
+ @library = (opts[:library] || :matrix).to_sym
23
+
24
+ array = Array.new(terms.size) do |i|
25
+ Array.new(documents.size) do |j|
26
+ documents[j].term_count(terms[i])
27
+ end
28
+ end
29
+
30
+ @matrix = initialize_matrix(array)
31
+
32
+ @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
33
+ end
34
+
35
+ # @param [String] term a term
36
+ # @return [Integer] the number of documents the term appears in
37
+ def document_count(term)
38
+ index = terms.index(term)
39
+ if index
40
+ case @library
41
+ when :gsl, :narray
42
+ row(index).where.size
43
+ when :nmatrix
44
+ row(index).each.count(&:nonzero?)
45
+ else
46
+ vector = row(index)
47
+ unless vector.respond_to?(:count)
48
+ vector = vector.to_a
49
+ end
50
+ vector.count(&:nonzero?)
51
+ end
52
+ else
53
+ 0
54
+ end
55
+ end
56
+
57
+ # @param [String] term a term
58
+ # @return [Integer] the number of times the term appears in the corpus
59
+ def term_count(term)
60
+ index = terms.index(term)
61
+ if index
62
+ case @library
63
+ when :gsl, :narray
64
+ row(index).sum
65
+ when :nmatrix
66
+ row(index).each.reduce(0, :+)
67
+ else
68
+ vector = row(index)
69
+ unless vector.respond_to?(:reduce)
70
+ vector = vector.to_a
71
+ end
72
+ vector.reduce(0, :+)
73
+ end
74
+ else
75
+ 0
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,81 @@
1
+ # A document-term matrix using either the tf*idf or BM25 functions.
2
+ #
3
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
4
+ # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
5
+ # @see http://en.wikipedia.org/wiki/Okapi_BM25
6
+ class TfIdfSimilarity::TfIdfModel
7
+ include TfIdfSimilarity::MatrixMethods
8
+
9
+ extend Forwardable
10
+ def_delegators :@model, :documents, :terms, :document_count
11
+
12
+ # @param [Array<TfIdfSimilarity::Document>] documents documents
13
+ # @param [Hash] opts optional arguments
14
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
15
+ # @option opts [Symbol] :function :tfidf (default) or :bm25
16
+ def initialize(documents, opts = {})
17
+ @model = TfIdfSimilarity::TermCountModel.new(documents, opts)
18
+ @library = (opts[:library] || :matrix).to_sym
19
+ @function = (opts[:function] || :tfidf).to_sym
20
+
21
+ array = Array.new(terms.size) do |i|
22
+ idf = inverse_document_frequency(terms[i])
23
+ Array.new(documents.size) do |j|
24
+ term_frequency(documents[j], terms[i]) * idf
25
+ end
26
+ end
27
+
28
+ @matrix = initialize_matrix(array)
29
+ end
30
+
31
+ # Return the term's inverse document frequency.
32
+ #
33
+ # @param [String] term a term
34
+ # @return [Float] the term's inverse document frequency
35
+ def inverse_document_frequency(term)
36
+ df = @model.document_count(term)
37
+ if @function == :bm25
38
+ log((documents.size - df + 0.5) / (df + 0.5))
39
+ else
40
+ 1 + log(documents.size / (df + 1.0))
41
+ end
42
+ end
43
+ alias_method :idf, :inverse_document_frequency
44
+
45
+ # Returns the term's frequency in the document.
46
+ #
47
+ # @param [Document] document a document
48
+ # @param [String] term a term
49
+ # @return [Float] the term's frequency in the document
50
+ #
51
+ # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
52
+ def term_frequency(document, term)
53
+ tf = document.term_count(term)
54
+ if @function == :bm25
55
+ (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
56
+ else
57
+ sqrt(tf)
58
+ end
59
+ end
60
+ alias_method :tf, :term_frequency
61
+
62
+ # Return the term frequency–inverse document frequency.
63
+ #
64
+ # @param [Document] document a document
65
+ # @param [String] term a term
66
+ # @return [Float] the term frequency–inverse document frequency
67
+ def term_frequency_inverse_document_frequency(document, term)
68
+ inverse_document_frequency(term) * term_frequency(document, term)
69
+ end
70
+ alias_method :tfidf, :term_frequency_inverse_document_frequency
71
+
72
+ # Returns a similarity matrix for the documents in the corpus.
73
+ #
74
+ # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
75
+ # @note Columns are normalized to unit vectors, so we can calculate the cosine
76
+ # similarity of all document vectors. BM25 doesn't normalize columns, but
77
+ # BM25 wasn't written with this use case in mind.
78
+ def similarity_matrix
79
+ multiply_self(normalize)
80
+ end
81
+ end
@@ -1,5 +1,7 @@
1
1
  # coding: utf-8
2
2
 
3
+ # A token.
4
+ #
3
5
  # @note We can add more filters from Solr and stem using Porter's Snowball.
4
6
  #
5
7
  # @see https://github.com/aurelian/ruby-stemmer
@@ -14,29 +16,49 @@ class TfIdfSimilarity::Token < String
14
16
  #
15
17
  # @return [Boolean] whether the string is a token
16
18
  def valid?
17
- !self[%r{
18
- \A
19
- (
20
- \d | # number
21
- \p{Cntrl} | # control character
22
- \p{Punct} | # punctuation
23
- [[:space:]] # whitespace
24
- )+
25
- \z
26
- }x]
19
+ if RUBY_VERSION < '1.9'
20
+ !self[%r{
21
+ \A
22
+ (
23
+ \d | # number
24
+ [[:cntrl:]] | # control character
25
+ [[:punct:]] | # punctuation
26
+ [[:space:]] # whitespace
27
+ )+
28
+ \z
29
+ }x]
30
+ else
31
+ !self[%r{
32
+ \A
33
+ (
34
+ \d | # number
35
+ \p{Cntrl} | # control character
36
+ \p{Punct} | # punctuation
37
+ \p{Space} # whitespace
38
+ )+
39
+ \z
40
+ }x] # The Ruby 1.8 parser will complain about this regular expression.
41
+ end
27
42
  end
28
43
 
44
+ # Returns a lowercase string.
45
+ #
29
46
  # @return [Token] a lowercase string
30
47
  #
31
48
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
32
49
  def lowercase_filter
33
- self.class.new UnicodeUtils.downcase(self, :fr)
50
+ self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
51
+ "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
52
+ "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
53
+ ).downcase)
34
54
  end
35
55
 
56
+ # Returns a string with no English possessive or periods in acronyms.
57
+ #
36
58
  # @return [Token] a string with no English possessive or periods in acronyms
37
59
  #
38
60
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
39
61
  def classic_filter
40
- self.class.new self.gsub('.', '').chomp("'s")
62
+ self.class.new(self.gsub('.', '').chomp("'s"))
41
63
  end
42
64
  end
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.0.9"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -0,0 +1,136 @@
1
+ require 'spec_helper'
2
+
3
+ # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
4
+ describe TfIdfSimilarity::Document do
5
+ let :text do
6
+ "FOO-foo BAR bar \r\n\t 123 !@#"
7
+ end
8
+
9
+ let :tokens do
10
+ ['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
11
+ end
12
+
13
+ let :document_without_text do
14
+ TfIdfSimilarity::Document.new('')
15
+ end
16
+
17
+ let :document do
18
+ TfIdfSimilarity::Document.new(text)
19
+ end
20
+
21
+ let :document_with_id do
22
+ TfIdfSimilarity::Document.new(text, :id => 'baz')
23
+ end
24
+
25
+ let :document_with_tokens do
26
+ TfIdfSimilarity::Document.new(text, :tokens => tokens)
27
+ end
28
+
29
+ let :document_with_term_counts do
30
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
31
+ end
32
+
33
+ let :document_with_term_counts_and_size do
34
+ TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
35
+ end
36
+
37
+ let :document_with_size do
38
+ TfIdfSimilarity::Document.new(text, :size => 10)
39
+ end
40
+
41
+ describe '#id' do
42
+ it 'should return the ID if no ID given' do
43
+ document.id.should == document.object_id
44
+ end
45
+
46
+ it 'should return the given ID' do
47
+ document_with_id.id.should == 'baz'
48
+ end
49
+ end
50
+
51
+ describe '#text' do
52
+ it 'should return the text' do
53
+ document.text.should == text
54
+ end
55
+ end
56
+
57
+ describe '#size' do
58
+ it 'should return the number of tokens if no tokens given' do
59
+ document.size.should == 4
60
+ end
61
+
62
+ it 'should return the number of tokens if tokens given' do
63
+ document_with_tokens.size.should == 3
64
+ end
65
+
66
+ it 'should return the number of tokens if no text given' do
67
+ document_without_text.size.should == 0
68
+ end
69
+
70
+ it 'should return the number of tokens if term counts given' do
71
+ document_with_term_counts.size.should == 15
72
+ end
73
+
74
+ it 'should return the given number of tokens if term counts and size given' do
75
+ document_with_term_counts_and_size.size.should == 20
76
+ end
77
+
78
+ it 'should not return the given number of tokens if term counts not given' do
79
+ document_with_size.size.should_not == 10
80
+ end
81
+ end
82
+
83
+ describe '#term_counts' do
84
+ it 'should return the term counts if no tokens given' do
85
+ document.term_counts.should == {'foo' => 2, 'bar' => 2}
86
+ end
87
+
88
+ it 'should return the term counts if tokens given' do
89
+ document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
90
+ end
91
+
92
+ it 'should return no term counts if no text given' do
93
+ document_without_text.term_counts.should == {}
94
+ end
95
+
96
+ it 'should return the term counts if term counts given' do
97
+ document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
98
+ end
99
+ end
100
+
101
+ describe '#terms' do
102
+ it 'should return the terms if no tokens given' do
103
+ document.terms.sort.should == ['bar', 'foo']
104
+ end
105
+
106
+ it 'should return the terms if tokens given' do
107
+ document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
108
+ end
109
+
110
+ it 'should return no terms if no text given' do
111
+ document_without_text.terms.should == []
112
+ end
113
+
114
+ it 'should return the terms if term counts given' do
115
+ document_with_term_counts.terms.sort.should == ['bar', 'baz']
116
+ end
117
+ end
118
+
119
+ describe '#term_count' do
120
+ it 'should return the term count if no tokens given' do
121
+ document.term_count('foo').should == 2
122
+ end
123
+
124
+ it 'should return the term count if tokens given' do
125
+ document_with_tokens.term_count('foo-foo').should == 1
126
+ end
127
+
128
+ it 'should return no term count if no text given' do
129
+ document_without_text.term_count('foo').should == 0
130
+ end
131
+
132
+ it 'should return the term count if term counts given' do
133
+ document_with_term_counts.term_count('bar').should == 5
134
+ end
135
+ end
136
+ end