tf-idf-similarity 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,164 +1,166 @@
1
- module TfIdfSimilarity::MatrixMethods
2
- private
1
+ module TfIdfSimilarity
2
+ module MatrixMethods
3
+ private
3
4
 
4
- # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
5
- #
6
- # @note Lucene normalizes document length differently.
7
- def normalize
8
- case @library
9
- when :gsl
10
- @matrix.clone.each_col do |column|
11
- unless column.isnull?
12
- column.normalize!
5
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
6
+ #
7
+ # @note Lucene normalizes document length differently.
8
+ def normalize
9
+ case @library
10
+ when :gsl
11
+ @matrix.clone.each_col do |column|
12
+ unless column.isnull?
13
+ column.normalize!
14
+ end
13
15
  end
14
- end
15
- when :narray # @see https://github.com/masa16/narray/issues/21
16
- norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
17
- norm[norm.where2[1]] = 1.0 # avoid division by zero
18
- NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
19
- when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
20
- normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
21
- (0...@matrix.shape[1]).each do |j|
22
- column = @matrix.column(j)
23
- norm = Math.sqrt(column.transpose.dot(column)[0, 0])
24
- (0...@matrix.shape[0]).each do |i|
25
- normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
16
+ when :narray # @see https://github.com/masa16/narray/issues/21
17
+ norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
18
+ norm[norm.where2[1]] = 1.0 # avoid division by zero
19
+ NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
20
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
21
+ normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
22
+ (0...@matrix.shape[1]).each do |j|
23
+ column = @matrix.column(j)
24
+ norm = Math.sqrt(column.transpose.dot(column)[0, 0])
25
+ (0...@matrix.shape[0]).each do |i|
26
+ normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
27
+ end
26
28
  end
29
+ normal
30
+ else
31
+ Matrix.columns(@matrix.column_vectors.map do |column|
32
+ if column.to_a.all?(&:zero?)
33
+ column
34
+ elsif column.respond_to?(:normalize)
35
+ column.normalize
36
+ else
37
+ column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
38
+ end
39
+ end)
27
40
  end
28
- normal
29
- else
30
- Matrix.columns(@matrix.column_vectors.map do |column|
31
- if column.to_a.all?(&:zero?)
32
- column
33
- elsif column.respond_to?(:normalize)
34
- column.normalize
35
- else
36
- column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
37
- end
38
- end)
39
41
  end
40
- end
41
42
 
42
- # @param [Integer] row index
43
- # @param [Integer] column index
44
- def get(i, j)
45
- case @library
46
- when :narray
47
- @matrix[j, i]
48
- else
49
- @matrix[i, j]
43
+ # @param [Integer] row index
44
+ # @param [Integer] column index
45
+ def get(i, j)
46
+ case @library
47
+ when :narray
48
+ @matrix[j, i]
49
+ else
50
+ @matrix[i, j]
51
+ end
50
52
  end
51
- end
52
53
 
53
- # @param [Integer] index the row index
54
- # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
55
- def row(index)
56
- case @library
57
- when :narray
58
- @matrix[true, index]
59
- else
60
- @matrix.row(index)
54
+ # @param [Integer] index the row index
55
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
56
+ def row(index)
57
+ case @library
58
+ when :narray
59
+ @matrix[true, index]
60
+ else
61
+ @matrix.row(index)
62
+ end
61
63
  end
62
- end
63
64
 
64
- # @param [Integer] index the column index
65
- # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
66
- def column(index)
67
- case @library
68
- when :narray
69
- @matrix[index, true]
70
- else
71
- @matrix.column(index)
65
+ # @param [Integer] index the column index
66
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
67
+ def column(index)
68
+ case @library
69
+ when :narray
70
+ @matrix[index, true]
71
+ else
72
+ @matrix.column(index)
73
+ end
72
74
  end
73
- end
74
75
 
75
- # @return [Float] the number of rows in the matrix
76
- def row_size
77
- case @library
78
- when :gsl, :nmatrix
79
- @matrix.shape[0]
80
- when :narray
81
- @matrix.shape[1]
82
- else
83
- @matrix.row_size
76
+ # @return [Float] the number of rows in the matrix
77
+ def row_size
78
+ case @library
79
+ when :gsl, :nmatrix
80
+ @matrix.shape[0]
81
+ when :narray
82
+ @matrix.shape[1]
83
+ else
84
+ @matrix.row_size
85
+ end
84
86
  end
85
- end
86
87
 
87
- # @return [Float] the number of columns in the matrix
88
- def column_size
89
- case @library
90
- when :gsl, :nmatrix
91
- @matrix.shape[1]
92
- when :narray
93
- @matrix.shape[0]
94
- else
95
- @matrix.column_size
88
+ # @return [Float] the number of columns in the matrix
89
+ def column_size
90
+ case @library
91
+ when :gsl, :nmatrix
92
+ @matrix.shape[1]
93
+ when :narray
94
+ @matrix.shape[0]
95
+ else
96
+ @matrix.column_size
97
+ end
96
98
  end
97
- end
98
99
 
99
- # @return [Array<Float>] the matrix's values
100
- def values
101
- case @library
102
- when :nmatrix
103
- @matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
104
- else
105
- @matrix.to_a.flatten
100
+ # @return [Array<Float>] the matrix's values
101
+ def values
102
+ case @library
103
+ when :nmatrix
104
+ @matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
105
+ else
106
+ @matrix.to_a.flatten
107
+ end
106
108
  end
107
- end
108
109
 
109
- # @return [Float] the sum of all values in the matrix
110
- def sum
111
- case @library
112
- when :narray
113
- @matrix.sum
114
- else
115
- values.reduce(0, :+)
110
+ # @return [Float] the sum of all values in the matrix
111
+ def sum
112
+ case @library
113
+ when :narray
114
+ @matrix.sum
115
+ else
116
+ values.reduce(0, :+)
117
+ end
116
118
  end
117
- end
118
119
 
119
- # @param [Array<Array>] array matrix rows
120
- # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
121
- def initialize_matrix(array)
122
- case @library
123
- when :gsl
124
- GSL::Matrix[*array]
125
- when :narray
126
- NArray[*array]
127
- when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
128
- NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
129
- else
130
- Matrix[*array]
120
+ # @param [Array<Array>] array matrix rows
121
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
122
+ def initialize_matrix(array)
123
+ case @library
124
+ when :gsl
125
+ GSL::Matrix[*array]
126
+ when :narray
127
+ NArray[*array]
128
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
129
+ NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
130
+ else
131
+ Matrix[*array]
132
+ end
131
133
  end
132
- end
133
134
 
134
- # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
135
- # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
136
- def multiply_self(matrix)
137
- case @library
138
- when :nmatrix
139
- matrix.transpose.dot(matrix)
140
- else
141
- matrix.transpose * matrix
135
+ # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
136
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
137
+ def multiply_self(matrix)
138
+ case @library
139
+ when :nmatrix
140
+ matrix.transpose.dot(matrix)
141
+ else
142
+ matrix.transpose * matrix
143
+ end
142
144
  end
143
- end
144
145
 
145
- def log(number)
146
- case @library
147
- when :gsl
148
- GSL::Sf::log(number)
149
- when :narray
150
- NMath.log(number)
151
- else
152
- Math.log(number)
146
+ def log(number)
147
+ case @library
148
+ when :gsl
149
+ GSL::Sf::log(number)
150
+ when :narray
151
+ NMath.log(number)
152
+ else
153
+ Math.log(number)
154
+ end
153
155
  end
154
- end
155
156
 
156
- def sqrt(number)
157
- case @library
158
- when :narray
159
- NMath.sqrt(number)
160
- else
161
- Math.sqrt(number)
157
+ def sqrt(number)
158
+ case @library
159
+ when :narray
160
+ NMath.sqrt(number)
161
+ else
162
+ Math.sqrt(number)
163
+ end
162
164
  end
163
165
  end
164
166
  end
@@ -0,0 +1,66 @@
1
+ module TfIdfSimilarity
2
+ class Model
3
+ include MatrixMethods
4
+
5
+ extend Forwardable
6
+ def_delegators :@model, :documents, :terms, :document_count
7
+
8
+ # @param [Array<Document>] documents documents
9
+ # @param [Hash] opts optional arguments
10
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
11
+ def initialize(documents, opts = {})
12
+ @model = TermCountModel.new(documents, opts)
13
+ @library = (opts[:library] || :matrix).to_sym
14
+
15
+ array = Array.new(terms.size) do |i|
16
+ idf = inverse_document_frequency(terms[i])
17
+ Array.new(documents.size) do |j|
18
+ term_frequency(documents[j], terms[i]) * idf
19
+ end
20
+ end
21
+
22
+ @matrix = initialize_matrix(array)
23
+ end
24
+
25
+ # Return the term frequency–inverse document frequency.
26
+ #
27
+ # @param [Document] document a document
28
+ # @param [String] term a term
29
+ # @return [Float] the term frequency–inverse document frequency
30
+ def term_frequency_inverse_document_frequency(document, term)
31
+ inverse_document_frequency(term) * term_frequency(document, term)
32
+ end
33
+ alias_method :tfidf, :term_frequency_inverse_document_frequency
34
+
35
+ # Returns a similarity matrix for the documents in the corpus.
36
+ #
37
+ # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
38
+ # @note Columns are normalized to unit vectors, so we can calculate the cosine
39
+ # similarity of all document vectors.
40
+ def similarity_matrix
41
+ if documents.empty?
42
+ []
43
+ else
44
+ multiply_self(normalize)
45
+ end
46
+ end
47
+
48
+ # Return the index of the document in the corpus.
49
+ #
50
+ # @param [Document] document a document
51
+ # @return [Integer,nil] the index of the document
52
+ def document_index(document)
53
+ @model.documents.index(document)
54
+ end
55
+
56
+ # Return the index of the document with matching text.
57
+ #
58
+ # @param [String] text a text
59
+ # @return [Integer,nil] the index of the document
60
+ def text_index(text)
61
+ @model.documents.index do |document|
62
+ document.text == text
63
+ end
64
+ end
65
+ end
66
+ end
@@ -1,74 +1,76 @@
1
1
  # A simple document-term matrix.
2
- class TfIdfSimilarity::TermCountModel
3
- include TfIdfSimilarity::MatrixMethods
2
+ module TfIdfSimilarity
3
+ class TermCountModel
4
+ include MatrixMethods
4
5
 
5
- # The documents in the corpus.
6
- attr_reader :documents
7
- # The set of terms in the corpus.
8
- attr_reader :terms
9
- # The average number of tokens in a document.
10
- attr_reader :average_document_size
6
+ # The documents in the corpus.
7
+ attr_reader :documents
8
+ # The set of terms in the corpus.
9
+ attr_reader :terms
10
+ # The average number of tokens in a document.
11
+ attr_reader :average_document_size
11
12
 
12
- # @param [Array<TfIdfSimilarity::Document>] documents documents
13
- # @param [Hash] opts optional arguments
14
- # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
15
- def initialize(documents, opts = {})
16
- @documents = documents
17
- @terms = Set.new(documents.map(&:terms).flatten).to_a
18
- @library = (opts[:library] || :matrix).to_sym
13
+ # @param [Array<Document>] documents documents
14
+ # @param [Hash] opts optional arguments
15
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
16
+ def initialize(documents, opts = {})
17
+ @documents = documents
18
+ @terms = Set.new(documents.map(&:terms).flatten).to_a
19
+ @library = (opts[:library] || :matrix).to_sym
19
20
 
20
- array = Array.new(terms.size) do |i|
21
- Array.new(documents.size) do |j|
22
- documents[j].term_count(terms[i])
21
+ array = Array.new(terms.size) do |i|
22
+ Array.new(documents.size) do |j|
23
+ documents[j].term_count(terms[i])
24
+ end
23
25
  end
24
- end
25
26
 
26
- @matrix = initialize_matrix(array)
27
+ @matrix = initialize_matrix(array)
27
28
 
28
- @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
29
- end
29
+ @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
30
+ end
30
31
 
31
- # @param [String] term a term
32
- # @return [Integer] the number of documents the term appears in
33
- def document_count(term)
34
- index = terms.index(term)
35
- if index
36
- case @library
37
- when :gsl, :narray
38
- row(index).where.size
39
- when :nmatrix
40
- row(index).each.count(&:nonzero?)
41
- else
42
- vector = row(index)
43
- unless vector.respond_to?(:count)
44
- vector = vector.to_a
32
+ # @param [String] term a term
33
+ # @return [Integer] the number of documents the term appears in
34
+ def document_count(term)
35
+ index = terms.index(term)
36
+ if index
37
+ case @library
38
+ when :gsl, :narray
39
+ row(index).where.size
40
+ when :nmatrix
41
+ row(index).each.count(&:nonzero?)
42
+ else
43
+ vector = row(index)
44
+ unless vector.respond_to?(:count)
45
+ vector = vector.to_a
46
+ end
47
+ vector.count(&:nonzero?)
45
48
  end
46
- vector.count(&:nonzero?)
49
+ else
50
+ 0
47
51
  end
48
- else
49
- 0
50
52
  end
51
- end
52
53
 
53
- # @param [String] term a term
54
- # @return [Integer] the number of times the term appears in the corpus
55
- def term_count(term)
56
- index = terms.index(term)
57
- if index
58
- case @library
59
- when :gsl, :narray
60
- row(index).sum
61
- when :nmatrix
62
- row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
63
- else
64
- vector = row(index)
65
- unless vector.respond_to?(:reduce)
66
- vector = vector.to_a
54
+ # @param [String] term a term
55
+ # @return [Integer] the number of times the term appears in the corpus
56
+ def term_count(term)
57
+ index = terms.index(term)
58
+ if index
59
+ case @library
60
+ when :gsl, :narray
61
+ row(index).sum
62
+ when :nmatrix
63
+ row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
64
+ else
65
+ vector = row(index)
66
+ unless vector.respond_to?(:reduce)
67
+ vector = vector.to_a
68
+ end
69
+ vector.reduce(0, :+)
67
70
  end
68
- vector.reduce(0, :+)
71
+ else
72
+ 0
69
73
  end
70
- else
71
- 0
72
74
  end
73
75
  end
74
76
  end