tf-idf-similarity 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,164 +1,166 @@
1
- module TfIdfSimilarity::MatrixMethods
2
- private
1
+ module TfIdfSimilarity
2
+ module MatrixMethods
3
+ private
3
4
 
4
- # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
5
- #
6
- # @note Lucene normalizes document length differently.
7
- def normalize
8
- case @library
9
- when :gsl
10
- @matrix.clone.each_col do |column|
11
- unless column.isnull?
12
- column.normalize!
5
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
6
+ #
7
+ # @note Lucene normalizes document length differently.
8
+ def normalize
9
+ case @library
10
+ when :gsl
11
+ @matrix.clone.each_col do |column|
12
+ unless column.isnull?
13
+ column.normalize!
14
+ end
13
15
  end
14
- end
15
- when :narray # @see https://github.com/masa16/narray/issues/21
16
- norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
17
- norm[norm.where2[1]] = 1.0 # avoid division by zero
18
- NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
19
- when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
20
- normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
21
- (0...@matrix.shape[1]).each do |j|
22
- column = @matrix.column(j)
23
- norm = Math.sqrt(column.transpose.dot(column)[0, 0])
24
- (0...@matrix.shape[0]).each do |i|
25
- normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
16
+ when :narray # @see https://github.com/masa16/narray/issues/21
17
+ norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
18
+ norm[norm.where2[1]] = 1.0 # avoid division by zero
19
+ NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
20
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
21
+ normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
22
+ (0...@matrix.shape[1]).each do |j|
23
+ column = @matrix.column(j)
24
+ norm = Math.sqrt(column.transpose.dot(column)[0, 0])
25
+ (0...@matrix.shape[0]).each do |i|
26
+ normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
27
+ end
26
28
  end
29
+ normal
30
+ else
31
+ Matrix.columns(@matrix.column_vectors.map do |column|
32
+ if column.to_a.all?(&:zero?)
33
+ column
34
+ elsif column.respond_to?(:normalize)
35
+ column.normalize
36
+ else
37
+ column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
38
+ end
39
+ end)
27
40
  end
28
- normal
29
- else
30
- Matrix.columns(@matrix.column_vectors.map do |column|
31
- if column.to_a.all?(&:zero?)
32
- column
33
- elsif column.respond_to?(:normalize)
34
- column.normalize
35
- else
36
- column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
37
- end
38
- end)
39
41
  end
40
- end
41
42
 
42
- # @param [Integer] row index
43
- # @param [Integer] column index
44
- def get(i, j)
45
- case @library
46
- when :narray
47
- @matrix[j, i]
48
- else
49
- @matrix[i, j]
43
+ # @param [Integer] row index
44
+ # @param [Integer] column index
45
+ def get(i, j)
46
+ case @library
47
+ when :narray
48
+ @matrix[j, i]
49
+ else
50
+ @matrix[i, j]
51
+ end
50
52
  end
51
- end
52
53
 
53
- # @param [Integer] index the row index
54
- # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
55
- def row(index)
56
- case @library
57
- when :narray
58
- @matrix[true, index]
59
- else
60
- @matrix.row(index)
54
+ # @param [Integer] index the row index
55
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
56
+ def row(index)
57
+ case @library
58
+ when :narray
59
+ @matrix[true, index]
60
+ else
61
+ @matrix.row(index)
62
+ end
61
63
  end
62
- end
63
64
 
64
- # @param [Integer] index the column index
65
- # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
66
- def column(index)
67
- case @library
68
- when :narray
69
- @matrix[index, true]
70
- else
71
- @matrix.column(index)
65
+ # @param [Integer] index the column index
66
+ # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
67
+ def column(index)
68
+ case @library
69
+ when :narray
70
+ @matrix[index, true]
71
+ else
72
+ @matrix.column(index)
73
+ end
72
74
  end
73
- end
74
75
 
75
- # @return [Float] the number of rows in the matrix
76
- def row_size
77
- case @library
78
- when :gsl, :nmatrix
79
- @matrix.shape[0]
80
- when :narray
81
- @matrix.shape[1]
82
- else
83
- @matrix.row_size
76
+ # @return [Float] the number of rows in the matrix
77
+ def row_size
78
+ case @library
79
+ when :gsl, :nmatrix
80
+ @matrix.shape[0]
81
+ when :narray
82
+ @matrix.shape[1]
83
+ else
84
+ @matrix.row_size
85
+ end
84
86
  end
85
- end
86
87
 
87
- # @return [Float] the number of columns in the matrix
88
- def column_size
89
- case @library
90
- when :gsl, :nmatrix
91
- @matrix.shape[1]
92
- when :narray
93
- @matrix.shape[0]
94
- else
95
- @matrix.column_size
88
+ # @return [Float] the number of columns in the matrix
89
+ def column_size
90
+ case @library
91
+ when :gsl, :nmatrix
92
+ @matrix.shape[1]
93
+ when :narray
94
+ @matrix.shape[0]
95
+ else
96
+ @matrix.column_size
97
+ end
96
98
  end
97
- end
98
99
 
99
- # @return [Array<Float>] the matrix's values
100
- def values
101
- case @library
102
- when :nmatrix
103
- @matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
104
- else
105
- @matrix.to_a.flatten
100
+ # @return [Array<Float>] the matrix's values
101
+ def values
102
+ case @library
103
+ when :nmatrix
104
+ @matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
105
+ else
106
+ @matrix.to_a.flatten
107
+ end
106
108
  end
107
- end
108
109
 
109
- # @return [Float] the sum of all values in the matrix
110
- def sum
111
- case @library
112
- when :narray
113
- @matrix.sum
114
- else
115
- values.reduce(0, :+)
110
+ # @return [Float] the sum of all values in the matrix
111
+ def sum
112
+ case @library
113
+ when :narray
114
+ @matrix.sum
115
+ else
116
+ values.reduce(0, :+)
117
+ end
116
118
  end
117
- end
118
119
 
119
- # @param [Array<Array>] array matrix rows
120
- # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
121
- def initialize_matrix(array)
122
- case @library
123
- when :gsl
124
- GSL::Matrix[*array]
125
- when :narray
126
- NArray[*array]
127
- when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
128
- NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
129
- else
130
- Matrix[*array]
120
+ # @param [Array<Array>] array matrix rows
121
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
122
+ def initialize_matrix(array)
123
+ case @library
124
+ when :gsl
125
+ GSL::Matrix[*array]
126
+ when :narray
127
+ NArray[*array]
128
+ when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
129
+ NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
130
+ else
131
+ Matrix[*array]
132
+ end
131
133
  end
132
- end
133
134
 
134
- # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
135
- # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
136
- def multiply_self(matrix)
137
- case @library
138
- when :nmatrix
139
- matrix.transpose.dot(matrix)
140
- else
141
- matrix.transpose * matrix
135
+ # @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
136
+ # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
137
+ def multiply_self(matrix)
138
+ case @library
139
+ when :nmatrix
140
+ matrix.transpose.dot(matrix)
141
+ else
142
+ matrix.transpose * matrix
143
+ end
142
144
  end
143
- end
144
145
 
145
- def log(number)
146
- case @library
147
- when :gsl
148
- GSL::Sf::log(number)
149
- when :narray
150
- NMath.log(number)
151
- else
152
- Math.log(number)
146
+ def log(number)
147
+ case @library
148
+ when :gsl
149
+ GSL::Sf::log(number)
150
+ when :narray
151
+ NMath.log(number)
152
+ else
153
+ Math.log(number)
154
+ end
153
155
  end
154
- end
155
156
 
156
- def sqrt(number)
157
- case @library
158
- when :narray
159
- NMath.sqrt(number)
160
- else
161
- Math.sqrt(number)
157
+ def sqrt(number)
158
+ case @library
159
+ when :narray
160
+ NMath.sqrt(number)
161
+ else
162
+ Math.sqrt(number)
163
+ end
162
164
  end
163
165
  end
164
166
  end
@@ -0,0 +1,66 @@
1
+ module TfIdfSimilarity
2
+ class Model
3
+ include MatrixMethods
4
+
5
+ extend Forwardable
6
+ def_delegators :@model, :documents, :terms, :document_count
7
+
8
+ # @param [Array<Document>] documents documents
9
+ # @param [Hash] opts optional arguments
10
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
11
+ def initialize(documents, opts = {})
12
+ @model = TermCountModel.new(documents, opts)
13
+ @library = (opts[:library] || :matrix).to_sym
14
+
15
+ array = Array.new(terms.size) do |i|
16
+ idf = inverse_document_frequency(terms[i])
17
+ Array.new(documents.size) do |j|
18
+ term_frequency(documents[j], terms[i]) * idf
19
+ end
20
+ end
21
+
22
+ @matrix = initialize_matrix(array)
23
+ end
24
+
25
+ # Return the term frequency–inverse document frequency.
26
+ #
27
+ # @param [Document] document a document
28
+ # @param [String] term a term
29
+ # @return [Float] the term frequency–inverse document frequency
30
+ def term_frequency_inverse_document_frequency(document, term)
31
+ inverse_document_frequency(term) * term_frequency(document, term)
32
+ end
33
+ alias_method :tfidf, :term_frequency_inverse_document_frequency
34
+
35
+ # Returns a similarity matrix for the documents in the corpus.
36
+ #
37
+ # @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
38
+ # @note Columns are normalized to unit vectors, so we can calculate the cosine
39
+ # similarity of all document vectors.
40
+ def similarity_matrix
41
+ if documents.empty?
42
+ []
43
+ else
44
+ multiply_self(normalize)
45
+ end
46
+ end
47
+
48
+ # Return the index of the document in the corpus.
49
+ #
50
+ # @param [Document] document a document
51
+ # @return [Integer,nil] the index of the document
52
+ def document_index(document)
53
+ @model.documents.index(document)
54
+ end
55
+
56
+ # Return the index of the document with matching text.
57
+ #
58
+ # @param [String] text a text
59
+ # @return [Integer,nil] the index of the document
60
+ def text_index(text)
61
+ @model.documents.index do |document|
62
+ document.text == text
63
+ end
64
+ end
65
+ end
66
+ end
@@ -1,74 +1,76 @@
1
1
  # A simple document-term matrix.
2
- class TfIdfSimilarity::TermCountModel
3
- include TfIdfSimilarity::MatrixMethods
2
+ module TfIdfSimilarity
3
+ class TermCountModel
4
+ include MatrixMethods
4
5
 
5
- # The documents in the corpus.
6
- attr_reader :documents
7
- # The set of terms in the corpus.
8
- attr_reader :terms
9
- # The average number of tokens in a document.
10
- attr_reader :average_document_size
6
+ # The documents in the corpus.
7
+ attr_reader :documents
8
+ # The set of terms in the corpus.
9
+ attr_reader :terms
10
+ # The average number of tokens in a document.
11
+ attr_reader :average_document_size
11
12
 
12
- # @param [Array<TfIdfSimilarity::Document>] documents documents
13
- # @param [Hash] opts optional arguments
14
- # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
15
- def initialize(documents, opts = {})
16
- @documents = documents
17
- @terms = Set.new(documents.map(&:terms).flatten).to_a
18
- @library = (opts[:library] || :matrix).to_sym
13
+ # @param [Array<Document>] documents documents
14
+ # @param [Hash] opts optional arguments
15
+ # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
16
+ def initialize(documents, opts = {})
17
+ @documents = documents
18
+ @terms = Set.new(documents.map(&:terms).flatten).to_a
19
+ @library = (opts[:library] || :matrix).to_sym
19
20
 
20
- array = Array.new(terms.size) do |i|
21
- Array.new(documents.size) do |j|
22
- documents[j].term_count(terms[i])
21
+ array = Array.new(terms.size) do |i|
22
+ Array.new(documents.size) do |j|
23
+ documents[j].term_count(terms[i])
24
+ end
23
25
  end
24
- end
25
26
 
26
- @matrix = initialize_matrix(array)
27
+ @matrix = initialize_matrix(array)
27
28
 
28
- @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
29
- end
29
+ @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
30
+ end
30
31
 
31
- # @param [String] term a term
32
- # @return [Integer] the number of documents the term appears in
33
- def document_count(term)
34
- index = terms.index(term)
35
- if index
36
- case @library
37
- when :gsl, :narray
38
- row(index).where.size
39
- when :nmatrix
40
- row(index).each.count(&:nonzero?)
41
- else
42
- vector = row(index)
43
- unless vector.respond_to?(:count)
44
- vector = vector.to_a
32
+ # @param [String] term a term
33
+ # @return [Integer] the number of documents the term appears in
34
+ def document_count(term)
35
+ index = terms.index(term)
36
+ if index
37
+ case @library
38
+ when :gsl, :narray
39
+ row(index).where.size
40
+ when :nmatrix
41
+ row(index).each.count(&:nonzero?)
42
+ else
43
+ vector = row(index)
44
+ unless vector.respond_to?(:count)
45
+ vector = vector.to_a
46
+ end
47
+ vector.count(&:nonzero?)
45
48
  end
46
- vector.count(&:nonzero?)
49
+ else
50
+ 0
47
51
  end
48
- else
49
- 0
50
52
  end
51
- end
52
53
 
53
- # @param [String] term a term
54
- # @return [Integer] the number of times the term appears in the corpus
55
- def term_count(term)
56
- index = terms.index(term)
57
- if index
58
- case @library
59
- when :gsl, :narray
60
- row(index).sum
61
- when :nmatrix
62
- row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
63
- else
64
- vector = row(index)
65
- unless vector.respond_to?(:reduce)
66
- vector = vector.to_a
54
+ # @param [String] term a term
55
+ # @return [Integer] the number of times the term appears in the corpus
56
+ def term_count(term)
57
+ index = terms.index(term)
58
+ if index
59
+ case @library
60
+ when :gsl, :narray
61
+ row(index).sum
62
+ when :nmatrix
63
+ row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
64
+ else
65
+ vector = row(index)
66
+ unless vector.respond_to?(:reduce)
67
+ vector = vector.to_a
68
+ end
69
+ vector.reduce(0, :+)
67
70
  end
68
- vector.reduce(0, :+)
71
+ else
72
+ 0
69
73
  end
70
- else
71
- 0
72
74
  end
73
75
  end
74
76
  end