tf-idf-similarity 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
@@ -1,164 +1,166 @@
|
|
1
|
-
module TfIdfSimilarity
|
2
|
-
|
1
|
+
module TfIdfSimilarity
|
2
|
+
module MatrixMethods
|
3
|
+
private
|
3
4
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
5
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
|
6
|
+
#
|
7
|
+
# @note Lucene normalizes document length differently.
|
8
|
+
def normalize
|
9
|
+
case @library
|
10
|
+
when :gsl
|
11
|
+
@matrix.clone.each_col do |column|
|
12
|
+
unless column.isnull?
|
13
|
+
column.normalize!
|
14
|
+
end
|
13
15
|
end
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
when :narray # @see https://github.com/masa16/narray/issues/21
|
17
|
+
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
18
|
+
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
19
|
+
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
20
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
21
|
+
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
22
|
+
(0...@matrix.shape[1]).each do |j|
|
23
|
+
column = @matrix.column(j)
|
24
|
+
norm = Math.sqrt(column.transpose.dot(column)[0, 0])
|
25
|
+
(0...@matrix.shape[0]).each do |i|
|
26
|
+
normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
|
27
|
+
end
|
26
28
|
end
|
29
|
+
normal
|
30
|
+
else
|
31
|
+
Matrix.columns(@matrix.column_vectors.map do |column|
|
32
|
+
if column.to_a.all?(&:zero?)
|
33
|
+
column
|
34
|
+
elsif column.respond_to?(:normalize)
|
35
|
+
column.normalize
|
36
|
+
else
|
37
|
+
column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
|
38
|
+
end
|
39
|
+
end)
|
27
40
|
end
|
28
|
-
normal
|
29
|
-
else
|
30
|
-
Matrix.columns(@matrix.column_vectors.map do |column|
|
31
|
-
if column.to_a.all?(&:zero?)
|
32
|
-
column
|
33
|
-
elsif column.respond_to?(:normalize)
|
34
|
-
column.normalize
|
35
|
-
else
|
36
|
-
column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
|
37
|
-
end
|
38
|
-
end)
|
39
41
|
end
|
40
|
-
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
43
|
+
# @param [Integer] row index
|
44
|
+
# @param [Integer] column index
|
45
|
+
def get(i, j)
|
46
|
+
case @library
|
47
|
+
when :narray
|
48
|
+
@matrix[j, i]
|
49
|
+
else
|
50
|
+
@matrix[i, j]
|
51
|
+
end
|
50
52
|
end
|
51
|
-
end
|
52
53
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
54
|
+
# @param [Integer] index the row index
|
55
|
+
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
|
56
|
+
def row(index)
|
57
|
+
case @library
|
58
|
+
when :narray
|
59
|
+
@matrix[true, index]
|
60
|
+
else
|
61
|
+
@matrix.row(index)
|
62
|
+
end
|
61
63
|
end
|
62
|
-
end
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
65
|
+
# @param [Integer] index the column index
|
66
|
+
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
67
|
+
def column(index)
|
68
|
+
case @library
|
69
|
+
when :narray
|
70
|
+
@matrix[index, true]
|
71
|
+
else
|
72
|
+
@matrix.column(index)
|
73
|
+
end
|
72
74
|
end
|
73
|
-
end
|
74
75
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
76
|
+
# @return [Float] the number of rows in the matrix
|
77
|
+
def row_size
|
78
|
+
case @library
|
79
|
+
when :gsl, :nmatrix
|
80
|
+
@matrix.shape[0]
|
81
|
+
when :narray
|
82
|
+
@matrix.shape[1]
|
83
|
+
else
|
84
|
+
@matrix.row_size
|
85
|
+
end
|
84
86
|
end
|
85
|
-
end
|
86
87
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
88
|
+
# @return [Float] the number of columns in the matrix
|
89
|
+
def column_size
|
90
|
+
case @library
|
91
|
+
when :gsl, :nmatrix
|
92
|
+
@matrix.shape[1]
|
93
|
+
when :narray
|
94
|
+
@matrix.shape[0]
|
95
|
+
else
|
96
|
+
@matrix.column_size
|
97
|
+
end
|
96
98
|
end
|
97
|
-
end
|
98
99
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
100
|
+
# @return [Array<Float>] the matrix's values
|
101
|
+
def values
|
102
|
+
case @library
|
103
|
+
when :nmatrix
|
104
|
+
@matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
|
105
|
+
else
|
106
|
+
@matrix.to_a.flatten
|
107
|
+
end
|
106
108
|
end
|
107
|
-
end
|
108
109
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
110
|
+
# @return [Float] the sum of all values in the matrix
|
111
|
+
def sum
|
112
|
+
case @library
|
113
|
+
when :narray
|
114
|
+
@matrix.sum
|
115
|
+
else
|
116
|
+
values.reduce(0, :+)
|
117
|
+
end
|
116
118
|
end
|
117
|
-
end
|
118
119
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
120
|
+
# @param [Array<Array>] array matrix rows
|
121
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
|
122
|
+
def initialize_matrix(array)
|
123
|
+
case @library
|
124
|
+
when :gsl
|
125
|
+
GSL::Matrix[*array]
|
126
|
+
when :narray
|
127
|
+
NArray[*array]
|
128
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
129
|
+
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
130
|
+
else
|
131
|
+
Matrix[*array]
|
132
|
+
end
|
131
133
|
end
|
132
|
-
end
|
133
134
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
135
|
+
# @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
|
136
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
137
|
+
def multiply_self(matrix)
|
138
|
+
case @library
|
139
|
+
when :nmatrix
|
140
|
+
matrix.transpose.dot(matrix)
|
141
|
+
else
|
142
|
+
matrix.transpose * matrix
|
143
|
+
end
|
142
144
|
end
|
143
|
-
end
|
144
145
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
146
|
+
def log(number)
|
147
|
+
case @library
|
148
|
+
when :gsl
|
149
|
+
GSL::Sf::log(number)
|
150
|
+
when :narray
|
151
|
+
NMath.log(number)
|
152
|
+
else
|
153
|
+
Math.log(number)
|
154
|
+
end
|
153
155
|
end
|
154
|
-
end
|
155
156
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
157
|
+
def sqrt(number)
|
158
|
+
case @library
|
159
|
+
when :narray
|
160
|
+
NMath.sqrt(number)
|
161
|
+
else
|
162
|
+
Math.sqrt(number)
|
163
|
+
end
|
162
164
|
end
|
163
165
|
end
|
164
166
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module TfIdfSimilarity
|
2
|
+
class Model
|
3
|
+
include MatrixMethods
|
4
|
+
|
5
|
+
extend Forwardable
|
6
|
+
def_delegators :@model, :documents, :terms, :document_count
|
7
|
+
|
8
|
+
# @param [Array<Document>] documents documents
|
9
|
+
# @param [Hash] opts optional arguments
|
10
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
11
|
+
def initialize(documents, opts = {})
|
12
|
+
@model = TermCountModel.new(documents, opts)
|
13
|
+
@library = (opts[:library] || :matrix).to_sym
|
14
|
+
|
15
|
+
array = Array.new(terms.size) do |i|
|
16
|
+
idf = inverse_document_frequency(terms[i])
|
17
|
+
Array.new(documents.size) do |j|
|
18
|
+
term_frequency(documents[j], terms[i]) * idf
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
@matrix = initialize_matrix(array)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return the term frequency–inverse document frequency.
|
26
|
+
#
|
27
|
+
# @param [Document] document a document
|
28
|
+
# @param [String] term a term
|
29
|
+
# @return [Float] the term frequency–inverse document frequency
|
30
|
+
def term_frequency_inverse_document_frequency(document, term)
|
31
|
+
inverse_document_frequency(term) * term_frequency(document, term)
|
32
|
+
end
|
33
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
34
|
+
|
35
|
+
# Returns a similarity matrix for the documents in the corpus.
|
36
|
+
#
|
37
|
+
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
38
|
+
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
39
|
+
# similarity of all document vectors.
|
40
|
+
def similarity_matrix
|
41
|
+
if documents.empty?
|
42
|
+
[]
|
43
|
+
else
|
44
|
+
multiply_self(normalize)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Return the index of the document in the corpus.
|
49
|
+
#
|
50
|
+
# @param [Document] document a document
|
51
|
+
# @return [Integer,nil] the index of the document
|
52
|
+
def document_index(document)
|
53
|
+
@model.documents.index(document)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Return the index of the document with matching text.
|
57
|
+
#
|
58
|
+
# @param [String] text a text
|
59
|
+
# @return [Integer,nil] the index of the document
|
60
|
+
def text_index(text)
|
61
|
+
@model.documents.index do |document|
|
62
|
+
document.text == text
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -1,74 +1,76 @@
|
|
1
1
|
# A simple document-term matrix.
|
2
|
-
|
3
|
-
|
2
|
+
module TfIdfSimilarity
|
3
|
+
class TermCountModel
|
4
|
+
include MatrixMethods
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
# The documents in the corpus.
|
7
|
+
attr_reader :documents
|
8
|
+
# The set of terms in the corpus.
|
9
|
+
attr_reader :terms
|
10
|
+
# The average number of tokens in a document.
|
11
|
+
attr_reader :average_document_size
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
13
|
+
# @param [Array<Document>] documents documents
|
14
|
+
# @param [Hash] opts optional arguments
|
15
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
16
|
+
def initialize(documents, opts = {})
|
17
|
+
@documents = documents
|
18
|
+
@terms = Set.new(documents.map(&:terms).flatten).to_a
|
19
|
+
@library = (opts[:library] || :matrix).to_sym
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
array = Array.new(terms.size) do |i|
|
22
|
+
Array.new(documents.size) do |j|
|
23
|
+
documents[j].term_count(terms[i])
|
24
|
+
end
|
23
25
|
end
|
24
|
-
end
|
25
26
|
|
26
|
-
|
27
|
+
@matrix = initialize_matrix(array)
|
27
28
|
|
28
|
-
|
29
|
-
|
29
|
+
@average_document_size = documents.empty? ? 0 : sum / column_size.to_f
|
30
|
+
end
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
32
|
+
# @param [String] term a term
|
33
|
+
# @return [Integer] the number of documents the term appears in
|
34
|
+
def document_count(term)
|
35
|
+
index = terms.index(term)
|
36
|
+
if index
|
37
|
+
case @library
|
38
|
+
when :gsl, :narray
|
39
|
+
row(index).where.size
|
40
|
+
when :nmatrix
|
41
|
+
row(index).each.count(&:nonzero?)
|
42
|
+
else
|
43
|
+
vector = row(index)
|
44
|
+
unless vector.respond_to?(:count)
|
45
|
+
vector = vector.to_a
|
46
|
+
end
|
47
|
+
vector.count(&:nonzero?)
|
45
48
|
end
|
46
|
-
|
49
|
+
else
|
50
|
+
0
|
47
51
|
end
|
48
|
-
else
|
49
|
-
0
|
50
52
|
end
|
51
|
-
end
|
52
53
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
54
|
+
# @param [String] term a term
|
55
|
+
# @return [Integer] the number of times the term appears in the corpus
|
56
|
+
def term_count(term)
|
57
|
+
index = terms.index(term)
|
58
|
+
if index
|
59
|
+
case @library
|
60
|
+
when :gsl, :narray
|
61
|
+
row(index).sum
|
62
|
+
when :nmatrix
|
63
|
+
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
64
|
+
else
|
65
|
+
vector = row(index)
|
66
|
+
unless vector.respond_to?(:reduce)
|
67
|
+
vector = vector.to_a
|
68
|
+
end
|
69
|
+
vector.reduce(0, :+)
|
67
70
|
end
|
68
|
-
|
71
|
+
else
|
72
|
+
0
|
69
73
|
end
|
70
|
-
else
|
71
|
-
0
|
72
74
|
end
|
73
75
|
end
|
74
76
|
end
|