tf-idf-similarity 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
@@ -1,164 +1,166 @@
|
|
1
|
-
module TfIdfSimilarity
|
2
|
-
|
1
|
+
module TfIdfSimilarity
|
2
|
+
module MatrixMethods
|
3
|
+
private
|
3
4
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
5
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
|
6
|
+
#
|
7
|
+
# @note Lucene normalizes document length differently.
|
8
|
+
def normalize
|
9
|
+
case @library
|
10
|
+
when :gsl
|
11
|
+
@matrix.clone.each_col do |column|
|
12
|
+
unless column.isnull?
|
13
|
+
column.normalize!
|
14
|
+
end
|
13
15
|
end
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
when :narray # @see https://github.com/masa16/narray/issues/21
|
17
|
+
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
18
|
+
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
19
|
+
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
20
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
21
|
+
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
22
|
+
(0...@matrix.shape[1]).each do |j|
|
23
|
+
column = @matrix.column(j)
|
24
|
+
norm = Math.sqrt(column.transpose.dot(column)[0, 0])
|
25
|
+
(0...@matrix.shape[0]).each do |i|
|
26
|
+
normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
|
27
|
+
end
|
26
28
|
end
|
29
|
+
normal
|
30
|
+
else
|
31
|
+
Matrix.columns(@matrix.column_vectors.map do |column|
|
32
|
+
if column.to_a.all?(&:zero?)
|
33
|
+
column
|
34
|
+
elsif column.respond_to?(:normalize)
|
35
|
+
column.normalize
|
36
|
+
else
|
37
|
+
column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
|
38
|
+
end
|
39
|
+
end)
|
27
40
|
end
|
28
|
-
normal
|
29
|
-
else
|
30
|
-
Matrix.columns(@matrix.column_vectors.map do |column|
|
31
|
-
if column.to_a.all?(&:zero?)
|
32
|
-
column
|
33
|
-
elsif column.respond_to?(:normalize)
|
34
|
-
column.normalize
|
35
|
-
else
|
36
|
-
column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
|
37
|
-
end
|
38
|
-
end)
|
39
41
|
end
|
40
|
-
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
43
|
+
# @param [Integer] row index
|
44
|
+
# @param [Integer] column index
|
45
|
+
def get(i, j)
|
46
|
+
case @library
|
47
|
+
when :narray
|
48
|
+
@matrix[j, i]
|
49
|
+
else
|
50
|
+
@matrix[i, j]
|
51
|
+
end
|
50
52
|
end
|
51
|
-
end
|
52
53
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
54
|
+
# @param [Integer] index the row index
|
55
|
+
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
|
56
|
+
def row(index)
|
57
|
+
case @library
|
58
|
+
when :narray
|
59
|
+
@matrix[true, index]
|
60
|
+
else
|
61
|
+
@matrix.row(index)
|
62
|
+
end
|
61
63
|
end
|
62
|
-
end
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
65
|
+
# @param [Integer] index the column index
|
66
|
+
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
67
|
+
def column(index)
|
68
|
+
case @library
|
69
|
+
when :narray
|
70
|
+
@matrix[index, true]
|
71
|
+
else
|
72
|
+
@matrix.column(index)
|
73
|
+
end
|
72
74
|
end
|
73
|
-
end
|
74
75
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
76
|
+
# @return [Float] the number of rows in the matrix
|
77
|
+
def row_size
|
78
|
+
case @library
|
79
|
+
when :gsl, :nmatrix
|
80
|
+
@matrix.shape[0]
|
81
|
+
when :narray
|
82
|
+
@matrix.shape[1]
|
83
|
+
else
|
84
|
+
@matrix.row_size
|
85
|
+
end
|
84
86
|
end
|
85
|
-
end
|
86
87
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
88
|
+
# @return [Float] the number of columns in the matrix
|
89
|
+
def column_size
|
90
|
+
case @library
|
91
|
+
when :gsl, :nmatrix
|
92
|
+
@matrix.shape[1]
|
93
|
+
when :narray
|
94
|
+
@matrix.shape[0]
|
95
|
+
else
|
96
|
+
@matrix.column_size
|
97
|
+
end
|
96
98
|
end
|
97
|
-
end
|
98
99
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
100
|
+
# @return [Array<Float>] the matrix's values
|
101
|
+
def values
|
102
|
+
case @library
|
103
|
+
when :nmatrix
|
104
|
+
@matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
|
105
|
+
else
|
106
|
+
@matrix.to_a.flatten
|
107
|
+
end
|
106
108
|
end
|
107
|
-
end
|
108
109
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
110
|
+
# @return [Float] the sum of all values in the matrix
|
111
|
+
def sum
|
112
|
+
case @library
|
113
|
+
when :narray
|
114
|
+
@matrix.sum
|
115
|
+
else
|
116
|
+
values.reduce(0, :+)
|
117
|
+
end
|
116
118
|
end
|
117
|
-
end
|
118
119
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
120
|
+
# @param [Array<Array>] array matrix rows
|
121
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
|
122
|
+
def initialize_matrix(array)
|
123
|
+
case @library
|
124
|
+
when :gsl
|
125
|
+
GSL::Matrix[*array]
|
126
|
+
when :narray
|
127
|
+
NArray[*array]
|
128
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
129
|
+
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
130
|
+
else
|
131
|
+
Matrix[*array]
|
132
|
+
end
|
131
133
|
end
|
132
|
-
end
|
133
134
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
135
|
+
# @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
|
136
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
137
|
+
def multiply_self(matrix)
|
138
|
+
case @library
|
139
|
+
when :nmatrix
|
140
|
+
matrix.transpose.dot(matrix)
|
141
|
+
else
|
142
|
+
matrix.transpose * matrix
|
143
|
+
end
|
142
144
|
end
|
143
|
-
end
|
144
145
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
146
|
+
def log(number)
|
147
|
+
case @library
|
148
|
+
when :gsl
|
149
|
+
GSL::Sf::log(number)
|
150
|
+
when :narray
|
151
|
+
NMath.log(number)
|
152
|
+
else
|
153
|
+
Math.log(number)
|
154
|
+
end
|
153
155
|
end
|
154
|
-
end
|
155
156
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
157
|
+
def sqrt(number)
|
158
|
+
case @library
|
159
|
+
when :narray
|
160
|
+
NMath.sqrt(number)
|
161
|
+
else
|
162
|
+
Math.sqrt(number)
|
163
|
+
end
|
162
164
|
end
|
163
165
|
end
|
164
166
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module TfIdfSimilarity
|
2
|
+
class Model
|
3
|
+
include MatrixMethods
|
4
|
+
|
5
|
+
extend Forwardable
|
6
|
+
def_delegators :@model, :documents, :terms, :document_count
|
7
|
+
|
8
|
+
# @param [Array<Document>] documents documents
|
9
|
+
# @param [Hash] opts optional arguments
|
10
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
11
|
+
def initialize(documents, opts = {})
|
12
|
+
@model = TermCountModel.new(documents, opts)
|
13
|
+
@library = (opts[:library] || :matrix).to_sym
|
14
|
+
|
15
|
+
array = Array.new(terms.size) do |i|
|
16
|
+
idf = inverse_document_frequency(terms[i])
|
17
|
+
Array.new(documents.size) do |j|
|
18
|
+
term_frequency(documents[j], terms[i]) * idf
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
@matrix = initialize_matrix(array)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return the term frequency–inverse document frequency.
|
26
|
+
#
|
27
|
+
# @param [Document] document a document
|
28
|
+
# @param [String] term a term
|
29
|
+
# @return [Float] the term frequency–inverse document frequency
|
30
|
+
def term_frequency_inverse_document_frequency(document, term)
|
31
|
+
inverse_document_frequency(term) * term_frequency(document, term)
|
32
|
+
end
|
33
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
34
|
+
|
35
|
+
# Returns a similarity matrix for the documents in the corpus.
|
36
|
+
#
|
37
|
+
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
38
|
+
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
39
|
+
# similarity of all document vectors.
|
40
|
+
def similarity_matrix
|
41
|
+
if documents.empty?
|
42
|
+
[]
|
43
|
+
else
|
44
|
+
multiply_self(normalize)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Return the index of the document in the corpus.
|
49
|
+
#
|
50
|
+
# @param [Document] document a document
|
51
|
+
# @return [Integer,nil] the index of the document
|
52
|
+
def document_index(document)
|
53
|
+
@model.documents.index(document)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Return the index of the document with matching text.
|
57
|
+
#
|
58
|
+
# @param [String] text a text
|
59
|
+
# @return [Integer,nil] the index of the document
|
60
|
+
def text_index(text)
|
61
|
+
@model.documents.index do |document|
|
62
|
+
document.text == text
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -1,74 +1,76 @@
|
|
1
1
|
# A simple document-term matrix.
|
2
|
-
|
3
|
-
|
2
|
+
module TfIdfSimilarity
|
3
|
+
class TermCountModel
|
4
|
+
include MatrixMethods
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
# The documents in the corpus.
|
7
|
+
attr_reader :documents
|
8
|
+
# The set of terms in the corpus.
|
9
|
+
attr_reader :terms
|
10
|
+
# The average number of tokens in a document.
|
11
|
+
attr_reader :average_document_size
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
13
|
+
# @param [Array<Document>] documents documents
|
14
|
+
# @param [Hash] opts optional arguments
|
15
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
16
|
+
def initialize(documents, opts = {})
|
17
|
+
@documents = documents
|
18
|
+
@terms = Set.new(documents.map(&:terms).flatten).to_a
|
19
|
+
@library = (opts[:library] || :matrix).to_sym
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
array = Array.new(terms.size) do |i|
|
22
|
+
Array.new(documents.size) do |j|
|
23
|
+
documents[j].term_count(terms[i])
|
24
|
+
end
|
23
25
|
end
|
24
|
-
end
|
25
26
|
|
26
|
-
|
27
|
+
@matrix = initialize_matrix(array)
|
27
28
|
|
28
|
-
|
29
|
-
|
29
|
+
@average_document_size = documents.empty? ? 0 : sum / column_size.to_f
|
30
|
+
end
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
32
|
+
# @param [String] term a term
|
33
|
+
# @return [Integer] the number of documents the term appears in
|
34
|
+
def document_count(term)
|
35
|
+
index = terms.index(term)
|
36
|
+
if index
|
37
|
+
case @library
|
38
|
+
when :gsl, :narray
|
39
|
+
row(index).where.size
|
40
|
+
when :nmatrix
|
41
|
+
row(index).each.count(&:nonzero?)
|
42
|
+
else
|
43
|
+
vector = row(index)
|
44
|
+
unless vector.respond_to?(:count)
|
45
|
+
vector = vector.to_a
|
46
|
+
end
|
47
|
+
vector.count(&:nonzero?)
|
45
48
|
end
|
46
|
-
|
49
|
+
else
|
50
|
+
0
|
47
51
|
end
|
48
|
-
else
|
49
|
-
0
|
50
52
|
end
|
51
|
-
end
|
52
53
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
54
|
+
# @param [String] term a term
|
55
|
+
# @return [Integer] the number of times the term appears in the corpus
|
56
|
+
def term_count(term)
|
57
|
+
index = terms.index(term)
|
58
|
+
if index
|
59
|
+
case @library
|
60
|
+
when :gsl, :narray
|
61
|
+
row(index).sum
|
62
|
+
when :nmatrix
|
63
|
+
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
64
|
+
else
|
65
|
+
vector = row(index)
|
66
|
+
unless vector.respond_to?(:reduce)
|
67
|
+
vector = vector.to_a
|
68
|
+
end
|
69
|
+
vector.reduce(0, :+)
|
67
70
|
end
|
68
|
-
|
71
|
+
else
|
72
|
+
0
|
69
73
|
end
|
70
|
-
else
|
71
|
-
0
|
72
74
|
end
|
73
75
|
end
|
74
76
|
end
|