tf-idf-similarity 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +31 -3
- data/Gemfile +3 -1
- data/lib/tf-idf-similarity/bm25_model.rb +6 -2
- data/lib/tf-idf-similarity/matrix_methods.rb +18 -6
- data/lib/tf-idf-similarity/model.rb +1 -1
- data/lib/tf-idf-similarity/term_count_model.rb +3 -1
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +13 -8
- data/spec/spec_helper.rb +2 -0
- data/td-idf-similarity.gemspec +3 -2
- metadata +10 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
|
4
|
+
data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
|
7
|
+
data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
|
data/.travis.yml
CHANGED
@@ -2,14 +2,42 @@ sudo: false
|
|
2
2
|
language: ruby
|
3
3
|
cache: bundler
|
4
4
|
rvm:
|
5
|
-
- 2.
|
6
|
-
- 2.
|
7
|
-
- 2.
|
5
|
+
- 2.4
|
6
|
+
- 2.5
|
7
|
+
- 2.6
|
8
|
+
- 2.7
|
9
|
+
- 3.0
|
10
|
+
- 3.1
|
11
|
+
- 3.2
|
12
|
+
- ruby-head
|
13
|
+
matrix:
|
14
|
+
exclude:
|
15
|
+
# No gem releases since 2017 and failing on new versions.
|
16
|
+
# https://rubygems.org/gems/gsl
|
17
|
+
# https://rubygems.org/gems/nmatrix
|
18
|
+
- rvm: 3.0
|
19
|
+
env: MATRIX_LIBRARY=gsl
|
20
|
+
- rvm: 3.1
|
21
|
+
env: MATRIX_LIBRARY=gsl
|
22
|
+
- rvm: 3.2
|
23
|
+
env: MATRIX_LIBRARY=gsl
|
24
|
+
- rvm: ruby-head
|
25
|
+
env: MATRIX_LIBRARY=gsl
|
26
|
+
- rvm: 3.2
|
27
|
+
env: MATRIX_LIBRARY=nmatrix
|
28
|
+
- rvm: ruby-head
|
29
|
+
env: MATRIX_LIBRARY=nmatrix
|
30
|
+
allow_failures:
|
31
|
+
- rvm: ruby-head
|
32
|
+
env: MATRIX_LIBRARY=matrix
|
33
|
+
- rvm: ruby-head
|
34
|
+
env: MATRIX_LIBRARY=narray
|
8
35
|
env:
|
9
36
|
- MATRIX_LIBRARY=gsl
|
10
37
|
- MATRIX_LIBRARY=narray
|
11
38
|
- MATRIX_LIBRARY=nmatrix
|
12
39
|
- MATRIX_LIBRARY=matrix
|
40
|
+
- MATRIX_LIBRARY=numo
|
13
41
|
addons:
|
14
42
|
apt:
|
15
43
|
packages:
|
data/Gemfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
|
+
gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
|
4
5
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.
|
6
|
+
gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
|
7
|
+
gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
|
6
8
|
|
7
9
|
# Specify your gem's dependencies in the gemspec
|
8
10
|
gemspec
|
@@ -22,8 +22,12 @@ module TfIdfSimilarity
|
|
22
22
|
#
|
23
23
|
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
24
24
|
def term_frequency(document, term)
|
25
|
-
|
26
|
-
|
25
|
+
if @model.average_document_size.zero?
|
26
|
+
Float::NAN
|
27
|
+
else
|
28
|
+
tf = document.term_count(term)
|
29
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
|
30
|
+
end
|
27
31
|
end
|
28
32
|
alias_method :tf, :term_frequency
|
29
33
|
end
|
@@ -17,6 +17,10 @@ module TfIdfSimilarity
|
|
17
17
|
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
18
18
|
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
19
19
|
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
20
|
+
when :numo
|
21
|
+
norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
|
22
|
+
norm[(norm.eq 0).where] = 1.0 # avoid division by zero
|
23
|
+
(@matrix / norm)
|
20
24
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
21
25
|
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
22
26
|
(0...@matrix.shape[1]).each do |j|
|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
|
|
44
48
|
# @param [Integer] column index
|
45
49
|
def get(i, j)
|
46
50
|
case @library
|
47
|
-
when :narray
|
51
|
+
when :narray, :numo
|
48
52
|
@matrix[j, i]
|
49
53
|
else
|
50
54
|
@matrix[i, j]
|
@@ -57,6 +61,8 @@ module TfIdfSimilarity
|
|
57
61
|
case @library
|
58
62
|
when :narray
|
59
63
|
@matrix[true, index]
|
64
|
+
when :numo
|
65
|
+
@matrix[index, true]
|
60
66
|
else
|
61
67
|
@matrix.row(index)
|
62
68
|
end
|
@@ -66,7 +72,7 @@ module TfIdfSimilarity
|
|
66
72
|
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
67
73
|
def column(index)
|
68
74
|
case @library
|
69
|
-
when :narray
|
75
|
+
when :narray, :numo
|
70
76
|
@matrix[index, true]
|
71
77
|
else
|
72
78
|
@matrix.column(index)
|
@@ -78,7 +84,7 @@ module TfIdfSimilarity
|
|
78
84
|
case @library
|
79
85
|
when :gsl, :nmatrix
|
80
86
|
@matrix.shape[0]
|
81
|
-
when :narray
|
87
|
+
when :narray, :numo
|
82
88
|
@matrix.shape[1]
|
83
89
|
else
|
84
90
|
@matrix.row_size
|
@@ -90,7 +96,7 @@ module TfIdfSimilarity
|
|
90
96
|
case @library
|
91
97
|
when :gsl, :nmatrix
|
92
98
|
@matrix.shape[1]
|
93
|
-
when :narray
|
99
|
+
when :narray, :numo
|
94
100
|
@matrix.shape[0]
|
95
101
|
else
|
96
102
|
@matrix.column_size
|
@@ -110,7 +116,7 @@ module TfIdfSimilarity
|
|
110
116
|
# @return [Float] the sum of all values in the matrix
|
111
117
|
def sum
|
112
118
|
case @library
|
113
|
-
when :narray
|
119
|
+
when :narray, :numo
|
114
120
|
@matrix.sum
|
115
121
|
else
|
116
122
|
values.reduce(0, :+)
|
@@ -125,6 +131,8 @@ module TfIdfSimilarity
|
|
125
131
|
GSL::Matrix[*array]
|
126
132
|
when :narray
|
127
133
|
NArray[*array]
|
134
|
+
when :numo
|
135
|
+
Numo::DFloat[*array]
|
128
136
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
129
137
|
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
130
138
|
else
|
@@ -136,7 +144,7 @@ module TfIdfSimilarity
|
|
136
144
|
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
137
145
|
def multiply_self(matrix)
|
138
146
|
case @library
|
139
|
-
when :nmatrix
|
147
|
+
when :nmatrix, :numo
|
140
148
|
matrix.transpose.dot(matrix)
|
141
149
|
else
|
142
150
|
matrix.transpose * matrix
|
@@ -149,6 +157,8 @@ module TfIdfSimilarity
|
|
149
157
|
GSL::Sf::log(number)
|
150
158
|
when :narray
|
151
159
|
NMath.log(number)
|
160
|
+
when :numo
|
161
|
+
Numo::NMath.log(number)
|
152
162
|
else
|
153
163
|
Math.log(number)
|
154
164
|
end
|
@@ -158,6 +168,8 @@ module TfIdfSimilarity
|
|
158
168
|
case @library
|
159
169
|
when :narray
|
160
170
|
NMath.sqrt(number)
|
171
|
+
when :numo
|
172
|
+
Numo::NMath.sqrt(number)
|
161
173
|
else
|
162
174
|
Math.sqrt(number)
|
163
175
|
end
|
@@ -15,7 +15,7 @@ module TfIdfSimilarity
|
|
15
15
|
array = Array.new(terms.size) do |i|
|
16
16
|
idf = inverse_document_frequency(terms[i])
|
17
17
|
Array.new(documents.size) do |j|
|
18
|
-
term_frequency(documents[j], terms[i]) * idf
|
18
|
+
(term_frequency(documents[j], terms[i]) * idf).to_f
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -37,6 +37,8 @@ module TfIdfSimilarity
|
|
37
37
|
case @library
|
38
38
|
when :gsl, :narray
|
39
39
|
row(index).where.size
|
40
|
+
when :numo
|
41
|
+
(row(index).ne 0).where.size
|
40
42
|
when :nmatrix
|
41
43
|
row(index).each.count(&:nonzero?)
|
42
44
|
else
|
@@ -57,7 +59,7 @@ module TfIdfSimilarity
|
|
57
59
|
index = terms.index(term)
|
58
60
|
if index
|
59
61
|
case @library
|
60
|
-
when :gsl, :narray
|
62
|
+
when :gsl, :narray, :numo
|
61
63
|
row(index).sum
|
62
64
|
when :nmatrix
|
63
65
|
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
data/spec/bm25_model_spec.rb
CHANGED
@@ -82,7 +82,12 @@ module TfIdfSimilarity
|
|
82
82
|
|
83
83
|
describe '#term_frequency_inverse_document_frequency' do
|
84
84
|
it 'should return negative infinity' do
|
85
|
-
|
85
|
+
case MATRIX_LIBRARY
|
86
|
+
when :numo
|
87
|
+
model.tfidf(document, 'foo').isnan.should eq 1
|
88
|
+
else
|
89
|
+
model.tfidf(document, 'foo').should be_nan
|
90
|
+
end
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
@@ -147,7 +152,7 @@ module TfIdfSimilarity
|
|
147
152
|
end
|
148
153
|
|
149
154
|
it 'should return the term frequency if tokens given' do
|
150
|
-
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 *
|
155
|
+
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
|
151
156
|
end
|
152
157
|
|
153
158
|
it 'should return no term frequency if no text given' do
|
@@ -155,7 +160,7 @@ module TfIdfSimilarity
|
|
155
160
|
end
|
156
161
|
|
157
162
|
it 'should return the term frequency if term counts given' do
|
158
|
-
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 *
|
163
|
+
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
|
159
164
|
end
|
160
165
|
|
161
166
|
it 'should return the term frequency of a non-occurring term' do
|
@@ -163,7 +168,7 @@ module TfIdfSimilarity
|
|
163
168
|
end
|
164
169
|
|
165
170
|
it 'should return the term frequency in a non-occurring document' do
|
166
|
-
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 *
|
171
|
+
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
|
167
172
|
end
|
168
173
|
end
|
169
174
|
|
@@ -177,17 +182,17 @@ module TfIdfSimilarity
|
|
177
182
|
end
|
178
183
|
|
179
184
|
it 'should return the tf*idf in a non-occurring term' do
|
180
|
-
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 *
|
185
|
+
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
|
181
186
|
end
|
182
187
|
end
|
183
188
|
|
184
189
|
describe '#similarity_matrix' do
|
185
190
|
it 'should return the similarity matrix' do
|
186
191
|
expected = [
|
187
|
-
1.0, 0.
|
188
|
-
0.
|
192
|
+
1.0, 0.558, 0.0, 0.449,
|
193
|
+
0.558, 1.0, 0.0, 0.501,
|
189
194
|
0.0, 0.0, 0.0, 0.0,
|
190
|
-
0.
|
195
|
+
0.449, 0.501, 0.0, 1.0,
|
191
196
|
]
|
192
197
|
|
193
198
|
similarity_matrix_values(model).each_with_index do |value,i|
|
data/spec/spec_helper.rb
CHANGED
data/td-idf-similarity.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
|
10
10
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
11
11
|
s.license = 'MIT'
|
12
|
+
s.required_ruby_version = '>= 2.4.0'
|
12
13
|
|
13
14
|
s.files = `git ls-files`.split("\n")
|
14
15
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
|
|
19
20
|
|
20
21
|
s.add_development_dependency('coveralls')
|
21
22
|
s.add_development_dependency('json', '< 2')
|
22
|
-
s.add_development_dependency('rake'
|
23
|
-
s.add_development_dependency('rspec', '~>
|
23
|
+
s.add_development_dependency('rake')
|
24
|
+
s.add_development_dependency('rspec', '~> 3.0')
|
24
25
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode_utils
|
@@ -56,30 +56,30 @@ dependencies:
|
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '3.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '3.0'
|
83
83
|
description:
|
84
84
|
email:
|
85
85
|
executables: []
|
@@ -126,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
126
|
requirements:
|
127
127
|
- - ">="
|
128
128
|
- !ruby/object:Gem::Version
|
129
|
-
version:
|
129
|
+
version: 2.4.0
|
130
130
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
131
|
requirements:
|
132
132
|
- - ">="
|
133
133
|
- !ruby/object:Gem::Version
|
134
134
|
version: '0'
|
135
135
|
requirements: []
|
136
|
-
|
137
|
-
rubygems_version: 2.7.6
|
136
|
+
rubygems_version: 3.0.3.1
|
138
137
|
signing_key:
|
139
138
|
specification_version: 4
|
140
139
|
summary: Calculates the similarity between texts using tf*idf
|