tf-idf-similarity 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +31 -3
- data/Gemfile +3 -1
- data/lib/tf-idf-similarity/bm25_model.rb +6 -2
- data/lib/tf-idf-similarity/matrix_methods.rb +18 -6
- data/lib/tf-idf-similarity/model.rb +1 -1
- data/lib/tf-idf-similarity/term_count_model.rb +3 -1
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +13 -8
- data/spec/spec_helper.rb +2 -0
- data/td-idf-similarity.gemspec +3 -2
- metadata +10 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
|
4
|
+
data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
|
7
|
+
data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
|
data/.travis.yml
CHANGED
@@ -2,14 +2,42 @@ sudo: false
|
|
2
2
|
language: ruby
|
3
3
|
cache: bundler
|
4
4
|
rvm:
|
5
|
-
- 2.
|
6
|
-
- 2.
|
7
|
-
- 2.
|
5
|
+
- 2.4
|
6
|
+
- 2.5
|
7
|
+
- 2.6
|
8
|
+
- 2.7
|
9
|
+
- 3.0
|
10
|
+
- 3.1
|
11
|
+
- 3.2
|
12
|
+
- ruby-head
|
13
|
+
matrix:
|
14
|
+
exclude:
|
15
|
+
# No gem releases since 2017 and failing on new versions.
|
16
|
+
# https://rubygems.org/gems/gsl
|
17
|
+
# https://rubygems.org/gems/nmatrix
|
18
|
+
- rvm: 3.0
|
19
|
+
env: MATRIX_LIBRARY=gsl
|
20
|
+
- rvm: 3.1
|
21
|
+
env: MATRIX_LIBRARY=gsl
|
22
|
+
- rvm: 3.2
|
23
|
+
env: MATRIX_LIBRARY=gsl
|
24
|
+
- rvm: ruby-head
|
25
|
+
env: MATRIX_LIBRARY=gsl
|
26
|
+
- rvm: 3.2
|
27
|
+
env: MATRIX_LIBRARY=nmatrix
|
28
|
+
- rvm: ruby-head
|
29
|
+
env: MATRIX_LIBRARY=nmatrix
|
30
|
+
allow_failures:
|
31
|
+
- rvm: ruby-head
|
32
|
+
env: MATRIX_LIBRARY=matrix
|
33
|
+
- rvm: ruby-head
|
34
|
+
env: MATRIX_LIBRARY=narray
|
8
35
|
env:
|
9
36
|
- MATRIX_LIBRARY=gsl
|
10
37
|
- MATRIX_LIBRARY=narray
|
11
38
|
- MATRIX_LIBRARY=nmatrix
|
12
39
|
- MATRIX_LIBRARY=matrix
|
40
|
+
- MATRIX_LIBRARY=numo
|
13
41
|
addons:
|
14
42
|
apt:
|
15
43
|
packages:
|
data/Gemfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
|
+
gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
|
4
5
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.
|
6
|
+
gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
|
7
|
+
gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
|
6
8
|
|
7
9
|
# Specify your gem's dependencies in the gemspec
|
8
10
|
gemspec
|
@@ -22,8 +22,12 @@ module TfIdfSimilarity
|
|
22
22
|
#
|
23
23
|
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
24
24
|
def term_frequency(document, term)
|
25
|
-
|
26
|
-
|
25
|
+
if @model.average_document_size.zero?
|
26
|
+
Float::NAN
|
27
|
+
else
|
28
|
+
tf = document.term_count(term)
|
29
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
|
30
|
+
end
|
27
31
|
end
|
28
32
|
alias_method :tf, :term_frequency
|
29
33
|
end
|
@@ -17,6 +17,10 @@ module TfIdfSimilarity
|
|
17
17
|
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
18
18
|
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
19
19
|
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
20
|
+
when :numo
|
21
|
+
norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
|
22
|
+
norm[(norm.eq 0).where] = 1.0 # avoid division by zero
|
23
|
+
(@matrix / norm)
|
20
24
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
21
25
|
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
22
26
|
(0...@matrix.shape[1]).each do |j|
|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
|
|
44
48
|
# @param [Integer] column index
|
45
49
|
def get(i, j)
|
46
50
|
case @library
|
47
|
-
when :narray
|
51
|
+
when :narray, :numo
|
48
52
|
@matrix[j, i]
|
49
53
|
else
|
50
54
|
@matrix[i, j]
|
@@ -57,6 +61,8 @@ module TfIdfSimilarity
|
|
57
61
|
case @library
|
58
62
|
when :narray
|
59
63
|
@matrix[true, index]
|
64
|
+
when :numo
|
65
|
+
@matrix[index, true]
|
60
66
|
else
|
61
67
|
@matrix.row(index)
|
62
68
|
end
|
@@ -66,7 +72,7 @@ module TfIdfSimilarity
|
|
66
72
|
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
67
73
|
def column(index)
|
68
74
|
case @library
|
69
|
-
when :narray
|
75
|
+
when :narray, :numo
|
70
76
|
@matrix[index, true]
|
71
77
|
else
|
72
78
|
@matrix.column(index)
|
@@ -78,7 +84,7 @@ module TfIdfSimilarity
|
|
78
84
|
case @library
|
79
85
|
when :gsl, :nmatrix
|
80
86
|
@matrix.shape[0]
|
81
|
-
when :narray
|
87
|
+
when :narray, :numo
|
82
88
|
@matrix.shape[1]
|
83
89
|
else
|
84
90
|
@matrix.row_size
|
@@ -90,7 +96,7 @@ module TfIdfSimilarity
|
|
90
96
|
case @library
|
91
97
|
when :gsl, :nmatrix
|
92
98
|
@matrix.shape[1]
|
93
|
-
when :narray
|
99
|
+
when :narray, :numo
|
94
100
|
@matrix.shape[0]
|
95
101
|
else
|
96
102
|
@matrix.column_size
|
@@ -110,7 +116,7 @@ module TfIdfSimilarity
|
|
110
116
|
# @return [Float] the sum of all values in the matrix
|
111
117
|
def sum
|
112
118
|
case @library
|
113
|
-
when :narray
|
119
|
+
when :narray, :numo
|
114
120
|
@matrix.sum
|
115
121
|
else
|
116
122
|
values.reduce(0, :+)
|
@@ -125,6 +131,8 @@ module TfIdfSimilarity
|
|
125
131
|
GSL::Matrix[*array]
|
126
132
|
when :narray
|
127
133
|
NArray[*array]
|
134
|
+
when :numo
|
135
|
+
Numo::DFloat[*array]
|
128
136
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
129
137
|
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
130
138
|
else
|
@@ -136,7 +144,7 @@ module TfIdfSimilarity
|
|
136
144
|
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
137
145
|
def multiply_self(matrix)
|
138
146
|
case @library
|
139
|
-
when :nmatrix
|
147
|
+
when :nmatrix, :numo
|
140
148
|
matrix.transpose.dot(matrix)
|
141
149
|
else
|
142
150
|
matrix.transpose * matrix
|
@@ -149,6 +157,8 @@ module TfIdfSimilarity
|
|
149
157
|
GSL::Sf::log(number)
|
150
158
|
when :narray
|
151
159
|
NMath.log(number)
|
160
|
+
when :numo
|
161
|
+
Numo::NMath.log(number)
|
152
162
|
else
|
153
163
|
Math.log(number)
|
154
164
|
end
|
@@ -158,6 +168,8 @@ module TfIdfSimilarity
|
|
158
168
|
case @library
|
159
169
|
when :narray
|
160
170
|
NMath.sqrt(number)
|
171
|
+
when :numo
|
172
|
+
Numo::NMath.sqrt(number)
|
161
173
|
else
|
162
174
|
Math.sqrt(number)
|
163
175
|
end
|
@@ -15,7 +15,7 @@ module TfIdfSimilarity
|
|
15
15
|
array = Array.new(terms.size) do |i|
|
16
16
|
idf = inverse_document_frequency(terms[i])
|
17
17
|
Array.new(documents.size) do |j|
|
18
|
-
term_frequency(documents[j], terms[i]) * idf
|
18
|
+
(term_frequency(documents[j], terms[i]) * idf).to_f
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -37,6 +37,8 @@ module TfIdfSimilarity
|
|
37
37
|
case @library
|
38
38
|
when :gsl, :narray
|
39
39
|
row(index).where.size
|
40
|
+
when :numo
|
41
|
+
(row(index).ne 0).where.size
|
40
42
|
when :nmatrix
|
41
43
|
row(index).each.count(&:nonzero?)
|
42
44
|
else
|
@@ -57,7 +59,7 @@ module TfIdfSimilarity
|
|
57
59
|
index = terms.index(term)
|
58
60
|
if index
|
59
61
|
case @library
|
60
|
-
when :gsl, :narray
|
62
|
+
when :gsl, :narray, :numo
|
61
63
|
row(index).sum
|
62
64
|
when :nmatrix
|
63
65
|
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
data/spec/bm25_model_spec.rb
CHANGED
@@ -82,7 +82,12 @@ module TfIdfSimilarity
|
|
82
82
|
|
83
83
|
describe '#term_frequency_inverse_document_frequency' do
|
84
84
|
it 'should return negative infinity' do
|
85
|
-
|
85
|
+
case MATRIX_LIBRARY
|
86
|
+
when :numo
|
87
|
+
model.tfidf(document, 'foo').isnan.should eq 1
|
88
|
+
else
|
89
|
+
model.tfidf(document, 'foo').should be_nan
|
90
|
+
end
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
@@ -147,7 +152,7 @@ module TfIdfSimilarity
|
|
147
152
|
end
|
148
153
|
|
149
154
|
it 'should return the term frequency if tokens given' do
|
150
|
-
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 *
|
155
|
+
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
|
151
156
|
end
|
152
157
|
|
153
158
|
it 'should return no term frequency if no text given' do
|
@@ -155,7 +160,7 @@ module TfIdfSimilarity
|
|
155
160
|
end
|
156
161
|
|
157
162
|
it 'should return the term frequency if term counts given' do
|
158
|
-
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 *
|
163
|
+
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
|
159
164
|
end
|
160
165
|
|
161
166
|
it 'should return the term frequency of a non-occurring term' do
|
@@ -163,7 +168,7 @@ module TfIdfSimilarity
|
|
163
168
|
end
|
164
169
|
|
165
170
|
it 'should return the term frequency in a non-occurring document' do
|
166
|
-
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 *
|
171
|
+
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
|
167
172
|
end
|
168
173
|
end
|
169
174
|
|
@@ -177,17 +182,17 @@ module TfIdfSimilarity
|
|
177
182
|
end
|
178
183
|
|
179
184
|
it 'should return the tf*idf in a non-occurring term' do
|
180
|
-
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 *
|
185
|
+
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
|
181
186
|
end
|
182
187
|
end
|
183
188
|
|
184
189
|
describe '#similarity_matrix' do
|
185
190
|
it 'should return the similarity matrix' do
|
186
191
|
expected = [
|
187
|
-
1.0, 0.
|
188
|
-
0.
|
192
|
+
1.0, 0.558, 0.0, 0.449,
|
193
|
+
0.558, 1.0, 0.0, 0.501,
|
189
194
|
0.0, 0.0, 0.0, 0.0,
|
190
|
-
0.
|
195
|
+
0.449, 0.501, 0.0, 1.0,
|
191
196
|
]
|
192
197
|
|
193
198
|
similarity_matrix_values(model).each_with_index do |value,i|
|
data/spec/spec_helper.rb
CHANGED
data/td-idf-similarity.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
|
10
10
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
11
11
|
s.license = 'MIT'
|
12
|
+
s.required_ruby_version = '>= 2.4.0'
|
12
13
|
|
13
14
|
s.files = `git ls-files`.split("\n")
|
14
15
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
|
|
19
20
|
|
20
21
|
s.add_development_dependency('coveralls')
|
21
22
|
s.add_development_dependency('json', '< 2')
|
22
|
-
s.add_development_dependency('rake'
|
23
|
-
s.add_development_dependency('rspec', '~>
|
23
|
+
s.add_development_dependency('rake')
|
24
|
+
s.add_development_dependency('rspec', '~> 3.0')
|
24
25
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode_utils
|
@@ -56,30 +56,30 @@ dependencies:
|
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '3.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '3.0'
|
83
83
|
description:
|
84
84
|
email:
|
85
85
|
executables: []
|
@@ -126,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
126
|
requirements:
|
127
127
|
- - ">="
|
128
128
|
- !ruby/object:Gem::Version
|
129
|
-
version:
|
129
|
+
version: 2.4.0
|
130
130
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
131
|
requirements:
|
132
132
|
- - ">="
|
133
133
|
- !ruby/object:Gem::Version
|
134
134
|
version: '0'
|
135
135
|
requirements: []
|
136
|
-
|
137
|
-
rubygems_version: 2.7.6
|
136
|
+
rubygems_version: 3.0.3.1
|
138
137
|
signing_key:
|
139
138
|
specification_version: 4
|
140
139
|
summary: Calculates the similarity between texts using tf*idf
|