tf-idf-similarity 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 605ac457508eaf64a7e583e8a4a71af231d3d9d2f9c30ee82b25fb9f647d1312
4
- data.tar.gz: f24b89dccdcbef3c4fcaa59d15050f064455859c134c550fd6a432346883eb31
3
+ metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
4
+ data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
5
5
  SHA512:
6
- metadata.gz: a41195c6543dea206baa8ce3e2095437d1df94fabedcc76a8151fa5af5991524d96530710a7216c1fef48a7008f88a43773ce2a2323afa563fa29f5abed9909c
7
- data.tar.gz: aadbb85d6bd74625088d0aa7cb58b4127337d5c1dcc2af13c22664f1562013c59d79d8b3bcc3564a2861dfd968d39770205d3b401114e8bdf870b2ac412fda26
6
+ metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
7
+ data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
data/.travis.yml CHANGED
@@ -2,14 +2,42 @@ sudo: false
2
2
  language: ruby
3
3
  cache: bundler
4
4
  rvm:
5
- - 2.0.0
6
- - 2.1.0
7
- - 2.2.0
5
+ - 2.4
6
+ - 2.5
7
+ - 2.6
8
+ - 2.7
9
+ - 3.0
10
+ - 3.1
11
+ - 3.2
12
+ - ruby-head
13
+ matrix:
14
+ exclude:
15
+ # No gem releases since 2017 and failing on new versions.
16
+ # https://rubygems.org/gems/gsl
17
+ # https://rubygems.org/gems/nmatrix
18
+ - rvm: 3.0
19
+ env: MATRIX_LIBRARY=gsl
20
+ - rvm: 3.1
21
+ env: MATRIX_LIBRARY=gsl
22
+ - rvm: 3.2
23
+ env: MATRIX_LIBRARY=gsl
24
+ - rvm: ruby-head
25
+ env: MATRIX_LIBRARY=gsl
26
+ - rvm: 3.2
27
+ env: MATRIX_LIBRARY=nmatrix
28
+ - rvm: ruby-head
29
+ env: MATRIX_LIBRARY=nmatrix
30
+ allow_failures:
31
+ - rvm: ruby-head
32
+ env: MATRIX_LIBRARY=matrix
33
+ - rvm: ruby-head
34
+ env: MATRIX_LIBRARY=narray
8
35
  env:
9
36
  - MATRIX_LIBRARY=gsl
10
37
  - MATRIX_LIBRARY=narray
11
38
  - MATRIX_LIBRARY=nmatrix
12
39
  - MATRIX_LIBRARY=matrix
40
+ - MATRIX_LIBRARY=numo
13
41
  addons:
14
42
  apt:
15
43
  packages:
data/Gemfile CHANGED
@@ -1,8 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
4
+ gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
4
5
  gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
5
- gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
6
+ gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
7
+ gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
6
8
 
7
9
  # Specify your gem's dependencies in the gemspec
8
10
  gemspec
@@ -22,8 +22,12 @@ module TfIdfSimilarity
22
22
  #
23
23
  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
24
24
  def term_frequency(document, term)
25
- tf = document.term_count(term)
26
- (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
25
+ if @model.average_document_size.zero?
26
+ Float::NAN
27
+ else
28
+ tf = document.term_count(term)
29
+ (tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
30
+ end
27
31
  end
28
32
  alias_method :tf, :term_frequency
29
33
  end
@@ -17,6 +17,10 @@ module TfIdfSimilarity
17
17
  norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
18
18
  norm[norm.where2[1]] = 1.0 # avoid division by zero
19
19
  NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
20
+ when :numo
21
+ norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
22
+ norm[(norm.eq 0).where] = 1.0 # avoid division by zero
23
+ (@matrix / norm)
20
24
  when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
21
25
  normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
22
26
  (0...@matrix.shape[1]).each do |j|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
44
48
  # @param [Integer] column index
45
49
  def get(i, j)
46
50
  case @library
47
- when :narray
51
+ when :narray, :numo
48
52
  @matrix[j, i]
49
53
  else
50
54
  @matrix[i, j]
@@ -57,6 +61,8 @@ module TfIdfSimilarity
57
61
  case @library
58
62
  when :narray
59
63
  @matrix[true, index]
64
+ when :numo
65
+ @matrix[index, true]
60
66
  else
61
67
  @matrix.row(index)
62
68
  end
@@ -66,7 +72,7 @@ module TfIdfSimilarity
66
72
  # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
67
73
  def column(index)
68
74
  case @library
69
- when :narray
75
+ when :narray, :numo
70
76
  @matrix[index, true]
71
77
  else
72
78
  @matrix.column(index)
@@ -78,7 +84,7 @@ module TfIdfSimilarity
78
84
  case @library
79
85
  when :gsl, :nmatrix
80
86
  @matrix.shape[0]
81
- when :narray
87
+ when :narray, :numo
82
88
  @matrix.shape[1]
83
89
  else
84
90
  @matrix.row_size
@@ -90,7 +96,7 @@ module TfIdfSimilarity
90
96
  case @library
91
97
  when :gsl, :nmatrix
92
98
  @matrix.shape[1]
93
- when :narray
99
+ when :narray, :numo
94
100
  @matrix.shape[0]
95
101
  else
96
102
  @matrix.column_size
@@ -110,7 +116,7 @@ module TfIdfSimilarity
110
116
  # @return [Float] the sum of all values in the matrix
111
117
  def sum
112
118
  case @library
113
- when :narray
119
+ when :narray, :numo
114
120
  @matrix.sum
115
121
  else
116
122
  values.reduce(0, :+)
@@ -125,6 +131,8 @@ module TfIdfSimilarity
125
131
  GSL::Matrix[*array]
126
132
  when :narray
127
133
  NArray[*array]
134
+ when :numo
135
+ Numo::DFloat[*array]
128
136
  when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
129
137
  NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
130
138
  else
@@ -136,7 +144,7 @@ module TfIdfSimilarity
136
144
  # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
137
145
  def multiply_self(matrix)
138
146
  case @library
139
- when :nmatrix
147
+ when :nmatrix, :numo
140
148
  matrix.transpose.dot(matrix)
141
149
  else
142
150
  matrix.transpose * matrix
@@ -149,6 +157,8 @@ module TfIdfSimilarity
149
157
  GSL::Sf::log(number)
150
158
  when :narray
151
159
  NMath.log(number)
160
+ when :numo
161
+ Numo::NMath.log(number)
152
162
  else
153
163
  Math.log(number)
154
164
  end
@@ -158,6 +168,8 @@ module TfIdfSimilarity
158
168
  case @library
159
169
  when :narray
160
170
  NMath.sqrt(number)
171
+ when :numo
172
+ Numo::NMath.sqrt(number)
161
173
  else
162
174
  Math.sqrt(number)
163
175
  end
@@ -15,7 +15,7 @@ module TfIdfSimilarity
15
15
  array = Array.new(terms.size) do |i|
16
16
  idf = inverse_document_frequency(terms[i])
17
17
  Array.new(documents.size) do |j|
18
- term_frequency(documents[j], terms[i]) * idf
18
+ (term_frequency(documents[j], terms[i]) * idf).to_f
19
19
  end
20
20
  end
21
21
 
@@ -37,6 +37,8 @@ module TfIdfSimilarity
37
37
  case @library
38
38
  when :gsl, :narray
39
39
  row(index).where.size
40
+ when :numo
41
+ (row(index).ne 0).where.size
40
42
  when :nmatrix
41
43
  row(index).each.count(&:nonzero?)
42
44
  else
@@ -57,7 +59,7 @@ module TfIdfSimilarity
57
59
  index = terms.index(term)
58
60
  if index
59
61
  case @library
60
- when :gsl, :narray
62
+ when :gsl, :narray, :numo
61
63
  row(index).sum
62
64
  when :nmatrix
63
65
  row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -82,7 +82,12 @@ module TfIdfSimilarity
82
82
 
83
83
  describe '#term_frequency_inverse_document_frequency' do
84
84
  it 'should return negative infinity' do
85
- model.tfidf(document, 'foo').should be_nan
85
+ case MATRIX_LIBRARY
86
+ when :numo
87
+ model.tfidf(document, 'foo').isnan.should eq 1
88
+ else
89
+ model.tfidf(document, 'foo').should be_nan
90
+ end
86
91
  end
87
92
  end
88
93
 
@@ -147,7 +152,7 @@ module TfIdfSimilarity
147
152
  end
148
153
 
149
154
  it 'should return the term frequency if tokens given' do
150
- model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
155
+ model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
151
156
  end
152
157
 
153
158
  it 'should return no term frequency if no text given' do
@@ -155,7 +160,7 @@ module TfIdfSimilarity
155
160
  end
156
161
 
157
162
  it 'should return the term frequency if term counts given' do
158
- model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
163
+ model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
159
164
  end
160
165
 
161
166
  it 'should return the term frequency of a non-occurring term' do
@@ -163,7 +168,7 @@ module TfIdfSimilarity
163
168
  end
164
169
 
165
170
  it 'should return the term frequency in a non-occurring document' do
166
- model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
171
+ model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
167
172
  end
168
173
  end
169
174
 
@@ -177,17 +182,17 @@ module TfIdfSimilarity
177
182
  end
178
183
 
179
184
  it 'should return the tf*idf in a non-occurring term' do
180
- model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
185
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
181
186
  end
182
187
  end
183
188
 
184
189
  describe '#similarity_matrix' do
185
190
  it 'should return the similarity matrix' do
186
191
  expected = [
187
- 1.0, 0.564, 0.0, 0.479,
188
- 0.564, 1.0, 0.0, 0.540,
192
+ 1.0, 0.558, 0.0, 0.449,
193
+ 0.558, 1.0, 0.0, 0.501,
189
194
  0.0, 0.0, 0.0, 0.0,
190
- 0.479, 0.540, 0.0, 1.0,
195
+ 0.449, 0.501, 0.0, 1.0,
191
196
  ]
192
197
 
193
198
  similarity_matrix_values(model).each_with_index do |value,i|
data/spec/spec_helper.rb CHANGED
@@ -18,6 +18,8 @@ when :gsl
18
18
  require 'gsl'
19
19
  when :narray
20
20
  require 'narray'
21
+ when :numo
22
+ require 'numo/narray'
21
23
  when :nmatrix
22
24
  require 'nmatrix'
23
25
  else
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
10
10
  s.summary = %q{Calculates the similarity between texts using tf*idf}
11
11
  s.license = 'MIT'
12
+ s.required_ruby_version = '>= 2.4.0'
12
13
 
13
14
  s.files = `git ls-files`.split("\n")
14
15
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
19
20
 
20
21
  s.add_development_dependency('coveralls')
21
22
  s.add_development_dependency('json', '< 2')
22
- s.add_development_dependency('rake', '< 12')
23
- s.add_development_dependency('rspec', '~> 2.10')
23
+ s.add_development_dependency('rake')
24
+ s.add_development_dependency('rspec', '~> 3.0')
24
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tf-idf-similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James McKinney
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-12-19 00:00:00.000000000 Z
11
+ date: 2024-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode_utils
@@ -56,30 +56,30 @@ dependencies:
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "<"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '12'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "<"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '12'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '2.10'
75
+ version: '3.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '2.10'
82
+ version: '3.0'
83
83
  description:
84
84
  email:
85
85
  executables: []
@@ -126,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
126
126
  requirements:
127
127
  - - ">="
128
128
  - !ruby/object:Gem::Version
129
- version: '0'
129
+ version: 2.4.0
130
130
  required_rubygems_version: !ruby/object:Gem::Requirement
131
131
  requirements:
132
132
  - - ">="
133
133
  - !ruby/object:Gem::Version
134
134
  version: '0'
135
135
  requirements: []
136
- rubyforge_project:
137
- rubygems_version: 2.7.6
136
+ rubygems_version: 3.0.3.1
138
137
  signing_key:
139
138
  specification_version: 4
140
139
  summary: Calculates the similarity between texts using tf*idf