tf-idf-similarity 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 605ac457508eaf64a7e583e8a4a71af231d3d9d2f9c30ee82b25fb9f647d1312
4
- data.tar.gz: f24b89dccdcbef3c4fcaa59d15050f064455859c134c550fd6a432346883eb31
3
+ metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
4
+ data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
5
5
  SHA512:
6
- metadata.gz: a41195c6543dea206baa8ce3e2095437d1df94fabedcc76a8151fa5af5991524d96530710a7216c1fef48a7008f88a43773ce2a2323afa563fa29f5abed9909c
7
- data.tar.gz: aadbb85d6bd74625088d0aa7cb58b4127337d5c1dcc2af13c22664f1562013c59d79d8b3bcc3564a2861dfd968d39770205d3b401114e8bdf870b2ac412fda26
6
+ metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
7
+ data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
data/.travis.yml CHANGED
@@ -2,14 +2,42 @@ sudo: false
2
2
  language: ruby
3
3
  cache: bundler
4
4
  rvm:
5
- - 2.0.0
6
- - 2.1.0
7
- - 2.2.0
5
+ - 2.4
6
+ - 2.5
7
+ - 2.6
8
+ - 2.7
9
+ - 3.0
10
+ - 3.1
11
+ - 3.2
12
+ - ruby-head
13
+ matrix:
14
+ exclude:
15
+ # No gem releases since 2017 and failing on new versions.
16
+ # https://rubygems.org/gems/gsl
17
+ # https://rubygems.org/gems/nmatrix
18
+ - rvm: 3.0
19
+ env: MATRIX_LIBRARY=gsl
20
+ - rvm: 3.1
21
+ env: MATRIX_LIBRARY=gsl
22
+ - rvm: 3.2
23
+ env: MATRIX_LIBRARY=gsl
24
+ - rvm: ruby-head
25
+ env: MATRIX_LIBRARY=gsl
26
+ - rvm: 3.2
27
+ env: MATRIX_LIBRARY=nmatrix
28
+ - rvm: ruby-head
29
+ env: MATRIX_LIBRARY=nmatrix
30
+ allow_failures:
31
+ - rvm: ruby-head
32
+ env: MATRIX_LIBRARY=matrix
33
+ - rvm: ruby-head
34
+ env: MATRIX_LIBRARY=narray
8
35
  env:
9
36
  - MATRIX_LIBRARY=gsl
10
37
  - MATRIX_LIBRARY=narray
11
38
  - MATRIX_LIBRARY=nmatrix
12
39
  - MATRIX_LIBRARY=matrix
40
+ - MATRIX_LIBRARY=numo
13
41
  addons:
14
42
  apt:
15
43
  packages:
data/Gemfile CHANGED
@@ -1,8 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
4
+ gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
4
5
  gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
5
- gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
6
+ gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
7
+ gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
6
8
 
7
9
  # Specify your gem's dependencies in the gemspec
8
10
  gemspec
@@ -22,8 +22,12 @@ module TfIdfSimilarity
22
22
  #
23
23
  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
24
24
  def term_frequency(document, term)
25
- tf = document.term_count(term)
26
- (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
25
+ if @model.average_document_size.zero?
26
+ Float::NAN
27
+ else
28
+ tf = document.term_count(term)
29
+ (tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
30
+ end
27
31
  end
28
32
  alias_method :tf, :term_frequency
29
33
  end
@@ -17,6 +17,10 @@ module TfIdfSimilarity
17
17
  norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
18
18
  norm[norm.where2[1]] = 1.0 # avoid division by zero
19
19
  NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
20
+ when :numo
21
+ norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
22
+ norm[(norm.eq 0).where] = 1.0 # avoid division by zero
23
+ (@matrix / norm)
20
24
  when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
21
25
  normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
22
26
  (0...@matrix.shape[1]).each do |j|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
44
48
  # @param [Integer] column index
45
49
  def get(i, j)
46
50
  case @library
47
- when :narray
51
+ when :narray, :numo
48
52
  @matrix[j, i]
49
53
  else
50
54
  @matrix[i, j]
@@ -57,6 +61,8 @@ module TfIdfSimilarity
57
61
  case @library
58
62
  when :narray
59
63
  @matrix[true, index]
64
+ when :numo
65
+ @matrix[index, true]
60
66
  else
61
67
  @matrix.row(index)
62
68
  end
@@ -66,7 +72,7 @@ module TfIdfSimilarity
66
72
  # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
67
73
  def column(index)
68
74
  case @library
69
- when :narray
75
+ when :narray, :numo
70
76
  @matrix[index, true]
71
77
  else
72
78
  @matrix.column(index)
@@ -78,7 +84,7 @@ module TfIdfSimilarity
78
84
  case @library
79
85
  when :gsl, :nmatrix
80
86
  @matrix.shape[0]
81
- when :narray
87
+ when :narray, :numo
82
88
  @matrix.shape[1]
83
89
  else
84
90
  @matrix.row_size
@@ -90,7 +96,7 @@ module TfIdfSimilarity
90
96
  case @library
91
97
  when :gsl, :nmatrix
92
98
  @matrix.shape[1]
93
- when :narray
99
+ when :narray, :numo
94
100
  @matrix.shape[0]
95
101
  else
96
102
  @matrix.column_size
@@ -110,7 +116,7 @@ module TfIdfSimilarity
110
116
  # @return [Float] the sum of all values in the matrix
111
117
  def sum
112
118
  case @library
113
- when :narray
119
+ when :narray, :numo
114
120
  @matrix.sum
115
121
  else
116
122
  values.reduce(0, :+)
@@ -125,6 +131,8 @@ module TfIdfSimilarity
125
131
  GSL::Matrix[*array]
126
132
  when :narray
127
133
  NArray[*array]
134
+ when :numo
135
+ Numo::DFloat[*array]
128
136
  when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
129
137
  NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
130
138
  else
@@ -136,7 +144,7 @@ module TfIdfSimilarity
136
144
  # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
137
145
  def multiply_self(matrix)
138
146
  case @library
139
- when :nmatrix
147
+ when :nmatrix, :numo
140
148
  matrix.transpose.dot(matrix)
141
149
  else
142
150
  matrix.transpose * matrix
@@ -149,6 +157,8 @@ module TfIdfSimilarity
149
157
  GSL::Sf::log(number)
150
158
  when :narray
151
159
  NMath.log(number)
160
+ when :numo
161
+ Numo::NMath.log(number)
152
162
  else
153
163
  Math.log(number)
154
164
  end
@@ -158,6 +168,8 @@ module TfIdfSimilarity
158
168
  case @library
159
169
  when :narray
160
170
  NMath.sqrt(number)
171
+ when :numo
172
+ Numo::NMath.sqrt(number)
161
173
  else
162
174
  Math.sqrt(number)
163
175
  end
@@ -15,7 +15,7 @@ module TfIdfSimilarity
15
15
  array = Array.new(terms.size) do |i|
16
16
  idf = inverse_document_frequency(terms[i])
17
17
  Array.new(documents.size) do |j|
18
- term_frequency(documents[j], terms[i]) * idf
18
+ (term_frequency(documents[j], terms[i]) * idf).to_f
19
19
  end
20
20
  end
21
21
 
@@ -37,6 +37,8 @@ module TfIdfSimilarity
37
37
  case @library
38
38
  when :gsl, :narray
39
39
  row(index).where.size
40
+ when :numo
41
+ (row(index).ne 0).where.size
40
42
  when :nmatrix
41
43
  row(index).each.count(&:nonzero?)
42
44
  else
@@ -57,7 +59,7 @@ module TfIdfSimilarity
57
59
  index = terms.index(term)
58
60
  if index
59
61
  case @library
60
- when :gsl, :narray
62
+ when :gsl, :narray, :numo
61
63
  row(index).sum
62
64
  when :nmatrix
63
65
  row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -82,7 +82,12 @@ module TfIdfSimilarity
82
82
 
83
83
  describe '#term_frequency_inverse_document_frequency' do
84
84
  it 'should return negative infinity' do
85
- model.tfidf(document, 'foo').should be_nan
85
+ case MATRIX_LIBRARY
86
+ when :numo
87
+ model.tfidf(document, 'foo').isnan.should eq 1
88
+ else
89
+ model.tfidf(document, 'foo').should be_nan
90
+ end
86
91
  end
87
92
  end
88
93
 
@@ -147,7 +152,7 @@ module TfIdfSimilarity
147
152
  end
148
153
 
149
154
  it 'should return the term frequency if tokens given' do
150
- model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
155
+ model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
151
156
  end
152
157
 
153
158
  it 'should return no term frequency if no text given' do
@@ -155,7 +160,7 @@ module TfIdfSimilarity
155
160
  end
156
161
 
157
162
  it 'should return the term frequency if term counts given' do
158
- model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
163
+ model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
159
164
  end
160
165
 
161
166
  it 'should return the term frequency of a non-occurring term' do
@@ -163,7 +168,7 @@ module TfIdfSimilarity
163
168
  end
164
169
 
165
170
  it 'should return the term frequency in a non-occurring document' do
166
- model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
171
+ model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
167
172
  end
168
173
  end
169
174
 
@@ -177,17 +182,17 @@ module TfIdfSimilarity
177
182
  end
178
183
 
179
184
  it 'should return the tf*idf in a non-occurring term' do
180
- model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
185
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
181
186
  end
182
187
  end
183
188
 
184
189
  describe '#similarity_matrix' do
185
190
  it 'should return the similarity matrix' do
186
191
  expected = [
187
- 1.0, 0.564, 0.0, 0.479,
188
- 0.564, 1.0, 0.0, 0.540,
192
+ 1.0, 0.558, 0.0, 0.449,
193
+ 0.558, 1.0, 0.0, 0.501,
189
194
  0.0, 0.0, 0.0, 0.0,
190
- 0.479, 0.540, 0.0, 1.0,
195
+ 0.449, 0.501, 0.0, 1.0,
191
196
  ]
192
197
 
193
198
  similarity_matrix_values(model).each_with_index do |value,i|
data/spec/spec_helper.rb CHANGED
@@ -18,6 +18,8 @@ when :gsl
18
18
  require 'gsl'
19
19
  when :narray
20
20
  require 'narray'
21
+ when :numo
22
+ require 'numo/narray'
21
23
  when :nmatrix
22
24
  require 'nmatrix'
23
25
  else
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
10
10
  s.summary = %q{Calculates the similarity between texts using tf*idf}
11
11
  s.license = 'MIT'
12
+ s.required_ruby_version = '>= 2.4.0'
12
13
 
13
14
  s.files = `git ls-files`.split("\n")
14
15
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
19
20
 
20
21
  s.add_development_dependency('coveralls')
21
22
  s.add_development_dependency('json', '< 2')
22
- s.add_development_dependency('rake', '< 12')
23
- s.add_development_dependency('rspec', '~> 2.10')
23
+ s.add_development_dependency('rake')
24
+ s.add_development_dependency('rspec', '~> 3.0')
24
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tf-idf-similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James McKinney
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-12-19 00:00:00.000000000 Z
11
+ date: 2024-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode_utils
@@ -56,30 +56,30 @@ dependencies:
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "<"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '12'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "<"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '12'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '2.10'
75
+ version: '3.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '2.10'
82
+ version: '3.0'
83
83
  description:
84
84
  email:
85
85
  executables: []
@@ -126,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
126
126
  requirements:
127
127
  - - ">="
128
128
  - !ruby/object:Gem::Version
129
- version: '0'
129
+ version: 2.4.0
130
130
  required_rubygems_version: !ruby/object:Gem::Requirement
131
131
  requirements:
132
132
  - - ">="
133
133
  - !ruby/object:Gem::Version
134
134
  version: '0'
135
135
  requirements: []
136
- rubyforge_project:
137
- rubygems_version: 2.7.6
136
+ rubygems_version: 3.0.3.1
138
137
  signing_key:
139
138
  specification_version: 4
140
139
  summary: Calculates the similarity between texts using tf*idf