tf-idf-similarity 0.1.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 03431fb16064caa54fe9cbfc17a151acb1a25fa5
4
- data.tar.gz: be2e97b63e14244925937ee71fc8dc60c88dfce4
2
+ SHA256:
3
+ metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
4
+ data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
5
5
  SHA512:
6
- metadata.gz: f615fae6cfad994fa25c85b1f3d6882742944e7bb5894ae3fcf6b4c9d7b34647b0da1b3914f127eb26e46a299c0f8a4e9d64bc05a7cb1c429663beaf657704eb
7
- data.tar.gz: 317ea7c5a1a72e53419f2eadb5b4789bccbe29f0f7bf742f89e9ed9ffb210b43a78180ebef818baf497a48911e0f25897e6906251c45cd787d61c5da43cbbb92
6
+ metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
7
+ data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
data/.gitignore CHANGED
@@ -4,3 +4,4 @@
4
4
  Gemfile.lock
5
5
  doc/*
6
6
  pkg/*
7
+ coverage/*
data/.travis.yml CHANGED
@@ -2,14 +2,42 @@ sudo: false
2
2
  language: ruby
3
3
  cache: bundler
4
4
  rvm:
5
- - 2.0.0
6
- - 2.1.0
7
- - 2.2.0
5
+ - 2.4
6
+ - 2.5
7
+ - 2.6
8
+ - 2.7
9
+ - 3.0
10
+ - 3.1
11
+ - 3.2
12
+ - ruby-head
13
+ matrix:
14
+ exclude:
15
+ # No gem releases since 2017 and failing on new versions.
16
+ # https://rubygems.org/gems/gsl
17
+ # https://rubygems.org/gems/nmatrix
18
+ - rvm: 3.0
19
+ env: MATRIX_LIBRARY=gsl
20
+ - rvm: 3.1
21
+ env: MATRIX_LIBRARY=gsl
22
+ - rvm: 3.2
23
+ env: MATRIX_LIBRARY=gsl
24
+ - rvm: ruby-head
25
+ env: MATRIX_LIBRARY=gsl
26
+ - rvm: 3.2
27
+ env: MATRIX_LIBRARY=nmatrix
28
+ - rvm: ruby-head
29
+ env: MATRIX_LIBRARY=nmatrix
30
+ allow_failures:
31
+ - rvm: ruby-head
32
+ env: MATRIX_LIBRARY=matrix
33
+ - rvm: ruby-head
34
+ env: MATRIX_LIBRARY=narray
8
35
  env:
9
36
  - MATRIX_LIBRARY=gsl
10
37
  - MATRIX_LIBRARY=narray
11
38
  - MATRIX_LIBRARY=nmatrix
12
39
  - MATRIX_LIBRARY=matrix
40
+ - MATRIX_LIBRARY=numo
13
41
  addons:
14
42
  apt:
15
43
  packages:
@@ -18,7 +46,7 @@ addons:
18
46
  # Installing ATLAS will install BLAS.
19
47
  - libatlas-dev
20
48
  - libatlas-base-dev
21
- - libatlas3gf-base
49
+ - libatlas3-base
22
50
  before_install:
23
51
  - bundle config build.nmatrix --with-lapacklib
24
52
  - export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
data/Gemfile CHANGED
@@ -1,8 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
4
+ gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
4
5
  gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
5
- gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
6
+ gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
7
+ gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
6
8
 
7
9
  # Specify your gem's dependencies in the gemspec
8
10
  gemspec
data/README.md CHANGED
@@ -1,12 +1,11 @@
1
- # Ruby Vector Space Model (VSM) with tf*idf weights
1
+ # Ruby Vector Space Model (VSM) with tf\*idf weights
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](https://badge.fury.io/rb/tf-idf-similarity)
4
4
  [![Build Status](https://secure.travis-ci.org/jpmckinney/tf-idf-similarity.png)](https://travis-ci.org/jpmckinney/tf-idf-similarity)
5
- [![Dependency Status](https://gemnasium.com/jpmckinney/tf-idf-similarity.png)](https://gemnasium.com/jpmckinney/tf-idf-similarity)
6
5
  [![Coverage Status](https://coveralls.io/repos/jpmckinney/tf-idf-similarity/badge.png)](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
7
6
  [![Code Climate](https://codeclimate.com/github/jpmckinney/tf-idf-similarity.png)](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
8
7
 
9
- Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
8
+ Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
10
9
 
11
10
  ## Usage
12
11
 
@@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix:
48
47
  matrix[model.document_index(document1), model.document_index(document2)]
49
48
  ```
50
49
 
51
- Print the tf*idf values for terms in a document:
50
+ Print the tf\*idf values for terms in a document:
52
51
 
53
52
  ```ruby
54
53
  tfidf_by_term = {}
@@ -86,6 +85,8 @@ end
86
85
  document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
87
86
  ```
88
87
 
88
+ Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
89
+
89
90
  [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
90
91
 
91
92
  ## Troubleshooting
@@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul
114
115
  require 'tf-idf-similarity/extras/document'
115
116
  require 'tf-idf-similarity/extras/tf_idf_model'
116
117
 
117
- The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
118
+ The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
118
119
 
119
120
  ## Why?
120
121
 
121
- At the time of writing, no other Ruby gem implemented the tf*idf formula used by Lucene, Sphinx and Ferret.
122
+ At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
122
123
 
123
124
  * [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
124
125
  * [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
@@ -22,8 +22,12 @@ module TfIdfSimilarity
22
22
  #
23
23
  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
24
24
  def term_frequency(document, term)
25
- tf = document.term_count(term)
26
- (tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
25
+ if @model.average_document_size.zero?
26
+ Float::NAN
27
+ else
28
+ tf = document.term_count(term)
29
+ (tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
30
+ end
27
31
  end
28
32
  alias_method :tf, :term_frequency
29
33
  end
@@ -1,3 +1,5 @@
1
+ require 'tf-idf-similarity/tokenizer'
2
+
1
3
  # A document.
2
4
  module TfIdfSimilarity
3
5
  class Document
@@ -19,7 +21,8 @@ module TfIdfSimilarity
19
21
  def initialize(text, opts = {})
20
22
  @text = text
21
23
  @id = opts[:id] || object_id
22
- @tokens = opts[:tokens]
24
+ @tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
25
+ @tokenizer = opts[:tokenizer] || Tokenizer.new
23
26
 
24
27
  if opts[:term_counts]
25
28
  @term_counts = opts[:term_counts]
@@ -51,10 +54,9 @@ module TfIdfSimilarity
51
54
 
52
55
  # Tokenizes the text and counts terms and total tokens.
53
56
  def set_term_counts_and_size
54
- tokenize(text).each do |word|
55
- token = Token.new(word)
57
+ tokenize(text).each do |token|
56
58
  if token.valid?
57
- term = token.lowercase_filter.classic_filter.to_s
59
+ term = token.to_s
58
60
  @term_counts[term] += 1
59
61
  @size += 1
60
62
  end
@@ -76,7 +78,7 @@ module TfIdfSimilarity
76
78
  # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
77
79
  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
78
80
  def tokenize(text)
79
- @tokens || UnicodeUtils.each_word(text)
81
+ @tokens || @tokenizer.tokenize(text)
80
82
  end
81
83
  end
82
84
  end
@@ -17,6 +17,10 @@ module TfIdfSimilarity
17
17
  norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
18
18
  norm[norm.where2[1]] = 1.0 # avoid division by zero
19
19
  NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
20
+ when :numo
21
+ norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
22
+ norm[(norm.eq 0).where] = 1.0 # avoid division by zero
23
+ (@matrix / norm)
20
24
  when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
21
25
  normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
22
26
  (0...@matrix.shape[1]).each do |j|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
44
48
  # @param [Integer] column index
45
49
  def get(i, j)
46
50
  case @library
47
- when :narray
51
+ when :narray, :numo
48
52
  @matrix[j, i]
49
53
  else
50
54
  @matrix[i, j]
@@ -57,6 +61,8 @@ module TfIdfSimilarity
57
61
  case @library
58
62
  when :narray
59
63
  @matrix[true, index]
64
+ when :numo
65
+ @matrix[index, true]
60
66
  else
61
67
  @matrix.row(index)
62
68
  end
@@ -66,7 +72,7 @@ module TfIdfSimilarity
66
72
  # @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
67
73
  def column(index)
68
74
  case @library
69
- when :narray
75
+ when :narray, :numo
70
76
  @matrix[index, true]
71
77
  else
72
78
  @matrix.column(index)
@@ -78,7 +84,7 @@ module TfIdfSimilarity
78
84
  case @library
79
85
  when :gsl, :nmatrix
80
86
  @matrix.shape[0]
81
- when :narray
87
+ when :narray, :numo
82
88
  @matrix.shape[1]
83
89
  else
84
90
  @matrix.row_size
@@ -90,7 +96,7 @@ module TfIdfSimilarity
90
96
  case @library
91
97
  when :gsl, :nmatrix
92
98
  @matrix.shape[1]
93
- when :narray
99
+ when :narray, :numo
94
100
  @matrix.shape[0]
95
101
  else
96
102
  @matrix.column_size
@@ -110,7 +116,7 @@ module TfIdfSimilarity
110
116
  # @return [Float] the sum of all values in the matrix
111
117
  def sum
112
118
  case @library
113
- when :narray
119
+ when :narray, :numo
114
120
  @matrix.sum
115
121
  else
116
122
  values.reduce(0, :+)
@@ -125,6 +131,8 @@ module TfIdfSimilarity
125
131
  GSL::Matrix[*array]
126
132
  when :narray
127
133
  NArray[*array]
134
+ when :numo
135
+ Numo::DFloat[*array]
128
136
  when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
129
137
  NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
130
138
  else
@@ -136,7 +144,7 @@ module TfIdfSimilarity
136
144
  # @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
137
145
  def multiply_self(matrix)
138
146
  case @library
139
- when :nmatrix
147
+ when :nmatrix, :numo
140
148
  matrix.transpose.dot(matrix)
141
149
  else
142
150
  matrix.transpose * matrix
@@ -149,6 +157,8 @@ module TfIdfSimilarity
149
157
  GSL::Sf::log(number)
150
158
  when :narray
151
159
  NMath.log(number)
160
+ when :numo
161
+ Numo::NMath.log(number)
152
162
  else
153
163
  Math.log(number)
154
164
  end
@@ -158,6 +168,8 @@ module TfIdfSimilarity
158
168
  case @library
159
169
  when :narray
160
170
  NMath.sqrt(number)
171
+ when :numo
172
+ Numo::NMath.sqrt(number)
161
173
  else
162
174
  Math.sqrt(number)
163
175
  end
@@ -15,7 +15,7 @@ module TfIdfSimilarity
15
15
  array = Array.new(terms.size) do |i|
16
16
  idf = inverse_document_frequency(terms[i])
17
17
  Array.new(documents.size) do |j|
18
- term_frequency(documents[j], terms[i]) * idf
18
+ (term_frequency(documents[j], terms[i]) * idf).to_f
19
19
  end
20
20
  end
21
21
 
@@ -37,6 +37,8 @@ module TfIdfSimilarity
37
37
  case @library
38
38
  when :gsl, :narray
39
39
  row(index).where.size
40
+ when :numo
41
+ (row(index).ne 0).where.size
40
42
  when :nmatrix
41
43
  row(index).each.count(&:nonzero?)
42
44
  else
@@ -57,7 +59,7 @@ module TfIdfSimilarity
57
59
  index = terms.index(term)
58
60
  if index
59
61
  case @library
60
- when :gsl, :narray
62
+ when :gsl, :narray, :numo
61
63
  row(index).sum
62
64
  when :nmatrix
63
65
  row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
@@ -1,5 +1,7 @@
1
1
  # coding: utf-8
2
2
  require 'delegate'
3
+ require 'unicode_utils/downcase'
4
+ require 'unicode_utils/each_word'
3
5
 
4
6
  # A token.
5
7
  #
@@ -47,5 +49,10 @@ module TfIdfSimilarity
47
49
  def classic_filter
48
50
  self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
49
51
  end
52
+
53
+ def to_s
54
+ # Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
55
+ UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
56
+ end
50
57
  end
51
58
  end
@@ -0,0 +1,19 @@
1
+ require 'unicode_utils/each_word'
2
+ require 'tf-idf-similarity/token'
3
+
4
+ # A tokenizer using UnicodeUtils to tokenize a text.
5
+ #
6
+ # @see https://github.com/lang/unicode_utils
7
+ module TfIdfSimilarity
8
+ class Tokenizer
9
+ # Tokenizes a text.
10
+ #
11
+ # @param [String] text
12
+ # @return [Enumerator] an enumerator of Token objects
13
+ def tokenize(text)
14
+ UnicodeUtils.each_word(text).map do |word|
15
+ Token.new(word)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module TfIdfSimilarity
2
- VERSION = "0.1.6"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -1,9 +1,6 @@
1
1
  require 'forwardable'
2
2
  require 'set'
3
3
 
4
- require 'unicode_utils/downcase'
5
- require 'unicode_utils/each_word'
6
-
7
4
  module TfIdfSimilarity
8
5
  end
9
6
 
@@ -82,7 +82,12 @@ module TfIdfSimilarity
82
82
 
83
83
  describe '#term_frequency_inverse_document_frequency' do
84
84
  it 'should return negative infinity' do
85
- model.tfidf(document, 'foo').should be_nan
85
+ case MATRIX_LIBRARY
86
+ when :numo
87
+ model.tfidf(document, 'foo').isnan.should eq 1
88
+ else
89
+ model.tfidf(document, 'foo').should be_nan
90
+ end
86
91
  end
87
92
  end
88
93
 
@@ -147,7 +152,7 @@ module TfIdfSimilarity
147
152
  end
148
153
 
149
154
  it 'should return the term frequency if tokens given' do
150
- model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
155
+ model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
151
156
  end
152
157
 
153
158
  it 'should return no term frequency if no text given' do
@@ -155,7 +160,7 @@ module TfIdfSimilarity
155
160
  end
156
161
 
157
162
  it 'should return the term frequency if term counts given' do
158
- model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
163
+ model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
159
164
  end
160
165
 
161
166
  it 'should return the term frequency of a non-occurring term' do
@@ -163,7 +168,7 @@ module TfIdfSimilarity
163
168
  end
164
169
 
165
170
  it 'should return the term frequency in a non-occurring document' do
166
- model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
171
+ model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
167
172
  end
168
173
  end
169
174
 
@@ -177,17 +182,17 @@ module TfIdfSimilarity
177
182
  end
178
183
 
179
184
  it 'should return the tf*idf in a non-occurring term' do
180
- model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
185
+ model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
181
186
  end
182
187
  end
183
188
 
184
189
  describe '#similarity_matrix' do
185
190
  it 'should return the similarity matrix' do
186
191
  expected = [
187
- 1.0, 0.564, 0.0, 0.479,
188
- 0.564, 1.0, 0.0, 0.540,
192
+ 1.0, 0.558, 0.0, 0.449,
193
+ 0.558, 1.0, 0.0, 0.501,
189
194
  0.0, 0.0, 0.0, 0.0,
190
- 0.479, 0.540, 0.0, 1.0,
195
+ 0.449, 0.501, 0.0, 1.0,
191
196
  ]
192
197
 
193
198
  similarity_matrix_values(model).each_with_index do |value,i|
data/spec/spec_helper.rb CHANGED
@@ -18,6 +18,8 @@ when :gsl
18
18
  require 'gsl'
19
19
  when :narray
20
20
  require 'narray'
21
+ when :numo
22
+ require 'numo/narray'
21
23
  when :nmatrix
22
24
  require 'nmatrix'
23
25
  else
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
10
10
  s.summary = %q{Calculates the similarity between texts using tf*idf}
11
11
  s.license = 'MIT'
12
+ s.required_ruby_version = '>= 2.4.0'
12
13
 
13
14
  s.files = `git ls-files`.split("\n")
14
15
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
19
20
 
20
21
  s.add_development_dependency('coveralls')
21
22
  s.add_development_dependency('json', '< 2')
22
- s.add_development_dependency('rake', '< 12')
23
- s.add_development_dependency('rspec', '~> 2.10')
23
+ s.add_development_dependency('rake')
24
+ s.add_development_dependency('rspec', '~> 3.0')
24
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tf-idf-similarity
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James McKinney
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-07 00:00:00.000000000 Z
11
+ date: 2024-02-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode_utils
@@ -56,30 +56,30 @@ dependencies:
56
56
  name: rake
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "<"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '12'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "<"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '12'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '2.10'
75
+ version: '3.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '2.10'
82
+ version: '3.0'
83
83
  description:
84
84
  email:
85
85
  executables: []
@@ -104,6 +104,7 @@ files:
104
104
  - lib/tf-idf-similarity/term_count_model.rb
105
105
  - lib/tf-idf-similarity/tf_idf_model.rb
106
106
  - lib/tf-idf-similarity/token.rb
107
+ - lib/tf-idf-similarity/tokenizer.rb
107
108
  - lib/tf-idf-similarity/version.rb
108
109
  - spec/bm25_model_spec.rb
109
110
  - spec/document_spec.rb
@@ -125,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
125
126
  requirements:
126
127
  - - ">="
127
128
  - !ruby/object:Gem::Version
128
- version: '0'
129
+ version: 2.4.0
129
130
  required_rubygems_version: !ruby/object:Gem::Requirement
130
131
  requirements:
131
132
  - - ">="
132
133
  - !ruby/object:Gem::Version
133
134
  version: '0'
134
135
  requirements: []
135
- rubyforge_project:
136
- rubygems_version: 2.4.5
136
+ rubygems_version: 3.0.3.1
137
137
  signing_key:
138
138
  specification_version: 4
139
139
  summary: Calculates the similarity between texts using tf*idf