tf-idf-similarity 0.1.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.travis.yml +32 -4
- data/Gemfile +3 -1
- data/README.md +7 -6
- data/lib/tf-idf-similarity/bm25_model.rb +6 -2
- data/lib/tf-idf-similarity/document.rb +7 -5
- data/lib/tf-idf-similarity/matrix_methods.rb +18 -6
- data/lib/tf-idf-similarity/model.rb +1 -1
- data/lib/tf-idf-similarity/term_count_model.rb +3 -1
- data/lib/tf-idf-similarity/token.rb +7 -0
- data/lib/tf-idf-similarity/tokenizer.rb +19 -0
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/lib/tf-idf-similarity.rb +0 -3
- data/spec/bm25_model_spec.rb +13 -8
- data/spec/spec_helper.rb +2 -0
- data/td-idf-similarity.gemspec +3 -2
- metadata +11 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
|
4
|
+
data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
|
7
|
+
data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
@@ -2,14 +2,42 @@ sudo: false
|
|
2
2
|
language: ruby
|
3
3
|
cache: bundler
|
4
4
|
rvm:
|
5
|
-
- 2.
|
6
|
-
- 2.
|
7
|
-
- 2.
|
5
|
+
- 2.4
|
6
|
+
- 2.5
|
7
|
+
- 2.6
|
8
|
+
- 2.7
|
9
|
+
- 3.0
|
10
|
+
- 3.1
|
11
|
+
- 3.2
|
12
|
+
- ruby-head
|
13
|
+
matrix:
|
14
|
+
exclude:
|
15
|
+
# No gem releases since 2017 and failing on new versions.
|
16
|
+
# https://rubygems.org/gems/gsl
|
17
|
+
# https://rubygems.org/gems/nmatrix
|
18
|
+
- rvm: 3.0
|
19
|
+
env: MATRIX_LIBRARY=gsl
|
20
|
+
- rvm: 3.1
|
21
|
+
env: MATRIX_LIBRARY=gsl
|
22
|
+
- rvm: 3.2
|
23
|
+
env: MATRIX_LIBRARY=gsl
|
24
|
+
- rvm: ruby-head
|
25
|
+
env: MATRIX_LIBRARY=gsl
|
26
|
+
- rvm: 3.2
|
27
|
+
env: MATRIX_LIBRARY=nmatrix
|
28
|
+
- rvm: ruby-head
|
29
|
+
env: MATRIX_LIBRARY=nmatrix
|
30
|
+
allow_failures:
|
31
|
+
- rvm: ruby-head
|
32
|
+
env: MATRIX_LIBRARY=matrix
|
33
|
+
- rvm: ruby-head
|
34
|
+
env: MATRIX_LIBRARY=narray
|
8
35
|
env:
|
9
36
|
- MATRIX_LIBRARY=gsl
|
10
37
|
- MATRIX_LIBRARY=narray
|
11
38
|
- MATRIX_LIBRARY=nmatrix
|
12
39
|
- MATRIX_LIBRARY=matrix
|
40
|
+
- MATRIX_LIBRARY=numo
|
13
41
|
addons:
|
14
42
|
apt:
|
15
43
|
packages:
|
@@ -18,7 +46,7 @@ addons:
|
|
18
46
|
# Installing ATLAS will install BLAS.
|
19
47
|
- libatlas-dev
|
20
48
|
- libatlas-base-dev
|
21
|
-
-
|
49
|
+
- libatlas3-base
|
22
50
|
before_install:
|
23
51
|
- bundle config build.nmatrix --with-lapacklib
|
24
52
|
- export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
|
data/Gemfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
|
+
gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
|
4
5
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.
|
6
|
+
gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
|
7
|
+
gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
|
6
8
|
|
7
9
|
# Specify your gem's dependencies in the gemspec
|
8
10
|
gemspec
|
data/README.md
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
-
# Ruby Vector Space Model (VSM) with tf
|
1
|
+
# Ruby Vector Space Model (VSM) with tf\*idf weights
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/tf-idf-similarity)
|
4
4
|
[](https://travis-ci.org/jpmckinney/tf-idf-similarity)
|
5
|
-
[](https://gemnasium.com/jpmckinney/tf-idf-similarity)
|
6
5
|
[](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
|
7
6
|
[](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
|
8
7
|
|
9
|
-
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
9
|
|
11
10
|
## Usage
|
12
11
|
|
@@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix:
|
|
48
47
|
matrix[model.document_index(document1), model.document_index(document2)]
|
49
48
|
```
|
50
49
|
|
51
|
-
Print the tf
|
50
|
+
Print the tf\*idf values for terms in a document:
|
52
51
|
|
53
52
|
```ruby
|
54
53
|
tfidf_by_term = {}
|
@@ -86,6 +85,8 @@ end
|
|
86
85
|
document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
|
87
86
|
```
|
88
87
|
|
88
|
+
Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
|
89
|
+
|
89
90
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
90
91
|
|
91
92
|
## Troubleshooting
|
@@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul
|
|
114
115
|
require 'tf-idf-similarity/extras/document'
|
115
116
|
require 'tf-idf-similarity/extras/tf_idf_model'
|
116
117
|
|
117
|
-
The default tf
|
118
|
+
The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
118
119
|
|
119
120
|
## Why?
|
120
121
|
|
121
|
-
At the time of writing, no other Ruby gem implemented the tf
|
122
|
+
At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
|
122
123
|
|
123
124
|
* [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
|
124
125
|
* [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
|
@@ -22,8 +22,12 @@ module TfIdfSimilarity
|
|
22
22
|
#
|
23
23
|
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
24
24
|
def term_frequency(document, term)
|
25
|
-
|
26
|
-
|
25
|
+
if @model.average_document_size.zero?
|
26
|
+
Float::NAN
|
27
|
+
else
|
28
|
+
tf = document.term_count(term)
|
29
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
|
30
|
+
end
|
27
31
|
end
|
28
32
|
alias_method :tf, :term_frequency
|
29
33
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'tf-idf-similarity/tokenizer'
|
2
|
+
|
1
3
|
# A document.
|
2
4
|
module TfIdfSimilarity
|
3
5
|
class Document
|
@@ -19,7 +21,8 @@ module TfIdfSimilarity
|
|
19
21
|
def initialize(text, opts = {})
|
20
22
|
@text = text
|
21
23
|
@id = opts[:id] || object_id
|
22
|
-
@tokens = opts[:tokens]
|
24
|
+
@tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
|
25
|
+
@tokenizer = opts[:tokenizer] || Tokenizer.new
|
23
26
|
|
24
27
|
if opts[:term_counts]
|
25
28
|
@term_counts = opts[:term_counts]
|
@@ -51,10 +54,9 @@ module TfIdfSimilarity
|
|
51
54
|
|
52
55
|
# Tokenizes the text and counts terms and total tokens.
|
53
56
|
def set_term_counts_and_size
|
54
|
-
tokenize(text).each do |
|
55
|
-
token = Token.new(word)
|
57
|
+
tokenize(text).each do |token|
|
56
58
|
if token.valid?
|
57
|
-
term = token.
|
59
|
+
term = token.to_s
|
58
60
|
@term_counts[term] += 1
|
59
61
|
@size += 1
|
60
62
|
end
|
@@ -76,7 +78,7 @@ module TfIdfSimilarity
|
|
76
78
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
79
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
80
|
def tokenize(text)
|
79
|
-
@tokens ||
|
81
|
+
@tokens || @tokenizer.tokenize(text)
|
80
82
|
end
|
81
83
|
end
|
82
84
|
end
|
@@ -17,6 +17,10 @@ module TfIdfSimilarity
|
|
17
17
|
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
18
18
|
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
19
19
|
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
20
|
+
when :numo
|
21
|
+
norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
|
22
|
+
norm[(norm.eq 0).where] = 1.0 # avoid division by zero
|
23
|
+
(@matrix / norm)
|
20
24
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
21
25
|
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
22
26
|
(0...@matrix.shape[1]).each do |j|
|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
|
|
44
48
|
# @param [Integer] column index
|
45
49
|
def get(i, j)
|
46
50
|
case @library
|
47
|
-
when :narray
|
51
|
+
when :narray, :numo
|
48
52
|
@matrix[j, i]
|
49
53
|
else
|
50
54
|
@matrix[i, j]
|
@@ -57,6 +61,8 @@ module TfIdfSimilarity
|
|
57
61
|
case @library
|
58
62
|
when :narray
|
59
63
|
@matrix[true, index]
|
64
|
+
when :numo
|
65
|
+
@matrix[index, true]
|
60
66
|
else
|
61
67
|
@matrix.row(index)
|
62
68
|
end
|
@@ -66,7 +72,7 @@ module TfIdfSimilarity
|
|
66
72
|
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
67
73
|
def column(index)
|
68
74
|
case @library
|
69
|
-
when :narray
|
75
|
+
when :narray, :numo
|
70
76
|
@matrix[index, true]
|
71
77
|
else
|
72
78
|
@matrix.column(index)
|
@@ -78,7 +84,7 @@ module TfIdfSimilarity
|
|
78
84
|
case @library
|
79
85
|
when :gsl, :nmatrix
|
80
86
|
@matrix.shape[0]
|
81
|
-
when :narray
|
87
|
+
when :narray, :numo
|
82
88
|
@matrix.shape[1]
|
83
89
|
else
|
84
90
|
@matrix.row_size
|
@@ -90,7 +96,7 @@ module TfIdfSimilarity
|
|
90
96
|
case @library
|
91
97
|
when :gsl, :nmatrix
|
92
98
|
@matrix.shape[1]
|
93
|
-
when :narray
|
99
|
+
when :narray, :numo
|
94
100
|
@matrix.shape[0]
|
95
101
|
else
|
96
102
|
@matrix.column_size
|
@@ -110,7 +116,7 @@ module TfIdfSimilarity
|
|
110
116
|
# @return [Float] the sum of all values in the matrix
|
111
117
|
def sum
|
112
118
|
case @library
|
113
|
-
when :narray
|
119
|
+
when :narray, :numo
|
114
120
|
@matrix.sum
|
115
121
|
else
|
116
122
|
values.reduce(0, :+)
|
@@ -125,6 +131,8 @@ module TfIdfSimilarity
|
|
125
131
|
GSL::Matrix[*array]
|
126
132
|
when :narray
|
127
133
|
NArray[*array]
|
134
|
+
when :numo
|
135
|
+
Numo::DFloat[*array]
|
128
136
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
129
137
|
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
130
138
|
else
|
@@ -136,7 +144,7 @@ module TfIdfSimilarity
|
|
136
144
|
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
137
145
|
def multiply_self(matrix)
|
138
146
|
case @library
|
139
|
-
when :nmatrix
|
147
|
+
when :nmatrix, :numo
|
140
148
|
matrix.transpose.dot(matrix)
|
141
149
|
else
|
142
150
|
matrix.transpose * matrix
|
@@ -149,6 +157,8 @@ module TfIdfSimilarity
|
|
149
157
|
GSL::Sf::log(number)
|
150
158
|
when :narray
|
151
159
|
NMath.log(number)
|
160
|
+
when :numo
|
161
|
+
Numo::NMath.log(number)
|
152
162
|
else
|
153
163
|
Math.log(number)
|
154
164
|
end
|
@@ -158,6 +168,8 @@ module TfIdfSimilarity
|
|
158
168
|
case @library
|
159
169
|
when :narray
|
160
170
|
NMath.sqrt(number)
|
171
|
+
when :numo
|
172
|
+
Numo::NMath.sqrt(number)
|
161
173
|
else
|
162
174
|
Math.sqrt(number)
|
163
175
|
end
|
@@ -15,7 +15,7 @@ module TfIdfSimilarity
|
|
15
15
|
array = Array.new(terms.size) do |i|
|
16
16
|
idf = inverse_document_frequency(terms[i])
|
17
17
|
Array.new(documents.size) do |j|
|
18
|
-
term_frequency(documents[j], terms[i]) * idf
|
18
|
+
(term_frequency(documents[j], terms[i]) * idf).to_f
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -37,6 +37,8 @@ module TfIdfSimilarity
|
|
37
37
|
case @library
|
38
38
|
when :gsl, :narray
|
39
39
|
row(index).where.size
|
40
|
+
when :numo
|
41
|
+
(row(index).ne 0).where.size
|
40
42
|
when :nmatrix
|
41
43
|
row(index).each.count(&:nonzero?)
|
42
44
|
else
|
@@ -57,7 +59,7 @@ module TfIdfSimilarity
|
|
57
59
|
index = terms.index(term)
|
58
60
|
if index
|
59
61
|
case @library
|
60
|
-
when :gsl, :narray
|
62
|
+
when :gsl, :narray, :numo
|
61
63
|
row(index).sum
|
62
64
|
when :nmatrix
|
63
65
|
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require 'delegate'
|
3
|
+
require 'unicode_utils/downcase'
|
4
|
+
require 'unicode_utils/each_word'
|
3
5
|
|
4
6
|
# A token.
|
5
7
|
#
|
@@ -47,5 +49,10 @@ module TfIdfSimilarity
|
|
47
49
|
def classic_filter
|
48
50
|
self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
|
49
51
|
end
|
52
|
+
|
53
|
+
def to_s
|
54
|
+
# Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
|
55
|
+
UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
|
56
|
+
end
|
50
57
|
end
|
51
58
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'unicode_utils/each_word'
|
2
|
+
require 'tf-idf-similarity/token'
|
3
|
+
|
4
|
+
# A tokenizer using UnicodeUtils to tokenize a text.
|
5
|
+
#
|
6
|
+
# @see https://github.com/lang/unicode_utils
|
7
|
+
module TfIdfSimilarity
|
8
|
+
class Tokenizer
|
9
|
+
# Tokenizes a text.
|
10
|
+
#
|
11
|
+
# @param [String] text
|
12
|
+
# @return [Enumerator] an enumerator of Token objects
|
13
|
+
def tokenize(text)
|
14
|
+
UnicodeUtils.each_word(text).map do |word|
|
15
|
+
Token.new(word)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/tf-idf-similarity.rb
CHANGED
data/spec/bm25_model_spec.rb
CHANGED
@@ -82,7 +82,12 @@ module TfIdfSimilarity
|
|
82
82
|
|
83
83
|
describe '#term_frequency_inverse_document_frequency' do
|
84
84
|
it 'should return negative infinity' do
|
85
|
-
|
85
|
+
case MATRIX_LIBRARY
|
86
|
+
when :numo
|
87
|
+
model.tfidf(document, 'foo').isnan.should eq 1
|
88
|
+
else
|
89
|
+
model.tfidf(document, 'foo').should be_nan
|
90
|
+
end
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
@@ -147,7 +152,7 @@ module TfIdfSimilarity
|
|
147
152
|
end
|
148
153
|
|
149
154
|
it 'should return the term frequency if tokens given' do
|
150
|
-
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 *
|
155
|
+
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
|
151
156
|
end
|
152
157
|
|
153
158
|
it 'should return no term frequency if no text given' do
|
@@ -155,7 +160,7 @@ module TfIdfSimilarity
|
|
155
160
|
end
|
156
161
|
|
157
162
|
it 'should return the term frequency if term counts given' do
|
158
|
-
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 *
|
163
|
+
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
|
159
164
|
end
|
160
165
|
|
161
166
|
it 'should return the term frequency of a non-occurring term' do
|
@@ -163,7 +168,7 @@ module TfIdfSimilarity
|
|
163
168
|
end
|
164
169
|
|
165
170
|
it 'should return the term frequency in a non-occurring document' do
|
166
|
-
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 *
|
171
|
+
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
|
167
172
|
end
|
168
173
|
end
|
169
174
|
|
@@ -177,17 +182,17 @@ module TfIdfSimilarity
|
|
177
182
|
end
|
178
183
|
|
179
184
|
it 'should return the tf*idf in a non-occurring term' do
|
180
|
-
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 *
|
185
|
+
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
|
181
186
|
end
|
182
187
|
end
|
183
188
|
|
184
189
|
describe '#similarity_matrix' do
|
185
190
|
it 'should return the similarity matrix' do
|
186
191
|
expected = [
|
187
|
-
1.0, 0.
|
188
|
-
0.
|
192
|
+
1.0, 0.558, 0.0, 0.449,
|
193
|
+
0.558, 1.0, 0.0, 0.501,
|
189
194
|
0.0, 0.0, 0.0, 0.0,
|
190
|
-
0.
|
195
|
+
0.449, 0.501, 0.0, 1.0,
|
191
196
|
]
|
192
197
|
|
193
198
|
similarity_matrix_values(model).each_with_index do |value,i|
|
data/spec/spec_helper.rb
CHANGED
data/td-idf-similarity.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
|
10
10
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
11
11
|
s.license = 'MIT'
|
12
|
+
s.required_ruby_version = '>= 2.4.0'
|
12
13
|
|
13
14
|
s.files = `git ls-files`.split("\n")
|
14
15
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
|
|
19
20
|
|
20
21
|
s.add_development_dependency('coveralls')
|
21
22
|
s.add_development_dependency('json', '< 2')
|
22
|
-
s.add_development_dependency('rake'
|
23
|
-
s.add_development_dependency('rspec', '~>
|
23
|
+
s.add_development_dependency('rake')
|
24
|
+
s.add_development_dependency('rspec', '~> 3.0')
|
24
25
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode_utils
|
@@ -56,30 +56,30 @@ dependencies:
|
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '3.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '3.0'
|
83
83
|
description:
|
84
84
|
email:
|
85
85
|
executables: []
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/tf-idf-similarity/term_count_model.rb
|
105
105
|
- lib/tf-idf-similarity/tf_idf_model.rb
|
106
106
|
- lib/tf-idf-similarity/token.rb
|
107
|
+
- lib/tf-idf-similarity/tokenizer.rb
|
107
108
|
- lib/tf-idf-similarity/version.rb
|
108
109
|
- spec/bm25_model_spec.rb
|
109
110
|
- spec/document_spec.rb
|
@@ -125,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
125
126
|
requirements:
|
126
127
|
- - ">="
|
127
128
|
- !ruby/object:Gem::Version
|
128
|
-
version:
|
129
|
+
version: 2.4.0
|
129
130
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
131
|
requirements:
|
131
132
|
- - ">="
|
132
133
|
- !ruby/object:Gem::Version
|
133
134
|
version: '0'
|
134
135
|
requirements: []
|
135
|
-
|
136
|
-
rubygems_version: 2.4.5
|
136
|
+
rubygems_version: 3.0.3.1
|
137
137
|
signing_key:
|
138
138
|
specification_version: 4
|
139
139
|
summary: Calculates the similarity between texts using tf*idf
|