tf-idf-similarity 0.1.6 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.travis.yml +32 -4
- data/Gemfile +3 -1
- data/README.md +7 -6
- data/lib/tf-idf-similarity/bm25_model.rb +6 -2
- data/lib/tf-idf-similarity/document.rb +7 -5
- data/lib/tf-idf-similarity/matrix_methods.rb +18 -6
- data/lib/tf-idf-similarity/model.rb +1 -1
- data/lib/tf-idf-similarity/term_count_model.rb +3 -1
- data/lib/tf-idf-similarity/token.rb +7 -0
- data/lib/tf-idf-similarity/tokenizer.rb +19 -0
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/lib/tf-idf-similarity.rb +0 -3
- data/spec/bm25_model_spec.rb +13 -8
- data/spec/spec_helper.rb +2 -0
- data/td-idf-similarity.gemspec +3 -2
- metadata +11 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6264c3854dae2c9b405e6880fc35161c7e7eb05521b88fa206d7d5f7cad53d73
|
4
|
+
data.tar.gz: d7d6c7f38723e35c256e14edecdfd7d4d993f58163a9c533e12f591051ed80dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f97120bc10adc73e504503969c4ee2c486ca5909def7fc69a6ed35c4ceb3ba03294fb60a583f334a8fd83b06fe37d5b13258fa85fd699c47a812d8bdc2ce23f
|
7
|
+
data.tar.gz: a84fd8950933635fadd85caeb6a2684dfa989664553c8971f8f1f6d1ec7f7a299388d8aec186facedf6ab9174d125182fdcdfee16facd7317b27af5b462399b9
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
@@ -2,14 +2,42 @@ sudo: false
|
|
2
2
|
language: ruby
|
3
3
|
cache: bundler
|
4
4
|
rvm:
|
5
|
-
- 2.
|
6
|
-
- 2.
|
7
|
-
- 2.
|
5
|
+
- 2.4
|
6
|
+
- 2.5
|
7
|
+
- 2.6
|
8
|
+
- 2.7
|
9
|
+
- 3.0
|
10
|
+
- 3.1
|
11
|
+
- 3.2
|
12
|
+
- ruby-head
|
13
|
+
matrix:
|
14
|
+
exclude:
|
15
|
+
# No gem releases since 2017 and failing on new versions.
|
16
|
+
# https://rubygems.org/gems/gsl
|
17
|
+
# https://rubygems.org/gems/nmatrix
|
18
|
+
- rvm: 3.0
|
19
|
+
env: MATRIX_LIBRARY=gsl
|
20
|
+
- rvm: 3.1
|
21
|
+
env: MATRIX_LIBRARY=gsl
|
22
|
+
- rvm: 3.2
|
23
|
+
env: MATRIX_LIBRARY=gsl
|
24
|
+
- rvm: ruby-head
|
25
|
+
env: MATRIX_LIBRARY=gsl
|
26
|
+
- rvm: 3.2
|
27
|
+
env: MATRIX_LIBRARY=nmatrix
|
28
|
+
- rvm: ruby-head
|
29
|
+
env: MATRIX_LIBRARY=nmatrix
|
30
|
+
allow_failures:
|
31
|
+
- rvm: ruby-head
|
32
|
+
env: MATRIX_LIBRARY=matrix
|
33
|
+
- rvm: ruby-head
|
34
|
+
env: MATRIX_LIBRARY=narray
|
8
35
|
env:
|
9
36
|
- MATRIX_LIBRARY=gsl
|
10
37
|
- MATRIX_LIBRARY=narray
|
11
38
|
- MATRIX_LIBRARY=nmatrix
|
12
39
|
- MATRIX_LIBRARY=matrix
|
40
|
+
- MATRIX_LIBRARY=numo
|
13
41
|
addons:
|
14
42
|
apt:
|
15
43
|
packages:
|
@@ -18,7 +46,7 @@ addons:
|
|
18
46
|
# Installing ATLAS will install BLAS.
|
19
47
|
- libatlas-dev
|
20
48
|
- libatlas-base-dev
|
21
|
-
-
|
49
|
+
- libatlas3-base
|
22
50
|
before_install:
|
23
51
|
- bundle config build.nmatrix --with-lapacklib
|
24
52
|
- export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
|
data/Gemfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
|
+
gem 'matrix' if ENV['MATRIX_LIBRARY'] == 'matrix'
|
4
5
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.
|
6
|
+
gem 'nmatrix', '~> 0.2' if ENV['MATRIX_LIBRARY'] == 'nmatrix'
|
7
|
+
gem 'numo-narray', '~> 0.9.2.1' if ENV['MATRIX_LIBRARY'] == 'numo'
|
6
8
|
|
7
9
|
# Specify your gem's dependencies in the gemspec
|
8
10
|
gemspec
|
data/README.md
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
-
# Ruby Vector Space Model (VSM) with tf
|
1
|
+
# Ruby Vector Space Model (VSM) with tf\*idf weights
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](https://badge.fury.io/rb/tf-idf-similarity)
|
4
4
|
[![Build Status](https://secure.travis-ci.org/jpmckinney/tf-idf-similarity.png)](https://travis-ci.org/jpmckinney/tf-idf-similarity)
|
5
|
-
[![Dependency Status](https://gemnasium.com/jpmckinney/tf-idf-similarity.png)](https://gemnasium.com/jpmckinney/tf-idf-similarity)
|
6
5
|
[![Coverage Status](https://coveralls.io/repos/jpmckinney/tf-idf-similarity/badge.png)](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
|
7
6
|
[![Code Climate](https://codeclimate.com/github/jpmckinney/tf-idf-similarity.png)](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
|
8
7
|
|
9
|
-
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
9
|
|
11
10
|
## Usage
|
12
11
|
|
@@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix:
|
|
48
47
|
matrix[model.document_index(document1), model.document_index(document2)]
|
49
48
|
```
|
50
49
|
|
51
|
-
Print the tf
|
50
|
+
Print the tf\*idf values for terms in a document:
|
52
51
|
|
53
52
|
```ruby
|
54
53
|
tfidf_by_term = {}
|
@@ -86,6 +85,8 @@ end
|
|
86
85
|
document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
|
87
86
|
```
|
88
87
|
|
88
|
+
Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
|
89
|
+
|
89
90
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
90
91
|
|
91
92
|
## Troubleshooting
|
@@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul
|
|
114
115
|
require 'tf-idf-similarity/extras/document'
|
115
116
|
require 'tf-idf-similarity/extras/tf_idf_model'
|
116
117
|
|
117
|
-
The default tf
|
118
|
+
The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
118
119
|
|
119
120
|
## Why?
|
120
121
|
|
121
|
-
At the time of writing, no other Ruby gem implemented the tf
|
122
|
+
At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
|
122
123
|
|
123
124
|
* [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
|
124
125
|
* [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
|
@@ -22,8 +22,12 @@ module TfIdfSimilarity
|
|
22
22
|
#
|
23
23
|
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
24
24
|
def term_frequency(document, term)
|
25
|
-
|
26
|
-
|
25
|
+
if @model.average_document_size.zero?
|
26
|
+
Float::NAN
|
27
|
+
else
|
28
|
+
tf = document.term_count(term)
|
29
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
|
30
|
+
end
|
27
31
|
end
|
28
32
|
alias_method :tf, :term_frequency
|
29
33
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'tf-idf-similarity/tokenizer'
|
2
|
+
|
1
3
|
# A document.
|
2
4
|
module TfIdfSimilarity
|
3
5
|
class Document
|
@@ -19,7 +21,8 @@ module TfIdfSimilarity
|
|
19
21
|
def initialize(text, opts = {})
|
20
22
|
@text = text
|
21
23
|
@id = opts[:id] || object_id
|
22
|
-
@tokens = opts[:tokens]
|
24
|
+
@tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
|
25
|
+
@tokenizer = opts[:tokenizer] || Tokenizer.new
|
23
26
|
|
24
27
|
if opts[:term_counts]
|
25
28
|
@term_counts = opts[:term_counts]
|
@@ -51,10 +54,9 @@ module TfIdfSimilarity
|
|
51
54
|
|
52
55
|
# Tokenizes the text and counts terms and total tokens.
|
53
56
|
def set_term_counts_and_size
|
54
|
-
tokenize(text).each do |
|
55
|
-
token = Token.new(word)
|
57
|
+
tokenize(text).each do |token|
|
56
58
|
if token.valid?
|
57
|
-
term = token.
|
59
|
+
term = token.to_s
|
58
60
|
@term_counts[term] += 1
|
59
61
|
@size += 1
|
60
62
|
end
|
@@ -76,7 +78,7 @@ module TfIdfSimilarity
|
|
76
78
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
79
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
80
|
def tokenize(text)
|
79
|
-
@tokens ||
|
81
|
+
@tokens || @tokenizer.tokenize(text)
|
80
82
|
end
|
81
83
|
end
|
82
84
|
end
|
@@ -17,6 +17,10 @@ module TfIdfSimilarity
|
|
17
17
|
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
18
18
|
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
19
19
|
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
20
|
+
when :numo
|
21
|
+
norm = Numo::NMath.sqrt((@matrix ** 2).sum(0).reshape(1, @matrix.shape[1]))
|
22
|
+
norm[(norm.eq 0).where] = 1.0 # avoid division by zero
|
23
|
+
(@matrix / norm)
|
20
24
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
21
25
|
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
22
26
|
(0...@matrix.shape[1]).each do |j|
|
@@ -44,7 +48,7 @@ module TfIdfSimilarity
|
|
44
48
|
# @param [Integer] column index
|
45
49
|
def get(i, j)
|
46
50
|
case @library
|
47
|
-
when :narray
|
51
|
+
when :narray, :numo
|
48
52
|
@matrix[j, i]
|
49
53
|
else
|
50
54
|
@matrix[i, j]
|
@@ -57,6 +61,8 @@ module TfIdfSimilarity
|
|
57
61
|
case @library
|
58
62
|
when :narray
|
59
63
|
@matrix[true, index]
|
64
|
+
when :numo
|
65
|
+
@matrix[index, true]
|
60
66
|
else
|
61
67
|
@matrix.row(index)
|
62
68
|
end
|
@@ -66,7 +72,7 @@ module TfIdfSimilarity
|
|
66
72
|
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
67
73
|
def column(index)
|
68
74
|
case @library
|
69
|
-
when :narray
|
75
|
+
when :narray, :numo
|
70
76
|
@matrix[index, true]
|
71
77
|
else
|
72
78
|
@matrix.column(index)
|
@@ -78,7 +84,7 @@ module TfIdfSimilarity
|
|
78
84
|
case @library
|
79
85
|
when :gsl, :nmatrix
|
80
86
|
@matrix.shape[0]
|
81
|
-
when :narray
|
87
|
+
when :narray, :numo
|
82
88
|
@matrix.shape[1]
|
83
89
|
else
|
84
90
|
@matrix.row_size
|
@@ -90,7 +96,7 @@ module TfIdfSimilarity
|
|
90
96
|
case @library
|
91
97
|
when :gsl, :nmatrix
|
92
98
|
@matrix.shape[1]
|
93
|
-
when :narray
|
99
|
+
when :narray, :numo
|
94
100
|
@matrix.shape[0]
|
95
101
|
else
|
96
102
|
@matrix.column_size
|
@@ -110,7 +116,7 @@ module TfIdfSimilarity
|
|
110
116
|
# @return [Float] the sum of all values in the matrix
|
111
117
|
def sum
|
112
118
|
case @library
|
113
|
-
when :narray
|
119
|
+
when :narray, :numo
|
114
120
|
@matrix.sum
|
115
121
|
else
|
116
122
|
values.reduce(0, :+)
|
@@ -125,6 +131,8 @@ module TfIdfSimilarity
|
|
125
131
|
GSL::Matrix[*array]
|
126
132
|
when :narray
|
127
133
|
NArray[*array]
|
134
|
+
when :numo
|
135
|
+
Numo::DFloat[*array]
|
128
136
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
129
137
|
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
130
138
|
else
|
@@ -136,7 +144,7 @@ module TfIdfSimilarity
|
|
136
144
|
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
137
145
|
def multiply_self(matrix)
|
138
146
|
case @library
|
139
|
-
when :nmatrix
|
147
|
+
when :nmatrix, :numo
|
140
148
|
matrix.transpose.dot(matrix)
|
141
149
|
else
|
142
150
|
matrix.transpose * matrix
|
@@ -149,6 +157,8 @@ module TfIdfSimilarity
|
|
149
157
|
GSL::Sf::log(number)
|
150
158
|
when :narray
|
151
159
|
NMath.log(number)
|
160
|
+
when :numo
|
161
|
+
Numo::NMath.log(number)
|
152
162
|
else
|
153
163
|
Math.log(number)
|
154
164
|
end
|
@@ -158,6 +168,8 @@ module TfIdfSimilarity
|
|
158
168
|
case @library
|
159
169
|
when :narray
|
160
170
|
NMath.sqrt(number)
|
171
|
+
when :numo
|
172
|
+
Numo::NMath.sqrt(number)
|
161
173
|
else
|
162
174
|
Math.sqrt(number)
|
163
175
|
end
|
@@ -15,7 +15,7 @@ module TfIdfSimilarity
|
|
15
15
|
array = Array.new(terms.size) do |i|
|
16
16
|
idf = inverse_document_frequency(terms[i])
|
17
17
|
Array.new(documents.size) do |j|
|
18
|
-
term_frequency(documents[j], terms[i]) * idf
|
18
|
+
(term_frequency(documents[j], terms[i]) * idf).to_f
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -37,6 +37,8 @@ module TfIdfSimilarity
|
|
37
37
|
case @library
|
38
38
|
when :gsl, :narray
|
39
39
|
row(index).where.size
|
40
|
+
when :numo
|
41
|
+
(row(index).ne 0).where.size
|
40
42
|
when :nmatrix
|
41
43
|
row(index).each.count(&:nonzero?)
|
42
44
|
else
|
@@ -57,7 +59,7 @@ module TfIdfSimilarity
|
|
57
59
|
index = terms.index(term)
|
58
60
|
if index
|
59
61
|
case @library
|
60
|
-
when :gsl, :narray
|
62
|
+
when :gsl, :narray, :numo
|
61
63
|
row(index).sum
|
62
64
|
when :nmatrix
|
63
65
|
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require 'delegate'
|
3
|
+
require 'unicode_utils/downcase'
|
4
|
+
require 'unicode_utils/each_word'
|
3
5
|
|
4
6
|
# A token.
|
5
7
|
#
|
@@ -47,5 +49,10 @@ module TfIdfSimilarity
|
|
47
49
|
def classic_filter
|
48
50
|
self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
|
49
51
|
end
|
52
|
+
|
53
|
+
def to_s
|
54
|
+
# Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
|
55
|
+
UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
|
56
|
+
end
|
50
57
|
end
|
51
58
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'unicode_utils/each_word'
|
2
|
+
require 'tf-idf-similarity/token'
|
3
|
+
|
4
|
+
# A tokenizer using UnicodeUtils to tokenize a text.
|
5
|
+
#
|
6
|
+
# @see https://github.com/lang/unicode_utils
|
7
|
+
module TfIdfSimilarity
|
8
|
+
class Tokenizer
|
9
|
+
# Tokenizes a text.
|
10
|
+
#
|
11
|
+
# @param [String] text
|
12
|
+
# @return [Enumerator] an enumerator of Token objects
|
13
|
+
def tokenize(text)
|
14
|
+
UnicodeUtils.each_word(text).map do |word|
|
15
|
+
Token.new(word)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/tf-idf-similarity.rb
CHANGED
data/spec/bm25_model_spec.rb
CHANGED
@@ -82,7 +82,12 @@ module TfIdfSimilarity
|
|
82
82
|
|
83
83
|
describe '#term_frequency_inverse_document_frequency' do
|
84
84
|
it 'should return negative infinity' do
|
85
|
-
|
85
|
+
case MATRIX_LIBRARY
|
86
|
+
when :numo
|
87
|
+
model.tfidf(document, 'foo').isnan.should eq 1
|
88
|
+
else
|
89
|
+
model.tfidf(document, 'foo').should be_nan
|
90
|
+
end
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
@@ -147,7 +152,7 @@ module TfIdfSimilarity
|
|
147
152
|
end
|
148
153
|
|
149
154
|
it 'should return the term frequency if tokens given' do
|
150
|
-
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 *
|
155
|
+
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
|
151
156
|
end
|
152
157
|
|
153
158
|
it 'should return no term frequency if no text given' do
|
@@ -155,7 +160,7 @@ module TfIdfSimilarity
|
|
155
160
|
end
|
156
161
|
|
157
162
|
it 'should return the term frequency if term counts given' do
|
158
|
-
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 *
|
163
|
+
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
|
159
164
|
end
|
160
165
|
|
161
166
|
it 'should return the term frequency of a non-occurring term' do
|
@@ -163,7 +168,7 @@ module TfIdfSimilarity
|
|
163
168
|
end
|
164
169
|
|
165
170
|
it 'should return the term frequency in a non-occurring document' do
|
166
|
-
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 *
|
171
|
+
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
|
167
172
|
end
|
168
173
|
end
|
169
174
|
|
@@ -177,17 +182,17 @@ module TfIdfSimilarity
|
|
177
182
|
end
|
178
183
|
|
179
184
|
it 'should return the tf*idf in a non-occurring term' do
|
180
|
-
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 *
|
185
|
+
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
|
181
186
|
end
|
182
187
|
end
|
183
188
|
|
184
189
|
describe '#similarity_matrix' do
|
185
190
|
it 'should return the similarity matrix' do
|
186
191
|
expected = [
|
187
|
-
1.0, 0.
|
188
|
-
0.
|
192
|
+
1.0, 0.558, 0.0, 0.449,
|
193
|
+
0.558, 1.0, 0.0, 0.501,
|
189
194
|
0.0, 0.0, 0.0, 0.0,
|
190
|
-
0.
|
195
|
+
0.449, 0.501, 0.0, 1.0,
|
191
196
|
]
|
192
197
|
|
193
198
|
similarity_matrix_values(model).each_with_index do |value,i|
|
data/spec/spec_helper.rb
CHANGED
data/td-idf-similarity.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
|
10
10
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
11
11
|
s.license = 'MIT'
|
12
|
+
s.required_ruby_version = '>= 2.4.0'
|
12
13
|
|
13
14
|
s.files = `git ls-files`.split("\n")
|
14
15
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -19,6 +20,6 @@ Gem::Specification.new do |s|
|
|
19
20
|
|
20
21
|
s.add_development_dependency('coveralls')
|
21
22
|
s.add_development_dependency('json', '< 2')
|
22
|
-
s.add_development_dependency('rake'
|
23
|
-
s.add_development_dependency('rspec', '~>
|
23
|
+
s.add_development_dependency('rake')
|
24
|
+
s.add_development_dependency('rspec', '~> 3.0')
|
24
25
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode_utils
|
@@ -56,30 +56,30 @@ dependencies:
|
|
56
56
|
name: rake
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '3.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '3.0'
|
83
83
|
description:
|
84
84
|
email:
|
85
85
|
executables: []
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/tf-idf-similarity/term_count_model.rb
|
105
105
|
- lib/tf-idf-similarity/tf_idf_model.rb
|
106
106
|
- lib/tf-idf-similarity/token.rb
|
107
|
+
- lib/tf-idf-similarity/tokenizer.rb
|
107
108
|
- lib/tf-idf-similarity/version.rb
|
108
109
|
- spec/bm25_model_spec.rb
|
109
110
|
- spec/document_spec.rb
|
@@ -125,15 +126,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
125
126
|
requirements:
|
126
127
|
- - ">="
|
127
128
|
- !ruby/object:Gem::Version
|
128
|
-
version:
|
129
|
+
version: 2.4.0
|
129
130
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
131
|
requirements:
|
131
132
|
- - ">="
|
132
133
|
- !ruby/object:Gem::Version
|
133
134
|
version: '0'
|
134
135
|
requirements: []
|
135
|
-
|
136
|
-
rubygems_version: 2.4.5
|
136
|
+
rubygems_version: 3.0.3.1
|
137
137
|
signing_key:
|
138
138
|
specification_version: 4
|
139
139
|
summary: Calculates the similarity between texts using tf*idf
|