tf-idf-similarity 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.travis.yml +13 -9
- data/.yardopts +0 -1
- data/Gemfile +3 -3
- data/LICENSE +1 -1
- data/README.md +49 -23
- data/lib/tf-idf-similarity.rb +2 -6
- data/lib/tf-idf-similarity/bm25_model.rb +1 -1
- data/lib/tf-idf-similarity/document.rb +1 -1
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +1 -1
- data/lib/tf-idf-similarity/matrix_methods.rb +1 -1
- data/lib/tf-idf-similarity/token.rb +3 -6
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/extras/tf_idf_model_spec.rb +3 -3
- data/spec/spec_helper.rb +5 -1
- data/spec/token_spec.rb +8 -0
- data/td-idf-similarity.gemspec +6 -8
- metadata +17 -20
- data/USAGE +0 -1
- data/ext/mkrf_conf.rb +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 736ca4c4b93d14ea046cbc4bdae930c8b88082be
|
4
|
+
data.tar.gz: 6b43e8356c59e0f48ac08f300186d4e12497368d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 635ea3047ba54a951020f95ab7e9412adf07a39d6042b85e605fcd0517345d506690bac11ab05f7f20e16f80106e95e8002fd1ae2ab4e466a27cc4f143ac15d6
|
7
|
+
data.tar.gz: 693ac6c70f9daf3f0a1ed06ba693d170654e7c871702641d598e59a6fc69cbd5316e76441da062aca05d0b8f67c0ba0c958a4115e2c9da17316a2ebef2190738
|
data/.rspec
ADDED
data/.travis.yml
CHANGED
@@ -1,21 +1,25 @@
|
|
1
|
+
sudo: false
|
1
2
|
language: ruby
|
3
|
+
cache: bundler
|
2
4
|
rvm:
|
3
|
-
- 1.9.2
|
4
5
|
- 1.9.3
|
5
6
|
- 2.0.0
|
6
7
|
- 2.1.0
|
8
|
+
- 2.2.0
|
7
9
|
env:
|
8
10
|
- MATRIX_LIBRARY=gsl
|
9
11
|
- MATRIX_LIBRARY=narray
|
10
12
|
- MATRIX_LIBRARY=nmatrix
|
11
13
|
- MATRIX_LIBRARY=matrix
|
14
|
+
addons:
|
15
|
+
apt:
|
16
|
+
packages:
|
17
|
+
- gsl-bin
|
18
|
+
- libgsl0-dev
|
19
|
+
# Installing ATLAS will install BLAS.
|
20
|
+
- libatlas-dev
|
21
|
+
- libatlas-base-dev
|
22
|
+
- libatlas3gf-base
|
12
23
|
before_install:
|
13
24
|
- bundle config build.nmatrix --with-lapacklib
|
14
|
-
-
|
15
|
-
- if [ $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get install gsl-bin libgsl0-dev; fi
|
16
|
-
# Installing ATLAS will install BLAS.
|
17
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
|
18
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
|
19
|
-
# Travis sometimes runs without Bundler.
|
20
|
-
install: bundle
|
21
|
-
script: bundle exec rake --trace
|
25
|
+
- export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
|
data/.yardopts
CHANGED
data/Gemfile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
gem 'rb-gsl', '~> 1.16.0.2'
|
3
|
+
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
4
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.1.0.rc5'
|
5
|
+
gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in the gemspec
|
8
8
|
gemspec
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# Ruby Vector Space Model (VSM) with tf*idf weights
|
2
2
|
|
3
|
-
[![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](
|
4
|
-
[![Build Status](https://secure.travis-ci.org/
|
5
|
-
[![Dependency Status](https://gemnasium.com/
|
6
|
-
[![Coverage Status](https://coveralls.io/repos/
|
7
|
-
[![Code Climate](https://codeclimate.com/github/
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/tf-idf-similarity.svg)](https://badge.fury.io/rb/tf-idf-similarity)
|
4
|
+
[![Build Status](https://secure.travis-ci.org/jpmckinney/tf-idf-similarity.png)](https://travis-ci.org/jpmckinney/tf-idf-similarity)
|
5
|
+
[![Dependency Status](https://gemnasium.com/jpmckinney/tf-idf-similarity.png)](https://gemnasium.com/jpmckinney/tf-idf-similarity)
|
6
|
+
[![Coverage Status](https://coveralls.io/repos/jpmckinney/tf-idf-similarity/badge.png)](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
|
7
|
+
[![Code Climate](https://codeclimate.com/github/jpmckinney/tf-idf-similarity.png)](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
|
8
8
|
|
9
|
-
Calculates the similarity between texts using a [bag-of-words](
|
9
|
+
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
10
|
|
11
11
|
## Usage
|
12
12
|
|
@@ -24,13 +24,13 @@ document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
|
24
24
|
corpus = [document1, document2, document3]
|
25
25
|
```
|
26
26
|
|
27
|
-
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](
|
27
|
+
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](https://en.wikipedia.org/wiki/Tf–idf):
|
28
28
|
|
29
29
|
```ruby
|
30
30
|
model = TfIdfSimilarity::TfIdfModel.new(corpus)
|
31
31
|
```
|
32
32
|
|
33
|
-
Or, create a document-term matrix using the [Okapi BM25 ranking function](
|
33
|
+
Or, create a document-term matrix using the [Okapi BM25 ranking function](https://en.wikipedia.org/wiki/Okapi_BM25):
|
34
34
|
|
35
35
|
```ruby
|
36
36
|
model = TfIdfSimilarity::BM25Model.new(corpus)
|
@@ -58,16 +58,46 @@ end
|
|
58
58
|
puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
|
59
59
|
```
|
60
60
|
|
61
|
+
Tokenize a document yourself, for example by excluding stop words:
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
require 'unicode_utils'
|
65
|
+
text = "Lorem ipsum dolor sit amet..."
|
66
|
+
tokens = UnicodeUtils.each_word(text).to_a - ['and', 'the', 'to']
|
67
|
+
document1 = TfIdfSimilarity::Document.new(text, :tokens => tokens)
|
68
|
+
```
|
69
|
+
|
70
|
+
Provide, by yourself, the number of times each term appears and the number of tokens in the document:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
require 'unicode_utils'
|
74
|
+
text = "Lorem ipsum dolor sit amet..."
|
75
|
+
tokens = UnicodeUtils.each_word(text).to_a - ['and', 'the', 'to']
|
76
|
+
term_counts = Hash.new(0)
|
77
|
+
size = 0
|
78
|
+
tokens.each do |token|
|
79
|
+
# Unless the token is numeric.
|
80
|
+
unless token[/\A\d+\z/]
|
81
|
+
# Remove all punctuation from tokens.
|
82
|
+
term_counts[token.gsub(/\p{Punct}/, '')] += 1
|
83
|
+
size += 1
|
84
|
+
end
|
85
|
+
end
|
86
|
+
document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
|
87
|
+
```
|
88
|
+
|
61
89
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
62
90
|
|
63
91
|
## Speed
|
64
92
|
|
65
93
|
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/), [NArray](http://narray.rubyforge.org/) or [NMatrix](https://github.com/SciRuby/nmatrix) (0.0.9 or greater) gems for faster matrix operations. For example:
|
66
94
|
|
67
|
-
require '
|
68
|
-
model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :
|
95
|
+
require 'narray'
|
96
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :narray)
|
69
97
|
|
70
|
-
|
98
|
+
NArray seems to have the best performance of the three libraries.
|
99
|
+
|
100
|
+
The NMatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#installation) to install the NMatrix gem.
|
71
101
|
|
72
102
|
## Extras
|
73
103
|
|
@@ -76,7 +106,7 @@ You can access more term frequency, document frequency, and normalization formul
|
|
76
106
|
require 'tf-idf-similarity/extras/document'
|
77
107
|
require 'tf-idf-similarity/extras/tf_idf_model'
|
78
108
|
|
79
|
-
The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0
|
109
|
+
The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
80
110
|
|
81
111
|
## Why?
|
82
112
|
|
@@ -115,17 +145,13 @@ Adapters for the following projects were also considered:
|
|
115
145
|
|
116
146
|
## Further Reading
|
117
147
|
|
118
|
-
Lucene implements many more [similarity functions](http://lucene.apache.org/core/4_0_0
|
119
|
-
|
120
|
-
* a [divergence from randomness (DFR) framework](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/DFRSimilarity.html)
|
121
|
-
* a [framework for the family of information-based models](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/IBSimilarity.html)
|
122
|
-
* a [language model with Bayesian smoothing using Dirichlet priors](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html)
|
123
|
-
* a [language model with Jelinek-Mercer smoothing](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html)
|
124
|
-
|
125
|
-
Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
|
148
|
+
Lucene implements many more [similarity functions](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/Similarity.html), such as:
|
126
149
|
|
127
|
-
|
150
|
+
* a [divergence from randomness (DFR) framework](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/DFRSimilarity.html)
|
151
|
+
* a [framework for the family of information-based models](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/IBSimilarity.html)
|
152
|
+
* a [language model with Bayesian smoothing using Dirichlet priors](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html)
|
153
|
+
* a [language model with Jelinek-Mercer smoothing](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html)
|
128
154
|
|
129
|
-
|
155
|
+
Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
|
130
156
|
|
131
|
-
Copyright (c) 2012
|
157
|
+
Copyright (c) 2012 James McKinney, released under the MIT license
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
require 'forwardable'
|
2
2
|
require 'set'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
require 'unicode_utils/each_word'
|
7
|
-
rescue LoadError
|
8
|
-
# Ruby 1.8
|
9
|
-
end
|
4
|
+
require 'unicode_utils/downcase'
|
5
|
+
require 'unicode_utils/each_word'
|
10
6
|
|
11
7
|
module TfIdfSimilarity
|
12
8
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# A document-term matrix using the BM25 function.
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
4
|
-
# @see
|
4
|
+
# @see https://en.wikipedia.org/wiki/Okapi_BM25
|
5
5
|
module TfIdfSimilarity
|
6
6
|
class BM25Model < Model
|
7
7
|
# Return the term's inverse document frequency.
|
@@ -76,7 +76,7 @@ module TfIdfSimilarity
|
|
76
76
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
77
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
78
|
def tokenize(text)
|
79
|
-
@tokens ||
|
79
|
+
@tokens || UnicodeUtils.each_word(text)
|
80
80
|
end
|
81
81
|
end
|
82
82
|
end
|
@@ -110,7 +110,7 @@ module TfIdfSimilarity
|
|
110
110
|
end
|
111
111
|
alias_method :binary_tf, :binary_term_frequency
|
112
112
|
|
113
|
-
# @see
|
113
|
+
# @see https://en.wikipedia.org/wiki/Tf*idf
|
114
114
|
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
115
115
|
def normalized_term_frequency(document, term, a = 0)
|
116
116
|
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
@@ -9,7 +9,7 @@
|
|
9
9
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
|
10
10
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
|
11
11
|
module TfIdfSimilarity
|
12
|
-
class Token <
|
12
|
+
class Token < SimpleDelegator
|
13
13
|
# Returns a falsy value if all its characters are numbers, punctuation,
|
14
14
|
# whitespace or control characters.
|
15
15
|
#
|
@@ -35,10 +35,7 @@ module TfIdfSimilarity
|
|
35
35
|
#
|
36
36
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
|
37
37
|
def lowercase_filter
|
38
|
-
self.class.new(
|
39
|
-
"ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
|
40
|
-
"àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
|
41
|
-
).downcase)
|
38
|
+
self.class.new(UnicodeUtils.downcase(self))
|
42
39
|
end
|
43
40
|
|
44
41
|
# Returns a string with no English possessive or periods in acronyms.
|
@@ -47,7 +44,7 @@ module TfIdfSimilarity
|
|
47
44
|
#
|
48
45
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
49
46
|
def classic_filter
|
50
|
-
self.class.new(self.gsub('.', '').
|
47
|
+
self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
|
51
48
|
end
|
52
49
|
end
|
53
50
|
end
|
@@ -33,7 +33,7 @@ module TfIdfSimilarity
|
|
33
33
|
build_model(documents)
|
34
34
|
end
|
35
35
|
|
36
|
-
|
36
|
+
skip "Add #search"
|
37
37
|
end
|
38
38
|
|
39
39
|
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
|
@@ -98,7 +98,7 @@ module TfIdfSimilarity
|
|
98
98
|
end
|
99
99
|
|
100
100
|
it 'should return the similarity matrix' do
|
101
|
-
|
101
|
+
skip "Calculate the tf*idf matrix like the similarity gem does"
|
102
102
|
end
|
103
103
|
|
104
104
|
it 'should return the number of documents in which a term appears' do
|
@@ -113,7 +113,7 @@ module TfIdfSimilarity
|
|
113
113
|
end
|
114
114
|
|
115
115
|
it 'should return the document vector' do
|
116
|
-
|
116
|
+
skip "Calculate the tf*idf matrix like the similarity gem does"
|
117
117
|
end
|
118
118
|
end
|
119
119
|
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
|
3
|
+
require 'simplecov'
|
3
4
|
require 'coveralls'
|
4
|
-
Coveralls
|
5
|
+
SimpleCov.formatter = Coveralls::SimpleCov::Formatter
|
6
|
+
SimpleCov.start do
|
7
|
+
add_filter 'spec'
|
8
|
+
end
|
5
9
|
|
6
10
|
require 'rspec'
|
7
11
|
require File.dirname(__FILE__) + '/../lib/tf-idf-similarity'
|
data/spec/token_spec.rb
CHANGED
@@ -28,6 +28,14 @@ module TfIdfSimilarity
|
|
28
28
|
Token.new("foo's").classic_filter.should == 'foo'
|
29
29
|
end
|
30
30
|
|
31
|
+
it 'should remove ending possessives with nonstandard apostrophe 1' do
|
32
|
+
Token.new("foo`s").classic_filter.should == 'foo'
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should remove ending possessives with nonstandard apostrophe 2' do
|
36
|
+
Token.new("foo’s").classic_filter.should == 'foo'
|
37
|
+
end
|
38
|
+
|
31
39
|
it 'should not remove infix possessives' do
|
32
40
|
Token.new("foo's bar").classic_filter.should == "foo's bar"
|
33
41
|
end
|
data/td-idf-similarity.gemspec
CHANGED
@@ -5,9 +5,8 @@ Gem::Specification.new do |s|
|
|
5
5
|
s.name = "tf-idf-similarity"
|
6
6
|
s.version = TfIdfSimilarity::VERSION
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
|
-
s.authors = ["
|
9
|
-
s.
|
10
|
-
s.homepage = "http://github.com/opennorth/tf-idf-similarity"
|
8
|
+
s.authors = ["James McKinney"]
|
9
|
+
s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
|
11
10
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
12
11
|
s.license = 'MIT'
|
13
12
|
|
@@ -16,10 +15,9 @@ Gem::Specification.new do |s|
|
|
16
15
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
16
|
s.require_paths = ["lib"]
|
18
17
|
|
19
|
-
s.
|
20
|
-
s.add_development_dependency('rake')
|
21
|
-
s.add_development_dependency('coveralls')
|
22
|
-
s.add_development_dependency('mime-types', '~> 1.25') # 2.0 requires Ruby 1.9.2
|
18
|
+
s.add_runtime_dependency('unicode_utils', '~> 1.4')
|
23
19
|
|
24
|
-
s.
|
20
|
+
s.add_development_dependency('coveralls')
|
21
|
+
s.add_development_dependency('rake')
|
22
|
+
s.add_development_dependency('rspec', '~> 2.10')
|
25
23
|
end
|
metadata
CHANGED
@@ -1,31 +1,31 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: unicode_utils
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
20
|
-
type: :
|
19
|
+
version: '1.4'
|
20
|
+
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '1.4'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: coveralls
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -53,36 +53,33 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '2.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '2.10'
|
69
69
|
description:
|
70
|
-
email:
|
71
|
-
- info@opennorth.ca
|
70
|
+
email:
|
72
71
|
executables: []
|
73
|
-
extensions:
|
74
|
-
- ext/mkrf_conf.rb
|
72
|
+
extensions: []
|
75
73
|
extra_rdoc_files: []
|
76
74
|
files:
|
77
75
|
- ".gitignore"
|
76
|
+
- ".rspec"
|
78
77
|
- ".travis.yml"
|
79
78
|
- ".yardopts"
|
80
79
|
- Gemfile
|
81
80
|
- LICENSE
|
82
81
|
- README.md
|
83
82
|
- Rakefile
|
84
|
-
- USAGE
|
85
|
-
- ext/mkrf_conf.rb
|
86
83
|
- lib/tf-idf-similarity.rb
|
87
84
|
- lib/tf-idf-similarity/bm25_model.rb
|
88
85
|
- lib/tf-idf-similarity/document.rb
|
@@ -102,7 +99,7 @@ files:
|
|
102
99
|
- spec/tf_idf_model_spec.rb
|
103
100
|
- spec/token_spec.rb
|
104
101
|
- td-idf-similarity.gemspec
|
105
|
-
homepage:
|
102
|
+
homepage: https://github.com/jpmckinney/tf-idf-similarity
|
106
103
|
licenses:
|
107
104
|
- MIT
|
108
105
|
metadata: {}
|
@@ -122,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
119
|
version: '0'
|
123
120
|
requirements: []
|
124
121
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
122
|
+
rubygems_version: 2.4.5
|
126
123
|
signing_key:
|
127
124
|
specification_version: 4
|
128
125
|
summary: Calculates the similarity between texts using tf*idf
|
data/USAGE
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
See README.md for full usage details.
|
data/ext/mkrf_conf.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# @see http://www.programmersparadox.com/2012/05/21/gemspec-loading-dependent-gems-based-on-the-users-system/
|
2
|
-
require 'rubygems/dependency_installer.rb'
|
3
|
-
|
4
|
-
installer = Gem::DependencyInstaller.new
|
5
|
-
begin
|
6
|
-
unless RUBY_VERSION < '1.9'
|
7
|
-
installer.install('unicode_utils', '>=0')
|
8
|
-
end
|
9
|
-
rescue
|
10
|
-
exit(1)
|
11
|
-
end
|
12
|
-
|
13
|
-
f = File.open(File.join(File.dirname(__FILE__), "Rakefile"), "w")
|
14
|
-
f.write("task :default\n")
|
15
|
-
f.close
|