tf-idf-similarity 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.travis.yml +13 -9
- data/.yardopts +0 -1
- data/Gemfile +3 -3
- data/LICENSE +1 -1
- data/README.md +49 -23
- data/lib/tf-idf-similarity.rb +2 -6
- data/lib/tf-idf-similarity/bm25_model.rb +1 -1
- data/lib/tf-idf-similarity/document.rb +1 -1
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +1 -1
- data/lib/tf-idf-similarity/matrix_methods.rb +1 -1
- data/lib/tf-idf-similarity/token.rb +3 -6
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/extras/tf_idf_model_spec.rb +3 -3
- data/spec/spec_helper.rb +5 -1
- data/spec/token_spec.rb +8 -0
- data/td-idf-similarity.gemspec +6 -8
- metadata +17 -20
- data/USAGE +0 -1
- data/ext/mkrf_conf.rb +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 736ca4c4b93d14ea046cbc4bdae930c8b88082be
|
4
|
+
data.tar.gz: 6b43e8356c59e0f48ac08f300186d4e12497368d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 635ea3047ba54a951020f95ab7e9412adf07a39d6042b85e605fcd0517345d506690bac11ab05f7f20e16f80106e95e8002fd1ae2ab4e466a27cc4f143ac15d6
|
7
|
+
data.tar.gz: 693ac6c70f9daf3f0a1ed06ba693d170654e7c871702641d598e59a6fc69cbd5316e76441da062aca05d0b8f67c0ba0c958a4115e2c9da17316a2ebef2190738
|
data/.rspec
ADDED
data/.travis.yml
CHANGED
@@ -1,21 +1,25 @@
|
|
1
|
+
sudo: false
|
1
2
|
language: ruby
|
3
|
+
cache: bundler
|
2
4
|
rvm:
|
3
|
-
- 1.9.2
|
4
5
|
- 1.9.3
|
5
6
|
- 2.0.0
|
6
7
|
- 2.1.0
|
8
|
+
- 2.2.0
|
7
9
|
env:
|
8
10
|
- MATRIX_LIBRARY=gsl
|
9
11
|
- MATRIX_LIBRARY=narray
|
10
12
|
- MATRIX_LIBRARY=nmatrix
|
11
13
|
- MATRIX_LIBRARY=matrix
|
14
|
+
addons:
|
15
|
+
apt:
|
16
|
+
packages:
|
17
|
+
- gsl-bin
|
18
|
+
- libgsl0-dev
|
19
|
+
# Installing ATLAS will install BLAS.
|
20
|
+
- libatlas-dev
|
21
|
+
- libatlas-base-dev
|
22
|
+
- libatlas3gf-base
|
12
23
|
before_install:
|
13
24
|
- bundle config build.nmatrix --with-lapacklib
|
14
|
-
-
|
15
|
-
- if [ $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get install gsl-bin libgsl0-dev; fi
|
16
|
-
# Installing ATLAS will install BLAS.
|
17
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
|
18
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
|
19
|
-
# Travis sometimes runs without Bundler.
|
20
|
-
install: bundle
|
21
|
-
script: bundle exec rake --trace
|
25
|
+
- export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
|
data/.yardopts
CHANGED
data/Gemfile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
gem 'rb-gsl', '~> 1.16.0.2'
|
3
|
+
gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
4
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix', '~> 0.1.0.rc5'
|
5
|
+
gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in the gemspec
|
8
8
|
gemspec
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# Ruby Vector Space Model (VSM) with tf*idf weights
|
2
2
|
|
3
|
-
[](
|
4
|
-
[](https://badge.fury.io/rb/tf-idf-similarity)
|
4
|
+
[](https://travis-ci.org/jpmckinney/tf-idf-similarity)
|
5
|
+
[](https://gemnasium.com/jpmckinney/tf-idf-similarity)
|
6
|
+
[](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
|
7
|
+
[](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
|
8
8
|
|
9
|
-
Calculates the similarity between texts using a [bag-of-words](
|
9
|
+
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
10
|
|
11
11
|
## Usage
|
12
12
|
|
@@ -24,13 +24,13 @@ document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
|
24
24
|
corpus = [document1, document2, document3]
|
25
25
|
```
|
26
26
|
|
27
|
-
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](
|
27
|
+
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](https://en.wikipedia.org/wiki/Tf–idf):
|
28
28
|
|
29
29
|
```ruby
|
30
30
|
model = TfIdfSimilarity::TfIdfModel.new(corpus)
|
31
31
|
```
|
32
32
|
|
33
|
-
Or, create a document-term matrix using the [Okapi BM25 ranking function](
|
33
|
+
Or, create a document-term matrix using the [Okapi BM25 ranking function](https://en.wikipedia.org/wiki/Okapi_BM25):
|
34
34
|
|
35
35
|
```ruby
|
36
36
|
model = TfIdfSimilarity::BM25Model.new(corpus)
|
@@ -58,16 +58,46 @@ end
|
|
58
58
|
puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
|
59
59
|
```
|
60
60
|
|
61
|
+
Tokenize a document yourself, for example by excluding stop words:
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
require 'unicode_utils'
|
65
|
+
text = "Lorem ipsum dolor sit amet..."
|
66
|
+
tokens = UnicodeUtils.each_word(text).to_a - ['and', 'the', 'to']
|
67
|
+
document1 = TfIdfSimilarity::Document.new(text, :tokens => tokens)
|
68
|
+
```
|
69
|
+
|
70
|
+
Provide, by yourself, the number of times each term appears and the number of tokens in the document:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
require 'unicode_utils'
|
74
|
+
text = "Lorem ipsum dolor sit amet..."
|
75
|
+
tokens = UnicodeUtils.each_word(text).to_a - ['and', 'the', 'to']
|
76
|
+
term_counts = Hash.new(0)
|
77
|
+
size = 0
|
78
|
+
tokens.each do |token|
|
79
|
+
# Unless the token is numeric.
|
80
|
+
unless token[/\A\d+\z/]
|
81
|
+
# Remove all punctuation from tokens.
|
82
|
+
term_counts[token.gsub(/\p{Punct}/, '')] += 1
|
83
|
+
size += 1
|
84
|
+
end
|
85
|
+
end
|
86
|
+
document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
|
87
|
+
```
|
88
|
+
|
61
89
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
62
90
|
|
63
91
|
## Speed
|
64
92
|
|
65
93
|
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/), [NArray](http://narray.rubyforge.org/) or [NMatrix](https://github.com/SciRuby/nmatrix) (0.0.9 or greater) gems for faster matrix operations. For example:
|
66
94
|
|
67
|
-
require '
|
68
|
-
model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :
|
95
|
+
require 'narray'
|
96
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :narray)
|
69
97
|
|
70
|
-
|
98
|
+
NArray seems to have the best performance of the three libraries.
|
99
|
+
|
100
|
+
The NMatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#installation) to install the NMatrix gem.
|
71
101
|
|
72
102
|
## Extras
|
73
103
|
|
@@ -76,7 +106,7 @@ You can access more term frequency, document frequency, and normalization formul
|
|
76
106
|
require 'tf-idf-similarity/extras/document'
|
77
107
|
require 'tf-idf-similarity/extras/tf_idf_model'
|
78
108
|
|
79
|
-
The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0
|
109
|
+
The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
80
110
|
|
81
111
|
## Why?
|
82
112
|
|
@@ -115,17 +145,13 @@ Adapters for the following projects were also considered:
|
|
115
145
|
|
116
146
|
## Further Reading
|
117
147
|
|
118
|
-
Lucene implements many more [similarity functions](http://lucene.apache.org/core/4_0_0
|
119
|
-
|
120
|
-
* a [divergence from randomness (DFR) framework](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/DFRSimilarity.html)
|
121
|
-
* a [framework for the family of information-based models](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/IBSimilarity.html)
|
122
|
-
* a [language model with Bayesian smoothing using Dirichlet priors](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html)
|
123
|
-
* a [language model with Jelinek-Mercer smoothing](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html)
|
124
|
-
|
125
|
-
Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
|
148
|
+
Lucene implements many more [similarity functions](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/Similarity.html), such as:
|
126
149
|
|
127
|
-
|
150
|
+
* a [divergence from randomness (DFR) framework](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/DFRSimilarity.html)
|
151
|
+
* a [framework for the family of information-based models](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/IBSimilarity.html)
|
152
|
+
* a [language model with Bayesian smoothing using Dirichlet priors](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html)
|
153
|
+
* a [language model with Jelinek-Mercer smoothing](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html)
|
128
154
|
|
129
|
-
|
155
|
+
Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
|
130
156
|
|
131
|
-
Copyright (c) 2012
|
157
|
+
Copyright (c) 2012 James McKinney, released under the MIT license
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
require 'forwardable'
|
2
2
|
require 'set'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
require 'unicode_utils/each_word'
|
7
|
-
rescue LoadError
|
8
|
-
# Ruby 1.8
|
9
|
-
end
|
4
|
+
require 'unicode_utils/downcase'
|
5
|
+
require 'unicode_utils/each_word'
|
10
6
|
|
11
7
|
module TfIdfSimilarity
|
12
8
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# A document-term matrix using the BM25 function.
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
4
|
-
# @see
|
4
|
+
# @see https://en.wikipedia.org/wiki/Okapi_BM25
|
5
5
|
module TfIdfSimilarity
|
6
6
|
class BM25Model < Model
|
7
7
|
# Return the term's inverse document frequency.
|
@@ -76,7 +76,7 @@ module TfIdfSimilarity
|
|
76
76
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
77
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
78
|
def tokenize(text)
|
79
|
-
@tokens ||
|
79
|
+
@tokens || UnicodeUtils.each_word(text)
|
80
80
|
end
|
81
81
|
end
|
82
82
|
end
|
@@ -110,7 +110,7 @@ module TfIdfSimilarity
|
|
110
110
|
end
|
111
111
|
alias_method :binary_tf, :binary_term_frequency
|
112
112
|
|
113
|
-
# @see
|
113
|
+
# @see https://en.wikipedia.org/wiki/Tf*idf
|
114
114
|
# @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
|
115
115
|
def normalized_term_frequency(document, term, a = 0)
|
116
116
|
a + (1 - a) * document.term_count(term) / document.maximum_term_count
|
@@ -9,7 +9,7 @@
|
|
9
9
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
|
10
10
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
|
11
11
|
module TfIdfSimilarity
|
12
|
-
class Token <
|
12
|
+
class Token < SimpleDelegator
|
13
13
|
# Returns a falsy value if all its characters are numbers, punctuation,
|
14
14
|
# whitespace or control characters.
|
15
15
|
#
|
@@ -35,10 +35,7 @@ module TfIdfSimilarity
|
|
35
35
|
#
|
36
36
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
|
37
37
|
def lowercase_filter
|
38
|
-
self.class.new(
|
39
|
-
"ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
|
40
|
-
"àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
|
41
|
-
).downcase)
|
38
|
+
self.class.new(UnicodeUtils.downcase(self))
|
42
39
|
end
|
43
40
|
|
44
41
|
# Returns a string with no English possessive or periods in acronyms.
|
@@ -47,7 +44,7 @@ module TfIdfSimilarity
|
|
47
44
|
#
|
48
45
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
49
46
|
def classic_filter
|
50
|
-
self.class.new(self.gsub('.', '').
|
47
|
+
self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
|
51
48
|
end
|
52
49
|
end
|
53
50
|
end
|
@@ -33,7 +33,7 @@ module TfIdfSimilarity
|
|
33
33
|
build_model(documents)
|
34
34
|
end
|
35
35
|
|
36
|
-
|
36
|
+
skip "Add #search"
|
37
37
|
end
|
38
38
|
|
39
39
|
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
|
@@ -98,7 +98,7 @@ module TfIdfSimilarity
|
|
98
98
|
end
|
99
99
|
|
100
100
|
it 'should return the similarity matrix' do
|
101
|
-
|
101
|
+
skip "Calculate the tf*idf matrix like the similarity gem does"
|
102
102
|
end
|
103
103
|
|
104
104
|
it 'should return the number of documents in which a term appears' do
|
@@ -113,7 +113,7 @@ module TfIdfSimilarity
|
|
113
113
|
end
|
114
114
|
|
115
115
|
it 'should return the document vector' do
|
116
|
-
|
116
|
+
skip "Calculate the tf*idf matrix like the similarity gem does"
|
117
117
|
end
|
118
118
|
end
|
119
119
|
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
|
3
|
+
require 'simplecov'
|
3
4
|
require 'coveralls'
|
4
|
-
Coveralls
|
5
|
+
SimpleCov.formatter = Coveralls::SimpleCov::Formatter
|
6
|
+
SimpleCov.start do
|
7
|
+
add_filter 'spec'
|
8
|
+
end
|
5
9
|
|
6
10
|
require 'rspec'
|
7
11
|
require File.dirname(__FILE__) + '/../lib/tf-idf-similarity'
|
data/spec/token_spec.rb
CHANGED
@@ -28,6 +28,14 @@ module TfIdfSimilarity
|
|
28
28
|
Token.new("foo's").classic_filter.should == 'foo'
|
29
29
|
end
|
30
30
|
|
31
|
+
it 'should remove ending possessives with nonstandard apostrophe 1' do
|
32
|
+
Token.new("foo`s").classic_filter.should == 'foo'
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should remove ending possessives with nonstandard apostrophe 2' do
|
36
|
+
Token.new("foo’s").classic_filter.should == 'foo'
|
37
|
+
end
|
38
|
+
|
31
39
|
it 'should not remove infix possessives' do
|
32
40
|
Token.new("foo's bar").classic_filter.should == "foo's bar"
|
33
41
|
end
|
data/td-idf-similarity.gemspec
CHANGED
@@ -5,9 +5,8 @@ Gem::Specification.new do |s|
|
|
5
5
|
s.name = "tf-idf-similarity"
|
6
6
|
s.version = TfIdfSimilarity::VERSION
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
|
-
s.authors = ["
|
9
|
-
s.
|
10
|
-
s.homepage = "http://github.com/opennorth/tf-idf-similarity"
|
8
|
+
s.authors = ["James McKinney"]
|
9
|
+
s.homepage = "https://github.com/jpmckinney/tf-idf-similarity"
|
11
10
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
12
11
|
s.license = 'MIT'
|
13
12
|
|
@@ -16,10 +15,9 @@ Gem::Specification.new do |s|
|
|
16
15
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
16
|
s.require_paths = ["lib"]
|
18
17
|
|
19
|
-
s.
|
20
|
-
s.add_development_dependency('rake')
|
21
|
-
s.add_development_dependency('coveralls')
|
22
|
-
s.add_development_dependency('mime-types', '~> 1.25') # 2.0 requires Ruby 1.9.2
|
18
|
+
s.add_runtime_dependency('unicode_utils', '~> 1.4')
|
23
19
|
|
24
|
-
s.
|
20
|
+
s.add_development_dependency('coveralls')
|
21
|
+
s.add_development_dependency('rake')
|
22
|
+
s.add_development_dependency('rspec', '~> 2.10')
|
25
23
|
end
|
metadata
CHANGED
@@ -1,31 +1,31 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: unicode_utils
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
20
|
-
type: :
|
19
|
+
version: '1.4'
|
20
|
+
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '1.4'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: coveralls
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -53,36 +53,33 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '2.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '2.10'
|
69
69
|
description:
|
70
|
-
email:
|
71
|
-
- info@opennorth.ca
|
70
|
+
email:
|
72
71
|
executables: []
|
73
|
-
extensions:
|
74
|
-
- ext/mkrf_conf.rb
|
72
|
+
extensions: []
|
75
73
|
extra_rdoc_files: []
|
76
74
|
files:
|
77
75
|
- ".gitignore"
|
76
|
+
- ".rspec"
|
78
77
|
- ".travis.yml"
|
79
78
|
- ".yardopts"
|
80
79
|
- Gemfile
|
81
80
|
- LICENSE
|
82
81
|
- README.md
|
83
82
|
- Rakefile
|
84
|
-
- USAGE
|
85
|
-
- ext/mkrf_conf.rb
|
86
83
|
- lib/tf-idf-similarity.rb
|
87
84
|
- lib/tf-idf-similarity/bm25_model.rb
|
88
85
|
- lib/tf-idf-similarity/document.rb
|
@@ -102,7 +99,7 @@ files:
|
|
102
99
|
- spec/tf_idf_model_spec.rb
|
103
100
|
- spec/token_spec.rb
|
104
101
|
- td-idf-similarity.gemspec
|
105
|
-
homepage:
|
102
|
+
homepage: https://github.com/jpmckinney/tf-idf-similarity
|
106
103
|
licenses:
|
107
104
|
- MIT
|
108
105
|
metadata: {}
|
@@ -122,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
119
|
version: '0'
|
123
120
|
requirements: []
|
124
121
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
122
|
+
rubygems_version: 2.4.5
|
126
123
|
signing_key:
|
127
124
|
specification_version: 4
|
128
125
|
summary: Calculates the similarity between texts using tf*idf
|
data/USAGE
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
See README.md for full usage details.
|
data/ext/mkrf_conf.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# @see http://www.programmersparadox.com/2012/05/21/gemspec-loading-dependent-gems-based-on-the-users-system/
|
2
|
-
require 'rubygems/dependency_installer.rb'
|
3
|
-
|
4
|
-
installer = Gem::DependencyInstaller.new
|
5
|
-
begin
|
6
|
-
unless RUBY_VERSION < '1.9'
|
7
|
-
installer.install('unicode_utils', '>=0')
|
8
|
-
end
|
9
|
-
rescue
|
10
|
-
exit(1)
|
11
|
-
end
|
12
|
-
|
13
|
-
f = File.open(File.join(File.dirname(__FILE__), "Rakefile"), "w")
|
14
|
-
f.write("task :default\n")
|
15
|
-
f.close
|