tf-idf-similarity 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +0 -4
- data/Gemfile +1 -1
- data/README.md +14 -22
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +69 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +4 -4
- data/lib/tf-idf-similarity/term_count_model.rb +1 -5
- data/lib/tf-idf-similarity/tf_idf_model.rb +4 -19
- data/lib/tf-idf-similarity/token.rb +10 -23
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/td-idf-similarity.gemspec +2 -0
- metadata +88 -81
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bcd5811852a4ec0e65c55ac09854024c1256483b
|
4
|
+
data.tar.gz: 9978891aa4e76e8badec85da898b575ed91098cc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3219126f9ea91f3d2bfb8db0954d6637f090d4c75bf5f7c57c8208b0a812c1d8e2fd488d438cedd5e188a6a169ff7b9bf1470e82146085bc2bdf7298de1572fb
|
7
|
+
data.tar.gz: d523d3e77ab1cfd31ddc4e6b7f429726b4d6b3520377e3504d8b0c2bc7a51b11dcaa55991ed19ff497ae75e993ce49aef34dcef619f066e7a7c6ac9e7ee1aa35
|
data/.travis.yml
CHANGED
@@ -23,10 +23,6 @@ before_install:
|
|
23
23
|
# Installing ATLAS will install BLAS.
|
24
24
|
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
|
25
25
|
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
|
26
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then git clone git://github.com/SciRuby/nmatrix.git; fi
|
27
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd nmatrix && ORIGINAL_BUNDLE_GEMFILE=$BUNDLE_GEMFILE; fi
|
28
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then BUNDLE_GEMFILE=`pwd`/Gemfile && bundle && bundle exec rake install; fi
|
29
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd .. && BUNDLE_GEMFILE=$ORIGINAL_BUNDLE_GEMFILE; fi
|
30
26
|
# Travis sometimes runs without Bundler.
|
31
27
|
install: bundle
|
32
28
|
script: bundle exec rake --trace
|
data/Gemfile
CHANGED
@@ -2,7 +2,7 @@ source "http://rubygems.org"
|
|
2
2
|
|
3
3
|
gem 'gsl', '~> 1.15.3' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
4
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix',
|
5
|
+
gem 'nmatrix', '~> 0.0.9' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in the gemspec
|
8
8
|
gemspec
|
data/README.md
CHANGED
@@ -5,8 +5,7 @@
|
|
5
5
|
[![Coverage Status](https://coveralls.io/repos/opennorth/tf-idf-similarity/badge.png?branch=master)](https://coveralls.io/r/opennorth/tf-idf-similarity)
|
6
6
|
[![Code Climate](https://codeclimate.com/github/opennorth/tf-idf-similarity.png)](https://codeclimate.com/github/opennorth/tf-idf-similarity)
|
7
7
|
|
8
|
-
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](http://en.wikipedia.org/wiki/
|
9
|
-
) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) or similar (see below).
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](http://en.wikipedia.org/wiki/Tf*idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
9
|
|
11
10
|
## Usage
|
12
11
|
|
@@ -20,34 +19,24 @@ Create a set of documents:
|
|
20
19
|
corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
|
21
20
|
corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
22
21
|
|
23
|
-
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/)
|
22
|
+
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
|
24
23
|
|
25
|
-
model = TfIdfSimilarity::TfIdfModel(corpus
|
24
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus)
|
26
25
|
|
27
26
|
Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
|
28
27
|
|
29
|
-
model = TfIdfSimilarity::
|
28
|
+
model = TfIdfSimilarity::BM25Model.new(corpus)
|
30
29
|
|
31
30
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
32
31
|
|
33
32
|
## Speed
|
34
33
|
|
35
|
-
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the
|
34
|
+
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/), [NArray](http://narray.rubyforge.org/) or [NMatrix](https://github.com/SciRuby/nmatrix) (0.0.9 or greater) gems for faster matrix operations. For example:
|
36
35
|
|
37
36
|
require 'gsl'
|
38
|
-
model = TfIdfSimilarity::TfIdfModel(corpus, :library => :gsl)
|
37
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :gsl)
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
gem install gsl
|
43
|
-
|
44
|
-
### [NArray](http://narray.rubyforge.org/)
|
45
|
-
|
46
|
-
gem install narray
|
47
|
-
|
48
|
-
### [NMatrix](https://github.com/SciRuby/nmatrix)
|
49
|
-
|
50
|
-
The nmatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install the nmatrix gem. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/Installation).
|
39
|
+
The NMatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install the NMatrix gem. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/Installation).
|
51
40
|
|
52
41
|
## Extras
|
53
42
|
|
@@ -68,22 +57,25 @@ At the time of writing, no other Ruby gem implemented the tf*idf formula used by
|
|
68
57
|
|
69
58
|
### Term frequencies
|
70
59
|
|
71
|
-
The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important.
|
60
|
+
* The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important.
|
61
|
+
* The [tf_idf](https://github.com/reddavis/TF-IDF) and similarity gems normalize the frequency of a term in a document to the number of terms in that document, which never occurs in the literature.
|
62
|
+
* The [tf-idf](https://github.com/mchung/tf-idf) gem normalizes the frequency of a term in a document to the number of *unique* terms in that document, which never occurs in the literature.
|
72
63
|
|
73
64
|
### Document frequencies
|
74
65
|
|
75
|
-
The vss gem does not normalize the inverse document frequency.
|
66
|
+
* The vss gem does not normalize the inverse document frequency.
|
67
|
+
* The treat, tf_idf, tf-idf and similarity gems use variants of the typical inverse document frequency formula.
|
76
68
|
|
77
69
|
### Normalization
|
78
70
|
|
79
|
-
The treat, tf_idf, tf-idf, rsemantic and vss gems have no normalization component.
|
71
|
+
* The treat, tf_idf, tf-idf, rsemantic and vss gems have no normalization component.
|
80
72
|
|
81
73
|
## Additional adapters
|
82
74
|
|
83
75
|
Adapters for the following projects were also considered:
|
84
76
|
|
85
77
|
* [Ruby-LAPACK](http://ruby.gfd-dennou.org/products/ruby-lapack/) is a very thin wrapper around LAPACK, which has an opaque Fortran-style naming scheme.
|
86
|
-
* [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) give access to LAPACK from Ruby
|
78
|
+
* [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) give access to LAPACK from Ruby but are old and unavailable as gems.
|
87
79
|
|
88
80
|
## Reference
|
89
81
|
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
# A document-term matrix using the BM25 function.
|
2
|
+
#
|
3
|
+
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
4
|
+
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
5
|
+
class TfIdfSimilarity::BM25Model
|
6
|
+
include TfIdfSimilarity::MatrixMethods
|
7
|
+
|
8
|
+
extend Forwardable
|
9
|
+
def_delegators :@model, :documents, :terms, :document_count
|
10
|
+
|
11
|
+
# @param [Array<TfIdfSimilarity::Document>] documents documents
|
12
|
+
# @param [Hash] opts optional arguments
|
13
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
14
|
+
def initialize(documents, opts = {})
|
15
|
+
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
16
|
+
@library = (opts[:library] || :matrix).to_sym
|
17
|
+
|
18
|
+
array = Array.new(terms.size) do |i|
|
19
|
+
idf = inverse_document_frequency(terms[i])
|
20
|
+
Array.new(documents.size) do |j|
|
21
|
+
term_frequency(documents[j], terms[i]) * idf
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
@matrix = initialize_matrix(array)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Return the term's inverse document frequency.
|
29
|
+
#
|
30
|
+
# @param [String] term a term
|
31
|
+
# @return [Float] the term's inverse document frequency
|
32
|
+
def inverse_document_frequency(term)
|
33
|
+
df = @model.document_count(term)
|
34
|
+
log((documents.size - df + 0.5) / (df + 0.5))
|
35
|
+
end
|
36
|
+
alias_method :idf, :inverse_document_frequency
|
37
|
+
|
38
|
+
# Returns the term's frequency in the document.
|
39
|
+
#
|
40
|
+
# @param [Document] document a document
|
41
|
+
# @param [String] term a term
|
42
|
+
# @return [Float] the term's frequency in the document
|
43
|
+
#
|
44
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
45
|
+
def term_frequency(document, term)
|
46
|
+
tf = document.term_count(term)
|
47
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
48
|
+
end
|
49
|
+
alias_method :tf, :term_frequency
|
50
|
+
|
51
|
+
# Return the term frequency–inverse document frequency.
|
52
|
+
#
|
53
|
+
# @param [Document] document a document
|
54
|
+
# @param [String] term a term
|
55
|
+
# @return [Float] the term frequency–inverse document frequency
|
56
|
+
def term_frequency_inverse_document_frequency(document, term)
|
57
|
+
inverse_document_frequency(term) * term_frequency(document, term)
|
58
|
+
end
|
59
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
60
|
+
|
61
|
+
# Returns a similarity matrix for the documents in the corpus.
|
62
|
+
#
|
63
|
+
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
64
|
+
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
65
|
+
# similarity of all document vectors.
|
66
|
+
def similarity_matrix
|
67
|
+
multiply_self(normalize)
|
68
|
+
end
|
69
|
+
end
|
@@ -17,7 +17,7 @@ private
|
|
17
17
|
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
18
18
|
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
19
19
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
20
|
-
normal = NMatrix.new(:dense, @matrix.shape, :float64)
|
20
|
+
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
21
21
|
(0...@matrix.shape[1]).each do |j|
|
22
22
|
column = @matrix.column(j)
|
23
23
|
norm = Math.sqrt(column.transpose.dot(column)[0, 0])
|
@@ -100,7 +100,7 @@ private
|
|
100
100
|
def values
|
101
101
|
case @library
|
102
102
|
when :nmatrix
|
103
|
-
@matrix.each.to_a
|
103
|
+
@matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
|
104
104
|
else
|
105
105
|
@matrix.to_a.flatten
|
106
106
|
end
|
@@ -124,8 +124,8 @@ private
|
|
124
124
|
GSL::Matrix[*array]
|
125
125
|
when :narray
|
126
126
|
NArray[*array]
|
127
|
-
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91
|
128
|
-
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten)
|
127
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
128
|
+
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
129
129
|
else
|
130
130
|
Matrix[*array]
|
131
131
|
end
|
@@ -1,8 +1,4 @@
|
|
1
1
|
# A simple document-term matrix.
|
2
|
-
#
|
3
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
5
|
-
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
6
2
|
class TfIdfSimilarity::TermCountModel
|
7
3
|
include TfIdfSimilarity::MatrixMethods
|
8
4
|
|
@@ -63,7 +59,7 @@ class TfIdfSimilarity::TermCountModel
|
|
63
59
|
when :gsl, :narray
|
64
60
|
row(index).sum
|
65
61
|
when :nmatrix
|
66
|
-
row(index).each.reduce(0, :+)
|
62
|
+
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
67
63
|
else
|
68
64
|
vector = row(index)
|
69
65
|
unless vector.respond_to?(:reduce)
|
@@ -1,8 +1,6 @@
|
|
1
|
-
# A document-term matrix using
|
1
|
+
# A document-term matrix using the tf*idf function.
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
5
|
-
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
6
4
|
class TfIdfSimilarity::TfIdfModel
|
7
5
|
include TfIdfSimilarity::MatrixMethods
|
8
6
|
|
@@ -12,11 +10,9 @@ class TfIdfSimilarity::TfIdfModel
|
|
12
10
|
# @param [Array<TfIdfSimilarity::Document>] documents documents
|
13
11
|
# @param [Hash] opts optional arguments
|
14
12
|
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
15
|
-
# @option opts [Symbol] :function :tfidf (default) or :bm25
|
16
13
|
def initialize(documents, opts = {})
|
17
14
|
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
18
15
|
@library = (opts[:library] || :matrix).to_sym
|
19
|
-
@function = (opts[:function] || :tfidf).to_sym
|
20
16
|
|
21
17
|
array = Array.new(terms.size) do |i|
|
22
18
|
idf = inverse_document_frequency(terms[i])
|
@@ -34,11 +30,7 @@ class TfIdfSimilarity::TfIdfModel
|
|
34
30
|
# @return [Float] the term's inverse document frequency
|
35
31
|
def inverse_document_frequency(term)
|
36
32
|
df = @model.document_count(term)
|
37
|
-
|
38
|
-
log((documents.size - df + 0.5) / (df + 0.5))
|
39
|
-
else
|
40
|
-
1 + log(documents.size / (df + 1.0))
|
41
|
-
end
|
33
|
+
1 + log(documents.size / (df + 1.0))
|
42
34
|
end
|
43
35
|
alias_method :idf, :inverse_document_frequency
|
44
36
|
|
@@ -47,15 +39,9 @@ class TfIdfSimilarity::TfIdfModel
|
|
47
39
|
# @param [Document] document a document
|
48
40
|
# @param [String] term a term
|
49
41
|
# @return [Float] the term's frequency in the document
|
50
|
-
#
|
51
|
-
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
52
42
|
def term_frequency(document, term)
|
53
43
|
tf = document.term_count(term)
|
54
|
-
|
55
|
-
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
56
|
-
else
|
57
|
-
sqrt(tf)
|
58
|
-
end
|
44
|
+
sqrt(tf)
|
59
45
|
end
|
60
46
|
alias_method :tf, :term_frequency
|
61
47
|
|
@@ -73,8 +59,7 @@ class TfIdfSimilarity::TfIdfModel
|
|
73
59
|
#
|
74
60
|
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
75
61
|
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
76
|
-
# similarity of all document vectors.
|
77
|
-
# BM25 wasn't written with this use case in mind.
|
62
|
+
# similarity of all document vectors.
|
78
63
|
def similarity_matrix
|
79
64
|
multiply_self(normalize)
|
80
65
|
end
|
@@ -16,29 +16,16 @@ class TfIdfSimilarity::Token < String
|
|
16
16
|
#
|
17
17
|
# @return [Boolean] whether the string is a token
|
18
18
|
def valid?
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
}x]
|
30
|
-
else
|
31
|
-
!self[%r{
|
32
|
-
\A
|
33
|
-
(
|
34
|
-
\d | # number
|
35
|
-
\p{Cntrl} | # control character
|
36
|
-
\p{Punct} | # punctuation
|
37
|
-
\p{Space} # whitespace
|
38
|
-
)+
|
39
|
-
\z
|
40
|
-
}x] # The Ruby 1.8 parser will complain about this regular expression.
|
41
|
-
end
|
19
|
+
!self[%r{
|
20
|
+
\A
|
21
|
+
(
|
22
|
+
\d | # number
|
23
|
+
[[:cntrl:]] | # control character
|
24
|
+
[[:punct:]] | # punctuation
|
25
|
+
[[:space:]] # whitespace
|
26
|
+
)+
|
27
|
+
\z
|
28
|
+
}x]
|
42
29
|
end
|
43
30
|
|
44
31
|
# Returns a lowercase string.
|
data/td-idf-similarity.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.email = ["info@opennorth.ca"]
|
10
10
|
s.homepage = "http://github.com/opennorth/tf-idf-similarity"
|
11
11
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
12
|
+
s.license = 'MIT'
|
12
13
|
|
13
14
|
s.files = `git ls-files`.split("\n")
|
14
15
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -19,4 +20,5 @@ Gem::Specification.new do |s|
|
|
19
20
|
s.add_development_dependency('rspec', '~> 2.10')
|
20
21
|
s.add_development_dependency('rake')
|
21
22
|
s.add_development_dependency('coveralls')
|
23
|
+
s.add_development_dependency('mime-types', '~> 1.25') # 2.0 requires Ruby 1.9.2
|
22
24
|
end
|
metadata
CHANGED
@@ -1,85 +1,102 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
|
-
- 0
|
10
|
-
version: 0.1.0
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
11
5
|
platform: ruby
|
12
|
-
authors:
|
6
|
+
authors:
|
13
7
|
- Open North
|
14
8
|
autorequire:
|
15
9
|
bindir: bin
|
16
10
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
11
|
+
date: 2014-03-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: unicode_utils
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
23
21
|
prerelease: false
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.10'
|
34
34
|
type: :development
|
35
|
-
version_requirements: *id001
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: rake
|
38
35
|
prerelease: false
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.10'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
42
45
|
- - ">="
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
|
45
|
-
segments:
|
46
|
-
- 0
|
47
|
-
version: "0"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
|
-
|
50
|
-
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
51
56
|
name: coveralls
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
52
63
|
prerelease: false
|
53
|
-
|
54
|
-
|
55
|
-
requirements:
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
56
66
|
- - ">="
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mime-types
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.25'
|
62
76
|
type: :development
|
63
|
-
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.25'
|
64
83
|
description:
|
65
|
-
email:
|
84
|
+
email:
|
66
85
|
- info@opennorth.ca
|
67
86
|
executables: []
|
68
|
-
|
69
87
|
extensions: []
|
70
|
-
|
71
88
|
extra_rdoc_files: []
|
72
|
-
|
73
|
-
|
74
|
-
- .
|
75
|
-
- .
|
76
|
-
- .yardopts
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".travis.yml"
|
92
|
+
- ".yardopts"
|
77
93
|
- Gemfile
|
78
94
|
- LICENSE
|
79
95
|
- README.md
|
80
96
|
- Rakefile
|
81
97
|
- USAGE
|
82
98
|
- lib/tf-idf-similarity.rb
|
99
|
+
- lib/tf-idf-similarity/bm25_model.rb
|
83
100
|
- lib/tf-idf-similarity/document.rb
|
84
101
|
- lib/tf-idf-similarity/extras/document.rb
|
85
102
|
- lib/tf-idf-similarity/extras/tf_idf_model.rb
|
@@ -95,41 +112,31 @@ files:
|
|
95
112
|
- spec/tf_idf_model_spec.rb
|
96
113
|
- spec/token_spec.rb
|
97
114
|
- td-idf-similarity.gemspec
|
98
|
-
has_rdoc: true
|
99
115
|
homepage: http://github.com/opennorth/tf-idf-similarity
|
100
|
-
licenses:
|
101
|
-
|
116
|
+
licenses:
|
117
|
+
- MIT
|
118
|
+
metadata: {}
|
102
119
|
post_install_message:
|
103
120
|
rdoc_options: []
|
104
|
-
|
105
|
-
require_paths:
|
121
|
+
require_paths:
|
106
122
|
- lib
|
107
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
-
|
109
|
-
requirements:
|
123
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
110
125
|
- - ">="
|
111
|
-
- !ruby/object:Gem::Version
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
version: "0"
|
116
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
-
none: false
|
118
|
-
requirements:
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
requirements:
|
119
130
|
- - ">="
|
120
|
-
- !ruby/object:Gem::Version
|
121
|
-
|
122
|
-
segments:
|
123
|
-
- 0
|
124
|
-
version: "0"
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
125
133
|
requirements: []
|
126
|
-
|
127
134
|
rubyforge_project:
|
128
|
-
rubygems_version:
|
135
|
+
rubygems_version: 2.2.2
|
129
136
|
signing_key:
|
130
|
-
specification_version:
|
137
|
+
specification_version: 4
|
131
138
|
summary: Calculates the similarity between texts using tf*idf
|
132
|
-
test_files:
|
139
|
+
test_files:
|
133
140
|
- spec/document_spec.rb
|
134
141
|
- spec/extras/tf_idf_model_spec.rb
|
135
142
|
- spec/spec_helper.rb
|