tf-idf-similarity 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +0 -4
- data/Gemfile +1 -1
- data/README.md +14 -22
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +69 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +4 -4
- data/lib/tf-idf-similarity/term_count_model.rb +1 -5
- data/lib/tf-idf-similarity/tf_idf_model.rb +4 -19
- data/lib/tf-idf-similarity/token.rb +10 -23
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/td-idf-similarity.gemspec +2 -0
- metadata +88 -81
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bcd5811852a4ec0e65c55ac09854024c1256483b
|
4
|
+
data.tar.gz: 9978891aa4e76e8badec85da898b575ed91098cc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3219126f9ea91f3d2bfb8db0954d6637f090d4c75bf5f7c57c8208b0a812c1d8e2fd488d438cedd5e188a6a169ff7b9bf1470e82146085bc2bdf7298de1572fb
|
7
|
+
data.tar.gz: d523d3e77ab1cfd31ddc4e6b7f429726b4d6b3520377e3504d8b0c2bc7a51b11dcaa55991ed19ff497ae75e993ce49aef34dcef619f066e7a7c6ac9e7ee1aa35
|
data/.travis.yml
CHANGED
@@ -23,10 +23,6 @@ before_install:
|
|
23
23
|
# Installing ATLAS will install BLAS.
|
24
24
|
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
|
25
25
|
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
|
26
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then git clone git://github.com/SciRuby/nmatrix.git; fi
|
27
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd nmatrix && ORIGINAL_BUNDLE_GEMFILE=$BUNDLE_GEMFILE; fi
|
28
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then BUNDLE_GEMFILE=`pwd`/Gemfile && bundle && bundle exec rake install; fi
|
29
|
-
- if [ $MATRIX_LIBRARY = 'nmatrix' ]; then cd .. && BUNDLE_GEMFILE=$ORIGINAL_BUNDLE_GEMFILE; fi
|
30
26
|
# Travis sometimes runs without Bundler.
|
31
27
|
install: bundle
|
32
28
|
script: bundle exec rake --trace
|
data/Gemfile
CHANGED
@@ -2,7 +2,7 @@ source "http://rubygems.org"
|
|
2
2
|
|
3
3
|
gem 'gsl', '~> 1.15.3' if ENV['MATRIX_LIBRARY'] == 'gsl'
|
4
4
|
gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
|
5
|
-
gem 'nmatrix',
|
5
|
+
gem 'nmatrix', '~> 0.0.9' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
|
6
6
|
|
7
7
|
# Specify your gem's dependencies in the gemspec
|
8
8
|
gemspec
|
data/README.md
CHANGED
@@ -5,8 +5,7 @@
|
|
5
5
|
[](https://coveralls.io/r/opennorth/tf-idf-similarity)
|
6
6
|
[](https://codeclimate.com/github/opennorth/tf-idf-similarity)
|
7
7
|
|
8
|
-
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](http://en.wikipedia.org/wiki/
|
9
|
-
) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) or similar (see below).
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](http://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](http://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](http://en.wikipedia.org/wiki/Tf*idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
9
|
|
11
10
|
## Usage
|
12
11
|
|
@@ -20,34 +19,24 @@ Create a set of documents:
|
|
20
19
|
corpus << TfIdfSimilarity::Document.new("Pellentesque sed ipsum dui...")
|
21
20
|
corpus << TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...")
|
22
21
|
|
23
|
-
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/)
|
22
|
+
Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](http://en.wikipedia.org/wiki/):
|
24
23
|
|
25
|
-
model = TfIdfSimilarity::TfIdfModel(corpus
|
24
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus)
|
26
25
|
|
27
26
|
Create a document-term matrix using the [Okapi BM25 ranking function](http://en.wikipedia.org/wiki/Okapi_BM25):
|
28
27
|
|
29
|
-
model = TfIdfSimilarity::
|
28
|
+
model = TfIdfSimilarity::BM25Model.new(corpus)
|
30
29
|
|
31
30
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
32
31
|
|
33
32
|
## Speed
|
34
33
|
|
35
|
-
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the
|
34
|
+
Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/), [NArray](http://narray.rubyforge.org/) or [NMatrix](https://github.com/SciRuby/nmatrix) (0.0.9 or greater) gems for faster matrix operations. For example:
|
36
35
|
|
37
36
|
require 'gsl'
|
38
|
-
model = TfIdfSimilarity::TfIdfModel(corpus, :library => :gsl)
|
37
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :gsl)
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
gem install gsl
|
43
|
-
|
44
|
-
### [NArray](http://narray.rubyforge.org/)
|
45
|
-
|
46
|
-
gem install narray
|
47
|
-
|
48
|
-
### [NMatrix](https://github.com/SciRuby/nmatrix)
|
49
|
-
|
50
|
-
The nmatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install the nmatrix gem. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/Installation).
|
39
|
+
The NMatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#synopsis) to install the NMatrix gem. You may need [additional instructions for Mac OS X Lion](https://github.com/SciRuby/nmatrix/wiki/Installation).
|
51
40
|
|
52
41
|
## Extras
|
53
42
|
|
@@ -68,22 +57,25 @@ At the time of writing, no other Ruby gem implemented the tf*idf formula used by
|
|
68
57
|
|
69
58
|
### Term frequencies
|
70
59
|
|
71
|
-
The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important.
|
60
|
+
* The [vss](https://github.com/mkdynamic/vss) gem does not normalize the frequency of a term in a document; this occurs frequently in the academic literature, but only to demonstrate why normalization is important.
|
61
|
+
* The [tf_idf](https://github.com/reddavis/TF-IDF) and similarity gems normalize the frequency of a term in a document to the number of terms in that document, which never occurs in the literature.
|
62
|
+
* The [tf-idf](https://github.com/mchung/tf-idf) gem normalizes the frequency of a term in a document to the number of *unique* terms in that document, which never occurs in the literature.
|
72
63
|
|
73
64
|
### Document frequencies
|
74
65
|
|
75
|
-
The vss gem does not normalize the inverse document frequency.
|
66
|
+
* The vss gem does not normalize the inverse document frequency.
|
67
|
+
* The treat, tf_idf, tf-idf and similarity gems use variants of the typical inverse document frequency formula.
|
76
68
|
|
77
69
|
### Normalization
|
78
70
|
|
79
|
-
The treat, tf_idf, tf-idf, rsemantic and vss gems have no normalization component.
|
71
|
+
* The treat, tf_idf, tf-idf, rsemantic and vss gems have no normalization component.
|
80
72
|
|
81
73
|
## Additional adapters
|
82
74
|
|
83
75
|
Adapters for the following projects were also considered:
|
84
76
|
|
85
77
|
* [Ruby-LAPACK](http://ruby.gfd-dennou.org/products/ruby-lapack/) is a very thin wrapper around LAPACK, which has an opaque Fortran-style naming scheme.
|
86
|
-
* [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) give access to LAPACK from Ruby
|
78
|
+
* [Linalg](https://github.com/quix/linalg) and [RNum](http://rnum.rubyforge.org/) give access to LAPACK from Ruby but are old and unavailable as gems.
|
87
79
|
|
88
80
|
## Reference
|
89
81
|
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
# A document-term matrix using the BM25 function.
|
2
|
+
#
|
3
|
+
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
4
|
+
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
5
|
+
class TfIdfSimilarity::BM25Model
|
6
|
+
include TfIdfSimilarity::MatrixMethods
|
7
|
+
|
8
|
+
extend Forwardable
|
9
|
+
def_delegators :@model, :documents, :terms, :document_count
|
10
|
+
|
11
|
+
# @param [Array<TfIdfSimilarity::Document>] documents documents
|
12
|
+
# @param [Hash] opts optional arguments
|
13
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
14
|
+
def initialize(documents, opts = {})
|
15
|
+
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
16
|
+
@library = (opts[:library] || :matrix).to_sym
|
17
|
+
|
18
|
+
array = Array.new(terms.size) do |i|
|
19
|
+
idf = inverse_document_frequency(terms[i])
|
20
|
+
Array.new(documents.size) do |j|
|
21
|
+
term_frequency(documents[j], terms[i]) * idf
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
@matrix = initialize_matrix(array)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Return the term's inverse document frequency.
|
29
|
+
#
|
30
|
+
# @param [String] term a term
|
31
|
+
# @return [Float] the term's inverse document frequency
|
32
|
+
def inverse_document_frequency(term)
|
33
|
+
df = @model.document_count(term)
|
34
|
+
log((documents.size - df + 0.5) / (df + 0.5))
|
35
|
+
end
|
36
|
+
alias_method :idf, :inverse_document_frequency
|
37
|
+
|
38
|
+
# Returns the term's frequency in the document.
|
39
|
+
#
|
40
|
+
# @param [Document] document a document
|
41
|
+
# @param [String] term a term
|
42
|
+
# @return [Float] the term's frequency in the document
|
43
|
+
#
|
44
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
45
|
+
def term_frequency(document, term)
|
46
|
+
tf = document.term_count(term)
|
47
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
48
|
+
end
|
49
|
+
alias_method :tf, :term_frequency
|
50
|
+
|
51
|
+
# Return the term frequency–inverse document frequency.
|
52
|
+
#
|
53
|
+
# @param [Document] document a document
|
54
|
+
# @param [String] term a term
|
55
|
+
# @return [Float] the term frequency–inverse document frequency
|
56
|
+
def term_frequency_inverse_document_frequency(document, term)
|
57
|
+
inverse_document_frequency(term) * term_frequency(document, term)
|
58
|
+
end
|
59
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
60
|
+
|
61
|
+
# Returns a similarity matrix for the documents in the corpus.
|
62
|
+
#
|
63
|
+
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
64
|
+
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
65
|
+
# similarity of all document vectors.
|
66
|
+
def similarity_matrix
|
67
|
+
multiply_self(normalize)
|
68
|
+
end
|
69
|
+
end
|
@@ -17,7 +17,7 @@ private
|
|
17
17
|
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
18
18
|
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
19
19
|
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
20
|
-
normal = NMatrix.new(:dense, @matrix.shape, :float64)
|
20
|
+
normal = NMatrix.new(:dense, @matrix.shape, 0, :float64)
|
21
21
|
(0...@matrix.shape[1]).each do |j|
|
22
22
|
column = @matrix.column(j)
|
23
23
|
norm = Math.sqrt(column.transpose.dot(column)[0, 0])
|
@@ -100,7 +100,7 @@ private
|
|
100
100
|
def values
|
101
101
|
case @library
|
102
102
|
when :nmatrix
|
103
|
-
@matrix.each.to_a
|
103
|
+
@matrix.each.to_a # faster than NMatrix's `to_a` and `to_flat_a`
|
104
104
|
else
|
105
105
|
@matrix.to_a.flatten
|
106
106
|
end
|
@@ -124,8 +124,8 @@ private
|
|
124
124
|
GSL::Matrix[*array]
|
125
125
|
when :narray
|
126
126
|
NArray[*array]
|
127
|
-
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91
|
128
|
-
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten)
|
127
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91#issuecomment-18870619
|
128
|
+
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten, :float64)
|
129
129
|
else
|
130
130
|
Matrix[*array]
|
131
131
|
end
|
@@ -1,8 +1,4 @@
|
|
1
1
|
# A simple document-term matrix.
|
2
|
-
#
|
3
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
5
|
-
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
6
2
|
class TfIdfSimilarity::TermCountModel
|
7
3
|
include TfIdfSimilarity::MatrixMethods
|
8
4
|
|
@@ -63,7 +59,7 @@ class TfIdfSimilarity::TermCountModel
|
|
63
59
|
when :gsl, :narray
|
64
60
|
row(index).sum
|
65
61
|
when :nmatrix
|
66
|
-
row(index).each.reduce(0, :+)
|
62
|
+
row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
|
67
63
|
else
|
68
64
|
vector = row(index)
|
69
65
|
unless vector.respond_to?(:reduce)
|
@@ -1,8 +1,6 @@
|
|
1
|
-
# A document-term matrix using
|
1
|
+
# A document-term matrix using the tf*idf function.
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
-
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
5
|
-
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
6
4
|
class TfIdfSimilarity::TfIdfModel
|
7
5
|
include TfIdfSimilarity::MatrixMethods
|
8
6
|
|
@@ -12,11 +10,9 @@ class TfIdfSimilarity::TfIdfModel
|
|
12
10
|
# @param [Array<TfIdfSimilarity::Document>] documents documents
|
13
11
|
# @param [Hash] opts optional arguments
|
14
12
|
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
15
|
-
# @option opts [Symbol] :function :tfidf (default) or :bm25
|
16
13
|
def initialize(documents, opts = {})
|
17
14
|
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
18
15
|
@library = (opts[:library] || :matrix).to_sym
|
19
|
-
@function = (opts[:function] || :tfidf).to_sym
|
20
16
|
|
21
17
|
array = Array.new(terms.size) do |i|
|
22
18
|
idf = inverse_document_frequency(terms[i])
|
@@ -34,11 +30,7 @@ class TfIdfSimilarity::TfIdfModel
|
|
34
30
|
# @return [Float] the term's inverse document frequency
|
35
31
|
def inverse_document_frequency(term)
|
36
32
|
df = @model.document_count(term)
|
37
|
-
|
38
|
-
log((documents.size - df + 0.5) / (df + 0.5))
|
39
|
-
else
|
40
|
-
1 + log(documents.size / (df + 1.0))
|
41
|
-
end
|
33
|
+
1 + log(documents.size / (df + 1.0))
|
42
34
|
end
|
43
35
|
alias_method :idf, :inverse_document_frequency
|
44
36
|
|
@@ -47,15 +39,9 @@ class TfIdfSimilarity::TfIdfModel
|
|
47
39
|
# @param [Document] document a document
|
48
40
|
# @param [String] term a term
|
49
41
|
# @return [Float] the term's frequency in the document
|
50
|
-
#
|
51
|
-
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
52
42
|
def term_frequency(document, term)
|
53
43
|
tf = document.term_count(term)
|
54
|
-
|
55
|
-
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
56
|
-
else
|
57
|
-
sqrt(tf)
|
58
|
-
end
|
44
|
+
sqrt(tf)
|
59
45
|
end
|
60
46
|
alias_method :tf, :term_frequency
|
61
47
|
|
@@ -73,8 +59,7 @@ class TfIdfSimilarity::TfIdfModel
|
|
73
59
|
#
|
74
60
|
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
75
61
|
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
76
|
-
# similarity of all document vectors.
|
77
|
-
# BM25 wasn't written with this use case in mind.
|
62
|
+
# similarity of all document vectors.
|
78
63
|
def similarity_matrix
|
79
64
|
multiply_self(normalize)
|
80
65
|
end
|
@@ -16,29 +16,16 @@ class TfIdfSimilarity::Token < String
|
|
16
16
|
#
|
17
17
|
# @return [Boolean] whether the string is a token
|
18
18
|
def valid?
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
}x]
|
30
|
-
else
|
31
|
-
!self[%r{
|
32
|
-
\A
|
33
|
-
(
|
34
|
-
\d | # number
|
35
|
-
\p{Cntrl} | # control character
|
36
|
-
\p{Punct} | # punctuation
|
37
|
-
\p{Space} # whitespace
|
38
|
-
)+
|
39
|
-
\z
|
40
|
-
}x] # The Ruby 1.8 parser will complain about this regular expression.
|
41
|
-
end
|
19
|
+
!self[%r{
|
20
|
+
\A
|
21
|
+
(
|
22
|
+
\d | # number
|
23
|
+
[[:cntrl:]] | # control character
|
24
|
+
[[:punct:]] | # punctuation
|
25
|
+
[[:space:]] # whitespace
|
26
|
+
)+
|
27
|
+
\z
|
28
|
+
}x]
|
42
29
|
end
|
43
30
|
|
44
31
|
# Returns a lowercase string.
|
data/td-idf-similarity.gemspec
CHANGED
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.email = ["info@opennorth.ca"]
|
10
10
|
s.homepage = "http://github.com/opennorth/tf-idf-similarity"
|
11
11
|
s.summary = %q{Calculates the similarity between texts using tf*idf}
|
12
|
+
s.license = 'MIT'
|
12
13
|
|
13
14
|
s.files = `git ls-files`.split("\n")
|
14
15
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -19,4 +20,5 @@ Gem::Specification.new do |s|
|
|
19
20
|
s.add_development_dependency('rspec', '~> 2.10')
|
20
21
|
s.add_development_dependency('rake')
|
21
22
|
s.add_development_dependency('coveralls')
|
23
|
+
s.add_development_dependency('mime-types', '~> 1.25') # 2.0 requires Ruby 1.9.2
|
22
24
|
end
|
metadata
CHANGED
@@ -1,85 +1,102 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
|
-
- 0
|
10
|
-
version: 0.1.0
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
11
5
|
platform: ruby
|
12
|
-
authors:
|
6
|
+
authors:
|
13
7
|
- Open North
|
14
8
|
autorequire:
|
15
9
|
bindir: bin
|
16
10
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
11
|
+
date: 2014-03-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: unicode_utils
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
23
21
|
prerelease: false
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.10'
|
34
34
|
type: :development
|
35
|
-
version_requirements: *id001
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: rake
|
38
35
|
prerelease: false
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.10'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
42
45
|
- - ">="
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
|
45
|
-
segments:
|
46
|
-
- 0
|
47
|
-
version: "0"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
|
-
|
50
|
-
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
51
56
|
name: coveralls
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
52
63
|
prerelease: false
|
53
|
-
|
54
|
-
|
55
|
-
requirements:
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
56
66
|
- - ">="
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mime-types
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.25'
|
62
76
|
type: :development
|
63
|
-
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.25'
|
64
83
|
description:
|
65
|
-
email:
|
84
|
+
email:
|
66
85
|
- info@opennorth.ca
|
67
86
|
executables: []
|
68
|
-
|
69
87
|
extensions: []
|
70
|
-
|
71
88
|
extra_rdoc_files: []
|
72
|
-
|
73
|
-
|
74
|
-
- .
|
75
|
-
- .
|
76
|
-
- .yardopts
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".travis.yml"
|
92
|
+
- ".yardopts"
|
77
93
|
- Gemfile
|
78
94
|
- LICENSE
|
79
95
|
- README.md
|
80
96
|
- Rakefile
|
81
97
|
- USAGE
|
82
98
|
- lib/tf-idf-similarity.rb
|
99
|
+
- lib/tf-idf-similarity/bm25_model.rb
|
83
100
|
- lib/tf-idf-similarity/document.rb
|
84
101
|
- lib/tf-idf-similarity/extras/document.rb
|
85
102
|
- lib/tf-idf-similarity/extras/tf_idf_model.rb
|
@@ -95,41 +112,31 @@ files:
|
|
95
112
|
- spec/tf_idf_model_spec.rb
|
96
113
|
- spec/token_spec.rb
|
97
114
|
- td-idf-similarity.gemspec
|
98
|
-
has_rdoc: true
|
99
115
|
homepage: http://github.com/opennorth/tf-idf-similarity
|
100
|
-
licenses:
|
101
|
-
|
116
|
+
licenses:
|
117
|
+
- MIT
|
118
|
+
metadata: {}
|
102
119
|
post_install_message:
|
103
120
|
rdoc_options: []
|
104
|
-
|
105
|
-
require_paths:
|
121
|
+
require_paths:
|
106
122
|
- lib
|
107
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
-
|
109
|
-
requirements:
|
123
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
110
125
|
- - ">="
|
111
|
-
- !ruby/object:Gem::Version
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
version: "0"
|
116
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
-
none: false
|
118
|
-
requirements:
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
requirements:
|
119
130
|
- - ">="
|
120
|
-
- !ruby/object:Gem::Version
|
121
|
-
|
122
|
-
segments:
|
123
|
-
- 0
|
124
|
-
version: "0"
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
125
133
|
requirements: []
|
126
|
-
|
127
134
|
rubyforge_project:
|
128
|
-
rubygems_version:
|
135
|
+
rubygems_version: 2.2.2
|
129
136
|
signing_key:
|
130
|
-
specification_version:
|
137
|
+
specification_version: 4
|
131
138
|
summary: Calculates the similarity between texts using tf*idf
|
132
|
-
test_files:
|
139
|
+
test_files:
|
133
140
|
- spec/document_spec.rb
|
134
141
|
- spec/extras/tf_idf_model_spec.rb
|
135
142
|
- spec/spec_helper.rb
|