tf-idf-similarity 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +29 -0
- data/Gemfile +4 -0
- data/README.md +41 -29
- data/lib/tf-idf-similarity.rb +12 -1
- data/lib/tf-idf-similarity/document.rb +35 -28
- data/lib/tf-idf-similarity/extras/document.rb +2 -125
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
- data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
- data/lib/tf-idf-similarity/term_count_model.rb +78 -0
- data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
- data/lib/tf-idf-similarity/token.rb +34 -12
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/document_spec.rb +136 -0
- data/spec/extras/tf_idf_model_spec.rb +269 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/term_count_model_spec.rb +108 -0
- data/spec/tf_idf_model_spec.rb +174 -0
- data/spec/token_spec.rb +34 -0
- data/td-idf-similarity.gemspec +3 -3
- metadata +91 -63
- data/lib/tf-idf-similarity/collection.rb +0 -205
- data/lib/tf-idf-similarity/extras/collection.rb +0 -110
@@ -0,0 +1,164 @@
|
|
1
|
+
module TfIdfSimilarity::MatrixMethods
|
2
|
+
private
|
3
|
+
|
4
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] all document vectors as unit vectors
|
5
|
+
#
|
6
|
+
# @note Lucene normalizes document length differently.
|
7
|
+
def normalize
|
8
|
+
case @library
|
9
|
+
when :gsl
|
10
|
+
@matrix.clone.each_col do |column|
|
11
|
+
unless column.isnull?
|
12
|
+
column.normalize!
|
13
|
+
end
|
14
|
+
end
|
15
|
+
when :narray # @see https://github.com/masa16/narray/issues/21
|
16
|
+
norm = NMath.sqrt((@matrix ** 2).sum(1).reshape(@matrix.shape[0], 1))
|
17
|
+
norm[norm.where2[1]] = 1.0 # avoid division by zero
|
18
|
+
NMatrix.refer(@matrix / norm) # must be NMatrix for matrix multiplication
|
19
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/38
|
20
|
+
normal = NMatrix.new(:dense, @matrix.shape, :float64)
|
21
|
+
(0...@matrix.shape[1]).each do |j|
|
22
|
+
column = @matrix.column(j)
|
23
|
+
norm = Math.sqrt(column.transpose.dot(column)[0, 0])
|
24
|
+
(0...@matrix.shape[0]).each do |i|
|
25
|
+
normal[i, j] = norm.zero? ? 0 : @matrix[i, j] / norm
|
26
|
+
end
|
27
|
+
end
|
28
|
+
normal
|
29
|
+
else
|
30
|
+
Matrix.columns(@matrix.column_vectors.map do |column|
|
31
|
+
if column.to_a.all?(&:zero?)
|
32
|
+
column
|
33
|
+
elsif column.respond_to?(:normalize)
|
34
|
+
column.normalize
|
35
|
+
else
|
36
|
+
column * (1 / Math.sqrt(column.inner_product(column))) # 1.8 does define division
|
37
|
+
end
|
38
|
+
end)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# @param [Integer] row index
|
43
|
+
# @param [Integer] column index
|
44
|
+
def get(i, j)
|
45
|
+
case @library
|
46
|
+
when :narray
|
47
|
+
@matrix[j, i]
|
48
|
+
else
|
49
|
+
@matrix[i, j]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# @param [Integer] index the row index
|
54
|
+
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a row
|
55
|
+
def row(index)
|
56
|
+
case @library
|
57
|
+
when :narray
|
58
|
+
@matrix[true, index]
|
59
|
+
else
|
60
|
+
@matrix.row(index)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# @param [Integer] index the column index
|
65
|
+
# @return [GSL::Vector::View,NArray,NMatrix,Vector] a column
|
66
|
+
def column(index)
|
67
|
+
case @library
|
68
|
+
when :narray
|
69
|
+
@matrix[index, true]
|
70
|
+
else
|
71
|
+
@matrix.column(index)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# @return [Float] the number of rows in the matrix
|
76
|
+
def row_size
|
77
|
+
case @library
|
78
|
+
when :gsl, :nmatrix
|
79
|
+
@matrix.shape[0]
|
80
|
+
when :narray
|
81
|
+
@matrix.shape[1]
|
82
|
+
else
|
83
|
+
@matrix.row_size
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# @return [Float] the number of columns in the matrix
|
88
|
+
def column_size
|
89
|
+
case @library
|
90
|
+
when :gsl, :nmatrix
|
91
|
+
@matrix.shape[1]
|
92
|
+
when :narray
|
93
|
+
@matrix.shape[0]
|
94
|
+
else
|
95
|
+
@matrix.column_size
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# @return [Array<Float>] the matrix's values
|
100
|
+
def values
|
101
|
+
case @library
|
102
|
+
when :nmatrix
|
103
|
+
@matrix.each.to_a
|
104
|
+
else
|
105
|
+
@matrix.to_a.flatten
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# @return [Float] the sum of all values in the matrix
|
110
|
+
def sum
|
111
|
+
case @library
|
112
|
+
when :narray
|
113
|
+
@matrix.sum
|
114
|
+
else
|
115
|
+
values.reduce(0, :+)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# @param [Array<Array>] array matrix rows
|
120
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] a matrix
|
121
|
+
def initialize_matrix(array)
|
122
|
+
case @library
|
123
|
+
when :gsl
|
124
|
+
GSL::Matrix[*array]
|
125
|
+
when :narray
|
126
|
+
NArray[*array]
|
127
|
+
when :nmatrix # @see https://github.com/SciRuby/nmatrix/issues/91
|
128
|
+
NMatrix.new(:dense, [array.size, array.empty? ? 0 : array[0].size], array.flatten)
|
129
|
+
else
|
130
|
+
Matrix[*array]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# @param [GSL::Matrix,NArray,NMatrix,Matrix] matrix a matrix
|
135
|
+
# @return [GSL::Matrix,NArray,NMatrix,Matrix] the product
|
136
|
+
def multiply_self(matrix)
|
137
|
+
case @library
|
138
|
+
when :nmatrix
|
139
|
+
matrix.transpose.dot(matrix)
|
140
|
+
else
|
141
|
+
matrix.transpose * matrix
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def log(number)
|
146
|
+
case @library
|
147
|
+
when :gsl
|
148
|
+
GSL::Sf::log(number)
|
149
|
+
when :narray
|
150
|
+
NMath.log(number)
|
151
|
+
else
|
152
|
+
Math.log(number)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
def sqrt(number)
|
157
|
+
case @library
|
158
|
+
when :narray
|
159
|
+
NMath.sqrt(number)
|
160
|
+
else
|
161
|
+
Math.sqrt(number)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# A simple document-term matrix.
|
2
|
+
#
|
3
|
+
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
+
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
5
|
+
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
6
|
+
class TfIdfSimilarity::TermCountModel
|
7
|
+
include TfIdfSimilarity::MatrixMethods
|
8
|
+
|
9
|
+
# The documents in the corpus.
|
10
|
+
attr_reader :documents
|
11
|
+
# The set of terms in the corpus.
|
12
|
+
attr_reader :terms
|
13
|
+
# The average number of tokens in a document.
|
14
|
+
attr_reader :average_document_size
|
15
|
+
|
16
|
+
# @param [Array<TfIdfSimilarity::Document>] documents documents
|
17
|
+
# @param [Hash] opts optional arguments
|
18
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
19
|
+
def initialize(documents, opts = {})
|
20
|
+
@documents = documents
|
21
|
+
@terms = Set.new(documents.map(&:terms).flatten).to_a
|
22
|
+
@library = (opts[:library] || :matrix).to_sym
|
23
|
+
|
24
|
+
array = Array.new(terms.size) do |i|
|
25
|
+
Array.new(documents.size) do |j|
|
26
|
+
documents[j].term_count(terms[i])
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
@matrix = initialize_matrix(array)
|
31
|
+
|
32
|
+
@average_document_size = documents.empty? ? 0 : sum / column_size.to_f
|
33
|
+
end
|
34
|
+
|
35
|
+
# @param [String] term a term
|
36
|
+
# @return [Integer] the number of documents the term appears in
|
37
|
+
def document_count(term)
|
38
|
+
index = terms.index(term)
|
39
|
+
if index
|
40
|
+
case @library
|
41
|
+
when :gsl, :narray
|
42
|
+
row(index).where.size
|
43
|
+
when :nmatrix
|
44
|
+
row(index).each.count(&:nonzero?)
|
45
|
+
else
|
46
|
+
vector = row(index)
|
47
|
+
unless vector.respond_to?(:count)
|
48
|
+
vector = vector.to_a
|
49
|
+
end
|
50
|
+
vector.count(&:nonzero?)
|
51
|
+
end
|
52
|
+
else
|
53
|
+
0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# @param [String] term a term
|
58
|
+
# @return [Integer] the number of times the term appears in the corpus
|
59
|
+
def term_count(term)
|
60
|
+
index = terms.index(term)
|
61
|
+
if index
|
62
|
+
case @library
|
63
|
+
when :gsl, :narray
|
64
|
+
row(index).sum
|
65
|
+
when :nmatrix
|
66
|
+
row(index).each.reduce(0, :+)
|
67
|
+
else
|
68
|
+
vector = row(index)
|
69
|
+
unless vector.respond_to?(:reduce)
|
70
|
+
vector = vector.to_a
|
71
|
+
end
|
72
|
+
vector.reduce(0, :+)
|
73
|
+
end
|
74
|
+
else
|
75
|
+
0
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# A document-term matrix using either the tf*idf or BM25 functions.
|
2
|
+
#
|
3
|
+
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
+
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
|
5
|
+
# @see http://en.wikipedia.org/wiki/Okapi_BM25
|
6
|
+
class TfIdfSimilarity::TfIdfModel
|
7
|
+
include TfIdfSimilarity::MatrixMethods
|
8
|
+
|
9
|
+
extend Forwardable
|
10
|
+
def_delegators :@model, :documents, :terms, :document_count
|
11
|
+
|
12
|
+
# @param [Array<TfIdfSimilarity::Document>] documents documents
|
13
|
+
# @param [Hash] opts optional arguments
|
14
|
+
# @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
|
15
|
+
# @option opts [Symbol] :function :tfidf (default) or :bm25
|
16
|
+
def initialize(documents, opts = {})
|
17
|
+
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
18
|
+
@library = (opts[:library] || :matrix).to_sym
|
19
|
+
@function = (opts[:function] || :tfidf).to_sym
|
20
|
+
|
21
|
+
array = Array.new(terms.size) do |i|
|
22
|
+
idf = inverse_document_frequency(terms[i])
|
23
|
+
Array.new(documents.size) do |j|
|
24
|
+
term_frequency(documents[j], terms[i]) * idf
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
@matrix = initialize_matrix(array)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Return the term's inverse document frequency.
|
32
|
+
#
|
33
|
+
# @param [String] term a term
|
34
|
+
# @return [Float] the term's inverse document frequency
|
35
|
+
def inverse_document_frequency(term)
|
36
|
+
df = @model.document_count(term)
|
37
|
+
if @function == :bm25
|
38
|
+
log((documents.size - df + 0.5) / (df + 0.5))
|
39
|
+
else
|
40
|
+
1 + log(documents.size / (df + 1.0))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
alias_method :idf, :inverse_document_frequency
|
44
|
+
|
45
|
+
# Returns the term's frequency in the document.
|
46
|
+
#
|
47
|
+
# @param [Document] document a document
|
48
|
+
# @param [String] term a term
|
49
|
+
# @return [Float] the term's frequency in the document
|
50
|
+
#
|
51
|
+
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
|
52
|
+
def term_frequency(document, term)
|
53
|
+
tf = document.term_count(term)
|
54
|
+
if @function == :bm25
|
55
|
+
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
|
56
|
+
else
|
57
|
+
sqrt(tf)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
alias_method :tf, :term_frequency
|
61
|
+
|
62
|
+
# Return the term frequency–inverse document frequency.
|
63
|
+
#
|
64
|
+
# @param [Document] document a document
|
65
|
+
# @param [String] term a term
|
66
|
+
# @return [Float] the term frequency–inverse document frequency
|
67
|
+
def term_frequency_inverse_document_frequency(document, term)
|
68
|
+
inverse_document_frequency(term) * term_frequency(document, term)
|
69
|
+
end
|
70
|
+
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
71
|
+
|
72
|
+
# Returns a similarity matrix for the documents in the corpus.
|
73
|
+
#
|
74
|
+
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
75
|
+
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
76
|
+
# similarity of all document vectors. BM25 doesn't normalize columns, but
|
77
|
+
# BM25 wasn't written with this use case in mind.
|
78
|
+
def similarity_matrix
|
79
|
+
multiply_self(normalize)
|
80
|
+
end
|
81
|
+
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
+
# A token.
|
4
|
+
#
|
3
5
|
# @note We can add more filters from Solr and stem using Porter's Snowball.
|
4
6
|
#
|
5
7
|
# @see https://github.com/aurelian/ruby-stemmer
|
@@ -14,29 +16,49 @@ class TfIdfSimilarity::Token < String
|
|
14
16
|
#
|
15
17
|
# @return [Boolean] whether the string is a token
|
16
18
|
def valid?
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
19
|
+
if RUBY_VERSION < '1.9'
|
20
|
+
!self[%r{
|
21
|
+
\A
|
22
|
+
(
|
23
|
+
\d | # number
|
24
|
+
[[:cntrl:]] | # control character
|
25
|
+
[[:punct:]] | # punctuation
|
26
|
+
[[:space:]] # whitespace
|
27
|
+
)+
|
28
|
+
\z
|
29
|
+
}x]
|
30
|
+
else
|
31
|
+
!self[%r{
|
32
|
+
\A
|
33
|
+
(
|
34
|
+
\d | # number
|
35
|
+
\p{Cntrl} | # control character
|
36
|
+
\p{Punct} | # punctuation
|
37
|
+
\p{Space} # whitespace
|
38
|
+
)+
|
39
|
+
\z
|
40
|
+
}x] # The Ruby 1.8 parser will complain about this regular expression.
|
41
|
+
end
|
27
42
|
end
|
28
43
|
|
44
|
+
# Returns a lowercase string.
|
45
|
+
#
|
29
46
|
# @return [Token] a lowercase string
|
30
47
|
#
|
31
48
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
|
32
49
|
def lowercase_filter
|
33
|
-
self.class.new UnicodeUtils.downcase(self
|
50
|
+
self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
|
51
|
+
"ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
|
52
|
+
"àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
|
53
|
+
).downcase)
|
34
54
|
end
|
35
55
|
|
56
|
+
# Returns a string with no English possessive or periods in acronyms.
|
57
|
+
#
|
36
58
|
# @return [Token] a string with no English possessive or periods in acronyms
|
37
59
|
#
|
38
60
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
39
61
|
def classic_filter
|
40
|
-
self.class.new
|
62
|
+
self.class.new(self.gsub('.', '').chomp("'s"))
|
41
63
|
end
|
42
64
|
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
|
4
|
+
describe TfIdfSimilarity::Document do
|
5
|
+
let :text do
|
6
|
+
"FOO-foo BAR bar \r\n\t 123 !@#"
|
7
|
+
end
|
8
|
+
|
9
|
+
let :tokens do
|
10
|
+
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
11
|
+
end
|
12
|
+
|
13
|
+
let :document_without_text do
|
14
|
+
TfIdfSimilarity::Document.new('')
|
15
|
+
end
|
16
|
+
|
17
|
+
let :document do
|
18
|
+
TfIdfSimilarity::Document.new(text)
|
19
|
+
end
|
20
|
+
|
21
|
+
let :document_with_id do
|
22
|
+
TfIdfSimilarity::Document.new(text, :id => 'baz')
|
23
|
+
end
|
24
|
+
|
25
|
+
let :document_with_tokens do
|
26
|
+
TfIdfSimilarity::Document.new(text, :tokens => tokens)
|
27
|
+
end
|
28
|
+
|
29
|
+
let :document_with_term_counts do
|
30
|
+
TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
|
31
|
+
end
|
32
|
+
|
33
|
+
let :document_with_term_counts_and_size do
|
34
|
+
TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
|
35
|
+
end
|
36
|
+
|
37
|
+
let :document_with_size do
|
38
|
+
TfIdfSimilarity::Document.new(text, :size => 10)
|
39
|
+
end
|
40
|
+
|
41
|
+
describe '#id' do
|
42
|
+
it 'should return the ID if no ID given' do
|
43
|
+
document.id.should == document.object_id
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should return the given ID' do
|
47
|
+
document_with_id.id.should == 'baz'
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe '#text' do
|
52
|
+
it 'should return the text' do
|
53
|
+
document.text.should == text
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe '#size' do
|
58
|
+
it 'should return the number of tokens if no tokens given' do
|
59
|
+
document.size.should == 4
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should return the number of tokens if tokens given' do
|
63
|
+
document_with_tokens.size.should == 3
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should return the number of tokens if no text given' do
|
67
|
+
document_without_text.size.should == 0
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should return the number of tokens if term counts given' do
|
71
|
+
document_with_term_counts.size.should == 15
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should return the given number of tokens if term counts and size given' do
|
75
|
+
document_with_term_counts_and_size.size.should == 20
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'should not return the given number of tokens if term counts not given' do
|
79
|
+
document_with_size.size.should_not == 10
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe '#term_counts' do
|
84
|
+
it 'should return the term counts if no tokens given' do
|
85
|
+
document.term_counts.should == {'foo' => 2, 'bar' => 2}
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should return the term counts if tokens given' do
|
89
|
+
document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'should return no term counts if no text given' do
|
93
|
+
document_without_text.term_counts.should == {}
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'should return the term counts if term counts given' do
|
97
|
+
document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe '#terms' do
|
102
|
+
it 'should return the terms if no tokens given' do
|
103
|
+
document.terms.sort.should == ['bar', 'foo']
|
104
|
+
end
|
105
|
+
|
106
|
+
it 'should return the terms if tokens given' do
|
107
|
+
document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'should return no terms if no text given' do
|
111
|
+
document_without_text.terms.should == []
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'should return the terms if term counts given' do
|
115
|
+
document_with_term_counts.terms.sort.should == ['bar', 'baz']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe '#term_count' do
|
120
|
+
it 'should return the term count if no tokens given' do
|
121
|
+
document.term_count('foo').should == 2
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'should return the term count if tokens given' do
|
125
|
+
document_with_tokens.term_count('foo-foo').should == 1
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'should return no term count if no text given' do
|
129
|
+
document_without_text.term_count('foo').should == 0
|
130
|
+
end
|
131
|
+
|
132
|
+
it 'should return the term count if term counts given' do
|
133
|
+
document_with_term_counts.term_count('bar').should == 5
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|