tf-idf-similarity 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -8
- data/Gemfile +2 -2
- data/README.md +40 -9
- data/lib/tf-idf-similarity.rb +1 -0
- data/lib/tf-idf-similarity/bm25_model.rb +23 -62
- data/lib/tf-idf-similarity/document.rb +69 -67
- data/lib/tf-idf-similarity/extras/document.rb +10 -8
- data/lib/tf-idf-similarity/extras/tf_idf_model.rb +157 -155
- data/lib/tf-idf-similarity/matrix_methods.rb +137 -135
- data/lib/tf-idf-similarity/model.rb +66 -0
- data/lib/tf-idf-similarity/term_count_model.rb +59 -57
- data/lib/tf-idf-similarity/tf_idf_model.rb +21 -60
- data/lib/tf-idf-similarity/token.rb +39 -37
- data/lib/tf-idf-similarity/version.rb +1 -1
- data/spec/bm25_model_spec.rb +200 -0
- data/spec/document_spec.rb +98 -96
- data/spec/extras/tf_idf_model_spec.rb +224 -222
- data/spec/spec_helper.rb +6 -0
- data/spec/term_count_model_spec.rb +76 -74
- data/spec/tf_idf_model_spec.rb +143 -117
- data/spec/token_spec.rb +23 -21
- metadata +6 -2
@@ -1,66 +1,27 @@
|
|
1
1
|
# A document-term matrix using the tf*idf function.
|
2
2
|
#
|
3
3
|
# @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
def initialize(documents, opts = {})
|
14
|
-
@model = TfIdfSimilarity::TermCountModel.new(documents, opts)
|
15
|
-
@library = (opts[:library] || :matrix).to_sym
|
16
|
-
|
17
|
-
array = Array.new(terms.size) do |i|
|
18
|
-
idf = inverse_document_frequency(terms[i])
|
19
|
-
Array.new(documents.size) do |j|
|
20
|
-
term_frequency(documents[j], terms[i]) * idf
|
21
|
-
end
|
4
|
+
module TfIdfSimilarity
|
5
|
+
class TfIdfModel < Model
|
6
|
+
# Return the term's inverse document frequency.
|
7
|
+
#
|
8
|
+
# @param [String] term a term
|
9
|
+
# @return [Float] the term's inverse document frequency
|
10
|
+
def inverse_document_frequency(term)
|
11
|
+
df = @model.document_count(term)
|
12
|
+
1 + log(documents.size / (df + 1.0))
|
22
13
|
end
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
alias_method :idf, :inverse_document_frequency
|
36
|
-
|
37
|
-
# Returns the term's frequency in the document.
|
38
|
-
#
|
39
|
-
# @param [Document] document a document
|
40
|
-
# @param [String] term a term
|
41
|
-
# @return [Float] the term's frequency in the document
|
42
|
-
def term_frequency(document, term)
|
43
|
-
tf = document.term_count(term)
|
44
|
-
sqrt(tf)
|
45
|
-
end
|
46
|
-
alias_method :tf, :term_frequency
|
47
|
-
|
48
|
-
# Return the term frequency–inverse document frequency.
|
49
|
-
#
|
50
|
-
# @param [Document] document a document
|
51
|
-
# @param [String] term a term
|
52
|
-
# @return [Float] the term frequency–inverse document frequency
|
53
|
-
def term_frequency_inverse_document_frequency(document, term)
|
54
|
-
inverse_document_frequency(term) * term_frequency(document, term)
|
55
|
-
end
|
56
|
-
alias_method :tfidf, :term_frequency_inverse_document_frequency
|
57
|
-
|
58
|
-
# Returns a similarity matrix for the documents in the corpus.
|
59
|
-
#
|
60
|
-
# @return [GSL::Matrix,NMatrix,Matrix] a similarity matrix
|
61
|
-
# @note Columns are normalized to unit vectors, so we can calculate the cosine
|
62
|
-
# similarity of all document vectors.
|
63
|
-
def similarity_matrix
|
64
|
-
multiply_self(normalize)
|
14
|
+
alias_method :idf, :inverse_document_frequency
|
15
|
+
|
16
|
+
# Returns the term's frequency in the document.
|
17
|
+
#
|
18
|
+
# @param [Document] document a document
|
19
|
+
# @param [String] term a term
|
20
|
+
# @return [Float] the term's frequency in the document
|
21
|
+
def term_frequency(document, term)
|
22
|
+
tf = document.term_count(term)
|
23
|
+
sqrt(tf)
|
24
|
+
end
|
25
|
+
alias_method :tf, :term_frequency
|
65
26
|
end
|
66
27
|
end
|
@@ -8,44 +8,46 @@
|
|
8
8
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory
|
9
9
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
|
10
10
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
11
|
+
module TfIdfSimilarity
|
12
|
+
class Token < String
|
13
|
+
# Returns a falsy value if all its characters are numbers, punctuation,
|
14
|
+
# whitespace or control characters.
|
15
|
+
#
|
16
|
+
# @note Some implementations ignore one and two-letter words.
|
17
|
+
#
|
18
|
+
# @return [Boolean] whether the string is a token
|
19
|
+
def valid?
|
20
|
+
!self[%r{
|
21
|
+
\A
|
22
|
+
(
|
23
|
+
\d | # number
|
24
|
+
[[:cntrl:]] | # control character
|
25
|
+
[[:punct:]] | # punctuation
|
26
|
+
[[:space:]] # whitespace
|
27
|
+
)+
|
28
|
+
\z
|
29
|
+
}x]
|
30
|
+
end
|
30
31
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
# Returns a lowercase string.
|
33
|
+
#
|
34
|
+
# @return [Token] a lowercase string
|
35
|
+
#
|
36
|
+
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
|
37
|
+
def lowercase_filter
|
38
|
+
self.class.new(defined?(UnicodeUtils) ? UnicodeUtils.downcase(self) : tr(
|
39
|
+
"ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
|
40
|
+
"àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
|
41
|
+
).downcase)
|
42
|
+
end
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
44
|
+
# Returns a string with no English possessive or periods in acronyms.
|
45
|
+
#
|
46
|
+
# @return [Token] a string with no English possessive or periods in acronyms
|
47
|
+
#
|
48
|
+
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
49
|
+
def classic_filter
|
50
|
+
self.class.new(self.gsub('.', '').chomp("'s"))
|
51
|
+
end
|
50
52
|
end
|
51
53
|
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module TfIdfSimilarity
|
4
|
+
describe BM25Model do
|
5
|
+
let :text do
|
6
|
+
"FOO-foo BAR bar \r\n\t 123 !@#"
|
7
|
+
end
|
8
|
+
|
9
|
+
let :tokens do
|
10
|
+
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
11
|
+
end
|
12
|
+
|
13
|
+
let :document_without_text do
|
14
|
+
Document.new('')
|
15
|
+
end
|
16
|
+
|
17
|
+
let :document do
|
18
|
+
Document.new(text)
|
19
|
+
end
|
20
|
+
|
21
|
+
let :document_with_tokens do
|
22
|
+
Document.new(text, :tokens => tokens)
|
23
|
+
end
|
24
|
+
|
25
|
+
let :document_with_term_counts do
|
26
|
+
Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
|
27
|
+
end
|
28
|
+
|
29
|
+
let :non_corpus_document do
|
30
|
+
Document.new('foo foo foo')
|
31
|
+
end
|
32
|
+
|
33
|
+
def similarity_matrix_values(model)
|
34
|
+
matrix = model.similarity_matrix
|
35
|
+
if MATRIX_LIBRARY == :nmatrix
|
36
|
+
matrix.each.to_a
|
37
|
+
else
|
38
|
+
matrix.to_a.flatten
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context 'without documents', :empty_matrix => true do
|
43
|
+
let :model do
|
44
|
+
BM25Model.new([], :library => MATRIX_LIBRARY)
|
45
|
+
end
|
46
|
+
|
47
|
+
describe '#documents' do
|
48
|
+
it 'should be empty' do
|
49
|
+
model.documents.should be_empty
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe '#document_index' do
|
54
|
+
it 'should return nil' do
|
55
|
+
model.document_index(document).should be_nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe '#text_index' do
|
60
|
+
it 'should return nil' do
|
61
|
+
model.text_index(text).should be_nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe '#terms' do
|
66
|
+
it 'should be empty' do
|
67
|
+
model.terms.should be_empty
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe '#inverse_document_frequency' do
|
72
|
+
it 'should return negative infinity' do
|
73
|
+
model.idf('foo').should == 0.0
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
describe '#term_frequency' do
|
78
|
+
it 'should return the term frequency' do
|
79
|
+
model.tf(document, 'foo').should be_nan
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe '#term_frequency_inverse_document_frequency' do
|
84
|
+
it 'should return negative infinity' do
|
85
|
+
model.tfidf(document, 'foo').should be_nan
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe '#similarity_matrix' do
|
90
|
+
it 'should be empty' do
|
91
|
+
similarity_matrix_values(model).should be_empty
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
context 'with documents' do
|
97
|
+
let :documents do
|
98
|
+
[
|
99
|
+
document,
|
100
|
+
document_with_tokens,
|
101
|
+
document_without_text,
|
102
|
+
document_with_term_counts,
|
103
|
+
]
|
104
|
+
end
|
105
|
+
|
106
|
+
let :model do
|
107
|
+
BM25Model.new(documents, :library => MATRIX_LIBRARY)
|
108
|
+
end
|
109
|
+
|
110
|
+
describe '#documents' do
|
111
|
+
it 'should return the documents' do
|
112
|
+
model.documents.should == documents
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
describe '#document_index' do
|
117
|
+
it 'should return nil' do
|
118
|
+
model.document_index(document).should == 0
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
describe '#text_index' do
|
123
|
+
it 'should return the index' do
|
124
|
+
model.text_index(text).should == 0
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
describe '#terms' do
|
129
|
+
it 'should return the terms' do
|
130
|
+
model.terms.to_a.sort.should == ['bar', 'baz', 'foo', 'foo-foo']
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
describe '#inverse_document_frequency' do
|
135
|
+
it 'should return the inverse document frequency' do
|
136
|
+
model.idf('foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)))
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'should return the inverse document frequency of a non-occurring term' do
|
140
|
+
model.idf('xxx').should be_within(0.001).of(Math.log((4 - 0 + 0.5) / (0 + 0.5)))
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
describe '#term_frequency' do
|
145
|
+
it 'should return the term frequency if no tokens given' do
|
146
|
+
model.tf(document, 'foo').should == (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5)
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'should return the term frequency if tokens given' do
|
150
|
+
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'should return no term frequency if no text given' do
|
154
|
+
model.tf(document_without_text, 'foo').should == 0
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'should return the term frequency if term counts given' do
|
158
|
+
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
|
159
|
+
end
|
160
|
+
|
161
|
+
it 'should return the term frequency of a non-occurring term' do
|
162
|
+
model.tf(document, 'xxx').should == 0
|
163
|
+
end
|
164
|
+
|
165
|
+
it 'should return the term frequency in a non-occurring document' do
|
166
|
+
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe '#term_frequency_inverse_document_frequency' do
|
171
|
+
it 'should return the tf*idf' do
|
172
|
+
model.tfidf(document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (2 * 2.2) / (2 + 0.3 + 0.9 * 4 / 5.5))
|
173
|
+
end
|
174
|
+
|
175
|
+
it 'should return the tf*idf of a non-occurring term' do
|
176
|
+
model.tfidf(document, 'xxx').should == 0
|
177
|
+
end
|
178
|
+
|
179
|
+
it 'should return the tf*idf in a non-occurring term' do
|
180
|
+
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
describe '#similarity_matrix' do
|
185
|
+
it 'should return the similarity matrix' do
|
186
|
+
expected = [
|
187
|
+
1.0, 0.564, 0.0, 0.479,
|
188
|
+
0.564, 1.0, 0.0, 0.540,
|
189
|
+
0.0, 0.0, 0.0, 0.0,
|
190
|
+
0.479, 0.540, 0.0, 1.0,
|
191
|
+
]
|
192
|
+
|
193
|
+
similarity_matrix_values(model).each_with_index do |value,i|
|
194
|
+
value.should be_within(0.001).of(expected[i])
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
data/spec/document_spec.rb
CHANGED
@@ -1,136 +1,138 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
# @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
let :tokens do
|
10
|
-
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
11
|
-
end
|
12
|
-
|
13
|
-
let :document_without_text do
|
14
|
-
TfIdfSimilarity::Document.new('')
|
15
|
-
end
|
16
|
-
|
17
|
-
let :document do
|
18
|
-
TfIdfSimilarity::Document.new(text)
|
19
|
-
end
|
20
|
-
|
21
|
-
let :document_with_id do
|
22
|
-
TfIdfSimilarity::Document.new(text, :id => 'baz')
|
23
|
-
end
|
24
|
-
|
25
|
-
let :document_with_tokens do
|
26
|
-
TfIdfSimilarity::Document.new(text, :tokens => tokens)
|
27
|
-
end
|
28
|
-
|
29
|
-
let :document_with_term_counts do
|
30
|
-
TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
|
31
|
-
end
|
32
|
-
|
33
|
-
let :document_with_term_counts_and_size do
|
34
|
-
TfIdfSimilarity::Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
|
35
|
-
end
|
36
|
-
|
37
|
-
let :document_with_size do
|
38
|
-
TfIdfSimilarity::Document.new(text, :size => 10)
|
39
|
-
end
|
40
|
-
|
41
|
-
describe '#id' do
|
42
|
-
it 'should return the ID if no ID given' do
|
43
|
-
document.id.should == document.object_id
|
4
|
+
module TfIdfSimilarity
|
5
|
+
describe Document do
|
6
|
+
let :text do
|
7
|
+
"FOO-foo BAR bar \r\n\t 123 !@#"
|
44
8
|
end
|
45
9
|
|
46
|
-
|
47
|
-
|
10
|
+
let :tokens do
|
11
|
+
['FOO-foo', 'BAR', 'bar', "\r\n\t", '123', '!@#']
|
48
12
|
end
|
49
|
-
end
|
50
13
|
|
51
|
-
|
52
|
-
|
53
|
-
document.text.should == text
|
14
|
+
let :document_without_text do
|
15
|
+
Document.new('')
|
54
16
|
end
|
55
|
-
end
|
56
17
|
|
57
|
-
|
58
|
-
|
59
|
-
document.size.should == 4
|
18
|
+
let :document do
|
19
|
+
Document.new(text)
|
60
20
|
end
|
61
21
|
|
62
|
-
|
63
|
-
|
22
|
+
let :document_with_id do
|
23
|
+
Document.new(text, :id => 'baz')
|
64
24
|
end
|
65
25
|
|
66
|
-
|
67
|
-
|
26
|
+
let :document_with_tokens do
|
27
|
+
Document.new(text, :tokens => tokens)
|
68
28
|
end
|
69
29
|
|
70
|
-
|
71
|
-
|
30
|
+
let :document_with_term_counts do
|
31
|
+
Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10})
|
72
32
|
end
|
73
33
|
|
74
|
-
|
75
|
-
|
34
|
+
let :document_with_term_counts_and_size do
|
35
|
+
Document.new(text, :term_counts => {'bar' => 5, 'baz' => 10}, :size => 20)
|
76
36
|
end
|
77
37
|
|
78
|
-
|
79
|
-
|
38
|
+
let :document_with_size do
|
39
|
+
Document.new(text, :size => 10)
|
80
40
|
end
|
81
|
-
end
|
82
41
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
42
|
+
describe '#id' do
|
43
|
+
it 'should return the ID if no ID given' do
|
44
|
+
document.id.should == document.object_id
|
45
|
+
end
|
87
46
|
|
88
|
-
|
89
|
-
|
47
|
+
it 'should return the given ID' do
|
48
|
+
document_with_id.id.should == 'baz'
|
49
|
+
end
|
90
50
|
end
|
91
51
|
|
92
|
-
|
93
|
-
|
52
|
+
describe '#text' do
|
53
|
+
it 'should return the text' do
|
54
|
+
document.text.should == text
|
55
|
+
end
|
94
56
|
end
|
95
57
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
58
|
+
describe '#size' do
|
59
|
+
it 'should return the number of tokens if no tokens given' do
|
60
|
+
document.size.should == 4
|
61
|
+
end
|
100
62
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
end
|
63
|
+
it 'should return the number of tokens if tokens given' do
|
64
|
+
document_with_tokens.size.should == 3
|
65
|
+
end
|
105
66
|
|
106
|
-
|
107
|
-
|
108
|
-
|
67
|
+
it 'should return the number of tokens if no text given' do
|
68
|
+
document_without_text.size.should == 0
|
69
|
+
end
|
109
70
|
|
110
|
-
|
111
|
-
|
112
|
-
|
71
|
+
it 'should return the number of tokens if term counts given' do
|
72
|
+
document_with_term_counts.size.should == 15
|
73
|
+
end
|
113
74
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
end
|
75
|
+
it 'should return the given number of tokens if term counts and size given' do
|
76
|
+
document_with_term_counts_and_size.size.should == 20
|
77
|
+
end
|
118
78
|
|
119
|
-
|
120
|
-
|
121
|
-
|
79
|
+
it 'should not return the given number of tokens if term counts not given' do
|
80
|
+
document_with_size.size.should_not == 10
|
81
|
+
end
|
122
82
|
end
|
123
83
|
|
124
|
-
|
125
|
-
|
84
|
+
describe '#term_counts' do
|
85
|
+
it 'should return the term counts if no tokens given' do
|
86
|
+
document.term_counts.should == {'foo' => 2, 'bar' => 2}
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should return the term counts if tokens given' do
|
90
|
+
document_with_tokens.term_counts.should == {'foo-foo' => 1, 'bar' => 2}
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should return no term counts if no text given' do
|
94
|
+
document_without_text.term_counts.should == {}
|
95
|
+
end
|
96
|
+
|
97
|
+
it 'should return the term counts if term counts given' do
|
98
|
+
document_with_term_counts.term_counts.should == {'bar' => 5, 'baz' => 10}
|
99
|
+
end
|
126
100
|
end
|
127
101
|
|
128
|
-
|
129
|
-
|
102
|
+
describe '#terms' do
|
103
|
+
it 'should return the terms if no tokens given' do
|
104
|
+
document.terms.sort.should == ['bar', 'foo']
|
105
|
+
end
|
106
|
+
|
107
|
+
it 'should return the terms if tokens given' do
|
108
|
+
document_with_tokens.terms.sort.should == ['bar', 'foo-foo']
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should return no terms if no text given' do
|
112
|
+
document_without_text.terms.should == []
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'should return the terms if term counts given' do
|
116
|
+
document_with_term_counts.terms.sort.should == ['bar', 'baz']
|
117
|
+
end
|
130
118
|
end
|
131
119
|
|
132
|
-
|
133
|
-
|
120
|
+
describe '#term_count' do
|
121
|
+
it 'should return the term count if no tokens given' do
|
122
|
+
document.term_count('foo').should == 2
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should return the term count if tokens given' do
|
126
|
+
document_with_tokens.term_count('foo-foo').should == 1
|
127
|
+
end
|
128
|
+
|
129
|
+
it 'should return no term count if no text given' do
|
130
|
+
document_without_text.term_count('foo').should == 0
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'should return the term count if term counts given' do
|
134
|
+
document_with_term_counts.term_count('bar').should == 5
|
135
|
+
end
|
134
136
|
end
|
135
137
|
end
|
136
138
|
end
|