noctivityinc-classifier191 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,337 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI < Classifier::Base
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ super
40
+ end
41
+
42
+ # Returns true if the index needs to be rebuilt. The index needs
43
+ # to be built after all informaton is added, but before you start
44
+ # using it for search, classification and cluster detection.
45
+ def needs_rebuild?
46
+ (@items.keys.size > 1) && (@version != @built_at_version)
47
+ end
48
+
49
+ # Adds an item to the index. item is assumed to be a string, but
50
+ # any item may be indexed so long as it responds to #to_s or if
51
+ # you provide an optional block explaining how the indexer can
52
+ # fetch fresh string data. This optional block is passed the item,
53
+ # so the item may only be a reference to a URL or file name.
54
+ #
55
+ # For example:
56
+ # lsi = Classifier::LSI.new
57
+ # lsi.add_item "This is just plain text"
58
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
59
+ # ar = ActiveRecordObject.find( :all )
60
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
61
+ #
62
+ def add_item( item, *categories, &block )
63
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
64
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
65
+ @version += 1
66
+ build_index if @auto_rebuild
67
+ end
68
+
69
+ # A less flexible shorthand for add_item that assumes
70
+ # you are passing in a string with no categorries. item
71
+ # will be duck typed via to_s .
72
+ #
73
+ def <<( item )
74
+ add_item item
75
+ end
76
+
77
+ # Returns the categories for a given indexed items. You are free to add and remove
78
+ # items from this as you see fit. It does not invalide an index to change its categories.
79
+ def categories_for(item)
80
+ return [] unless @items[item]
81
+ return @items[item].categories
82
+ end
83
+
84
+ # Removes an item from the database, if it is indexed.
85
+ #
86
+ def remove_item( item )
87
+ if @items.keys.contain? item
88
+ @items.remove item
89
+ @version += 1
90
+ end
91
+ end
92
+
93
+ # Returns an array of items that are indexed.
94
+ def items
95
+ @items.keys
96
+ end
97
+
98
+ # Returns the categories for a given indexed items. You are free to add and remove
99
+ # items from this as you see fit. It does not invalide an index to change its categories.
100
+ def categories_for(item)
101
+ return [] unless @items[item]
102
+ return @items[item].categories
103
+ end
104
+
105
+ # This function rebuilds the index if needs_rebuild? returns true.
106
+ # For very large document spaces, this indexing operation may take some
107
+ # time to complete, so it may be wise to place the operation in another
108
+ # thread.
109
+ #
110
+ # As a rule, indexing will be fairly swift on modern machines until
111
+ # you have well over 500 documents indexed, or have an incredibly diverse
112
+ # vocabulary for your documents.
113
+ #
114
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
115
+ # built, a certain number of s-values are discarded from the system. The
116
+ # cutoff parameter tells the indexer how many of these values to keep.
117
+ # A value of 1 for cutoff means that no semantic analysis will take place,
118
+ # turning the LSI class into a simple vector search engine.
119
+ def build_index( cutoff=0.75 )
120
+ return unless needs_rebuild?
121
+ make_word_list
122
+
123
+ doc_list = @items.values
124
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
125
+
126
+ if $GSL
127
+ tdm = GSL::Matrix.alloc(*tda).trans
128
+ ntdm = build_reduced_matrix(tdm, cutoff)
129
+
130
+ ntdm.size[1].times do |col|
131
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
132
+ doc_list[col].lsi_vector = vec
133
+ doc_list[col].lsi_norm = vec.normalize
134
+ end
135
+ else
136
+ tdm = Matrix.rows(tda).trans
137
+ ntdm = build_reduced_matrix(tdm, cutoff)
138
+
139
+ ntdm.row_size.times do |col|
140
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
141
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
142
+ end
143
+ end
144
+
145
+ @built_at_version = @version
146
+ end
147
+
148
+ # This method returns max_chunks entries, ordered by their average semantic rating.
149
+ # Essentially, the average distance of each entry from all other entries is calculated,
150
+ # the highest are returned.
151
+ #
152
+ # This can be used to build a summary service, or to provide more information about
153
+ # your dataset's general content. For example, if you were to use categorize on the
154
+ # results of this data, you could gather information on what your dataset is generally
155
+ # about.
156
+ def highest_relative_content( max_chunks=10 )
157
+ return [] if needs_rebuild?
158
+
159
+ avg_density = Hash.new
160
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
161
+
162
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
163
+ end
164
+
165
+ # This function is the primitive that find_related and classify
166
+ # build upon. It returns an array of 2-element arrays. The first element
167
+ # of this array is a document, and the second is its "score", defining
168
+ # how "close" it is to other indexed items.
169
+ #
170
+ # These values are somewhat arbitrary, having to do with the vector space
171
+ # created by your content, so the magnitude is interpretable but not always
172
+ # meaningful between indexes.
173
+ #
174
+ # The parameter doc is the content to compare. If that content is not
175
+ # indexed, you can pass an optional block to define how to create the
176
+ # text data. See add_item for examples of how this works.
177
+ def proximity_array_for_content( doc, &block )
178
+ return [] if needs_rebuild?
179
+
180
+ content_node = node_for_content( doc, &block )
181
+ result =
182
+ @items.keys.collect do |item|
183
+ next if @items[item].search_vector.blank? # not enough data
184
+ if $GSL
185
+ val = content_node.search_vector * @items[item].search_vector.col
186
+ else
187
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
188
+ end
189
+ [item, val]
190
+ end
191
+ result.compact.sort_by { |x| x[1] }.reverse
192
+ end
193
+
194
+ # Similar to proximity_array_for_content, this function takes similar
195
+ # arguments and returns a similar array. However, it uses the normalized
196
+ # calculated vectors instead of their full versions. This is useful when
197
+ # you're trying to perform operations on content that is much smaller than
198
+ # the text you're working with. search uses this primitive.
199
+ def proximity_norms_for_content( doc, &block )
200
+ return [] if needs_rebuild?
201
+
202
+ content_node = node_for_content( doc, &block )
203
+ result =
204
+ @items.keys.collect do |item|
205
+ next if @items[item].search_norm.blank? # not enough data
206
+ if $GSL
207
+ val = content_node.search_norm * @items[item].search_norm.col
208
+ else
209
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
210
+ end
211
+ [item, val]
212
+ end
213
+ result.compact.sort_by { |x| x[1] }.reverse
214
+ end
215
+
216
+ # This function allows for text-based search of your index. Unlike other functions
217
+ # like find_related and classify, search only takes short strings. It will also ignore
218
+ # factors like repeated words. It is best for short, google-like search terms.
219
+ # A search will first priortize lexical relationships, then semantic ones.
220
+ #
221
+ # While this may seem backwards compared to the other functions that LSI supports,
222
+ # it is actually the same algorithm, just applied on a smaller document.
223
+ def search( string, max_nearest=3 )
224
+ return [] if needs_rebuild?
225
+ carry = proximity_norms_for_content( string )
226
+ result = carry.collect { |x| x[0] }
227
+ return result[0..max_nearest-1]
228
+ end
229
+
230
+ # This function takes content and finds other documents
231
+ # that are semantically "close", returning an array of documents sorted
232
+ # from most to least relavant.
233
+ # max_nearest specifies the number of documents to return. A value of
234
+ # 0 means that it returns all the indexed documents, sorted by relavence.
235
+ #
236
+ # This is particularly useful for identifing clusters in your document space.
237
+ # For example you may want to identify several "What's Related" items for weblog
238
+ # articles, or find paragraphs that relate to each other in an essay.
239
+ def find_related( doc, max_nearest=3, &block )
240
+ carry =
241
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
242
+ result = carry.collect { |x| x[0] }
243
+ return result[0..max_nearest-1]
244
+ end
245
+
246
+ # This function uses a voting system to categorize documents, based on
247
+ # the categories of other documents. It uses the same logic as the
248
+ # find_related function to find related documents, then returns the
249
+ # most obvious category from this list.
250
+ #
251
+ # cutoff signifies the number of documents to consider when clasifying
252
+ # text. A cutoff of 1 means that every document in the index votes on
253
+ # what category the document is in. This may not always make sense.
254
+ #
255
+ def classify( doc, cutoff=0.30, &block )
256
+ icutoff = (@items.size * cutoff).round
257
+ carry = proximity_array_for_content( doc, &block )
258
+ carry = carry[0..icutoff-1]
259
+ votes = {}
260
+ carry.each do |pair|
261
+ categories = @items[pair[0]].categories
262
+ categories.each do |category|
263
+ votes[category] ||= 0.0
264
+ votes[category] += pair[1]
265
+ end
266
+ end
267
+
268
+ ranking = votes.keys.sort_by { |x| votes[x] }
269
+ return ranking[-1]
270
+ end
271
+
272
+ # Same as previous but returns all results, also more permissive in default cut-off
273
+ def classify_multiple( doc, cutoff=0.50, &block )
274
+ icutoff = (@items.size * cutoff).round
275
+ carry = proximity_array_for_content( doc, &block )
276
+ carry = carry[0..icutoff-1]
277
+ votes = {}
278
+ carry.each do |pair|
279
+ categories = @items[pair[0]].categories
280
+ categories.each do |category|
281
+ votes[category] ||= 0.0
282
+ votes[category] += pair[1]
283
+ end
284
+ end
285
+ votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
286
+ end
287
+
288
+ # Prototype, only works on indexed documents.
289
+ # I have no clue if this is going to work, but in theory
290
+ # it's supposed to.
291
+ def highest_ranked_stems( doc, count=3 )
292
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
293
+ arr = node_for_content(doc).lsi_vector.to_a
294
+ top_n = arr.sort.reverse[0..count-1]
295
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
296
+ end
297
+
298
+ private
299
+ def build_reduced_matrix( matrix, cutoff=0.75 )
300
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
301
+ u, v, s = matrix.SV_decomp
302
+
303
+ # TODO: Better than 75% term, please. :\
304
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
305
+ s.size.times do |ord|
306
+ s[ord] = 0.0 if s[ord] < s_cutoff
307
+ end
308
+ # Reconstruct the term document matrix, only with reduced rank
309
+ u * ($GSL ? GSL::Matrix : Matrix).diag( s ) * v.trans
310
+ end
311
+
312
+ def node_for_content(item, &block)
313
+ if @items[item]
314
+ return @items[item]
315
+ else
316
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
317
+
318
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
319
+
320
+ unless needs_rebuild?
321
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
322
+ end
323
+ end
324
+
325
+ return cn
326
+ end
327
+
328
+ def make_word_list
329
+ @word_list = WordList.new
330
+ @items.each_value do |node|
331
+ node.word_hash.each_key { |key| @word_list.add_word key }
332
+ end
333
+ end
334
+
335
+ end
336
+ end
337
+
data/lib/classifier.rb ADDED
@@ -0,0 +1,32 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'activesupport'
29
+ require 'lingua/stemmer'
30
+ require 'classifier/base'
31
+ require 'classifier/bayes'
32
+ require 'classifier/lsi'
data/lib/init.rb ADDED
@@ -0,0 +1 @@
1
+ require 'classifier'
data/test/base_test.rb ADDED
@@ -0,0 +1,17 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+ class HelpersTest < Test::Unit::TestCase
3
+
4
+ def test_word_hash
5
+ c = Classifier::Base.new
6
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
7
+ assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
8
+ end
9
+
10
+
11
+ def test_clean_word_hash
12
+ c = Classifier::Base.new
13
+ hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
14
+ assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
15
+ end
16
+
17
+ end
@@ -0,0 +1,52 @@
1
+ # coding:utf-8
2
+ # $KCODE = 'utf8'
3
+
4
+ require File.dirname(__FILE__) + '/../test_helper'
5
+ class BayesianTest < Test::Unit::TestCase
6
+ def setup
7
+ @classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
8
+ end
9
+
10
+ def test_good_training
11
+ assert_nothing_raised { @classifier.train_interesting "love" }
12
+ end
13
+
14
+ def test_bad_training
15
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
16
+ end
17
+
18
+ def test_bad_method
19
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
20
+ end
21
+
22
+ def test_categories
23
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
24
+ end
25
+
26
+ def test_add_category
27
+ @classifier.add_category 'Test'
28
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
29
+ end
30
+
31
+ def test_classification
32
+ @classifier.train_interesting "here are some good words. I hope you love them"
33
+ @classifier.train_uninteresting "here are some bad words, I hate you"
34
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
35
+ end
36
+
37
+ def test_ru_classification
38
+ c = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'], :language => "ru"
39
+ c.train_interesting "вот несколько хороших слов. Я надеюсь вам они понравились"
40
+ c.train_uninteresting "вот несколько плохих слов. Я тебя ненавижу"
41
+ assert_equal 'Uninteresting', c.classify("Я ненавижу плохие слова и тебя")
42
+ end
43
+
44
+ def test_case_insensitive
45
+ c = Classifier::Bayes.new :categories => [:good, :bad], :language => "ru"
46
+ c.train_good "Хорошо"
47
+ c.train_bad "Плохо"
48
+
49
+ assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
50
+ assert_equal c.classifications("плОХО"), c.classifications("плохо")
51
+ end
52
+ end
@@ -0,0 +1,167 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class LSITest < Test::Unit::TestCase
3
+ def setup
4
+ # we repeat principle words to help weight them.
5
+ # This test is rather delicate, since this system is mostly noise.
6
+ @str1 = "This text deals with dogs. Dogs."
7
+ @str2 = "This text involves dogs too. Dogs! "
8
+ @str3 = "This text revolves around cats. Cats."
9
+ @str4 = "This text also involves cats. Cats!"
10
+ @str5 = "This text involves birds. Birds."
11
+ @str6 = "Is it about dogs or birds?"
12
+ @str7 = "Is it about birds or cats?"
13
+ @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
14
+ end
15
+
16
+ def test_basic_indexing
17
+ lsi = Classifier::LSI.new
18
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
19
+ assert ! lsi.needs_rebuild?
20
+
21
+ # note that the closest match to str1 is str2, even though it is not
22
+ # the closest text match.
23
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
24
+ end
25
+
26
+ def test_not_auto_rebuild
27
+ lsi = Classifier::LSI.new :auto_rebuild => false
28
+ lsi.add_item @str1, "Dog"
29
+ lsi.add_item @str2, "Dog"
30
+ assert lsi.needs_rebuild?
31
+ lsi.build_index
32
+ assert ! lsi.needs_rebuild?
33
+ end
34
+
35
+ def test_basic_categorizing_with_too_small_dataset
36
+ lsi = Classifier::LSI.new
37
+ lsi.add_item @str2, "Dog"
38
+
39
+ assert_equal nil, lsi.classify( @str1 )
40
+ assert_equal [], lsi.classify_multiple( @str3 )
41
+ end
42
+
43
+ def test_basic_categorizing
44
+ lsi = Classifier::LSI.new
45
+ lsi.add_item @str2, "Dog"
46
+ lsi.add_item @str3, "Cat"
47
+ lsi.add_item @str4, "Cat"
48
+ lsi.add_item @str5, "Bird"
49
+
50
+ assert_equal "Dog", lsi.classify( @str1 )
51
+ assert_equal "Cat", lsi.classify( @str3 )
52
+ assert_equal "Bird", lsi.classify( @str5 )
53
+ assert_equal "Dog", lsi.classify( @str6 )
54
+ assert_equal "Bird", lsi.classify( @str7 )
55
+ assert_equal "Bird", lsi.classify( @str8 )
56
+ end
57
+
58
+ def test_multiple_categorizing
59
+ lsi = Classifier::LSI.new
60
+ lsi.add_item @str1, "Dog"
61
+ lsi.add_item @str2, "Dog"
62
+ lsi.add_item @str3, "Cat"
63
+ lsi.add_item @str4, "Cat"
64
+ lsi.add_item @str5, "Bird"
65
+
66
+ assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
67
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
68
+ assert_equal ["Bird"], lsi.classify_multiple( @str8 )
69
+ end
70
+
71
+ def test_multiple_categorizing_reverse
72
+ lsi = Classifier::LSI.new
73
+ lsi.add_item @str1, "Dog"
74
+ lsi.add_item @str3, "Cat"
75
+ lsi.add_item @str4, "Cat"
76
+ lsi.add_item @str6, "Dog", "Bird", "Flying"
77
+ lsi.add_item @str7, "Cat", "Bird"
78
+ lsi.add_item @str8, "Bird", "Dog", "Cat"
79
+
80
+ assert_equal ["Dog"], lsi.classify_multiple( @str2 )
81
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
82
+
83
+ # test with a word unknown alone
84
+ assert_equal "Bird", lsi.classify( "Bird!" )
85
+ assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
86
+ end
87
+
88
+ def test_external_classifying
89
+ lsi = Classifier::LSI.new
90
+ bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
91
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
92
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
93
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
94
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
95
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
96
+
97
+ # We're talking about dogs. Even though the text matches the corpus on
98
+ # cats better. Dogs have more semantic weight than cats. So bayes
99
+ # will fail here, but the LSI recognizes content.
100
+ tricky_case = "This text revolves around dogs."
101
+ assert_equal "Dog", lsi.classify( tricky_case )
102
+ assert_not_equal "Dog", bayes.classify( tricky_case )
103
+ end
104
+
105
+ def test_recategorize_interface
106
+ lsi = Classifier::LSI.new
107
+ lsi.add_item @str1, "Dog"
108
+ lsi.add_item @str2, "Dog"
109
+ lsi.add_item @str3, "Cat"
110
+ lsi.add_item @str4, "Cat"
111
+ lsi.add_item @str5, "Bird"
112
+
113
+ tricky_case = "This text revolves around dogs."
114
+ assert_equal "Dog", lsi.classify( tricky_case )
115
+
116
+ # Recategorize as needed.
117
+ lsi.categories_for(@str1).clear.push "Cow"
118
+ lsi.categories_for(@str2).clear.push "Cow"
119
+
120
+ assert !lsi.needs_rebuild?
121
+ assert_equal "Cow", lsi.classify( tricky_case )
122
+ end
123
+
124
+ def test_search
125
+ lsi = Classifier::LSI.new
126
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
127
+
128
+ # Searching by content and text, note that @str2 comes up first, because
129
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
130
+ # of @str4, because "dog" carries more weight than involves.
131
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
132
+ lsi.search("dog involves", 100) )
133
+
134
+ # Keyword search shows how the space is mapped out in relation to
135
+ # dog when magnitude is remove. Note the relations. We move from dog
136
+ # through involve and then finally to other words.
137
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
138
+ lsi.search("dog", 5) )
139
+ end
140
+
141
+ def test_serialize_safe
142
+ lsi = Classifier::LSI.new
143
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
144
+
145
+ lsi_md = Marshal.dump lsi
146
+ lsi_m = Marshal.load lsi_md
147
+
148
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
149
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
150
+ end
151
+
152
+ def test_keyword_search
153
+ lsi = Classifier::LSI.new
154
+ lsi.add_item @str1, "Dog"
155
+ lsi.add_item @str2, "Dog"
156
+ lsi.add_item @str3, "Cat"
157
+ lsi.add_item @str4, "Cat"
158
+ lsi.add_item @str5, "Bird"
159
+
160
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
161
+ end
162
+
163
+ def test_summary
164
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
165
+ end
166
+
167
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'classifier'