RubyGems - noctivityinc-classifier191 - Versions diffs - 1.3.5 - Mend

noctivityinc-classifier191 1.3.5

Files changed (20) hide show

data/LICENSE +429 -0
data/Manifest +19 -0
data/README +86 -0
data/Rakefile +15 -0
data/classifier.gemspec +38 -0
data/lib/classifier/base.rb +306 -0
data/lib/classifier/bayes.rb +134 -0
data/lib/classifier/extensions/vector.rb +100 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/lsi/content_node.rb +73 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/lsi.rb +337 -0
data/lib/classifier.rb +32 -0
data/lib/init.rb +1 -0
data/test/base_test.rb +17 -0
data/test/bayes/bayesian_test.rb +52 -0
data/test/lsi/lsi_test.rb +167 -0
data/test/test_helper.rb +4 -0
metadata +111 -0

data/lib/classifier/lsi.rb ADDED Viewed

@@ -0,0 +1,337 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+begin
+   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+   require 'gsl' # requires http://rb-gsl.rubyforge.org/
+   require 'classifier/extensions/vector_serialize'
+   $GSL = true
+rescue LoadError
+	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
+	require 'classifier/extensions/vector'
+end
+require 'classifier/lsi/word_list'
+require 'classifier/lsi/content_node'
+require 'classifier/lsi/summary'
+module Classifier
+  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
+  # data based on underlying semantic relations. For more information on the algorithms used,
+  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
+  class LSI < Classifier::Base
+    attr_reader :word_list
+    attr_accessor :auto_rebuild
+    # Create a fresh index.
+    # If you want to call #build_index manually, use
+    #      Classifier::LSI.new :auto_rebuild => false
+    #
+    def initialize(options = {})
+      @auto_rebuild = true unless options[:auto_rebuild] == false
+      @word_list, @items = WordList.new, {}
+      @version, @built_at_version = 0, -1
+      super
+    end
+    # Returns true if the index needs to be rebuilt.  The index needs
+    # to be built after all informaton is added, but before you start
+    # using it for search, classification and cluster detection.
+    def needs_rebuild?
+      (@items.keys.size > 1) && (@version != @built_at_version)
+    end
+    # Adds an item to the index. item is assumed to be a string, but
+    # any item may be indexed so long as it responds to #to_s or if
+    # you provide an optional block explaining how the indexer can
+    # fetch fresh string data. This optional block is passed the item,
+    # so the item may only be a reference to a URL or file name.
+    #
+    # For example:
+    #   lsi = Classifier::LSI.new
+    #   lsi.add_item "This is just plain text"
+    #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
+    #   ar = ActiveRecordObject.find( :all )
+    #   lsi.add_item ar, *ar.categories { |x| ar.content }
+    #
+    def add_item( item, *categories, &block )
+      clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
+      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      @version += 1
+      build_index if @auto_rebuild
+    end
+    # A less flexible shorthand for add_item that assumes
+    # you are passing in a string with no categorries. item
+    # will be duck typed via to_s .
+    #
+    def <<( item )
+      add_item item
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # Removes an item from the database, if it is indexed.
+    #
+    def remove_item( item )
+      if @items.keys.contain? item
+        @items.remove item
+        @version += 1
+      end
+    end
+    # Returns an array of items that are indexed.
+    def items
+      @items.keys
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # This function rebuilds the index if needs_rebuild? returns true.
+    # For very large document spaces, this indexing operation may take some
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
+    #
+    # As a rule, indexing will be fairly swift on modern machines until
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
+    #
+    # The optional parameter "cutoff" is a tuning parameter. When the index is
+    # built, a certain number of s-values are discarded from the system. The
+    # cutoff parameter tells the indexer how many of these values to keep.
+    # A value of 1 for cutoff means that no semantic analysis will take place,
+    # turning the LSI class into a simple vector search engine.
+    def build_index( cutoff=0.75 )
+      return unless needs_rebuild?
+      make_word_list
+      doc_list = @items.values
+      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      if $GSL
+         tdm = GSL::Matrix.alloc(*tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.size[1].times do |col|
+           vec = GSL::Vector.alloc( ntdm.column(col) ).row
+           doc_list[col].lsi_vector = vec
+           doc_list[col].lsi_norm = vec.normalize
+         end
+      else
+         tdm = Matrix.rows(tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.row_size.times do |col|
+           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
+         end
+      end
+      @built_at_version = @version
+    end
+    # This method returns max_chunks entries, ordered by their average semantic rating.
+    # Essentially, the average distance of each entry from all other entries is calculated,
+    # the highest are returned.
+    #
+    # This can be used to build a summary service, or to provide more information about
+    # your dataset's general content. For example, if you were to use categorize on the
+    # results of this data, you could gather information on what your dataset is generally
+    # about.
+    def highest_relative_content( max_chunks=10 )
+       return [] if needs_rebuild?
+       avg_density = Hash.new
+       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    end
+    # This function is the primitive that find_related and classify
+    # build upon. It returns an array of 2-element arrays. The first element
+    # of this array is a document, and the second is its "score", defining
+    # how "close" it is to other indexed items.
+    #
+    # These values are somewhat arbitrary, having to do with the vector space
+    # created by your content, so the magnitude is interpretable but not always
+    # meaningful between indexes.
+    #
+    # The parameter doc is the content to compare. If that content is not
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
+    def proximity_array_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          next if @items[item].search_vector.blank? # not enough data
+          if $GSL
+             val = content_node.search_vector * @items[item].search_vector.col
+          else
+             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+          end
+          [item, val]
+        end
+      result.compact.sort_by { |x| x[1] }.reverse
+    end
+    # Similar to proximity_array_for_content, this function takes similar
+    # arguments and returns a similar array. However, it uses the normalized
+    # calculated vectors instead of their full versions. This is useful when
+    # you're trying to perform operations on content that is much smaller than
+    # the text you're working with. search uses this primitive.
+    def proximity_norms_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          next if @items[item].search_norm.blank? # not enough data
+          if $GSL
+            val = content_node.search_norm * @items[item].search_norm.col
+          else
+            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+          end
+          [item, val]
+        end
+      result.compact.sort_by { |x| x[1] }.reverse
+    end
+    # This function allows for text-based search of your index. Unlike other functions
+    # like find_related and classify, search only takes short strings. It will also ignore
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
+    #
+    # While this may seem backwards compared to the other functions that LSI supports,
+    # it is actually the same algorithm, just applied on a smaller document.
+    def search( string, max_nearest=3 )
+      return [] if needs_rebuild?
+      carry = proximity_norms_for_content( string )
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function takes content and finds other documents
+    # that are semantically "close", returning an array of documents sorted
+    # from most to least relavant.
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
+    #
+    # This is particularly useful for identifing clusters in your document space.
+    # For example you may want to identify several "What's Related" items for weblog
+    # articles, or find paragraphs that relate to each other in an essay.
+    def find_related( doc, max_nearest=3, &block )
+      carry =
+        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
+    # find_related function to find related documents, then returns the
+    # most obvious category from this list.
+    #
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
+    # what category the document is in. This may not always make sense.
+    #
+    def classify( doc, cutoff=0.30, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      ranking = votes.keys.sort_by { |x| votes[x] }
+      return ranking[-1]
+    end
+    # Same as previous but returns all results, also more permissive in default cut-off
+    def classify_multiple( doc, cutoff=0.50, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
+    end
+    # Prototype, only works on indexed documents.
+    # I have no clue if this is going to work, but in theory
+    # it's supposed to.
+    def highest_ranked_stems( doc, count=3 )
+      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+      arr = node_for_content(doc).lsi_vector.to_a
+      top_n = arr.sort.reverse[0..count-1]
+      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    end
+    private
+    def build_reduced_matrix( matrix, cutoff=0.75 )
+      # TODO: Check that M>=N on these dimensions! Transpose helps assure this
+      u, v, s = matrix.SV_decomp
+      # TODO: Better than 75% term, please. :\
+      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      s.size.times do |ord|
+        s[ord] = 0.0 if s[ord] < s_cutoff
+      end
+      # Reconstruct the term document matrix, only with reduced rank
+      u * ($GSL ? GSL::Matrix : Matrix).diag( s ) * v.trans
+    end
+    def node_for_content(item, &block)
+      if @items[item]
+        return @items[item]
+      else
+        clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
+        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        unless needs_rebuild?
+          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+        end
+      end
+      return cn
+    end
+    def make_word_list
+      @word_list = WordList.new
+      @items.each_value do |node|
+        node.word_hash.each_key { |key| @word_list.add_word key }
+      end
+    end
+  end
+end

data/lib/classifier.rb ADDED Viewed

@@ -0,0 +1,32 @@
+#--
+# Copyright (c) 2005 Lucas Carlson
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+require 'rubygems'
+require 'activesupport'
+require 'lingua/stemmer'
+require 'classifier/base'
+require 'classifier/bayes'
+require 'classifier/lsi'

data/lib/init.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'classifier'

data/test/base_test.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require File.dirname(__FILE__) + '/test_helper'
+class HelpersTest < Test::Unit::TestCase
+	def test_word_hash
+	  c = Classifier::Base.new
+		hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
+		assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
+	end
+	def test_clean_word_hash
+	  c = Classifier::Base.new
+	  hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
+	  assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
+	end
+end

data/test/bayes/bayesian_test.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# coding:utf-8
+# $KCODE = 'utf8'
+require File.dirname(__FILE__) + '/../test_helper'
+class BayesianTest < Test::Unit::TestCase
+	def setup
+		@classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
+	end
+	def test_good_training
+		assert_nothing_raised { @classifier.train_interesting "love" }
+	end
+	def test_bad_training
+		assert_raise(StandardError) { @classifier.train_no_category "words" }
+	end
+	def test_bad_method
+		assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
+	end
+	def test_categories
+		assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_add_category
+		@classifier.add_category 'Test'
+		assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_classification
+		@classifier.train_interesting "here are some good words. I hope you love them"
+		@classifier.train_uninteresting "here are some bad words, I hate you"
+		assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
+	end
+	def test_ru_classification
+	  c = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'], :language => "ru"
+	  c.train_interesting "вот несколько хороших слов. Я надеюсь вам они понравились"
+	  c.train_uninteresting "вот несколько плохих слов. Я тебя ненавижу"
+	  assert_equal 'Uninteresting', c.classify("Я ненавижу плохие слова и тебя")
+  end
+  def test_case_insensitive
+	  c = Classifier::Bayes.new :categories => [:good, :bad], :language => "ru"
+	  c.train_good "Хорошо"
+	  c.train_bad "Плохо"
+	  assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
+	  assert_equal c.classifications("плОХО"), c.classifications("плохо")
+  end
+end

data/test/lsi/lsi_test.rb ADDED Viewed

@@ -0,0 +1,167 @@
+require File.dirname(__FILE__) + '/../test_helper'
+class LSITest < Test::Unit::TestCase
+	def setup
+	  # we repeat principle words to help weight them.
+	  # This test is rather delicate, since this system is mostly noise.
+    @str1 = "This text deals with dogs. Dogs."
+	  @str2 = "This text involves dogs too. Dogs! "
+	  @str3 = "This text revolves around cats. Cats."
+	  @str4 = "This text also involves cats. Cats!"
+	  @str5 = "This text involves birds. Birds."
+	  @str6 = "Is it about dogs or birds?"
+	  @str7 = "Is it about birds or cats?"
+	  @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
+	end
+	def test_basic_indexing
+	 lsi = Classifier::LSI.new
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+	  assert ! lsi.needs_rebuild?
+	 # note that the closest match to str1 is str2, even though it is not
+	 # the closest text match.
+	 assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
+	end
+	def test_not_auto_rebuild
+	 lsi = Classifier::LSI.new :auto_rebuild => false
+	 lsi.add_item @str1, "Dog"
+	 lsi.add_item @str2, "Dog"
+	 assert lsi.needs_rebuild?
+	 lsi.build_index
+	 assert ! lsi.needs_rebuild?
+	end
+	def test_basic_categorizing_with_too_small_dataset
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str2, "Dog"
+	  assert_equal nil, lsi.classify( @str1 )
+	  assert_equal [], lsi.classify_multiple( @str3 )
+	end
+	def test_basic_categorizing
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str2, "Dog"
+	  lsi.add_item @str3, "Cat"
+	  lsi.add_item @str4, "Cat"
+	  lsi.add_item @str5, "Bird"
+	  assert_equal "Dog", lsi.classify( @str1 )
+	  assert_equal "Cat", lsi.classify( @str3 )
+    assert_equal "Bird", lsi.classify( @str5 )
+    assert_equal "Dog", lsi.classify( @str6 )
+    assert_equal "Bird", lsi.classify( @str7 )
+    assert_equal "Bird", lsi.classify( @str8 )
+	end
+  def test_multiple_categorizing
+    lsi = Classifier::LSI.new
+    lsi.add_item @str1, "Dog"
+    lsi.add_item @str2, "Dog"
+    lsi.add_item @str3, "Cat"
+    lsi.add_item @str4, "Cat"
+    lsi.add_item @str5, "Bird"
+    assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
+    assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
+    assert_equal ["Bird"], lsi.classify_multiple( @str8 )
+  end
+  def test_multiple_categorizing_reverse
+    lsi = Classifier::LSI.new
+    lsi.add_item @str1, "Dog"
+    lsi.add_item @str3, "Cat"
+    lsi.add_item @str4, "Cat"
+    lsi.add_item @str6, "Dog", "Bird", "Flying"
+    lsi.add_item @str7, "Cat", "Bird"
+    lsi.add_item @str8, "Bird", "Dog", "Cat"
+    assert_equal ["Dog"], lsi.classify_multiple( @str2 )
+    assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
+    # test with a word unknown alone
+    assert_equal "Bird", lsi.classify( "Bird!" )
+    assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
+  end
+	def test_external_classifying
+	  lsi = Classifier::LSI.new
+	  bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
+	  lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
+	  lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
+	  lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
+	  lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
+	  lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
+	  # We're talking about dogs. Even though the text matches the corpus on
+	  # cats better.  Dogs have more semantic weight than cats. So bayes
+	  # will fail here, but the LSI recognizes content.
+	  tricky_case = "This text revolves around dogs."
+	  assert_equal "Dog", lsi.classify( tricky_case )
+	  assert_not_equal "Dog", bayes.classify( tricky_case )
+	end
+	def test_recategorize_interface
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str1, "Dog"
+	  lsi.add_item @str2, "Dog"
+	  lsi.add_item @str3, "Cat"
+	  lsi.add_item @str4, "Cat"
+	  lsi.add_item @str5, "Bird"
+	  tricky_case = "This text revolves around dogs."
+	  assert_equal "Dog", lsi.classify( tricky_case )
+	  # Recategorize as needed.
+	  lsi.categories_for(@str1).clear.push "Cow"
+	  lsi.categories_for(@str2).clear.push "Cow"
+	  assert !lsi.needs_rebuild?
+	  assert_equal "Cow", lsi.classify( tricky_case )
+	end
+	def test_search
+	  lsi = Classifier::LSI.new
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+	  # Searching by content and text, note that @str2 comes up first, because
+	  # both "dog" and "involve" are present. But, the next match is @str1 instead
+	  # of @str4, because "dog" carries more weight than involves.
+	  assert_equal( [@str2, @str1, @str4, @str5, @str3],
+	                lsi.search("dog involves", 100) )
+	  # Keyword search shows how the space is mapped out in relation to
+	  # dog when magnitude is remove. Note the relations. We move from dog
+	  # through involve and then finally to other words.
+	  assert_equal( [@str1, @str2, @str4, @str5, @str3],
+	                lsi.search("dog", 5) )
+	end
+	def test_serialize_safe
+    lsi = Classifier::LSI.new
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+	  lsi_md = Marshal.dump lsi
+	  lsi_m = Marshal.load lsi_md
+	  assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
+	  assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
+	end
+	def test_keyword_search
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str1, "Dog"
+	  lsi.add_item @str2, "Dog"
+	  lsi.add_item @str3, "Cat"
+	  lsi.add_item @str4, "Cat"
+	  lsi.add_item @str5, "Bird"
+	  assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
+	end
+	def test_summary
+	   assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
+	end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'classifier'