RubyGems - reclassifier - Versions diffs - 0.0.2 → 0.0.3 - Mend

reclassifier 0.0.2 → 0.0.3

Files changed (18) hide show

data/Rakefile +2 -5
data/lib/reclassifier/bayes.rb +88 -80
data/lib/reclassifier/unknown_classification_error.rb +2 -0
data/lib/reclassifier/version.rb +1 -1
data/lib/reclassifier.rb +5 -5
data/reclassifier.gemspec +1 -1
data/spec/bayes_spec.rb +97 -0
data/spec/core_ext/array_spec.rb +13 -0
data/spec/core_ext/string_spec.rb +19 -0
data/spec/lsi_spec.rb +123 -0
data/spec/spec_helper.rb +5 -0
metadata +14 -14
data/lib/reclassifier/core_ext/object.rb +0 -3
data/test/bayes_test.rb +0 -34
data/test/core_ext/array_test.rb +0 -15
data/test/core_ext/string_test.rb +0 -13
data/test/lsi_test.rb +0 -123
data/test/test_helper.rb +0 -4

data/Rakefile CHANGED Viewed

@@ -1,7 +1,4 @@
 require "bundler/gem_tasks"
-require 'rake/testtask'
+require 'rspec/core/rake_task'
-Rake::TestTask.new do |t|
-  t.libs << 'test'
-  t.test_files = FileList['test/**/*_test.rb']
-end
+RSpec::Core::RakeTask.new(:spec)

data/lib/reclassifier/bayes.rb CHANGED Viewed

@@ -1,129 +1,137 @@
+#
+# Bayesian classifier for arbitrary text.
+#
+# Implementation is translated from
+# Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
+# Cambridge University Press. 2008, ISBN 0521865719.
+#
 module Reclassifier
   class Bayes
-    # The class can be created with one or more categories, each of which will be
-    # initialized and given a training method. E.g.,
-    #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
-    def initialize(*categories)
-      @categories = Hash.new
-      categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
-      @total_words = 0
-      @category_counts = Hash.new(0)
+    # Can be created with zero or more classifications, each of which will be
+    # initialized and given a training method.  The classifications are specified as
+    # symbols.  E.g.,
+    #      b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
+    def initialize(*classifications)
+      @classifications = {}
+      classifications.each {|classification| @classifications[classification] = {}}
+      @docs_in_classification_count = {}
     end
     #
-    # Provides a general training method for all categories specified in Bayes#new
+    # Provides a general training method for all classifications specified in Bayes#new
     # For example:
-    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b = Reclassifier::Bayes.new :this, :that
     #     b.train :this, "This text"
-    #     b.train "that", "That text"
-    #     b.train "The other", "The other text"
-    def train(category, text)
-      category = category.prepare_category_name
-      @category_counts[category] += 1
+    #     b.train :that, "That text"
+    def train(classification, text)
+      ensure_classification_exists(classification)
+      @docs_in_classification_count[classification] ||= 0
+      @docs_in_classification_count[classification] += 1
       text.word_hash.each do |word, count|
-        @categories[category][word]     ||=     0
-        @categories[category][word]      +=     count
-        @total_words += count
+        @classifications[classification][word] ||= 0
+        @classifications[classification][word] += count
       end
     end
     #
-    # Provides a untraining method for all categories specified in Bayes#new
+    # Untrain a (classification, text) pair.
     # Be very careful with this method.
     #
     # For example:
-    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b = Reclassifier::Bayes.new :this, :that, :the_other
     #     b.train :this, "This text"
     #     b.untrain :this, "This text"
-    def untrain(category, text)
-      category = category.prepare_category_name
-      @category_counts[category] -= 1
+    def untrain(classification, text)
+      ensure_classification_exists(classification)
+      @docs_in_classification_count[classification] -= 1
       text.word_hash.each do |word, count|
-        if @total_words >= 0
-          orig = @categories[category][word]
-          @categories[category][word]     ||=     0
-          @categories[category][word]      -=     count
-          if @categories[category][word] <= 0
-            @categories[category].delete(word)
-            count = orig
-          end
-          @total_words -= count
-        end
+        @classifications[classification][word] -= count if @classifications[classification].include?(word)
       end
     end
     #
-    # Returns the scores in each category the provided +text+. E.g.,
+    # Returns the scores of the specified text for each classification. E.g.,
     #    b.classifications "I hate bad words and you"
     #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
     # The largest of these scores (the one closest to 0) is the one picked out by #classify
-    def classifications(text)
-      score = Hash.new
-      training_count = @category_counts.values.inject { |x,y| x+y }.to_f
-      @categories.each do |category, category_words|
-        score[category.to_s] = 0
-        total = category_words.values.inject(0) {|sum, element| sum+element}
+    def calculate_scores(text)
+      scores = {}
+      @classifications.each do |classification, classification_word_counts|
+        # prior
+        scores[classification] = Math.log(@docs_in_classification_count[classification])
+        scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
+        # likelihood
         text.word_hash.each do |word, count|
-          s = category_words.has_key?(word) ? category_words[word] : 0.1
-          score[category.to_s] += Math.log(s/total.to_f)
+          if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
+            scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
+            scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
+          end
         end
-        # now add prior probability for the category
-        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
-        score[category.to_s] += Math.log(s / training_count)
       end
-      return score
+      scores
     end
     #
-    # Returns the classification of the provided +text+, which is one of the
-    # categories given in the initializer. E.g.,
+    # Returns the classification of the specified text, which is one of the
+    # classifications given in the initializer. E.g.,
     #    b.classify "I hate bad words and you"
-    #    =>  'Uninteresting'
+    #    =>  :uninteresting
     def classify(text)
-      (classifications(text).sort_by { |a| -a[1] })[0][0]
+      calculate_scores(text).max_by {|classification| classification[1]}[0]
     end
     #
-    # Provides training and untraining methods for the categories specified in Bayes#new
+    # Provides a list of classification names
     # For example:
-    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-    #     b.train_this "This text"
-    #     b.train_that "That text"
-    #     b.untrain_that "That text"
-    #     b.train_the_other "The other text"
-    def method_missing(name, *args)
-      category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
-      if @categories.has_key? category
-        args.each { |text| eval("#{$1}train(category, text)") }
-      elsif name.to_s =~ /(un)?train_([\w]+)/
-        raise StandardError, "No such category: #{category}"
-      else
-        super  #raise StandardError, "No such method: #{name}"
-      end
+    #     b.classifications
+    #     =>   [:this, :that, :the_other]
+    def classifications
+      @classifications.keys
     end
     #
-    # Provides a list of category names
+    # Adds the classification to the classifier.
+    # Has no effect if the classification already existed.
+    # Returns the classification.
     # For example:
-    #     b.categories
-    #     =>   ['This', 'That', 'the_other']
-    def categories # :nodoc:
-      @categories.keys.collect {|c| c.to_s}
+    #     b.add_classification(:not_spam)
+    def add_classification(classification)
+      @classifications[classification] ||= {}
+      classification
     end
     #
-    # Allows you to add categories to the classifier.
+    # Removes the classification from the classifier.
+    # Returns the classifier if the classification existed, else nil.
     # For example:
-    #     b.add_category "Not spam"
-    #
-    # WARNING: Adding categories to a trained classifier will
-    # result in an undertrained category that will tend to match
-    # more criteria than the trained selective categories. In short,
-    # try to initialize your categories at initialization.
-    def add_category(category)
-      @categories[category.prepare_category_name] = Hash.new
+    #     b.remove_classification(:not_spam)
+    def remove_classification(classification)
+      return_value = if @classifications.include?(classification)
+                       classification
+                     else
+                       nil
+                     end
+      @classifications.delete(classification)
+      return_value
     end
-    alias append_category add_category
+    private
+      def ensure_classification_exists(classification)
+        raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
+      end
   end
 end

data/lib/reclassifier/unknown_classification_error.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class Reclassifier::UnknownClassificationError < StandardError
2	+ end

data/lib/reclassifier/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Reclassifier
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/lib/reclassifier.rb CHANGED Viewed

@@ -7,13 +7,13 @@ require 'gsl'
 require 'reclassifier/version'
 require 'reclassifier/core_ext/array'
 require 'reclassifier/core_ext/matrix'
-require 'reclassifier/core_ext/object'
 require 'reclassifier/core_ext/string'
 require 'gsl/vector'
 module Reclassifier
-  autoload :Bayes,       'reclassifier/bayes'
-  autoload :LSI,         'reclassifier/lsi'
-  autoload :ContentNode, 'reclassifier/content_node'
-  autoload :WordList,    'reclassifier/word_list'
+  autoload :Bayes,                      'reclassifier/bayes'
+  autoload :LSI,                        'reclassifier/lsi'
+  autoload :ContentNode,                'reclassifier/content_node'
+  autoload :WordList,                   'reclassifier/word_list'
+  autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
 end

data/reclassifier.gemspec CHANGED Viewed

@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'bundler', '~> 1.3'
   spec.add_development_dependency 'rake'
-  spec.add_development_dependency 'test-unit'
+  spec.add_development_dependency 'rspec'
   spec.add_dependency 'fast-stemmer'
   spec.add_dependency 'gsl'

data/spec/bayes_spec.rb ADDED Viewed

@@ -0,0 +1,97 @@
+require 'spec_helper'
+describe Reclassifier::Bayes do
+	describe "classifications" do
+    it "should return the classifications" do
+      subject = described_class.new(:interesting, :uninteresting)
+      subject.classifications.sort.should eq([:interesting, :uninteresting])
+    end
+	end
+  describe "train" do
+    it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
+      expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
+    end
+    it "should train the classifier to the (classification, document) pair" do
+      subject = described_class.new(:in_china, :not_in_china)
+      subject.train(:in_china, 'Chinese Beijing Chinese')
+      subject.train(:in_china, 'Chinese Chinese Shanghai')
+      subject.train(:in_china, 'Chinese Macao')
+      subject.train(:not_in_china, 'Tokyo Japan Chinese')
+      subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
+    end
+  end
+  describe "untrain" do
+    it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
+      expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
+    end
+    it "should untrain the classifier against the (classification, document) pair" do
+      subject = described_class.new(:in_china, :not_in_china)
+      subject.train(:in_china, 'Chinese Chinese')
+      subject.train(:not_in_china, 'Chinese Macao')
+      subject.classify('Chinese').should eq(:in_china)
+      subject.untrain(:in_china, 'Chinese Chinese')
+      subject.classify('Chinese').should eq(:not_in_china)
+    end
+  end
+  describe "calculate_scores" do
+    it "should return a score hash with the correct scores" do
+      subject = described_class.new(:in_china, :not_in_china)
+      subject.train(:in_china, 'Chinese Beijing Chinese')
+      subject.train(:in_china, 'Chinese Chinese Shanghai')
+      subject.train(:in_china, 'Chinese Macao')
+      subject.train(:not_in_china, 'Tokyo Japan Chinese')
+      scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
+      scores[:in_china].should eq(-8.107690312843907)
+      scores[:not_in_china].should eq(-8.906681345001262)
+    end
+  end
+  describe "add_classification" do
+    it "should add the classification to the set of classifications" do
+      subject.classifications.should be_empty
+      subject.add_classification(:niner)
+      subject.classifications.should eq([:niner])
+    end
+    it "should return the classification" do
+      subject.add_classification(:niner).should eq(:niner)
+    end
+  end
+  describe "remove_classification" do
+    it "should remove the classification from the set of classifications" do
+      subject.add_classification(:niner)
+      subject.remove_classification(:niner)
+      subject.classifications.should be_empty
+    end
+    it "should return the classification" do
+      subject.add_classification(:niner)
+      subject.remove_classification(:niner).should eq(:niner)
+    end
+    it "should return nil if the classification didn't exist" do
+      subject.remove_classification(:niner).should be(nil)
+    end
+  end
+end

data/spec/core_ext/array_spec.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'spec_helper'
+describe Array do
+  describe "sum_with_identity" do
+    it "should sum the array" do
+      [1,2,3].sum_with_identity.should eq(6)
+    end
+    it "should return 0 when it encounters an empty array" do
+      [].sum_with_identity.should eq(0)
+    end
+  end
+end

data/spec/core_ext/string_spec.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'spec_helper'
+describe String do
+  describe "word_hash" do
+    it "should hash text" do
+      hash  = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
+      "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
+    end
+	end
+  describe "clean_word_hash" do
+    it "should clean and hash text" do
+	    hash  = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
+  	  "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
+    end
+  end
+end

data/spec/lsi_spec.rb ADDED Viewed

@@ -0,0 +1,123 @@
+require 'spec_helper'
+describe Reclassifier::LSI do
+  before do
+	  # we repeat principle words to help weight them.
+	  # This test is rather delicate, since this system is mostly noise.
+    @str1 = "This text deals with dogs. Dogs."
+	  @str2 = "This text involves dogs too. Dogs! "
+	  @str3 = "This text revolves around cats. Cats."
+	  @str4 = "This text also involves cats. Cats!"
+	  @str5 = "This text involves birds. Birds."
+  end
+  it "should do basic indexing" do
+    [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
+	  subject.needs_rebuild?.should be(false)
+  	# note that the closest match to str1 is str2, even though it is not
+    # the closest text match.
+    subject.find_related(@str1, 3).should eq([@str2, @str5, @str3])
+  end
+  it "should not auto rebuild when it's specified as false" do
+	 subject = described_class.new(:auto_rebuild => false)
+	 subject.add_item @str1, "Dog"
+	 subject.add_item @str2, "Dog"
+	 subject.needs_rebuild?.should be(true)
+	 subject.build_index
+	 subject.needs_rebuild?.should be(false)
+  end
+  it "should do basic classifying" do
+	  subject.add_item(@str2, "Dog")
+	  subject.add_item(@str3, "Cat")
+	  subject.add_item(@str4, "Cat")
+	  subject.add_item(@str5, "Bird")
+	  subject.classify(@str1).should eq("Dog")
+	  subject.classify(@str3).should eq("Cat")
+    subject.classify(@str5).should eq("Bird")
+  end
+  it "should perform better than Bayes" do
+	  bayes = Reclassifier::Bayes.new :dog, :cat, :bird
+    [[@str1, "Dog"],
+ 		 [@str2, "Dog"],
+		 [@str3, "Cat"],
+		 [@str4, "Cat"],
+		 [@str5, "Bird"]].each do |str, classification|
+      subject.add_item(str, classification)
+      bayes.train(classification.downcase.to_sym, str)
+    end
+	  # We're talking about dogs. Even though the text matches the corpus on
+	  # cats better.  Dogs have more semantic weight than cats. So bayes
+	  # will fail here, but the LSI recognizes content.
+	  tricky_case = "This text revolves around dogs."
+	  subject.classify(tricky_case).should eq("Dog")
+	  bayes.classify(tricky_case).should eq(:dog)
+  end
+  it "should recategorize as needed" do
+	  subject.add_item(@str1, "Dog")
+	  subject.add_item(@str2, "Dog")
+	  subject.add_item(@str3, "Cat")
+	  subject.add_item(@str4, "Cat")
+	  subject.add_item(@str5, "Bird")
+	  tricky_case = "This text revolves around dogs."
+	  subject.classify(tricky_case).should eq("Dog")
+	  # Recategorize as needed.
+	  subject.categories_for(@str1).clear.push("Cow")
+	  subject.categories_for(@str2).clear.push("Cow")
+	  subject.needs_rebuild?.should be(false)
+	  subject.classify(tricky_case).should eq("Cow")
+  end
+  it "should search correctly" do
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
+	  # Searching by content and text, note that @str2 comes up first, because
+	  # both "dog" and "involve" are present. But, the next match is @str1 instead
+	  # of @str4, because "dog" carries more weight than involves.
+    subject.search("dog involves", 100).should eq([@str2, @str1, @str4, @str5, @str3])
+	  # Keyword search shows how the space is mapped out in relation to
+	  # dog when magnitude is remove. Note the relations. We move from dog
+	  # through involve and then finally to other words.
+    subject.search("dog", 5).should eq([@str1, @str2, @str4, @str5, @str3])
+  end
+  it "should serialize correctly" do
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
+	  subject_md = Marshal.dump(subject)
+	  subject_m = Marshal.load(subject_md)
+	  subject_m.search("cat", 3).should eq(subject.search("cat", 3))
+	  subject_m.find_related(@str1, 3).should eq(subject.find_related(@str1, 3))
+  end
+  it "should keyword search correctly" do
+	  subject.add_item(@str1, "Dog")
+	  subject.add_item(@str2, "Dog")
+	  subject.add_item(@str3, "Cat")
+	  subject.add_item(@str4, "Cat")
+	  subject.add_item(@str5, "Bird")
+    subject.highest_ranked_stems(@str1).should eq([:dog, :text, :deal])
+  end
+  it "should summarize correctly" do
+    [@str1, @str2, @str3, @str4, @str5].join.summary(2).should eq("This text involves dogs too [...] This text also involves cats")
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require File.join(Dir.pwd, 'lib', 'reclassifier.rb')
+RSpec.configure do |config|
+  config.color = true
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: reclassifier
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-18 00:00:00.000000000 Z
+date: 2013-04-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -44,7 +44,7 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: test-unit
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     none: false
     requirements:
@@ -109,18 +109,18 @@ files:
 - lib/reclassifier/content_node.rb
 - lib/reclassifier/core_ext/array.rb
 - lib/reclassifier/core_ext/matrix.rb
-- lib/reclassifier/core_ext/object.rb
 - lib/reclassifier/core_ext/string.rb
 - lib/reclassifier/core_ext/vector.rb
 - lib/reclassifier/lsi.rb
+- lib/reclassifier/unknown_classification_error.rb
 - lib/reclassifier/version.rb
 - lib/reclassifier/word_list.rb
 - reclassifier.gemspec
-- test/bayes_test.rb
-- test/core_ext/array_test.rb
-- test/core_ext/string_test.rb
-- test/lsi_test.rb
-- test/test_helper.rb
+- spec/bayes_spec.rb
+- spec/core_ext/array_spec.rb
+- spec/core_ext/string_spec.rb
+- spec/lsi_spec.rb
+- spec/spec_helper.rb
 homepage: https://github.com/saveup/reclassifier
 licenses:
 - LGPL
@@ -147,8 +147,8 @@ signing_key:
 specification_version: 3
 summary: Bayesian and Latent Semantic Indexing classification of text.
 test_files:
-- test/bayes_test.rb
-- test/core_ext/array_test.rb
-- test/core_ext/string_test.rb
-- test/lsi_test.rb
-- test/test_helper.rb
+- spec/bayes_spec.rb
+- spec/core_ext/array_spec.rb
+- spec/core_ext/string_spec.rb
+- spec/lsi_spec.rb
+- spec/spec_helper.rb

data/lib/reclassifier/core_ext/object.rb DELETED Viewed

@@ -1,3 +0,0 @@
-class Object
-	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
-end

data/test/bayes_test.rb DELETED Viewed

@@ -1,34 +0,0 @@
-require File.join(File.dirname(__FILE__), 'test_helper')
-class BayesTest < Test::Unit::TestCase
-	def setup
-		@classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
-	end
-	def test_good_training
-		assert_nothing_raised { @classifier.train_interesting "love" }
-	end
-	def test_bad_training
-		assert_raise(StandardError) { @classifier.train_no_category "words" }
-	end
-	def test_bad_method
-		assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
-	end
-	def test_categories
-		assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
-	end
-	def test_add_category
-		@classifier.add_category 'Test'
-		assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
-	end
-	def test_classification
-		@classifier.train_interesting "here are some good words. I hope you love them"
-		@classifier.train_uninteresting "here are some bad words, I hate you"
-		assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
-	end
-end

data/test/core_ext/array_test.rb DELETED Viewed

@@ -1,15 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', 'test_helper')
-class ArrayTest < Test::Unit::TestCase
-  def test_monkey_path_array_sum
-    assert_equal [1,2,3].sum_with_identity, 6
-  end
-  def test_summing_an_empty_array
-    assert_equal [nil].sum_with_identity, 0
-  end
-  def test_summing_an_empty_array
-    assert_equal Array[].sum_with_identity, 0
-  end
-end

data/test/core_ext/string_test.rb DELETED Viewed

@@ -1,13 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', 'test_helper')
-class StringTest < Test::Unit::TestCase
-	def test_word_hash
-		hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
-		assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
-	end
-	def test_clean_word_hash
-	   hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
-	   assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
-	end
-end

data/test/lsi_test.rb DELETED Viewed

@@ -1,123 +0,0 @@
-require File.join(File.dirname(__FILE__), 'test_helper')
-class LSITest < Test::Unit::TestCase
-	def setup
-	  # we repeat principle words to help weight them.
-	  # This test is rather delicate, since this system is mostly noise.
-     @str1 = "This text deals with dogs. Dogs."
-	  @str2 = "This text involves dogs too. Dogs! "
-	  @str3 = "This text revolves around cats. Cats."
-	  @str4 = "This text also involves cats. Cats!"
-	  @str5 = "This text involves birds. Birds."
-	end
-	def test_basic_indexing
-	 lsi = Reclassifier::LSI.new
-	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
-	  assert ! lsi.needs_rebuild?
-	 # note that the closest match to str1 is str2, even though it is not
-	 # the closest text match.
-	 assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
-	end
-	def test_not_auto_rebuild
-	 lsi = Reclassifier::LSI.new :auto_rebuild => false
-	 lsi.add_item @str1, "Dog"
-	 lsi.add_item @str2, "Dog"
-	 assert lsi.needs_rebuild?
-	 lsi.build_index
-	 assert ! lsi.needs_rebuild?
-	end
-	def test_basic_categorizing
-	  lsi = Reclassifier::LSI.new
-	  lsi.add_item @str2, "Dog"
-	  lsi.add_item @str3, "Cat"
-	  lsi.add_item @str4, "Cat"
-	  lsi.add_item @str5, "Bird"
-	  assert_equal "Dog", lsi.classify( @str1 )
-	  assert_equal "Cat", lsi.classify( @str3 )
-     assert_equal "Bird", lsi.classify( @str5 )
-	end
-	def test_external_classifying
-	  lsi = Reclassifier::LSI.new
-	  bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
-	  lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
-	  lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
-	  lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
-	  lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
-	  lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
-	  # We're talking about dogs. Even though the text matches the corpus on
-	  # cats better.  Dogs have more semantic weight than cats. So bayes
-	  # will fail here, but the LSI recognizes content.
-	  tricky_case = "This text revolves around dogs."
-	  assert_equal "Dog", lsi.classify( tricky_case )
-	  assert_not_equal "Dog", bayes.classify( tricky_case )
-	end
-	def test_recategorize_interface
-	  lsi = Reclassifier::LSI.new
-	  lsi.add_item @str1, "Dog"
-	  lsi.add_item @str2, "Dog"
-	  lsi.add_item @str3, "Cat"
-	  lsi.add_item @str4, "Cat"
-	  lsi.add_item @str5, "Bird"
-	  tricky_case = "This text revolves around dogs."
-	  assert_equal "Dog", lsi.classify( tricky_case )
-	  # Recategorize as needed.
-	  lsi.categories_for(@str1).clear.push "Cow"
-	  lsi.categories_for(@str2).clear.push "Cow"
-	  assert !lsi.needs_rebuild?
-	  assert_equal "Cow", lsi.classify( tricky_case )
-	end
-	def test_search
-	  lsi = Reclassifier::LSI.new
-	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
-	  # Searching by content and text, note that @str2 comes up first, because
-	  # both "dog" and "involve" are present. But, the next match is @str1 instead
-	  # of @str4, because "dog" carries more weight than involves.
-	  assert_equal( [@str2, @str1, @str4, @str5, @str3],
-	                lsi.search("dog involves", 100) )
-	  # Keyword search shows how the space is mapped out in relation to
-	  # dog when magnitude is remove. Note the relations. We move from dog
-	  # through involve and then finally to other words.
-	  assert_equal( [@str1, @str2, @str4, @str5, @str3],
-	                lsi.search("dog", 5) )
-	end
-	def test_serialize_safe
-    lsi = Reclassifier::LSI.new
-	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
-	  lsi_md = Marshal.dump lsi
-	  lsi_m = Marshal.load lsi_md
-	  assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
-	  assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
-	end
-	def test_keyword_search
-	  lsi = Reclassifier::LSI.new
-	  lsi.add_item @str1, "Dog"
-	  lsi.add_item @str2, "Dog"
-	  lsi.add_item @str3, "Cat"
-	  lsi.add_item @str4, "Cat"
-	  lsi.add_item @str5, "Bird"
-	  assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
-	end
-	def test_summary
-	   assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
-	end
-end

data/test/test_helper.rb DELETED Viewed

@@ -1,4 +0,0 @@
-$:.unshift(File.dirname(__FILE__) + '/../lib')
-require 'test/unit'
-require 'reclassifier'