RubyGems - reclassifier - Versions diffs - 0.0.2 → 0.0.3 - Mend

reclassifier 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/Rakefile +2 -5
data/lib/reclassifier/bayes.rb +88 -80
data/lib/reclassifier/unknown_classification_error.rb +2 -0
data/lib/reclassifier/version.rb +1 -1
data/lib/reclassifier.rb +5 -5
data/reclassifier.gemspec +1 -1
data/spec/bayes_spec.rb +97 -0
data/spec/core_ext/array_spec.rb +13 -0
data/spec/core_ext/string_spec.rb +19 -0
data/spec/lsi_spec.rb +123 -0
data/spec/spec_helper.rb +5 -0
metadata +14 -14
data/lib/reclassifier/core_ext/object.rb +0 -3
data/test/bayes_test.rb +0 -34
data/test/core_ext/array_test.rb +0 -15
data/test/core_ext/string_test.rb +0 -13
data/test/lsi_test.rb +0 -123
data/test/test_helper.rb +0 -4

data/Rakefile CHANGED Viewed

@@ -1,7 +1,4 @@
 require "bundler/gem_tasks"
-require 'rake/testtask'
+require 'rspec/core/rake_task'
-Rake::TestTask.new do |t|
-  t.libs << 'test'
-  t.test_files = FileList['test/**/*_test.rb']
-end
+RSpec::Core::RakeTask.new(:spec)

data/lib/reclassifier/bayes.rb CHANGED Viewed

@@ -1,129 +1,137 @@
+#
+# Bayesian classifier for arbitrary text.
+#
+# Implementation is translated from
+# Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
+# Cambridge University Press. 2008, ISBN 0521865719.
+#
 module Reclassifier
   class Bayes
-    # The class can be created with one or more categories, each of which will be
-    # initialized and given a training method. E.g.,
-    #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
-    def initialize(*categories)
-      @categories = Hash.new
-      categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
-      @total_words = 0
-      @category_counts = Hash.new(0)
+    # Can be created with zero or more classifications, each of which will be
+    # initialized and given a training method.  The classifications are specified as
+    # symbols.  E.g.,
+    #      b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
+    def initialize(*classifications)
+      @classifications = {}
+      classifications.each {|classification| @classifications[classification] = {}}
+      @docs_in_classification_count = {}
     end
     #
-    # Provides a general training method for all categories specified in Bayes#new
+    # Provides a general training method for all classifications specified in Bayes#new
     # For example:
-    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b = Reclassifier::Bayes.new :this, :that
     #     b.train :this, "This text"
-    #     b.train "that", "That text"
-    #     b.train "The other", "The other text"
-    def train(category, text)
-      category = category.prepare_category_name
-      @category_counts[category] += 1
+    #     b.train :that, "That text"
+    def train(classification, text)
+      ensure_classification_exists(classification)
+      @docs_in_classification_count[classification] ||= 0
+      @docs_in_classification_count[classification] += 1
       text.word_hash.each do |word, count|
-        @categories[category][word]     ||=     0
-        @categories[category][word]      +=     count
-        @total_words += count
+        @classifications[classification][word] ||= 0
+        @classifications[classification][word] += count
       end
     end
     #
-    # Provides a untraining method for all categories specified in Bayes#new
+    # Untrain a (classification, text) pair.
     # Be very careful with this method.
     #
     # For example:
-    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b = Reclassifier::Bayes.new :this, :that, :the_other
     #     b.train :this, "This text"
     #     b.untrain :this, "This text"
-    def untrain(category, text)
-      category = category.prepare_category_name
-      @category_counts[category] -= 1
+    def untrain(classification, text)
+      ensure_classification_exists(classification)
+      @docs_in_classification_count[classification] -= 1
       text.word_hash.each do |word, count|
-        if @total_words >= 0
-          orig = @categories[category][word]
-          @categories[category][word]     ||=     0
-          @categories[category][word]      -=     count
-          if @categories[category][word] <= 0
-            @categories[category].delete(word)
-            count = orig
-          end
-          @total_words -= count
-        end
+        @classifications[classification][word] -= count if @classifications[classification].include?(word)
       end
     end
     #
-    # Returns the scores in each category the provided +text+. E.g.,
+    # Returns the scores of the specified text for each classification. E.g.,
     #    b.classifications "I hate bad words and you"
     #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
     # The largest of these scores (the one closest to 0) is the one picked out by #classify
-    def classifications(text)
-      score = Hash.new
-      training_count = @category_counts.values.inject { |x,y| x+y }.to_f
-      @categories.each do |category, category_words|
-        score[category.to_s] = 0
-        total = category_words.values.inject(0) {|sum, element| sum+element}
+    def calculate_scores(text)
+      scores = {}
+      @classifications.each do |classification, classification_word_counts|
+        # prior
+        scores[classification] = Math.log(@docs_in_classification_count[classification])
+        scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
+        # likelihood
         text.word_hash.each do |word, count|
-          s = category_words.has_key?(word) ? category_words[word] : 0.1
-          score[category.to_s] += Math.log(s/total.to_f)
+          if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
+            scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
+            scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
+          end
         end
-        # now add prior probability for the category
-        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
-        score[category.to_s] += Math.log(s / training_count)
       end
-      return score
+      scores
     end
     #
-    # Returns the classification of the provided +text+, which is one of the
-    # categories given in the initializer. E.g.,
+    # Returns the classification of the specified text, which is one of the
+    # classifications given in the initializer. E.g.,
     #    b.classify "I hate bad words and you"
-    #    =>  'Uninteresting'
+    #    =>  :uninteresting
     def classify(text)
-      (classifications(text).sort_by { |a| -a[1] })[0][0]
+      calculate_scores(text).max_by {|classification| classification[1]}[0]
     end
     #
-    # Provides training and untraining methods for the categories specified in Bayes#new
+    # Provides a list of classification names
     # For example:
-    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
-    #     b.train_this "This text"
-    #     b.train_that "That text"
-    #     b.untrain_that "That text"
-    #     b.train_the_other "The other text"
-    def method_missing(name, *args)
-      category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
-      if @categories.has_key? category
-        args.each { |text| eval("#{$1}train(category, text)") }
-      elsif name.to_s =~ /(un)?train_([\w]+)/
-        raise StandardError, "No such category: #{category}"
-      else
-        super  #raise StandardError, "No such method: #{name}"
-      end
+    #     b.classifications
+    #     =>   [:this, :that, :the_other]
+    def classifications
+      @classifications.keys
     end
     #
-    # Provides a list of category names
+    # Adds the classification to the classifier.
+    # Has no effect if the classification already existed.
+    # Returns the classification.
     # For example:
-    #     b.categories
-    #     =>   ['This', 'That', 'the_other']
-    def categories # :nodoc:
-      @categories.keys.collect {|c| c.to_s}
+    #     b.add_classification(:not_spam)
+    def add_classification(classification)
+      @classifications[classification] ||= {}
+      classification
     end
     #
-    # Allows you to add categories to the classifier.
+    # Removes the classification from the classifier.
+    # Returns the classifier if the classification existed, else nil.
     # For example:
-    #     b.add_category "Not spam"
-    #
-    # WARNING: Adding categories to a trained classifier will
-    # result in an undertrained category that will tend to match
-    # more criteria than the trained selective categories. In short,
-    # try to initialize your categories at initialization.
-    def add_category(category)
-      @categories[category.prepare_category_name] = Hash.new
+    #     b.remove_classification(:not_spam)
+    def remove_classification(classification)
+      return_value = if @classifications.include?(classification)
+                       classification
+                     else
+                       nil
+                     end
+      @classifications.delete(classification)
+      return_value
     end
-    alias append_category add_category
+    private
+      def ensure_classification_exists(classification)
+        raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
+      end
   end
 end

data/lib/reclassifier/unknown_classification_error.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class Reclassifier::UnknownClassificationError < StandardError
2	+ end

data/lib/reclassifier/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Reclassifier
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/lib/reclassifier.rb CHANGED Viewed

@@ -7,13 +7,13 @@ require 'gsl'
 require 'reclassifier/version'
 require 'reclassifier/core_ext/array'
 require 'reclassifier/core_ext/matrix'
-require 'reclassifier/core_ext/object'
 require 'reclassifier/core_ext/string'
 require 'gsl/vector'
 module Reclassifier
-  autoload :Bayes,       'reclassifier/bayes'
-  autoload :LSI,         'reclassifier/lsi'
-  autoload :ContentNode, 'reclassifier/content_node'
-  autoload :WordList,    'reclassifier/word_list'
+  autoload :Bayes,                      'reclassifier/bayes'
+  autoload :LSI,                        'reclassifier/lsi'
+  autoload :ContentNode,                'reclassifier/content_node'
+  autoload :WordList,                   'reclassifier/word_list'
+  autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
 end

data/reclassifier.gemspec CHANGED Viewed

@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'bundler', '~> 1.3'
   spec.add_development_dependency 'rake'
-  spec.add_development_dependency 'test-unit'
+  spec.add_development_dependency 'rspec'
   spec.add_dependency 'fast-stemmer'
   spec.add_dependency 'gsl'

data/spec/bayes_spec.rb ADDED Viewed

@@ -0,0 +1,97 @@
+require 'spec_helper'
+describe Reclassifier::Bayes do
+	describe "classifications" do
+    it "should return the classifications" do
+      subject = described_class.new(:interesting, :uninteresting)
+      subject.classifications.sort.should eq([:interesting, :uninteresting])
+    end
+	end
+  describe "train" do
+    it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
+      expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
+    end
+    it "should train the classifier to the (classification, document) pair" do
+      subject = described_class.new(:in_china, :not_in_china)
+      subject.train(:in_china, 'Chinese Beijing Chinese')
+      subject.train(:in_china, 'Chinese Chinese Shanghai')
+      subject.train(:in_china, 'Chinese Macao')
+      subject.train(:not_in_china, 'Tokyo Japan Chinese')
+      subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
+    end
+  end
+  describe "untrain" do
+    it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
+      expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
+    end
+    it "should untrain the classifier against the (classification, document) pair" do
+      subject = described_class.new(:in_china, :not_in_china)
+      subject.train(:in_china, 'Chinese Chinese')
+      subject.train(:not_in_china, 'Chinese Macao')
+      subject.classify('Chinese').should eq(:in_china)
+      subject.untrain(:in_china, 'Chinese Chinese')
+      subject.classify('Chinese').should eq(:not_in_china)
+    end
+  end
+  describe "calculate_scores" do
+    it "should return a score hash with the correct scores" do
+      subject = described_class.new(:in_china, :not_in_china)
+      subject.train(:in_china, 'Chinese Beijing Chinese')
+      subject.train(:in_china, 'Chinese Chinese Shanghai')
+      subject.train(:in_china, 'Chinese Macao')
+      subject.train(:not_in_china, 'Tokyo Japan Chinese')
+      scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
+      scores[:in_china].should eq(-8.107690312843907)
+      scores[:not_in_china].should eq(-8.906681345001262)
+    end
+  end
+  describe "add_classification" do
+    it "should add the classification to the set of classifications" do
+      subject.classifications.should be_empty
+      subject.add_classification(:niner)
+      subject.classifications.should eq([:niner])
+    end
+    it "should return the classification" do
+      subject.add_classification(:niner).should eq(:niner)
+    end
+  end
+  describe "remove_classification" do
+    it "should remove the classification from the set of classifications" do
+      subject.add_classification(:niner)
+      subject.remove_classification(:niner)
+      subject.classifications.should be_empty
+    end
+    it "should return the classification" do
+      subject.add_classification(:niner)
+      subject.remove_classification(:niner).should eq(:niner)
+    end
+    it "should return nil if the classification didn't exist" do
+      subject.remove_classification(:niner).should be(nil)
+    end
+  end
+end

data/spec/core_ext/array_spec.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'spec_helper'
+describe Array do
+  describe "sum_with_identity" do
+    it "should sum the array" do
+      [1,2,3].sum_with_identity.should eq(6)
+    end
+    it "should return 0 when it encounters an empty array" do
+      [].sum_with_identity.should eq(0)
+    end
+  end
+end

data/spec/core_ext/string_spec.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'spec_helper'
+describe String do
+  describe "word_hash" do
+    it "should hash text" do
+      hash  = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
+      "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
+    end
+	end
+  describe "clean_word_hash" do
+    it "should clean and hash text" do
+	    hash  = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
+  	  "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
+    end
+  end
+end

data/spec/lsi_spec.rb ADDED Viewed

@@ -0,0 +1,123 @@
+require 'spec_helper'
+describe Reclassifier::LSI do
+  before do
+	  # we repeat principle words to help weight them.
+	  # This test is rather delicate, since this system is mostly noise.
+    @str1 = "This text deals with dogs. Dogs."
+	  @str2 = "This text involves dogs too. Dogs! "
+	  @str3 = "This text revolves around cats. Cats."
+	  @str4 = "This text also involves cats. Cats!"
+	  @str5 = "This text involves birds. Birds."
+  end
+  it "should do basic indexing" do
+    [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
+	  subject.needs_rebuild?.should be(false)
+  	# note that the closest match to str1 is str2, even though it is not
+    # the closest text match.
+    subject.find_related(@str1, 3).should eq([@str2, @str5, @str3])
+  end
+  it "should not auto rebuild when it's specified as false" do
+	 subject = described_class.new(:auto_rebuild => false)
+	 subject.add_item @str1, "Dog"
+	 subject.add_item @str2, "Dog"
+	 subject.needs_rebuild?.should be(true)
+	 subject.build_index
+	 subject.needs_rebuild?.should be(false)
+  end
+  it "should do basic classifying" do
+	  subject.add_item(@str2, "Dog")
+	  subject.add_item(@str3, "Cat")
+	  subject.add_item(@str4, "Cat")
+	  subject.add_item(@str5, "Bird")
+	  subject.classify(@str1).should eq("Dog")
+	  subject.classify(@str3).should eq("Cat")
+    subject.classify(@str5).should eq("Bird")
+  end
+  it "should perform better than Bayes" do
+	  bayes = Reclassifier::Bayes.new :dog, :cat, :bird
+    [[@str1, "Dog"],
+ 		 [@str2, "Dog"],
+		 [@str3, "Cat"],
+		 [@str4, "Cat"],
+		 [@str5, "Bird"]].each do |str, classification|
+      subject.add_item(str, classification)
+      bayes.train(classification.downcase.to_sym, str)
+    end
+	  # We're talking about dogs. Even though the text matches the corpus on
+	  # cats better.  Dogs have more semantic weight than cats. So bayes
+	  # will fail here, but the LSI recognizes content.
+	  tricky_case = "This text revolves around dogs."
+	  subject.classify(tricky_case).should eq("Dog")
+	  bayes.classify(tricky_case).should eq(:dog)
+  end
+  it "should recategorize as needed" do
+	  subject.add_item(@str1, "Dog")
+	  subject.add_item(@str2, "Dog")
+	  subject.add_item(@str3, "Cat")
+	  subject.add_item(@str4, "Cat")
+	  subject.add_item(@str5, "Bird")
+	  tricky_case = "This text revolves around dogs."
+	  subject.classify(tricky_case).should eq("Dog")
+	  # Recategorize as needed.
+	  subject.categories_for(@str1).clear.push("Cow")
+	  subject.categories_for(@str2).clear.push("Cow")
+	  subject.needs_rebuild?.should be(false)
+	  subject.classify(tricky_case).should eq("Cow")
+  end
+  it "should search correctly" do
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
+	  # Searching by content and text, note that @str2 comes up first, because
+	  # both "dog" and "involve" are present. But, the next match is @str1 instead
+	  # of @str4, because "dog" carries more weight than involves.
+    subject.search("dog involves", 100).should eq([@str2, @str1, @str4, @str5, @str3])
+	  # Keyword search shows how the space is mapped out in relation to
+	  # dog when magnitude is remove. Note the relations. We move from dog
+	  # through involve and then finally to other words.
+    subject.search("dog", 5).should eq([@str1, @str2, @str4, @str5, @str3])
+  end
+  it "should serialize correctly" do
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
+	  subject_md = Marshal.dump(subject)
+	  subject_m = Marshal.load(subject_md)
+	  subject_m.search("cat", 3).should eq(subject.search("cat", 3))
+	  subject_m.find_related(@str1, 3).should eq(subject.find_related(@str1, 3))
+  end
+  it "should keyword search correctly" do
+	  subject.add_item(@str1, "Dog")
+	  subject.add_item(@str2, "Dog")
+	  subject.add_item(@str3, "Cat")
+	  subject.add_item(@str4, "Cat")
+	  subject.add_item(@str5, "Bird")
+    subject.highest_ranked_stems(@str1).should eq([:dog, :text, :deal])
+  end
+  it "should summarize correctly" do
+    [@str1, @str2, @str3, @str4, @str5].join.summary(2).should eq("This text involves dogs too [...] This text also involves cats")
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require File.join(Dir.pwd, 'lib', 'reclassifier.rb')
+RSpec.configure do |config|
+  config.color = true
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: reclassifier
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-18 00:00:00.000000000 Z
+date: 2013-04-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -44,7 +44,7 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: test-unit
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     none: false
     requirements:
@@ -109,18 +109,18 @@ files:
 - lib/reclassifier/content_node.rb
 - lib/reclassifier/core_ext/array.rb
 - lib/reclassifier/core_ext/matrix.rb
-- lib/reclassifier/core_ext/object.rb
 - lib/reclassifier/core_ext/string.rb
 - lib/reclassifier/core_ext/vector.rb
 - lib/reclassifier/lsi.rb
+- lib/reclassifier/unknown_classification_error.rb
 - lib/reclassifier/version.rb
 - lib/reclassifier/word_list.rb
 - reclassifier.gemspec
-- test/bayes_test.rb
-- test/core_ext/array_test.rb
-- test/core_ext/string_test.rb
-- test/lsi_test.rb
-- test/test_helper.rb
+- spec/bayes_spec.rb
+- spec/core_ext/array_spec.rb
+- spec/core_ext/string_spec.rb
+- spec/lsi_spec.rb
+- spec/spec_helper.rb
 homepage: https://github.com/saveup/reclassifier
 licenses:
 - LGPL
@@ -147,8 +147,8 @@ signing_key:
 specification_version: 3
 summary: Bayesian and Latent Semantic Indexing classification of text.
 test_files:
-- test/bayes_test.rb
-- test/core_ext/array_test.rb
-- test/core_ext/string_test.rb
-- test/lsi_test.rb
-- test/test_helper.rb
+- spec/bayes_spec.rb
+- spec/core_ext/array_spec.rb
+- spec/core_ext/string_spec.rb
+- spec/lsi_spec.rb
+- spec/spec_helper.rb

data/lib/reclassifier/core_ext/object.rb DELETED Viewed

@@ -1,3 +0,0 @@
-class Object
-	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
-end

data/test/bayes_test.rb DELETED Viewed

@@ -1,34 +0,0 @@
-require File.join(File.dirname(__FILE__), 'test_helper')
-class BayesTest < Test::Unit::TestCase
-	def setup
-		@classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
-	end
-	def test_good_training
-		assert_nothing_raised { @classifier.train_interesting "love" }
-	end
-	def test_bad_training
-		assert_raise(StandardError) { @classifier.train_no_category "words" }
-	end
-	def test_bad_method
-		assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
-	end
-	def test_categories
-		assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
-	end
-	def test_add_category
-		@classifier.add_category 'Test'
-		assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
-	end
-	def test_classification
-		@classifier.train_interesting "here are some good words. I hope you love them"
-		@classifier.train_uninteresting "here are some bad words, I hate you"
-		assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
-	end
-end

data/test/core_ext/array_test.rb DELETED Viewed

@@ -1,15 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', 'test_helper')
-class ArrayTest < Test::Unit::TestCase
-  def test_monkey_path_array_sum
-    assert_equal [1,2,3].sum_with_identity, 6
-  end
-  def test_summing_an_empty_array
-    assert_equal [nil].sum_with_identity, 0
-  end
-  def test_summing_an_empty_array
-    assert_equal Array[].sum_with_identity, 0
-  end
-end

data/test/core_ext/string_test.rb DELETED Viewed

@@ -1,13 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', 'test_helper')
-class StringTest < Test::Unit::TestCase
-	def test_word_hash
-		hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
-		assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
-	end
-	def test_clean_word_hash
-	   hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
-	   assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
-	end
-end

data/test/lsi_test.rb DELETED Viewed

@@ -1,123 +0,0 @@
-require File.join(File.dirname(__FILE__), 'test_helper')
-class LSITest < Test::Unit::TestCase
-	def setup
-	  # we repeat principle words to help weight them.
-	  # This test is rather delicate, since this system is mostly noise.
-     @str1 = "This text deals with dogs. Dogs."
-	  @str2 = "This text involves dogs too. Dogs! "
-	  @str3 = "This text revolves around cats. Cats."
-	  @str4 = "This text also involves cats. Cats!"
-	  @str5 = "This text involves birds. Birds."
-	end
-	def test_basic_indexing
-	 lsi = Reclassifier::LSI.new
-	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
-	  assert ! lsi.needs_rebuild?
-	 # note that the closest match to str1 is str2, even though it is not
-	 # the closest text match.
-	 assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
-	end
-	def test_not_auto_rebuild
-	 lsi = Reclassifier::LSI.new :auto_rebuild => false
-	 lsi.add_item @str1, "Dog"
-	 lsi.add_item @str2, "Dog"
-	 assert lsi.needs_rebuild?
-	 lsi.build_index
-	 assert ! lsi.needs_rebuild?
-	end
-	def test_basic_categorizing
-	  lsi = Reclassifier::LSI.new
-	  lsi.add_item @str2, "Dog"
-	  lsi.add_item @str3, "Cat"
-	  lsi.add_item @str4, "Cat"
-	  lsi.add_item @str5, "Bird"
-	  assert_equal "Dog", lsi.classify( @str1 )
-	  assert_equal "Cat", lsi.classify( @str3 )
-     assert_equal "Bird", lsi.classify( @str5 )
-	end
-	def test_external_classifying
-	  lsi = Reclassifier::LSI.new
-	  bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
-	  lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
-	  lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
-	  lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
-	  lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
-	  lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
-	  # We're talking about dogs. Even though the text matches the corpus on
-	  # cats better.  Dogs have more semantic weight than cats. So bayes
-	  # will fail here, but the LSI recognizes content.
-	  tricky_case = "This text revolves around dogs."
-	  assert_equal "Dog", lsi.classify( tricky_case )
-	  assert_not_equal "Dog", bayes.classify( tricky_case )
-	end
-	def test_recategorize_interface
-	  lsi = Reclassifier::LSI.new
-	  lsi.add_item @str1, "Dog"
-	  lsi.add_item @str2, "Dog"
-	  lsi.add_item @str3, "Cat"
-	  lsi.add_item @str4, "Cat"
-	  lsi.add_item @str5, "Bird"
-	  tricky_case = "This text revolves around dogs."
-	  assert_equal "Dog", lsi.classify( tricky_case )
-	  # Recategorize as needed.
-	  lsi.categories_for(@str1).clear.push "Cow"
-	  lsi.categories_for(@str2).clear.push "Cow"
-	  assert !lsi.needs_rebuild?
-	  assert_equal "Cow", lsi.classify( tricky_case )
-	end
-	def test_search
-	  lsi = Reclassifier::LSI.new
-	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
-	  # Searching by content and text, note that @str2 comes up first, because
-	  # both "dog" and "involve" are present. But, the next match is @str1 instead
-	  # of @str4, because "dog" carries more weight than involves.
-	  assert_equal( [@str2, @str1, @str4, @str5, @str3],
-	                lsi.search("dog involves", 100) )
-	  # Keyword search shows how the space is mapped out in relation to
-	  # dog when magnitude is remove. Note the relations. We move from dog
-	  # through involve and then finally to other words.
-	  assert_equal( [@str1, @str2, @str4, @str5, @str3],
-	                lsi.search("dog", 5) )
-	end
-	def test_serialize_safe
-    lsi = Reclassifier::LSI.new
-	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
-	  lsi_md = Marshal.dump lsi
-	  lsi_m = Marshal.load lsi_md
-	  assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
-	  assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
-	end
-	def test_keyword_search
-	  lsi = Reclassifier::LSI.new
-	  lsi.add_item @str1, "Dog"
-	  lsi.add_item @str2, "Dog"
-	  lsi.add_item @str3, "Cat"
-	  lsi.add_item @str4, "Cat"
-	  lsi.add_item @str5, "Bird"
-	  assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
-	end
-	def test_summary
-	   assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
-	end
-end

data/test/test_helper.rb DELETED Viewed

@@ -1,4 +0,0 @@
-$:.unshift(File.dirname(__FILE__) + '/../lib')
-require 'test/unit'
-require 'reclassifier'