RubyGems - otherinbox-classifier - Versions diffs - 1.3.1.1 - Mend

otherinbox-classifier 1.3.1.1

Files changed (20) hide show

data/LICENSE +429 -0
data/README +88 -0
data/Rakefile +96 -0
data/bin/bayes.rb +36 -0
data/bin/summarize.rb +16 -0
data/lib/classifier.rb +30 -0
data/lib/classifier/bayes.rb +172 -0
data/lib/classifier/extensions/string.rb +16 -0
data/lib/classifier/extensions/vector.rb +106 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/extensions/word_hash.rb +154 -0
data/lib/classifier/lsi.rb +318 -0
data/lib/classifier/lsi/content_node.rb +72 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/test/bayes/bayesian_test.rb +33 -0
data/test/extensions/word_hash_test.rb +14 -0
data/test/lsi/lsi_test.rb +123 -0
data/test/test_helper.rb +4 -0
metadata +85 -0

data/lib/classifier/lsi/content_node.rb ADDED

@@ -0,0 +1,72 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = vec.sum
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/classifier/lsi/summary.rb ADDED

@@ -0,0 +1,31 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+class String
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Classifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end

data/lib/classifier/lsi/word_list.rb ADDED

@@ -0,0 +1,36 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+  # This class keeps a word => index mapping. It is used to map stemmed words
+  # to dimensions of a vector.
+  class WordList
+    def initialize
+      @location_table = Hash.new
+    end
+    # Adds a word (if it is new) and assigns it a unique dimension.
+    def add_word(word)
+      term = word
+      @location_table[term] = @location_table.size unless @location_table[term]
+    end
+    # Returns the dimension of the word or nil if the word is not in the space.
+    def [](lookup)
+      term = lookup
+      @location_table[term]
+    end
+    def word_for_index(ind)
+      @location_table.invert[ind]
+    end
+    # Returns the number of words mapped.
+    def size
+      @location_table.size
+    end
+  end
+end

data/test/bayes/bayesian_test.rb ADDED

@@ -0,0 +1,33 @@
+require File.dirname(__FILE__) + '/../test_helper'
+class BayesianTest < Test::Unit::TestCase
+	def setup
+		@classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
+	end
+	def test_good_training
+		assert_nothing_raised { @classifier.train_interesting "love" }
+	end
+	def test_bad_training
+		assert_raise(StandardError) { @classifier.train_no_category "words" }
+	end
+	def test_bad_method
+		assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
+	end
+	def test_categories
+		assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_add_category
+		@classifier.add_category 'Test'
+		assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_classification
+		@classifier.train_interesting "here are some good words. I hope you love them"
+		@classifier.train_uninteresting "here are some bad words, I hate you"
+		assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
+	end
+end

data/test/extensions/word_hash_test.rb ADDED

@@ -0,0 +1,14 @@
+require File.dirname(__FILE__) + '/../test_helper'
+class StringExtensionsTest < Test::Unit::TestCase
+	def test_word_hash
+		hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
+		assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
+	end
+	def test_clean_word_hash
+	   hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
+	   assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
+	end
+end

data/test/lsi/lsi_test.rb ADDED

@@ -0,0 +1,123 @@
+require File.dirname(__FILE__) + '/../test_helper'
+class LSITest < Test::Unit::TestCase
+	def setup
+	  # we repeat principle words to help weight them.
+	  # This test is rather delicate, since this system is mostly noise.
+     @str1 = "This text deals with dogs. Dogs."
+	  @str2 = "This text involves dogs too. Dogs! "
+	  @str3 = "This text revolves around cats. Cats."
+	  @str4 = "This text also involves cats. Cats!"
+	  @str5 = "This text involves birds. Birds."
+	end
+	def test_basic_indexing
+	 lsi = Classifier::LSI.new
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+	  assert ! lsi.needs_rebuild?
+	 # note that the closest match to str1 is str2, even though it is not
+	 # the closest text match.
+	 assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
+	end
+	def test_not_auto_rebuild
+	 lsi = Classifier::LSI.new :auto_rebuild => false
+	 lsi.add_item @str1, "Dog"
+	 lsi.add_item @str2, "Dog"
+	 assert lsi.needs_rebuild?
+	 lsi.build_index
+	 assert ! lsi.needs_rebuild?
+	end
+	def test_basic_categorizing
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str2, "Dog"
+	  lsi.add_item @str3, "Cat"
+	  lsi.add_item @str4, "Cat"
+	  lsi.add_item @str5, "Bird"
+	  assert_equal "Dog", lsi.classify( @str1 )
+	  assert_equal "Cat", lsi.classify( @str3 )
+     assert_equal "Bird", lsi.classify( @str5 )
+	end
+	def test_external_classifying
+	  lsi = Classifier::LSI.new
+	  bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
+	  lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
+	  lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
+	  lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
+	  lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
+	  lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
+	  # We're talking about dogs. Even though the text matches the corpus on
+	  # cats better.  Dogs have more semantic weight than cats. So bayes
+	  # will fail here, but the LSI recognizes content.
+	  tricky_case = "This text revolves around dogs."
+	  assert_equal "Dog", lsi.classify( tricky_case )
+	  assert_not_equal "Dog", bayes.classify( tricky_case )
+	end
+	def test_recategorize_interface
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str1, "Dog"
+	  lsi.add_item @str2, "Dog"
+	  lsi.add_item @str3, "Cat"
+	  lsi.add_item @str4, "Cat"
+	  lsi.add_item @str5, "Bird"
+	  tricky_case = "This text revolves around dogs."
+	  assert_equal "Dog", lsi.classify( tricky_case )
+	  # Recategorize as needed.
+	  lsi.categories_for(@str1).clear.push "Cow"
+	  lsi.categories_for(@str2).clear.push "Cow"
+	  assert !lsi.needs_rebuild?
+	  assert_equal "Cow", lsi.classify( tricky_case )
+	end
+	def test_search
+	  lsi = Classifier::LSI.new
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+	  # Searching by content and text, note that @str2 comes up first, because
+	  # both "dog" and "involve" are present. But, the next match is @str1 instead
+	  # of @str4, because "dog" carries more weight than involves.
+	  assert_equal( [@str2, @str1, @str4, @str5, @str3],
+	                lsi.search("dog involves", 100) )
+	  # Keyword search shows how the space is mapped out in relation to
+	  # dog when magnitude is remove. Note the relations. We move from dog
+	  # through involve and then finally to other words.
+	  assert_equal( [@str1, @str2, @str4, @str5, @str3],
+	                lsi.search("dog", 5) )
+	end
+	def test_serialize_safe
+    lsi = Classifier::LSI.new
+	  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+	  lsi_md = Marshal.dump lsi
+	  lsi_m = Marshal.load lsi_md
+	  assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
+	  assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
+	end
+	def test_keyword_search
+	  lsi = Classifier::LSI.new
+	  lsi.add_item @str1, "Dog"
+	  lsi.add_item @str2, "Dog"
+	  lsi.add_item @str3, "Cat"
+	  lsi.add_item @str4, "Cat"
+	  lsi.add_item @str5, "Bird"
+	  assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
+	end
+	def test_summary
+	   assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
+	end
+end

data/test/test_helper.rb ADDED

@@ -0,0 +1,4 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+require 'test/unit'
+require 'classifier'

metadata ADDED

@@ -0,0 +1,85 @@
+--- !ruby/object:Gem::Specification
+name: otherinbox-classifier
+version: !ruby/object:Gem::Version
+  version: 1.3.1.1
+platform: ruby
+authors:
+- Lucas Carlson
+autorequire: classifier
+bindir: bin
+cert_chain:
+date: 2008-01-19 00:00:00 -08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: stemmer
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.0.0
+    version:
+description: A general classifier module to allow Bayesian and other types of classifications.
+email: lucas@rufy.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/classifier.rb
+- lib/classifier
+- lib/classifier/bayes.rb
+- lib/classifier/lsi.rb
+- lib/classifier/extensions
+- lib/classifier/extensions/string.rb
+- lib/classifier/extensions/vector.rb
+- lib/classifier/extensions/vector_serialize.rb
+- lib/classifier/extensions/word_hash.rb
+- lib/classifier/lsi
+- lib/classifier/lsi/content_node.rb
+- lib/classifier/lsi/summary.rb
+- lib/classifier/lsi/word_list.rb
+- bin/bayes.rb
+- bin/summarize.rb
+- test/bayes
+- test/bayes/bayesian_test.rb
+- test/test_helper.rb
+- test/extensions
+- test/extensions/word_hash_test.rb
+- test/lsi
+- test/lsi/lsi_test.rb
+- README
+- Rakefile
+- LICENSE
+has_rdoc: true
+homepage: http://classifier.rufy.com/
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements:
+- A porter-stemmer module to split word stems.
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 1
+summary: A general classifier module to allow Bayesian and other types of classifications.
+test_files: []