RubyGems - reclassifier - Versions diffs - 0.0.4 → 0.1.4 - Mend

reclassifier 0.0.4 → 0.1.4

Files changed (10) hide show

data/lib/reclassifier/bayes.rb +26 -10
data/lib/reclassifier/core_ext/string.rb +0 -116
data/lib/reclassifier/lsi.rb +3 -2
data/lib/reclassifier/version.rb +1 -1
data/lib/reclassifier/word_hash.rb +111 -0
data/lib/reclassifier.rb +3 -2
data/spec/bayes_spec.rb +41 -4
data/spec/lsi_spec.rb +1 -1
data/spec/{core_ext/string_spec.rb → word_hash_spec.rb} +4 -4
metadata +4 -3

data/lib/reclassifier/bayes.rb CHANGED Viewed

@@ -6,15 +6,22 @@
 # Cambridge University Press. 2008, ISBN 0521865719.
 #
 class Reclassifier::Bayes
+  include Reclassifier::WordHash
   # Can be created with zero or more classifications, each of which will be
   # initialized and given a training method.  The classifications are specified as
-  # symbols.  E.g.,
-  #      b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
-  def initialize(*classifications)
+  # an array of symbols.  Options are specified in a hash.
+  #
+  # Options:
+  # * :clean - If false, punctuation will be included in the classifier.  Otherwise, punctuation will be omitted.  Default is true.
+  #
+  #      b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
+  def initialize(classifications = [], options = {})
     @classifications = {}
-    classifications.each {|classification| @classifications[classification] = {}}
     @docs_in_classification_count = {}
+    @options = options
+    classifications.each {|classification| add_classification(classification)}
   end
   #
@@ -26,10 +33,9 @@ class Reclassifier::Bayes
   def train(classification, text)
     ensure_classification_exists(classification)
-    @docs_in_classification_count[classification] ||= 0
     @docs_in_classification_count[classification] += 1
-    text.word_hash.each do |word, count|
+    smart_word_hash(text).each do |word, count|
       @classifications[classification][word] ||= 0
       @classifications[classification][word] += count
@@ -49,7 +55,7 @@ class Reclassifier::Bayes
     @docs_in_classification_count[classification] -= 1
-    text.word_hash.each do |word, count|
+    smart_word_hash(text).each do |word, count|
       @classifications[classification][word] -= count if @classifications[classification].include?(word)
     end
   end
@@ -68,11 +74,11 @@ class Reclassifier::Bayes
       scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
       # likelihood
-      text.word_hash.each do |word, count|
+      smart_word_hash(text).each do |word, count|
         if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
           scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
-          scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
+          scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
         end
       end
     end
@@ -107,6 +113,8 @@ class Reclassifier::Bayes
   def add_classification(classification)
     @classifications[classification] ||= {}
+    @docs_in_classification_count[classification] ||= 0
     classification
   end
@@ -132,4 +140,12 @@ class Reclassifier::Bayes
     def ensure_classification_exists(classification)
       raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
     end
+    def smart_word_hash(string)
+      if @options[:clean] == false
+        word_hash(string)
+      else
+        clean_word_hash(string)
+      end
+    end
 end

data/lib/reclassifier/core_ext/string.rb CHANGED Viewed

@@ -1,120 +1,4 @@
 class String
-  # Removes common punctuation symbols, returning a new string.
-  # E.g.,
-  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
-  #   => "Hello  greetings   with  braces         "
-  def without_punctuation
-    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
-  end
-  # Return a Hash of strings => ints. Each word in the string is stemmed,
-  # symbolized, and indexed to its frequency in the document.
-	def word_hash
-		word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
-	end
-	# Return a word hash without extra punctuation or short symbols, just stemmed words
-	def clean_word_hash
-		word_hash_for_words gsub(/[^\w\s]/,"").split
-	end
-	def word_hash_for_words(words)
-		d = Hash.new
-		words.each do |word|
-			word.downcase! if word =~ /[\w]+/
-			key = word.stem.to_sym
-			if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
-				d[key] ||= 0
-				d[key] += 1
-			end
-		end
-		return d
-	end
-	CORPUS_SKIP_WORDS = [
-      "a",
-      "again",
-      "all",
-      "along",
-      "are",
-      "also",
-      "an",
-      "and",
-      "as",
-      "at",
-      "but",
-      "by",
-      "came",
-      "can",
-      "cant",
-      "couldnt",
-      "did",
-      "didn",
-      "didnt",
-      "do",
-      "doesnt",
-      "dont",
-      "ever",
-      "first",
-      "from",
-      "have",
-      "her",
-      "here",
-      "him",
-      "how",
-      "i",
-      "if",
-      "in",
-      "into",
-      "is",
-      "isnt",
-      "it",
-      "itll",
-      "just",
-      "last",
-      "least",
-      "like",
-      "most",
-      "my",
-      "new",
-      "no",
-      "not",
-      "now",
-      "of",
-      "on",
-      "or",
-      "should",
-      "sinc",
-      "so",
-      "some",
-      "th",
-      "than",
-      "this",
-      "that",
-      "the",
-      "their",
-      "then",
-      "those",
-      "to",
-      "told",
-      "too",
-      "true",
-      "try",
-      "until",
-      "url",
-      "us",
-      "were",
-      "when",
-      "whether",
-      "while",
-      "with",
-      "within",
-      "yes",
-      "you",
-      "youll",
-      ]
    def summary( count=10, separator=" [...] " )
       perform_lsi split_sentences, count, separator
    end

data/lib/reclassifier/lsi.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module Reclassifier
   # data based on underlying semantic relations. For more information on the algorithms used,
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
+    include Reclassifier::WordHash
     attr_reader :word_list
     attr_accessor :auto_rebuild
@@ -41,7 +42,7 @@ module Reclassifier
     #   lsi.add_item ar, *ar.categories { |x| ar.content }
     #
     def add_item( item, *categories, &block )
-      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
       @items[item] = ContentNode.new(clean_word_hash, *categories)
       @version += 1
       build_index if @auto_rebuild
@@ -276,7 +277,7 @@ module Reclassifier
       if @items[item]
         return @items[item]
       else
-        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+        clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
         cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data

data/lib/reclassifier/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Reclassifier
-  VERSION = "0.0.4"
+  VERSION = "0.1.4"
 end

data/lib/reclassifier/word_hash.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module Reclassifier::WordHash
+	CORPUS_SKIP_WORDS = ["a",
+                       "again",
+                       "all",
+                       "along",
+                       "are",
+                       "also",
+                       "an",
+                       "and",
+                       "as",
+                       "at",
+                       "but",
+                       "by",
+                       "came",
+                       "can",
+                       "cant",
+                       "couldnt",
+                       "did",
+                       "didn",
+                       "didnt",
+                       "do",
+                       "doesnt",
+                       "dont",
+                       "ever",
+                       "first",
+                       "from",
+                       "have",
+                       "her",
+                       "here",
+                       "him",
+                       "how",
+                       "i",
+                       "if",
+                       "in",
+                       "into",
+                       "is",
+                       "isnt",
+                       "it",
+                       "itll",
+                       "just",
+                       "last",
+                       "least",
+                       "like",
+                       "most",
+                       "my",
+                       "new",
+                       "no",
+                       "not",
+                       "now",
+                       "of",
+                       "on",
+                       "or",
+                       "should",
+                       "sinc",
+                       "so",
+                       "some",
+                       "th",
+                       "than",
+                       "this",
+                       "that",
+                       "the",
+                       "their",
+                       "then",
+                       "those",
+                       "to",
+                       "told",
+                       "too",
+                       "true",
+                       "try",
+                       "until",
+                       "url",
+                       "us",
+                       "were",
+                       "when",
+                       "whether",
+                       "while",
+                       "with",
+                       "within",
+                       "yes",
+                       "you",
+                       "youll"]
+  # Return a Hash of strings => ints. Each word in the string is stemmed,
+  # symbolized, and indexed to its frequency in the document.
+	def word_hash(string)
+		word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
+	end
+	# Return a word hash without extra punctuation or short symbols, just stemmed words
+	def clean_word_hash(string)
+		word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
+	end
+	def word_hash_for_words(words)
+		d = {}
+		words.each do |word|
+			word.downcase!
+			key = word.stem.to_sym
+			if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+				d[key] ||= 0
+				d[key] += 1
+			end
+		end
+    d
+	end
+end

data/lib/reclassifier.rb CHANGED Viewed

@@ -12,8 +12,9 @@ require 'gsl/vector'
 module Reclassifier
   autoload :Bayes,                      'reclassifier/bayes'
-  autoload :LSI,                        'reclassifier/lsi'
   autoload :ContentNode,                'reclassifier/content_node'
-  autoload :WordList,                   'reclassifier/word_list'
+  autoload :LSI,                        'reclassifier/lsi'
   autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
+  autoload :WordHash,                   'reclassifier/word_hash'
+  autoload :WordList,                   'reclassifier/word_list'
 end

data/spec/bayes_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'spec_helper'
 describe Reclassifier::Bayes do
 	describe "classifications" do
     it "should return the classifications" do
-      subject = described_class.new(:interesting, :uninteresting)
+      subject = described_class.new([:interesting, :uninteresting])
       subject.classifications.sort.should eq([:interesting, :uninteresting])
     end
@@ -15,7 +15,7 @@ describe Reclassifier::Bayes do
     end
     it "should train the classifier to the (classification, document) pair" do
-      subject = described_class.new(:in_china, :not_in_china)
+      subject = described_class.new([:in_china, :not_in_china])
       subject.train(:in_china, 'Chinese Beijing Chinese')
       subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -32,7 +32,7 @@ describe Reclassifier::Bayes do
     end
     it "should untrain the classifier against the (classification, document) pair" do
-      subject = described_class.new(:in_china, :not_in_china)
+      subject = described_class.new([:in_china, :not_in_china])
       subject.train(:in_china, 'Chinese Chinese')
       subject.train(:not_in_china, 'Chinese Macao')
@@ -47,7 +47,7 @@ describe Reclassifier::Bayes do
   describe "calculate_scores" do
     it "should return a score hash with the correct scores" do
-      subject = described_class.new(:in_china, :not_in_china)
+      subject = described_class.new([:in_china, :not_in_china])
       subject.train(:in_china, 'Chinese Beijing Chinese')
       subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -59,6 +59,14 @@ describe Reclassifier::Bayes do
       scores[:in_china].should eq(-8.107690312843907)
       scores[:not_in_china].should eq(-8.906681345001262)
     end
+    it "should handle the case when no documents are classified for a particular classification" do
+      subject = described_class.new([:in_china, :not_in_china])
+      subject.train(:in_china, 'Chinese Beijing Chinese')
+      subject.calculate_scores('Chinese Beijing')
+    end
   end
   describe "add_classification" do
@@ -94,4 +102,33 @@ describe Reclassifier::Bayes do
       subject.remove_classification(:niner).should be(nil)
     end
   end
+  context ':clean option' do
+    it 'should cause punctuation to be omitted if it is set to true' do
+      subject = described_class.new([:one, :other], {:clean => true})
+      subject.train(:one, '! ! ! ! bbb')
+      subject.train(:other, 'aaa')
+      subject.classify('! aaa !').should eq(:other)
+    end
+    it 'should default to true' do
+      subject = described_class.new([:one, :other])
+      subject.train(:one, '! ! ! ! bbb')
+      subject.train(:other, 'aaa')
+      subject.classify('! aaa !').should eq(:other)
+    end
+    it 'should cause punctuation not to be omitted if it is set to false' do
+      subject = described_class.new([:one, :other], {:clean => false})
+      subject.train(:one, '! ! ! ! bbb')
+      subject.train(:other, 'aaa')
+      subject.classify('! aaa !').should eq(:one)
+    end
+  end
 end

data/spec/lsi_spec.rb CHANGED Viewed

@@ -45,7 +45,7 @@ describe Reclassifier::LSI do
   end
   it "should perform better than Bayes" do
-	  bayes = Reclassifier::Bayes.new :dog, :cat, :bird
+	  bayes = Reclassifier::Bayes.new([:dog, :cat, :bird])
     [[@str1, "Dog"],
  		 [@str2, "Dog"],

data/spec/{core_ext/string_spec.rb → word_hash_spec.rb} RENAMED Viewed

@@ -1,11 +1,11 @@
-require 'spec_helper'
+require "spec_helper"
-describe String do
+describe Reclassifier::Bayes do
   describe "word_hash" do
     it "should hash text" do
       hash  = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
-      "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
+      subject.word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
     end
 	end
@@ -13,7 +13,7 @@ describe String do
     it "should clean and hash text" do
 	    hash  = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
-  	  "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
+  	  subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: reclassifier
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.1.4
   prerelease:
 platform: ruby
 authors:
@@ -114,13 +114,14 @@ files:
 - lib/reclassifier/lsi.rb
 - lib/reclassifier/unknown_classification_error.rb
 - lib/reclassifier/version.rb
+- lib/reclassifier/word_hash.rb
 - lib/reclassifier/word_list.rb
 - reclassifier.gemspec
 - spec/bayes_spec.rb
 - spec/core_ext/array_spec.rb
-- spec/core_ext/string_spec.rb
 - spec/lsi_spec.rb
 - spec/spec_helper.rb
+- spec/word_hash_spec.rb
 homepage: https://github.com/saveup/reclassifier
 licenses:
 - LGPL
@@ -149,6 +150,6 @@ summary: Bayesian and Latent Semantic Indexing classification of text.
 test_files:
 - spec/bayes_spec.rb
 - spec/core_ext/array_spec.rb
-- spec/core_ext/string_spec.rb
 - spec/lsi_spec.rb
 - spec/spec_helper.rb
+- spec/word_hash_spec.rb