RubyGems - reclassifier - Versions diffs - 0.0.4 → 0.1.4 - Mend

reclassifier 0.0.4 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/lib/reclassifier/bayes.rb +26 -10
data/lib/reclassifier/core_ext/string.rb +0 -116
data/lib/reclassifier/lsi.rb +3 -2
data/lib/reclassifier/version.rb +1 -1
data/lib/reclassifier/word_hash.rb +111 -0
data/lib/reclassifier.rb +3 -2
data/spec/bayes_spec.rb +41 -4
data/spec/lsi_spec.rb +1 -1
data/spec/{core_ext/string_spec.rb → word_hash_spec.rb} +4 -4
metadata +4 -3

data/lib/reclassifier/bayes.rb CHANGED Viewed

@@ -6,15 +6,22 @@
 # Cambridge University Press. 2008, ISBN 0521865719.
 #
 class Reclassifier::Bayes
+  include Reclassifier::WordHash
   # Can be created with zero or more classifications, each of which will be
   # initialized and given a training method.  The classifications are specified as
-  # symbols.  E.g.,
-  #      b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
-  def initialize(*classifications)
+  # an array of symbols.  Options are specified in a hash.
+  #
+  # Options:
+  # * :clean - If false, punctuation will be included in the classifier.  Otherwise, punctuation will be omitted.  Default is true.
+  #
+  #      b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
+  def initialize(classifications = [], options = {})
     @classifications = {}
-    classifications.each {|classification| @classifications[classification] = {}}
     @docs_in_classification_count = {}
+    @options = options
+    classifications.each {|classification| add_classification(classification)}
   end
   #
@@ -26,10 +33,9 @@ class Reclassifier::Bayes
   def train(classification, text)
     ensure_classification_exists(classification)
-    @docs_in_classification_count[classification] ||= 0
     @docs_in_classification_count[classification] += 1
-    text.word_hash.each do |word, count|
+    smart_word_hash(text).each do |word, count|
       @classifications[classification][word] ||= 0
       @classifications[classification][word] += count
@@ -49,7 +55,7 @@ class Reclassifier::Bayes
     @docs_in_classification_count[classification] -= 1
-    text.word_hash.each do |word, count|
+    smart_word_hash(text).each do |word, count|
       @classifications[classification][word] -= count if @classifications[classification].include?(word)
     end
   end
@@ -68,11 +74,11 @@ class Reclassifier::Bayes
       scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
       # likelihood
-      text.word_hash.each do |word, count|
+      smart_word_hash(text).each do |word, count|
         if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
           scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
-          scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
+          scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
         end
       end
     end
@@ -107,6 +113,8 @@ class Reclassifier::Bayes
   def add_classification(classification)
     @classifications[classification] ||= {}
+    @docs_in_classification_count[classification] ||= 0
     classification
   end
@@ -132,4 +140,12 @@ class Reclassifier::Bayes
     def ensure_classification_exists(classification)
       raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
     end
+    def smart_word_hash(string)
+      if @options[:clean] == false
+        word_hash(string)
+      else
+        clean_word_hash(string)
+      end
+    end
 end

data/lib/reclassifier/core_ext/string.rb CHANGED Viewed

@@ -1,120 +1,4 @@
 class String
-  # Removes common punctuation symbols, returning a new string.
-  # E.g.,
-  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
-  #   => "Hello  greetings   with  braces         "
-  def without_punctuation
-    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
-  end
-  # Return a Hash of strings => ints. Each word in the string is stemmed,
-  # symbolized, and indexed to its frequency in the document.
-	def word_hash
-		word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
-	end
-	# Return a word hash without extra punctuation or short symbols, just stemmed words
-	def clean_word_hash
-		word_hash_for_words gsub(/[^\w\s]/,"").split
-	end
-	def word_hash_for_words(words)
-		d = Hash.new
-		words.each do |word|
-			word.downcase! if word =~ /[\w]+/
-			key = word.stem.to_sym
-			if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
-				d[key] ||= 0
-				d[key] += 1
-			end
-		end
-		return d
-	end
-	CORPUS_SKIP_WORDS = [
-      "a",
-      "again",
-      "all",
-      "along",
-      "are",
-      "also",
-      "an",
-      "and",
-      "as",
-      "at",
-      "but",
-      "by",
-      "came",
-      "can",
-      "cant",
-      "couldnt",
-      "did",
-      "didn",
-      "didnt",
-      "do",
-      "doesnt",
-      "dont",
-      "ever",
-      "first",
-      "from",
-      "have",
-      "her",
-      "here",
-      "him",
-      "how",
-      "i",
-      "if",
-      "in",
-      "into",
-      "is",
-      "isnt",
-      "it",
-      "itll",
-      "just",
-      "last",
-      "least",
-      "like",
-      "most",
-      "my",
-      "new",
-      "no",
-      "not",
-      "now",
-      "of",
-      "on",
-      "or",
-      "should",
-      "sinc",
-      "so",
-      "some",
-      "th",
-      "than",
-      "this",
-      "that",
-      "the",
-      "their",
-      "then",
-      "those",
-      "to",
-      "told",
-      "too",
-      "true",
-      "try",
-      "until",
-      "url",
-      "us",
-      "were",
-      "when",
-      "whether",
-      "while",
-      "with",
-      "within",
-      "yes",
-      "you",
-      "youll",
-      ]
    def summary( count=10, separator=" [...] " )
       perform_lsi split_sentences, count, separator
    end

data/lib/reclassifier/lsi.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module Reclassifier
   # data based on underlying semantic relations. For more information on the algorithms used,
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
+    include Reclassifier::WordHash
     attr_reader :word_list
     attr_accessor :auto_rebuild
@@ -41,7 +42,7 @@ module Reclassifier
     #   lsi.add_item ar, *ar.categories { |x| ar.content }
     #
     def add_item( item, *categories, &block )
-      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
       @items[item] = ContentNode.new(clean_word_hash, *categories)
       @version += 1
       build_index if @auto_rebuild
@@ -276,7 +277,7 @@ module Reclassifier
       if @items[item]
         return @items[item]
       else
-        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+        clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
         cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data

data/lib/reclassifier/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Reclassifier
-  VERSION = "0.0.4"
+  VERSION = "0.1.4"
 end

data/lib/reclassifier/word_hash.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module Reclassifier::WordHash
+	CORPUS_SKIP_WORDS = ["a",
+                       "again",
+                       "all",
+                       "along",
+                       "are",
+                       "also",
+                       "an",
+                       "and",
+                       "as",
+                       "at",
+                       "but",
+                       "by",
+                       "came",
+                       "can",
+                       "cant",
+                       "couldnt",
+                       "did",
+                       "didn",
+                       "didnt",
+                       "do",
+                       "doesnt",
+                       "dont",
+                       "ever",
+                       "first",
+                       "from",
+                       "have",
+                       "her",
+                       "here",
+                       "him",
+                       "how",
+                       "i",
+                       "if",
+                       "in",
+                       "into",
+                       "is",
+                       "isnt",
+                       "it",
+                       "itll",
+                       "just",
+                       "last",
+                       "least",
+                       "like",
+                       "most",
+                       "my",
+                       "new",
+                       "no",
+                       "not",
+                       "now",
+                       "of",
+                       "on",
+                       "or",
+                       "should",
+                       "sinc",
+                       "so",
+                       "some",
+                       "th",
+                       "than",
+                       "this",
+                       "that",
+                       "the",
+                       "their",
+                       "then",
+                       "those",
+                       "to",
+                       "told",
+                       "too",
+                       "true",
+                       "try",
+                       "until",
+                       "url",
+                       "us",
+                       "were",
+                       "when",
+                       "whether",
+                       "while",
+                       "with",
+                       "within",
+                       "yes",
+                       "you",
+                       "youll"]
+  # Return a Hash of strings => ints. Each word in the string is stemmed,
+  # symbolized, and indexed to its frequency in the document.
+	def word_hash(string)
+		word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
+	end
+	# Return a word hash without extra punctuation or short symbols, just stemmed words
+	def clean_word_hash(string)
+		word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
+	end
+	def word_hash_for_words(words)
+		d = {}
+		words.each do |word|
+			word.downcase!
+			key = word.stem.to_sym
+			if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+				d[key] ||= 0
+				d[key] += 1
+			end
+		end
+    d
+	end
+end

data/lib/reclassifier.rb CHANGED Viewed

@@ -12,8 +12,9 @@ require 'gsl/vector'
 module Reclassifier
   autoload :Bayes,                      'reclassifier/bayes'
-  autoload :LSI,                        'reclassifier/lsi'
   autoload :ContentNode,                'reclassifier/content_node'
-  autoload :WordList,                   'reclassifier/word_list'
+  autoload :LSI,                        'reclassifier/lsi'
   autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
+  autoload :WordHash,                   'reclassifier/word_hash'
+  autoload :WordList,                   'reclassifier/word_list'
 end

data/spec/bayes_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'spec_helper'
 describe Reclassifier::Bayes do
 	describe "classifications" do
     it "should return the classifications" do
-      subject = described_class.new(:interesting, :uninteresting)
+      subject = described_class.new([:interesting, :uninteresting])
       subject.classifications.sort.should eq([:interesting, :uninteresting])
     end
@@ -15,7 +15,7 @@ describe Reclassifier::Bayes do
     end
     it "should train the classifier to the (classification, document) pair" do
-      subject = described_class.new(:in_china, :not_in_china)
+      subject = described_class.new([:in_china, :not_in_china])
       subject.train(:in_china, 'Chinese Beijing Chinese')
       subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -32,7 +32,7 @@ describe Reclassifier::Bayes do
     end
     it "should untrain the classifier against the (classification, document) pair" do
-      subject = described_class.new(:in_china, :not_in_china)
+      subject = described_class.new([:in_china, :not_in_china])
       subject.train(:in_china, 'Chinese Chinese')
       subject.train(:not_in_china, 'Chinese Macao')
@@ -47,7 +47,7 @@ describe Reclassifier::Bayes do
   describe "calculate_scores" do
     it "should return a score hash with the correct scores" do
-      subject = described_class.new(:in_china, :not_in_china)
+      subject = described_class.new([:in_china, :not_in_china])
       subject.train(:in_china, 'Chinese Beijing Chinese')
       subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -59,6 +59,14 @@ describe Reclassifier::Bayes do
       scores[:in_china].should eq(-8.107690312843907)
       scores[:not_in_china].should eq(-8.906681345001262)
     end
+    it "should handle the case when no documents are classified for a particular classification" do
+      subject = described_class.new([:in_china, :not_in_china])
+      subject.train(:in_china, 'Chinese Beijing Chinese')
+      subject.calculate_scores('Chinese Beijing')
+    end
   end
   describe "add_classification" do
@@ -94,4 +102,33 @@ describe Reclassifier::Bayes do
       subject.remove_classification(:niner).should be(nil)
     end
   end
+  context ':clean option' do
+    it 'should cause punctuation to be omitted if it is set to true' do
+      subject = described_class.new([:one, :other], {:clean => true})
+      subject.train(:one, '! ! ! ! bbb')
+      subject.train(:other, 'aaa')
+      subject.classify('! aaa !').should eq(:other)
+    end
+    it 'should default to true' do
+      subject = described_class.new([:one, :other])
+      subject.train(:one, '! ! ! ! bbb')
+      subject.train(:other, 'aaa')
+      subject.classify('! aaa !').should eq(:other)
+    end
+    it 'should cause punctuation not to be omitted if it is set to false' do
+      subject = described_class.new([:one, :other], {:clean => false})
+      subject.train(:one, '! ! ! ! bbb')
+      subject.train(:other, 'aaa')
+      subject.classify('! aaa !').should eq(:one)
+    end
+  end
 end

data/spec/lsi_spec.rb CHANGED Viewed

@@ -45,7 +45,7 @@ describe Reclassifier::LSI do
   end
   it "should perform better than Bayes" do
-	  bayes = Reclassifier::Bayes.new :dog, :cat, :bird
+	  bayes = Reclassifier::Bayes.new([:dog, :cat, :bird])
     [[@str1, "Dog"],
  		 [@str2, "Dog"],

data/spec/{core_ext/string_spec.rb → word_hash_spec.rb} RENAMED Viewed

@@ -1,11 +1,11 @@
-require 'spec_helper'
+require "spec_helper"
-describe String do
+describe Reclassifier::Bayes do
   describe "word_hash" do
     it "should hash text" do
       hash  = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
-      "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
+      subject.word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
     end
 	end
@@ -13,7 +13,7 @@ describe String do
     it "should clean and hash text" do
 	    hash  = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
-  	  "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
+  	  subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: reclassifier
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.1.4
   prerelease:
 platform: ruby
 authors:
@@ -114,13 +114,14 @@ files:
 - lib/reclassifier/lsi.rb
 - lib/reclassifier/unknown_classification_error.rb
 - lib/reclassifier/version.rb
+- lib/reclassifier/word_hash.rb
 - lib/reclassifier/word_list.rb
 - reclassifier.gemspec
 - spec/bayes_spec.rb
 - spec/core_ext/array_spec.rb
-- spec/core_ext/string_spec.rb
 - spec/lsi_spec.rb
 - spec/spec_helper.rb
+- spec/word_hash_spec.rb
 homepage: https://github.com/saveup/reclassifier
 licenses:
 - LGPL
@@ -149,6 +150,6 @@ summary: Bayesian and Latent Semantic Indexing classification of text.
 test_files:
 - spec/bayes_spec.rb
 - spec/core_ext/array_spec.rb
-- spec/core_ext/string_spec.rb
 - spec/lsi_spec.rb
 - spec/spec_helper.rb
+- spec/word_hash_spec.rb