RubyGems - reclassifier - Versions diffs - 0.0.2 - Mend

reclassifier 0.0.2

Files changed (24) hide show

data/.gitignore +17 -0
data/Gemfile +4 -0
data/LICENSE.txt +429 -0
data/README.md +87 -0
data/Rakefile +7 -0
data/lib/gsl/vector.rb +12 -0
data/lib/reclassifier.rb +19 -0
data/lib/reclassifier/bayes.rb +129 -0
data/lib/reclassifier/content_node.rb +66 -0
data/lib/reclassifier/core_ext/array.rb +11 -0
data/lib/reclassifier/core_ext/matrix.rb +72 -0
data/lib/reclassifier/core_ext/object.rb +3 -0
data/lib/reclassifier/core_ext/string.rb +143 -0
data/lib/reclassifier/core_ext/vector.rb +20 -0
data/lib/reclassifier/lsi.rb +300 -0
data/lib/reclassifier/version.rb +3 -0
data/lib/reclassifier/word_list.rb +32 -0
data/reclassifier.gemspec +27 -0
data/test/bayes_test.rb +34 -0
data/test/core_ext/array_test.rb +15 -0
data/test/core_ext/string_test.rb +13 -0
data/test/lsi_test.rb +123 -0
data/test/test_helper.rb +4 -0
metadata +154 -0

data/README.md ADDED

@@ -0,0 +1,87 @@
+# Reclassifier
+Reclassifier is a gem that provides [classification](http://en.wikipedia.org/wiki/Statistical_classification) of strings.
+Classification can be done via [Naïve Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) or [Latent Semantic Indexing](http://en.wikipedia.org/wiki/Latent_semantic_indexing).
+It is a fork of the original [Classifier](https://github.com/cardmagic/classifier) gem, which appears to be unmaintained as of a couple of years ago.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'reclassifier'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install reclassifier
+## Dependencies
+Currently you need to install the GNU GSL library in order to use Reclassifier: http://www.gnu.org/software/gsl
+## Usage
+### Bayes
+Bayesian Classifiers are accurate, fast, and have modest memory requirements.
+#### Usage
+    require 'reclassifier'
+    b = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
+    b.train_interesting "here are some good words. I hope you love them"
+    b.train_uninteresting "here are some bad words, I hate you"
+    b.classify "I hate bad words and you" # returns 'Uninteresting'
+    require 'madeleine'
+    m = SnapshotMadeleine.new("bayes_data") {
+        Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
+    }
+    m.system.train_interesting "here are some good words. I hope you love them"
+    m.system.train_uninteresting "here are some bad words, I hate you"
+    m.take_snapshot
+    m.system.classify "I love you" # returns 'Interesting'
+Using Madeleine, your application can persist the learned data over time.
+### LSI
+Latent Semantic Indexing engines are not as fast or as small as Bayesian classifiers, but are more flexible, providing
+fast search and clustering detection as well as semantic analysis of the text that theoretically simulates human learning.
+#### Usage
+    require 'reclassifier'
+    lsi = Reclassifier::LSI.new
+    strings = [ ["This text deals with dogs. Dogs.", :dog],
+                ["This text involves dogs too. Dogs! ", :dog],
+                ["This text revolves around cats. Cats.", :cat],
+                ["This text also involves cats. Cats!", :cat],
+                ["This text involves birds. Birds.",:bird ]]
+    strings.each {|x| lsi.add_item x.first, x.last}
+    lsi.search("dog", 3)
+    # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
+    #             "This text also involves cats. Cats!"]
+    lsi.find_related(strings[2], 2)
+    # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
+    lsi.classify "This text is also about dogs!"
+    # returns => :dog
+Please see the Reclassifier::LSI documentation for more information. It is possible to index, search and classify
+with more than just simple strings.
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## License
+This library is released under the terms of the GNU LGPL. See LICENSE for more details.

data/Rakefile ADDED

@@ -0,0 +1,7 @@
+require "bundler/gem_tasks"
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+  t.test_files = FileList['test/**/*_test.rb']
+end

data/lib/gsl/vector.rb ADDED

@@ -0,0 +1,12 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump(self.to_a)
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+end

data/lib/reclassifier.rb ADDED

@@ -0,0 +1,19 @@
+# gems
+require 'matrix'
+require 'fast-stemmer'
+require 'gsl'
+# files
+require 'reclassifier/version'
+require 'reclassifier/core_ext/array'
+require 'reclassifier/core_ext/matrix'
+require 'reclassifier/core_ext/object'
+require 'reclassifier/core_ext/string'
+require 'gsl/vector'
+module Reclassifier
+  autoload :Bayes,       'reclassifier/bayes'
+  autoload :LSI,         'reclassifier/lsi'
+  autoload :ContentNode, 'reclassifier/content_node'
+  autoload :WordList,    'reclassifier/word_list'
+end

data/lib/reclassifier/bayes.rb ADDED

@@ -0,0 +1,129 @@
+module Reclassifier
+  class Bayes
+    # The class can be created with one or more categories, each of which will be
+    # initialized and given a training method. E.g.,
+    #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
+    def initialize(*categories)
+      @categories = Hash.new
+      categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
+      @total_words = 0
+      @category_counts = Hash.new(0)
+    end
+    #
+    # Provides a general training method for all categories specified in Bayes#new
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train :this, "This text"
+    #     b.train "that", "That text"
+    #     b.train "The other", "The other text"
+    def train(category, text)
+      category = category.prepare_category_name
+      @category_counts[category] += 1
+      text.word_hash.each do |word, count|
+        @categories[category][word]     ||=     0
+        @categories[category][word]      +=     count
+        @total_words += count
+      end
+    end
+    #
+    # Provides a untraining method for all categories specified in Bayes#new
+    # Be very careful with this method.
+    #
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train :this, "This text"
+    #     b.untrain :this, "This text"
+    def untrain(category, text)
+      category = category.prepare_category_name
+      @category_counts[category] -= 1
+      text.word_hash.each do |word, count|
+        if @total_words >= 0
+          orig = @categories[category][word]
+          @categories[category][word]     ||=     0
+          @categories[category][word]      -=     count
+          if @categories[category][word] <= 0
+            @categories[category].delete(word)
+            count = orig
+          end
+          @total_words -= count
+        end
+      end
+    end
+    #
+    # Returns the scores in each category the provided +text+. E.g.,
+    #    b.classifications "I hate bad words and you"
+    #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+    # The largest of these scores (the one closest to 0) is the one picked out by #classify
+    def classifications(text)
+      score = Hash.new
+      training_count = @category_counts.values.inject { |x,y| x+y }.to_f
+      @categories.each do |category, category_words|
+        score[category.to_s] = 0
+        total = category_words.values.inject(0) {|sum, element| sum+element}
+        text.word_hash.each do |word, count|
+          s = category_words.has_key?(word) ? category_words[word] : 0.1
+          score[category.to_s] += Math.log(s/total.to_f)
+        end
+        # now add prior probability for the category
+        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
+        score[category.to_s] += Math.log(s / training_count)
+      end
+      return score
+    end
+    #
+    # Returns the classification of the provided +text+, which is one of the
+    # categories given in the initializer. E.g.,
+    #    b.classify "I hate bad words and you"
+    #    =>  'Uninteresting'
+    def classify(text)
+      (classifications(text).sort_by { |a| -a[1] })[0][0]
+    end
+    #
+    # Provides training and untraining methods for the categories specified in Bayes#new
+    # For example:
+    #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+    #     b.train_this "This text"
+    #     b.train_that "That text"
+    #     b.untrain_that "That text"
+    #     b.train_the_other "The other text"
+    def method_missing(name, *args)
+      category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
+      if @categories.has_key? category
+        args.each { |text| eval("#{$1}train(category, text)") }
+      elsif name.to_s =~ /(un)?train_([\w]+)/
+        raise StandardError, "No such category: #{category}"
+      else
+        super  #raise StandardError, "No such method: #{name}"
+      end
+    end
+    #
+    # Provides a list of category names
+    # For example:
+    #     b.categories
+    #     =>   ['This', 'That', 'the_other']
+    def categories # :nodoc:
+      @categories.keys.collect {|c| c.to_s}
+    end
+    #
+    # Allows you to add categories to the classifier.
+    # For example:
+    #     b.add_category "Not spam"
+    #
+    # WARNING: Adding categories to a trained classifier will
+    # result in an undertrained category that will tend to match
+    # more criteria than the trained selective categories. In short,
+    # try to initialize your categories at initialization.
+    def add_category(category)
+      @categories[category.prepare_category_name] = Hash.new
+    end
+    alias append_category add_category
+  end
+end

data/lib/reclassifier/content_node.rb ADDED

@@ -0,0 +1,66 @@
+module Reclassifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = $GSL ? vec.sum : vec.sum_with_identity
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/reclassifier/core_ext/array.rb ADDED

@@ -0,0 +1,11 @@
+class Array
+  def sum_with_identity(identity = 0, &block)
+    return identity unless size > 0
+    if block_given?
+      map(&block).sum
+    else
+      reduce(:+)
+    end
+  end
+end

data/lib/reclassifier/core_ext/matrix.rb ADDED

@@ -0,0 +1,72 @@
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/reclassifier/core_ext/object.rb ADDED

@@ -0,0 +1,3 @@
+class Object
+	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
+end

data/lib/reclassifier/core_ext/string.rb ADDED

@@ -0,0 +1,143 @@
+class String
+  # Removes common punctuation symbols, returning a new string.
+  # E.g.,
+  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+  #   => "Hello  greetings   with  braces         "
+  def without_punctuation
+    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+  end
+  # Return a Hash of strings => ints. Each word in the string is stemmed,
+  # symbolized, and indexed to its frequency in the document.
+	def word_hash
+		word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
+	end
+	# Return a word hash without extra punctuation or short symbols, just stemmed words
+	def clean_word_hash
+		word_hash_for_words gsub(/[^\w\s]/,"").split
+	end
+	def word_hash_for_words(words)
+		d = Hash.new
+		words.each do |word|
+			word.downcase! if word =~ /[\w]+/
+			key = word.stem.to_sym
+			if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+				d[key] ||= 0
+				d[key] += 1
+			end
+		end
+		return d
+	end
+	CORPUS_SKIP_WORDS = [
+      "a",
+      "again",
+      "all",
+      "along",
+      "are",
+      "also",
+      "an",
+      "and",
+      "as",
+      "at",
+      "but",
+      "by",
+      "came",
+      "can",
+      "cant",
+      "couldnt",
+      "did",
+      "didn",
+      "didnt",
+      "do",
+      "doesnt",
+      "dont",
+      "ever",
+      "first",
+      "from",
+      "have",
+      "her",
+      "here",
+      "him",
+      "how",
+      "i",
+      "if",
+      "in",
+      "into",
+      "is",
+      "isnt",
+      "it",
+      "itll",
+      "just",
+      "last",
+      "least",
+      "like",
+      "most",
+      "my",
+      "new",
+      "no",
+      "not",
+      "now",
+      "of",
+      "on",
+      "or",
+      "should",
+      "sinc",
+      "so",
+      "some",
+      "th",
+      "than",
+      "this",
+      "that",
+      "the",
+      "their",
+      "then",
+      "those",
+      "to",
+      "told",
+      "too",
+      "true",
+      "try",
+      "until",
+      "url",
+      "us",
+      "were",
+      "when",
+      "whether",
+      "while",
+      "with",
+      "within",
+      "yes",
+      "you",
+      "youll",
+      ]
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Reclassifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end