RubyGems - yury-classifier - Versions diffs - 1.3.1 - Mend

yury-classifier 1.3.1

Files changed (20) hide show

data/LICENSE +429 -0
data/Manifest +18 -0
data/README +86 -0
data/Rakefile +15 -0
data/classifier.gemspec +38 -0
data/lib/classifier/base.rb +303 -0
data/lib/classifier/bayes.rb +134 -0
data/lib/classifier/extensions/vector.rb +100 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/lsi/content_node.rb +72 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/lsi.rb +319 -0
data/lib/classifier.rb +32 -0
data/lib/init.rb +1 -0
data/test/base_test.rb +17 -0
data/test/bayes/bayesian_test.rb +40 -0
data/test/lsi/lsi_test.rb +123 -0
data/test/test_helper.rb +4 -0
metadata +113 -0

data/lib/classifier/base.rb ADDED Viewed

@@ -0,0 +1,303 @@
+module Classifier
+  class Base
+    def initialize(options = {})
+      options.reverse_merge!(:language => 'en')
+      options.reverse_merge!(:encoding => 'UTF_8')
+      @options = options
+    end
+    def prepare_category_name val
+      val.to_s.gsub("_"," ").capitalize.intern
+    end
+    # Removes common punctuation symbols, returning a new string.
+    # E.g.,
+    #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+    #   => "Hello  greetings   with  braces         "
+    def without_punctuation str
+      str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+    end
+    # Return a Hash of strings => ints. Each word in the string is stemmed,
+    # interned, and indexes to its frequency in the document.
+  	def word_hash str
+  		word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
+  	end
+  	# Return a word hash without extra punctuation or short symbols, just stemmed words
+  	def clean_word_hash str
+  		word_hash_for_words str.gsub(/[^\w\s]/,"").split
+  	end
+  	private
+  	def word_hash_for_words(words)
+  	  stemmer = Lingua::Stemmer.new(@options)
+  		d = Hash.new
+  		skip_words = SKIP_WORDS[@options[:language]] || []
+  		words.each do |word|
+  			word.mb_chars.downcase! if word =~ /[\w]+/
+  			key = stemmer.stem(word).intern
+  			if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
+  				d[key] ||= 0
+  				d[key] += 1
+  			end
+  		end
+  		return d
+  	end
+  	EN_CORPUS_SKIP_WORDS = [
+        "a",
+        "again",
+        "all",
+        "along",
+        "are",
+        "also",
+        "an",
+        "and",
+        "as",
+        "at",
+        "but",
+        "by",
+        "came",
+        "can",
+        "cant",
+        "couldnt",
+        "did",
+        "didn",
+        "didnt",
+        "do",
+        "doesnt",
+        "dont",
+        "ever",
+        "first",
+        "from",
+        "have",
+        "her",
+        "here",
+        "him",
+        "how",
+        "i",
+        "if",
+        "in",
+        "into",
+        "is",
+        "isnt",
+        "it",
+        "itll",
+        "just",
+        "last",
+        "least",
+        "like",
+        "most",
+        "my",
+        "new",
+        "no",
+        "not",
+        "now",
+        "of",
+        "on",
+        "or",
+        "should",
+        "sinc",
+        "so",
+        "some",
+        "th",
+        "than",
+        "this",
+        "that",
+        "the",
+        "their",
+        "then",
+        "those",
+        "to",
+        "told",
+        "too",
+        "true",
+        "try",
+        "until",
+        "url",
+        "us",
+        "were",
+        "when",
+        "whether",
+        "while",
+        "with",
+        "within",
+        "yes",
+        "you",
+        "youll",
+        ]
+    # http://snowball.tartarus.org/algorithms/russian/stop.txt
+    RU_CORPUS_SKIP_WORDS = [
+      "и",              # and
+      "в",              # in/into
+      "во",             # alternative form
+      "не",             # not
+      "что",            # what/that
+      "он",             # he
+      "на",             # on/onto
+      "я",              # i
+      "с",              # from
+      "со",             # alternative form
+      "как",            # how
+      "а",              # milder form of `no' (but)
+      "то",             # conjunction and form of `that'
+      "все",            # all
+      "она",            # she
+      "так",            # so, thus
+      "его",            # him
+      "но",             # but
+      "да",             # yes/and
+      "ты",             # thou
+      "к",              # towards, by
+      "у",              # around, chez
+      "же",             # intensifier particle
+      "вы",             # you
+      "за",             # beyond, behind
+      "бы",             # conditional/subj. particle
+      "по",             # up to, along
+      "только",         # only
+      "ее",             # her
+      "мне",            # to me
+      "было",           # it was
+      "вот",            # here is/are, particle
+      "от",             # away from
+      "меня",           # me
+      "еще",            # still, yet, more
+      "нет",            # no, there isnt/arent
+      "о",              # about
+      "из",             # out of
+      "ему",            # to him
+      "теперь",         # now
+      "когда",          # when
+      "даже",           # even
+      "ну",             # so, well
+      "вдруг",          # suddenly
+      "ли",             # interrogative particle
+      "если",           # if
+      "уже",            # already, but homonym of `narrower'
+      "или",            # or
+      "ни",             # neither
+      "быть",           # to be
+      "был",            # he was
+      "него",           # prepositional form of его
+      "до",             # up to
+      "вас",            # you accusative
+      "нибудь",         # indef. suffix preceded by hyphen
+      "опять",          # again
+      "уж",             # already, but homonym of `adder'
+      "вам",            # to you
+      "сказал",         # he said
+      "ведь",           # particle `after all'
+      "там",            # there
+      "потом",          # then
+      "себя",           # oneself
+      "ничего",         # nothing
+      "ей",             # to her
+      "может",          # usually with `быть' as `maybe'
+      "они",            # they
+      "тут",            # here
+      "где",            # where
+      "есть",           # there is/are
+      "надо",           # got to, must
+      "ней",            # prepositional form of  ей
+      "для",            # for
+      "мы",             # we
+      "тебя",           # thee
+      "их",             # them, their
+      "чем",            # than
+      "была",           # she was
+      "сам",            # self
+      "чтоб",           # in order to
+      "без",            # without
+      "будто",          # as if
+      "человек",        # man, person, one
+      "чего",           # genitive form of `what'
+      "раз",            # once
+      "тоже",           # also
+      "себе",           # to oneself
+      "под",            # beneath
+      "жизнь",          # life
+      "будет",          # will be
+      "ж",              # short form of intensifer particle `же'
+      "тогда",          # then
+      "кто",            # who
+      "этот",           # this
+      "говорил",        # was saying
+      "того",           # genitive form of `that'
+      "потому",         # for that reason
+      "этого",          # genitive form of `this'
+      "какой",          # which
+      "совсем",         # altogether
+      "ним",            # prepositional form of `его', `они'
+      "здесь",          # here
+      "этом",           # prepositional form of `этот'
+      "один",           # one
+      "почти",          # almost
+      "мой",            # my
+      "тем",            # instrumental/dative plural of `тот', `то'
+      "чтобы",          # full form of `in order that'
+      "нее",            # her (acc.)
+      "кажется",        # it seems
+      "сейчас",         # now
+      "были",           # they were
+      "куда",           # where to
+      "зачем",          # why
+      "сказать",        # to say
+      "всех",           # all (acc., gen. preposn. plural)
+      "никогда",        # never
+      "сегодня",        # today
+      "можно",          # possible, one can
+      "при",            # by
+      "наконец",        # finally
+      "два",            # two
+      "об",             # alternative form of `о', about
+      "другой",         # another
+      "хоть",           # even
+      "после",          # after
+      "над",            # above
+      "больше",         # more
+      "тот",            # that one (masc.)
+      "через",          # across, in
+      "эти",            # these
+      "нас",            # us
+      "про",            # about
+      "всего",          # in all, only, of all
+      "них",            # prepositional form of `они' (they)
+      "какая",          # which, feminine
+      "много",          # lots
+      "разве",          # interrogative particle
+      "сказала",        # she said
+      "три",            # three
+      "эту",            # this, acc. fem. sing.
+      "моя",            # my, feminine
+      "впрочем",        # moreover, besides
+      "хорошо",         # good
+      "свою",           # ones own, acc. fem. sing.
+      "этой",           # oblique form of `эта', fem. `this'
+      "перед",          # in front of
+      "иногда",         # sometimes
+      "лучше",          # better
+      "чуть",           # a little
+      "том",            # preposn. form of `that one'
+      "нельзя",         # one must not
+      "такой",          # such a one
+      "им",             # to them
+      "более",          # more
+      "всегда",         # always
+      "конечно",        # of course
+      "всю",            # acc. fem. sing of `all'
+      "между",          # between
+    ]
+    SKIP_WORDS = {
+      'en' => EN_CORPUS_SKIP_WORDS,
+      'ru' => RU_CORPUS_SKIP_WORDS
+    }
+  end
+end

data/lib/classifier/bayes.rb ADDED Viewed

@@ -0,0 +1,134 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module Classifier
+class Bayes < Classifier::Base
+  # The class can be created with one or more categories, each of which will be
+  # initialized and given a training method. E.g.,
+  #      b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
+  #  you can specify language and encoding parameters for stemmer
+  # (default values - :language => 'en', :encoding => 'UTF_8')
+  #      b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
+	def initialize(options = {})
+		@categories = Hash.new
+		options.reverse_merge!(:categories => [])
+		options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
+		@total_words = 0
+		super
+	end
+	#
+	# Provides a general training method for all categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.train "that", "That text"
+	#     b.train "The other", "The other text"
+	def train(category, text)
+		category = prepare_category_name(category)
+		word_hash(text).each do |word, count|
+			@categories[category][word]     ||=     0
+			@categories[category][word]      +=     count
+			@total_words += count
+		end
+	end
+	#
+	# Provides a untraining method for all categories specified in Bayes#new
+	# Be very careful with this method.
+	#
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.untrain :this, "This text"
+	def untrain(category, text)
+		category = prepare_category_name(category)
+    word_hash(text).each do |word, count|
+			if @total_words >= 0
+				orig = @categories[category][word]
+				@categories[category][word]     ||=     0
+				@categories[category][word]      -=     count
+				if @categories[category][word] <= 0
+					@categories[category].delete(word)
+					count = orig
+				end
+				@total_words -= count
+			end
+		end
+	end
+	#
+	# Returns the scores in each category the provided +text+. E.g.,
+	#    b.classifications "I hate bad words and you"
+	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+	# The largest of these scores (the one closest to 0) is the one picked out by #classify
+	def classifications(text)
+		score = Hash.new
+		@categories.each do |category, category_words|
+			score[category.to_s] = 0
+			total = category_words.values.sum
+			word_hash(text).each do |word, count|
+				s = category_words.has_key?(word) ? category_words[word] : 0.1
+				score[category.to_s] += Math.log(s/total.to_f)
+			end
+		end
+		return score
+	end
+  #
+  # Returns the classification of the provided +text+, which is one of the
+  # categories given in the initializer. E.g.,
+  #    b.classify "I hate bad words and you"
+  #    =>  'Uninteresting'
+	def classify(text)
+		(classifications(text).sort_by { |a| -a[1] })[0][0]
+	end
+	#
+	# Provides training and untraining methods for the categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train_this "This text"
+	#     b.train_that "That text"
+	#     b.untrain_that "That text"
+	#     b.train_the_other "The other text"
+	def method_missing(name, *args)
+		category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
+		if @categories.has_key? category
+			args.each { |text| eval("#{$1}train(category, text)") }
+		elsif name.to_s =~ /(un)?train_([\w]+)/
+			raise StandardError, "No such category: #{category}"
+		else
+	    super  #raise StandardError, "No such method: #{name}"
+		end
+	end
+	#
+	# Provides a list of category names
+	# For example:
+	#     b.categories
+	#     =>   ['This', 'That', 'the_other']
+	def categories # :nodoc:
+		@categories.keys.collect {|c| c.to_s}
+	end
+	#
+	# Allows you to add categories to the classifier.
+	# For example:
+	#     b.add_category "Not spam"
+	#
+	# WARNING: Adding categories to a trained classifier will
+	# result in an undertrained category that will tend to match
+	# more criteria than the trained selective categories. In short,
+	# try to initialize your categories at initialization.
+	def add_category(category)
+		@categories[prepare_category_name(category)] = Hash.new
+	end
+	alias append_category add_category
+end
+end

data/lib/classifier/extensions/vector.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# Author::    Ernest Ellingson
+# Copyright:: Copyright (c) 2005
+# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
+require 'matrix'
+require 'mathn'
+class Vector
+  def magnitude
+    sumsqs = 0.0
+    self.size.times do |i|
+      sumsqs += self[i] ** 2.0
+    end
+    Math.sqrt(sumsqs)
+  end
+  def normalize
+    nv = []
+    mag = self.magnitude
+    self.size.times do |i|
+      nv << (self[i] / mag)
+    end
+    Vector[*nv]
+  end
+end
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/classifier/extensions/vector_serialize.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump( self.to_a )
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
+end

data/lib/classifier/lsi/content_node.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = vec.sum.to_f
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/classifier/lsi/summary.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+class String
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Classifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end

data/lib/classifier/lsi/word_list.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+  # This class keeps a word => index mapping. It is used to map stemmed words
+  # to dimensions of a vector.
+  class WordList
+    def initialize
+      @location_table = Hash.new
+    end
+    # Adds a word (if it is new) and assigns it a unique dimension.
+    def add_word(word)
+      term = word
+      @location_table[term] = @location_table.size unless @location_table[term]
+    end
+    # Returns the dimension of the word or nil if the word is not in the space.
+    def [](lookup)
+      term = lookup
+      @location_table[term]
+    end
+    def word_for_index(ind)
+      @location_table.invert[ind]
+    end
+    # Returns the number of words mapped.
+    def size
+      @location_table.size
+    end
+  end
+end