RubyGems - francois-classifier - Versions diffs - 1.3.4 - Mend

francois-classifier 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/LICENSE +429 -0
data/Manifest +18 -0
data/README +86 -0
data/Rakefile +21 -0
data/VERSION.yml +4 -0
data/classifier.gemspec +66 -0
data/lib/classifier/base.rb +306 -0
data/lib/classifier/bayes.rb +134 -0
data/lib/classifier/extensions/vector.rb +100 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/lsi/content_node.rb +73 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/lsi.rb +337 -0
data/lib/classifier.rb +32 -0
data/lib/init.rb +1 -0
data/test/base_test.rb +17 -0
data/test/bayes/bayesian_test.rb +52 -0
data/test/lsi/lsi_test.rb +167 -0
data/test/test_helper.rb +4 -0
metadata +95 -0

data/lib/classifier/base.rb ADDED Viewed

@@ -0,0 +1,306 @@
+# coding:utf-8
+$KCODE = 'utf8'
+module Classifier
+  class Base
+    def initialize(options = {})
+      options.reverse_merge!(:language => 'en')
+      options.reverse_merge!(:encoding => 'UTF_8')
+      @options = options
+    end
+    def prepare_category_name val
+      val.to_s.gsub("_"," ").capitalize.intern
+    end
+    # Removes common punctuation symbols, returning a new string.
+    # E.g.,
+    #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+    #   => "Hello  greetings   with  braces         "
+    def without_punctuation str
+      str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+    end
+    # Return a Hash of strings => ints. Each word in the string is stemmed,
+    # interned, and indexes to its frequency in the document.
+  	def word_hash str
+  		word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
+  	end
+  	# Return a word hash without extra punctuation or short symbols, just stemmed words
+  	def clean_word_hash str
+  		word_hash_for_words str.gsub(/[^\w\s]/,"").split
+  	end
+  	private
+  	def word_hash_for_words(words)
+  	  stemmer = Lingua::Stemmer.new(@options)
+  		d = Hash.new
+  		skip_words = SKIP_WORDS[@options[:language]] || []
+  		words.each do |word|
+  			word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
+  			key = stemmer.stem(word).intern
+  			if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
+  				d[key] ||= 0
+  				d[key] += 1
+  			end
+  		end
+  		return d
+  	end
+  	EN_CORPUS_SKIP_WORDS = [
+        "a",
+        "again",
+        "all",
+        "along",
+        "are",
+        "also",
+        "an",
+        "and",
+        "as",
+        "at",
+        "but",
+        "by",
+        "came",
+        "can",
+        "cant",
+        "couldnt",
+        "did",
+        "didn",
+        "didnt",
+        "do",
+        "doesnt",
+        "dont",
+        "ever",
+        "first",
+        "from",
+        "have",
+        "her",
+        "here",
+        "him",
+        "how",
+        "i",
+        "if",
+        "in",
+        "into",
+        "is",
+        "isnt",
+        "it",
+        "itll",
+        "just",
+        "last",
+        "least",
+        "like",
+        "most",
+        "my",
+        "new",
+        "no",
+        "not",
+        "now",
+        "of",
+        "on",
+        "or",
+        "should",
+        "sinc",
+        "so",
+        "some",
+        "th",
+        "than",
+        "this",
+        "that",
+        "the",
+        "their",
+        "then",
+        "those",
+        "to",
+        "told",
+        "too",
+        "true",
+        "try",
+        "until",
+        "url",
+        "us",
+        "were",
+        "when",
+        "whether",
+        "while",
+        "with",
+        "within",
+        "yes",
+        "you",
+        "youll",
+        ]
+    # http://snowball.tartarus.org/algorithms/russian/stop.txt
+    RU_CORPUS_SKIP_WORDS = [
+      "и",              # and
+      "в",              # in/into
+      "во",             # alternative form
+      "не",             # not
+      "что",            # what/that
+      "он",             # he
+      "на",             # on/onto
+      "я",              # i
+      "с",              # from
+      "со",             # alternative form
+      "как",            # how
+      "а",              # milder form of `no' (but)
+      "то",             # conjunction and form of `that'
+      "все",            # all
+      "она",            # she
+      "так",            # so, thus
+      "его",            # him
+      "но",             # but
+      "да",             # yes/and
+      "ты",             # thou
+      "к",              # towards, by
+      "у",              # around, chez
+      "же",             # intensifier particle
+      "вы",             # you
+      "за",             # beyond, behind
+      "бы",             # conditional/subj. particle
+      "по",             # up to, along
+      "только",         # only
+      "ее",             # her
+      "мне",            # to me
+      "было",           # it was
+      "вот",            # here is/are, particle
+      "от",             # away from
+      "меня",           # me
+      "еще",            # still, yet, more
+      "нет",            # no, there isnt/arent
+      "о",              # about
+      "из",             # out of
+      "ему",            # to him
+      "теперь",         # now
+      "когда",          # when
+      "даже",           # even
+      "ну",             # so, well
+      "вдруг",          # suddenly
+      "ли",             # interrogative particle
+      "если",           # if
+      "уже",            # already, but homonym of `narrower'
+      "или",            # or
+      "ни",             # neither
+      "быть",           # to be
+      "был",            # he was
+      "него",           # prepositional form of его
+      "до",             # up to
+      "вас",            # you accusative
+      "нибудь",         # indef. suffix preceded by hyphen
+      "опять",          # again
+      "уж",             # already, but homonym of `adder'
+      "вам",            # to you
+      "сказал",         # he said
+      "ведь",           # particle `after all'
+      "там",            # there
+      "потом",          # then
+      "себя",           # oneself
+      "ничего",         # nothing
+      "ей",             # to her
+      "может",          # usually with `быть' as `maybe'
+      "они",            # they
+      "тут",            # here
+      "где",            # where
+      "есть",           # there is/are
+      "надо",           # got to, must
+      "ней",            # prepositional form of  ей
+      "для",            # for
+      "мы",             # we
+      "тебя",           # thee
+      "их",             # them, their
+      "чем",            # than
+      "была",           # she was
+      "сам",            # self
+      "чтоб",           # in order to
+      "без",            # without
+      "будто",          # as if
+      "человек",        # man, person, one
+      "чего",           # genitive form of `what'
+      "раз",            # once
+      "тоже",           # also
+      "себе",           # to oneself
+      "под",            # beneath
+      "жизнь",          # life
+      "будет",          # will be
+      "ж",              # short form of intensifer particle `же'
+      "тогда",          # then
+      "кто",            # who
+      "этот",           # this
+      "говорил",        # was saying
+      "того",           # genitive form of `that'
+      "потому",         # for that reason
+      "этого",          # genitive form of `this'
+      "какой",          # which
+      "совсем",         # altogether
+      "ним",            # prepositional form of `его', `они'
+      "здесь",          # here
+      "этом",           # prepositional form of `этот'
+      "один",           # one
+      "почти",          # almost
+      "мой",            # my
+      "тем",            # instrumental/dative plural of `тот', `то'
+      "чтобы",          # full form of `in order that'
+      "нее",            # her (acc.)
+      "кажется",        # it seems
+      "сейчас",         # now
+      "были",           # they were
+      "куда",           # where to
+      "зачем",          # why
+      "сказать",        # to say
+      "всех",           # all (acc., gen. preposn. plural)
+      "никогда",        # never
+      "сегодня",        # today
+      "можно",          # possible, one can
+      "при",            # by
+      "наконец",        # finally
+      "два",            # two
+      "об",             # alternative form of `о', about
+      "другой",         # another
+      "хоть",           # even
+      "после",          # after
+      "над",            # above
+      "больше",         # more
+      "тот",            # that one (masc.)
+      "через",          # across, in
+      "эти",            # these
+      "нас",            # us
+      "про",            # about
+      "всего",          # in all, only, of all
+      "них",            # prepositional form of `они' (they)
+      "какая",          # which, feminine
+      "много",          # lots
+      "разве",          # interrogative particle
+      "сказала",        # she said
+      "три",            # three
+      "эту",            # this, acc. fem. sing.
+      "моя",            # my, feminine
+      "впрочем",        # moreover, besides
+      "хорошо",         # good
+      "свою",           # ones own, acc. fem. sing.
+      "этой",           # oblique form of `эта', fem. `this'
+      "перед",          # in front of
+      "иногда",         # sometimes
+      "лучше",          # better
+      "чуть",           # a little
+      "том",            # preposn. form of `that one'
+      "нельзя",         # one must not
+      "такой",          # such a one
+      "им",             # to them
+      "более",          # more
+      "всегда",         # always
+      "конечно",        # of course
+      "всю",            # acc. fem. sing of `all'
+      "между",          # between
+    ]
+    SKIP_WORDS = {
+      'en' => EN_CORPUS_SKIP_WORDS,
+      'ru' => RU_CORPUS_SKIP_WORDS
+    }
+  end
+end

data/lib/classifier/bayes.rb ADDED Viewed

@@ -0,0 +1,134 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module Classifier
+class Bayes < Classifier::Base
+  # The class can be created with one or more categories, each of which will be
+  # initialized and given a training method. E.g.,
+  #      b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
+  #  you can specify language and encoding parameters for stemmer
+  # (default values - :language => 'en', :encoding => 'UTF_8')
+  #      b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
+	def initialize(options = {})
+		@categories = Hash.new
+		options.reverse_merge!(:categories => [])
+		options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
+		@total_words = 0
+		super
+	end
+	#
+	# Provides a general training method for all categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.train "that", "That text"
+	#     b.train "The other", "The other text"
+	def train(category, text)
+		category = prepare_category_name(category)
+		word_hash(text).each do |word, count|
+			@categories[category][word]     ||=     0
+			@categories[category][word]      +=     count
+			@total_words += count
+		end
+	end
+	#
+	# Provides a untraining method for all categories specified in Bayes#new
+	# Be very careful with this method.
+	#
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.untrain :this, "This text"
+	def untrain(category, text)
+		category = prepare_category_name(category)
+    word_hash(text).each do |word, count|
+			if @total_words >= 0
+				orig = @categories[category][word]
+				@categories[category][word]     ||=     0
+				@categories[category][word]      -=     count
+				if @categories[category][word] <= 0
+					@categories[category].delete(word)
+					count = orig
+				end
+				@total_words -= count
+			end
+		end
+	end
+	#
+	# Returns the scores in each category the provided +text+. E.g.,
+	#    b.classifications "I hate bad words and you"
+	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+	# The largest of these scores (the one closest to 0) is the one picked out by #classify
+	def classifications(text)
+		score = Hash.new
+		@categories.each do |category, category_words|
+			score[category.to_s] = 0
+			total = category_words.values.sum
+			word_hash(text).each do |word, count|
+				s = category_words.has_key?(word) ? category_words[word] : 0.1
+				score[category.to_s] += Math.log(s/total.to_f)
+			end
+		end
+		return score
+	end
+  #
+  # Returns the classification of the provided +text+, which is one of the
+  # categories given in the initializer. E.g.,
+  #    b.classify "I hate bad words and you"
+  #    =>  'Uninteresting'
+	def classify(text)
+		(classifications(text).sort_by { |a| -a[1] })[0][0]
+	end
+	#
+	# Provides training and untraining methods for the categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train_this "This text"
+	#     b.train_that "That text"
+	#     b.untrain_that "That text"
+	#     b.train_the_other "The other text"
+	def method_missing(name, *args)
+		category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
+		if @categories.has_key? category
+			args.each { |text| eval("#{$1}train(category, text)") }
+		elsif name.to_s =~ /(un)?train_([\w]+)/
+			raise StandardError, "No such category: #{category}"
+		else
+	    super  #raise StandardError, "No such method: #{name}"
+		end
+	end
+	#
+	# Provides a list of category names
+	# For example:
+	#     b.categories
+	#     =>   ['This', 'That', 'the_other']
+	def categories # :nodoc:
+		@categories.keys.collect {|c| c.to_s}
+	end
+	#
+	# Allows you to add categories to the classifier.
+	# For example:
+	#     b.add_category "Not spam"
+	#
+	# WARNING: Adding categories to a trained classifier will
+	# result in an undertrained category that will tend to match
+	# more criteria than the trained selective categories. In short,
+	# try to initialize your categories at initialization.
+	def add_category(category)
+		@categories[prepare_category_name(category)] = Hash.new
+	end
+	alias append_category add_category
+end
+end

data/lib/classifier/extensions/vector.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# Author::    Ernest Ellingson
+# Copyright:: Copyright (c) 2005
+# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
+require 'matrix'
+require 'mathn'
+class Vector
+  def magnitude
+    sumsqs = 0.0
+    self.size.times do |i|
+      sumsqs += self[i] ** 2.0
+    end
+    Math.sqrt(sumsqs)
+  end
+  def normalize
+    nv = []
+    mag = self.magnitude
+    self.size.times do |i|
+      nv << (self[i] / mag)
+    end
+    Vector[*nv]
+  end
+end
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/classifier/extensions/vector_serialize.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump( self.to_a )
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
+end

data/lib/classifier/lsi/content_node.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = vec.sum.to_f
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/classifier/lsi/summary.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+class String
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Classifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end

data/lib/classifier/lsi/word_list.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+  # This class keeps a word => index mapping. It is used to map stemmed words
+  # to dimensions of a vector.
+  class WordList
+    def initialize
+      @location_table = Hash.new
+    end
+    # Adds a word (if it is new) and assigns it a unique dimension.
+    def add_word(word)
+      term = word
+      @location_table[term] = @location_table.size unless @location_table[term]
+    end
+    # Returns the dimension of the word or nil if the word is not in the space.
+    def [](lookup)
+      term = lookup
+      @location_table[term]
+    end
+    def word_for_index(ind)
+      @location_table.invert[ind]
+    end
+    # Returns the number of words mapped.
+    def size
+      @location_table.size
+    end
+  end
+end