RubyGems - classifier_atsukamoto - Versions diffs - 0.0.1 - Mend

classifier_atsukamoto 0.0.1

Files changed (15) hide show

checksums.yaml +7 -0
data/Rakefile +97 -0
data/lib/classifier/bayes.rb +156 -0
data/lib/classifier/extensions/string.rb +10 -0
data/lib/classifier/extensions/vector.rb +113 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/extensions/word_hash.rb +129 -0
data/lib/classifier/lsi/content_node.rb +72 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/lsi.rb +318 -0
data/lib/classifier/redis_store.rb +125 -0
data/lib/classifier.rb +31 -0
data/test/bayes/bayesian_test.rb +33 -0
metadata +57 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: e6eca98ba96b5157ddcfef0ba3f02e129652c5ce
+  data.tar.gz: bc219f04544083e8a017b548ca2fede7a942fa45
+SHA512:
+  metadata.gz: 1c78965de0ffd493b57ebf013deb1baf92c8e554f21157bb681dca1ae980edcd1d4a7fdafe601983270ecb0bbd86214a6b19ec3a1e9ebebc12789ebdbc1f0131
+  data.tar.gz: a9eb9c3ebac570198f25800f4c02677aea29ad8e9f152b41585242bd8667cf8b6de3812cf28ce68386c1090f5764fbc0c610972a0041342cb58bfb2fddccaf15

data/Rakefile ADDED Viewed

@@ -0,0 +1,97 @@
+require 'rubygems'
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+require 'rake/contrib/rubyforgepublisher'
+PKG_VERSION = "0.0.1"
+PKG_FILES = FileList[
+    "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
+]
+desc "Default Task"
+task :default => [ :test ]
+# Run the unit tests
+desc "Run all unit tests"
+Rake::TestTask.new("test") { |t|
+  t.libs << "lib"
+  t.pattern = 'test/*/*_test.rb'
+  t.verbose = true
+}
+# Make a console, useful when working on tests
+desc "Generate a test console"
+task :console do
+   verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
+end
+# Genereate the RDoc documentation
+desc "Create documentation"
+Rake::RDocTask.new("doc") { |rdoc|
+  rdoc.title = "Ruby Classifier Fork by ATsukamoto - Bayesian and LSI classification library with Redis for persistence
+  "
+  rdoc.rdoc_dir = 'html'
+  rdoc.rdoc_files.include('README')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+}
+# Genereate the package
+spec = Gem::Specification.new do |s|
+  #### Basic information.
+  s.name = 'classifier'
+  s.version = PKG_VERSION
+  s.summary = <<-EOF
+   A general classifier module to allow Bayesian and other types of classifications.
+  EOF
+  s.description = <<-EOF
+   A general classifier module to allow Bayesian and other types of classifications.
+  EOF
+  #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)
+  s.files = PKG_FILES
+  #### Load-time details: library and application (you will need one or both).
+  s.require_path = 'lib'
+  s.autorequire = 'classifier'
+  #### Documentation and testing.
+  s.has_rdoc = true
+  #### Dependencies and requirements.
+  s.add_dependency('fast-stemmer', '>= 1.0.0')
+  s.requirements << "A porter-stemmer module to split word stems."
+  #### Author and project details.
+  s.author = "Lucas Carlson"
+  s.email = "lucas@rufy.com"
+  s.homepage = "http://classifier.rufy.com/"
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.need_zip = true
+  pkg.need_tar = true
+end
+desc "Report code statistics (KLOCs, etc) from the application"
+task :stats do
+  require 'code_statistics'
+  CodeStatistics.new(
+    ["Library", "lib"],
+    ["Units", "test"]
+  ).to_s
+end
+desc "Publish new documentation"
+task :publish do
+   `ssh rufy update-classifier-doc`
+    Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
+end

data/lib/classifier/bayes.rb ADDED Viewed

@@ -0,0 +1,156 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module Classifier
+require 'lingua/stemmer'
+class Bayes
+  # The class can be created with one or more categories, each of which will be
+  # initialized and given a training method. E.g.,
+  #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
+	def initialize(lang, *categories)
+		#@categories = Hash.new
+		#categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
+		# RedisStore.total_words = 0
+		@categories = RedisStore.new lang, categories
+		@categories.init_total
+		@stemmer = Lingua::Stemmer.new(:language => lang.downcase)
+	end
+	#
+	# Provides a general training method for all categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.train "that", "That text"
+	#     b.train "The other", "The other text"
+	def train(category, text)
+		category = category.prepare_category_name
+		text.word_hash(@stemmer).each do |word, count|
+			# @categories[category][word] ||= 0
+			@categories.init(category, word)
+			# @categories[category][word] += count
+			@categories.incr(category, word, count)
+			# @total_words += count
+			@categories.incr_total(count)
+		end
+	end
+	#
+	# Provides a untraining method for all categories specified in Bayes#new
+	# Be very careful with this method.
+	#
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.untrain :this, "This text"
+	def untrain(category, text)
+		category = category.prepare_category_name
+		text.word_hash(@stemmer).each do |word, count|
+			# @total_words >= 0
+			if @categories.total_words >= 0
+				# orig = @categories[category][word]
+				orig = @categories.get(category,word)
+				# @categories[category][word] ||= 0
+				@categories.init(category, word)
+				# @categories[category][word] -= count
+				@categories.decr(category, word, count)
+				#if @categories[category][word] <= 0
+				if @categories.get(category,word) <= 0
+					# @categories[category].delete(word)
+					@categories.remove(category,word)
+					count = orig
+				end
+				#@total_words -= count
+				@categories.decr_total(count)
+			end
+		end
+	end
+	#
+	# Returns the scores in each category the provided +text+. E.g.,
+	#    b.classifications "I hate bad words and you"
+	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+	# The largest of these scores (the one closest to 0) is the one picked out by #classify
+	def classifications(text)
+		score = Hash.new
+		# actual categories saved in the beggining but each do |category|
+		@categories.names.each do |category, category_words|
+			score[category.to_s] = 0
+			# total = category_words.values.inject(0) {|sum, element| sum+element}
+			total = category_words.inject(0) { |sum, element| sum + element }
+			text.word_hash(@stemmer).each do |word, count|
+				#s = category_words.has_key?(word) ? category_words[word] : 0.1
+				s = @categories.has_word?(category, word) ? @categories.get(category, word) : 0.1
+				score[category.to_s] += Math.log(s/total.to_f)
+			end
+		end
+		return score
+	end
+  #
+  # Returns the classification of the provided +text+, which is one of the
+  # categories given in the initializer. E.g.,
+  #    b.classify "I hate bad words and you"
+  #    =>  'Uninteresting'
+	def classify(text)
+		(classifications(text).sort_by { |a| -a[1] })[0][0]
+	end
+	#
+	# Provides training and untraining methods for the categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train_this "This text"
+	#     b.train_that "That text"
+	#     b.untrain_that "That text"
+	#     b.train_the_other "The other text"
+	def method_missing(name, *args)
+		category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
+		# categories.has_key?(key)
+		if @categories.names.include? category
+			args.each { |text| eval("#{$1}train(category, text)") }
+		elsif name.to_s =~ /(un)?train_([\w]+)/
+			raise StandardError, "No such category: #{category}"
+		else
+	    super  #raise StandardError, "No such method: #{name}"
+		end
+	end
+	#
+	# Provides a list of category names
+	# For example:
+	#     b.categories
+	#     =>   ['This', 'That', 'the_other']
+	def categories # :nodoc:
+		@categories
+	end
+	#
+	# Allows you to add categories to the classifier.
+	# For example:
+	#     b.add_category "Not spam"
+	#
+	# WARNING: Adding categories to a trained classifier will
+	# result in an undertrained category that will tend to match
+	# more criteria than the trained selective categories. In short,
+	# try to initialize your categories at initialization.
+	def add_category(category)
+		@categories[category.prepare_category_name] = Hash.new
+	end
+	alias append_category add_category
+end
+end

data/lib/classifier/extensions/string.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+# require 'fast_stemmer'
+require 'classifier/extensions/word_hash'
+class Object
+	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
+end

data/lib/classifier/extensions/vector.rb ADDED Viewed

@@ -0,0 +1,113 @@
+# Author::    Ernest Ellingson
+# Copyright:: Copyright (c) 2005
+# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
+require 'matrix'
+require 'mathn'
+class Array
+  # TODO! Change name!
+  def a_sum(identity = 0, &block)
+    return identity unless size > 0
+    if block_given?
+      map(&block).sum
+    else
+      inject { |sum, element| sum + element }.to_f
+    end
+  end
+end
+class Vector
+  def magnitude
+    sumsqs = 0.0
+    self.size.times do |i|
+      sumsqs += self[i] ** 2.0
+    end
+    Math.sqrt(sumsqs)
+  end
+  def normalize
+    nv = []
+    mag = self.magnitude
+    self.size.times do |i|
+      nv << (self[i] / mag)
+    end
+    Vector[*nv]
+  end
+end
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/classifier/extensions/vector_serialize.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump( self.to_a )
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
+end

data/lib/classifier/extensions/word_hash.rb ADDED Viewed

@@ -0,0 +1,129 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+# These are extensions to the String class to provide convenience
+# methods for the Classifier package.
+require 'lingua/stemmer'
+class String
+  # Removes common punctuation symbols, returning a new string.
+  # E.g.,
+  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+  #   => "Hello  greetings   with  braces         "
+  def without_punctuation
+    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+  end
+  # Return a Hash of strings => ints. Each word in the string is stemmed,
+  # interned, and indexes to its frequency in the document.
+	def word_hash(stemmer)
+		word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split, stemmer)
+	end
+	# Return a word hash without extra punctuation or short symbols, just stemmed words
+	def clean_word_hash
+		word_hash_for_words gsub(/[^\w\s]/,"").split
+	end
+	private
+	def word_hash_for_words(words, stemmer)
+		d = Hash.new
+		words.each do |word|
+			word.downcase! if word =~ /[\w]+/
+			#key = word.stem.intern
+      key = stemmer.stem(word).intern
+			if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+				d[key] ||= 0
+				d[key] += 1
+			end
+		end
+		return d
+	end
+  # TODO! Actualize for each language
+	CORPUS_SKIP_WORDS = [
+      "a",
+      "again",
+      "all",
+      "along",
+      "are",
+      "also",
+      "an",
+      "and",
+      "as",
+      "at",
+      "but",
+      "by",
+      "came",
+      "can",
+      "cant",
+      "couldnt",
+      "did",
+      "didn",
+      "didnt",
+      "do",
+      "doesnt",
+      "dont",
+      "ever",
+      "first",
+      "from",
+      "have",
+      "her",
+      "here",
+      "him",
+      "how",
+      "i",
+      "if",
+      "in",
+      "into",
+      "is",
+      "isnt",
+      "it",
+      "itll",
+      "just",
+      "last",
+      "least",
+      "like",
+      "most",
+      "my",
+      "new",
+      "no",
+      "not",
+      "now",
+      "of",
+      "on",
+      "or",
+      "should",
+      "sinc",
+      "so",
+      "some",
+      "th",
+      "than",
+      "this",
+      "that",
+      "the",
+      "their",
+      "then",
+      "those",
+      "to",
+      "told",
+      "too",
+      "true",
+      "try",
+      "until",
+      "url",
+      "us",
+      "were",
+      "when",
+      "whether",
+      "while",
+      "with",
+      "within",
+      "yes",
+      "you",
+      "youll",
+      ]
+end

data/lib/classifier/lsi/content_node.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = vec.a_sum
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/classifier/lsi/summary.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+class String
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Classifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end

data/lib/classifier/lsi/word_list.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+  # This class keeps a word => index mapping. It is used to map stemmed words
+  # to dimensions of a vector.
+  class WordList
+    def initialize
+      @location_table = Hash.new
+    end
+    # Adds a word (if it is new) and assigns it a unique dimension.
+    def add_word(word)
+      term = word
+      @location_table[term] = @location_table.size unless @location_table[term]
+    end
+    # Returns the dimension of the word or nil if the word is not in the space.
+    def [](lookup)
+      term = lookup
+      @location_table[term]
+    end
+    def word_for_index(ind)
+      @location_table.invert[ind]
+    end
+    # Returns the number of words mapped.
+    def size
+      @location_table.size
+    end
+  end
+end

data/lib/classifier/lsi.rb ADDED Viewed

@@ -0,0 +1,318 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+begin
+   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+   require 'gsl' # requires http://rb-gsl.rubyforge.org/
+   require 'classifier/extensions/vector_serialize'
+   $GSL = true
+rescue LoadError
+	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
+	require 'classifier/extensions/vector'
+end
+require 'classifier/lsi/word_list'
+require 'classifier/lsi/content_node'
+require 'classifier/lsi/summary'
+module Classifier
+  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
+  # data based on underlying semantic relations. For more information on the algorithms used,
+  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
+  class LSI
+    attr_reader :word_list
+    attr_accessor :auto_rebuild
+    # Create a fresh index.
+    # If you want to call #build_index manually, use
+    #      Classifier::LSI.new :auto_rebuild => false
+    #
+    def initialize(options = {})
+      @auto_rebuild = true unless options[:auto_rebuild] == false
+      @word_list, @items = WordList.new, {}
+      @version, @built_at_version = 0, -1
+    end
+    # Returns true if the index needs to be rebuilt.  The index needs
+    # to be built after all informaton is added, but before you start
+    # using it for search, classification and cluster detection.
+    def needs_rebuild?
+      (@items.keys.size > 1) && (@version != @built_at_version)
+    end
+    # Adds an item to the index. item is assumed to be a string, but
+    # any item may be indexed so long as it responds to #to_s or if
+    # you provide an optional block explaining how the indexer can
+    # fetch fresh string data. This optional block is passed the item,
+    # so the item may only be a reference to a URL or file name.
+    #
+    # For example:
+    #   lsi = Classifier::LSI.new
+    #   lsi.add_item "This is just plain text"
+    #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
+    #   ar = ActiveRecordObject.find( :all )
+    #   lsi.add_item ar, *ar.categories { |x| ar.content }
+    #
+    def add_item( item, *categories, &block )
+      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      @version += 1
+      build_index if @auto_rebuild
+    end
+    # A less flexible shorthand for add_item that assumes
+    # you are passing in a string with no categorries. item
+    # will be duck typed via to_s .
+    #
+    def <<( item )
+      add_item item
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # Removes an item from the database, if it is indexed.
+    #
+    def remove_item( item )
+      if @items.keys.contain? item
+        @items.remove item
+        @version += 1
+      end
+    end
+    # Returns an array of items that are indexed.
+    def items
+      @items.keys
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # This function rebuilds the index if needs_rebuild? returns true.
+    # For very large document spaces, this indexing operation may take some
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
+    #
+    # As a rule, indexing will be fairly swift on modern machines until
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
+    #
+    # The optional parameter "cutoff" is a tuning parameter. When the index is
+    # built, a certain number of s-values are discarded from the system. The
+    # cutoff parameter tells the indexer how many of these values to keep.
+    # A value of 1 for cutoff means that no semantic analysis will take place,
+    # turning the LSI class into a simple vector search engine.
+    def build_index( cutoff=0.75 )
+      return unless needs_rebuild?
+      make_word_list
+      doc_list = @items.values
+      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      if $GSL
+         tdm = GSL::Matrix.alloc(*tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.size[1].times do |col|
+           vec = GSL::Vector.alloc( ntdm.column(col) ).row
+           doc_list[col].lsi_vector = vec
+           doc_list[col].lsi_norm = vec.normalize
+         end
+      else
+         tdm = Matrix.rows(tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.row_size.times do |col|
+           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
+         end
+      end
+      @built_at_version = @version
+    end
+    # This method returns max_chunks entries, ordered by their average semantic rating.
+    # Essentially, the average distance of each entry from all other entries is calculated,
+    # the highest are returned.
+    #
+    # This can be used to build a summary service, or to provide more information about
+    # your dataset's general content. For example, if you were to use categorize on the
+    # results of this data, you could gather information on what your dataset is generally
+    # about.
+    def highest_relative_content( max_chunks=10 )
+       return [] if needs_rebuild?
+       avg_density = Hash.new
+       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    end
+    # This function is the primitive that find_related and classify
+    # build upon. It returns an array of 2-element arrays. The first element
+    # of this array is a document, and the second is its "score", defining
+    # how "close" it is to other indexed items.
+    #
+    # These values are somewhat arbitrary, having to do with the vector space
+    # created by your content, so the magnitude is interpretable but not always
+    # meaningful between indexes.
+    #
+    # The parameter doc is the content to compare. If that content is not
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
+    def proximity_array_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          if $GSL
+             val = content_node.search_vector * @items[item].search_vector.col
+          else
+             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+          end
+          [item, val]
+        end
+      result.sort_by { |x| x[1] }.reverse
+    end
+    # Similar to proximity_array_for_content, this function takes similar
+    # arguments and returns a similar array. However, it uses the normalized
+    # calculated vectors instead of their full versions. This is useful when
+    # you're trying to perform operations on content that is much smaller than
+    # the text you're working with. search uses this primitive.
+    def proximity_norms_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          if $GSL
+            val = content_node.search_norm * @items[item].search_norm.col
+          else
+            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+          end
+          [item, val]
+        end
+      result.sort_by { |x| x[1] }.reverse
+    end
+    # This function allows for text-based search of your index. Unlike other functions
+    # like find_related and classify, search only takes short strings. It will also ignore
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
+    #
+    # While this may seem backwards compared to the other functions that LSI supports,
+    # it is actually the same algorithm, just applied on a smaller document.
+    def search( string, max_nearest=3 )
+      return [] if needs_rebuild?
+      carry = proximity_norms_for_content( string )
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function takes content and finds other documents
+    # that are semantically "close", returning an array of documents sorted
+    # from most to least relavant.
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
+    #
+    # This is particularly useful for identifing clusters in your document space.
+    # For example you may want to identify several "What's Related" items for weblog
+    # articles, or find paragraphs that relate to each other in an essay.
+    def find_related( doc, max_nearest=3, &block )
+      carry =
+        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
+    # find_related function to find related documents, then returns the
+    # most obvious category from this list.
+    #
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
+    # what category the document is in. This may not always make sense.
+    #
+    def classify( doc, cutoff=0.30, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      ranking = votes.keys.sort_by { |x| votes[x] }
+      return ranking[-1]
+    end
+    # Prototype, only works on indexed documents.
+    # I have no clue if this is going to work, but in theory
+    # it's supposed to.
+    def highest_ranked_stems( doc, count=3 )
+      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+      arr = node_for_content(doc).lsi_vector.to_a
+      top_n = arr.sort.reverse[0..count-1]
+      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    end
+    private
+    def build_reduced_matrix( matrix, cutoff=0.75 )
+      # TODO: Check that M>=N on these dimensions! Transpose helps assure this
+      u, v, s = matrix.SV_decomp
+      # TODO: Better than 75% term, please. :\
+      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      s.size.times do |ord|
+        s[ord] = 0.0 if s[ord] < s_cutoff
+      end
+      # Reconstruct the term document matrix, only with reduced rank
+      u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
+    end
+    def node_for_content(item, &block)
+      if @items[item]
+        return @items[item]
+      else
+        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        unless needs_rebuild?
+          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+        end
+      end
+      return cn
+    end
+    def make_word_list
+      @word_list = WordList.new
+      @items.each_value do |node|
+        node.word_hash.each_key { |key| @word_list.add_word key }
+      end
+    end
+  end
+end

data/lib/classifier/redis_store.rb ADDED Viewed

@@ -0,0 +1,125 @@
+module Classifier
+	require 'redis'
+	#if !String.instance_methods.include?(:underscore)
+		class String
+  		def underscore
+  	  	self.gsub(/::/, '/').
+  	  	gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+  	  	gsub(/([a-z\d])([A-Z])/,'\1_\2').
+  	  	tr("-", "_").
+  	  	downcase
+  		end
+		end
+	#end
+	class RedisStore
+		include Enumerable
+		attr_accessor :names
+		def initialize(lang, categories)
+			$redis = Redis.new
+			@names = []
+			@lang = lang
+			categories.each_with_index do |category, index|
+				@names << category.prepare_category_name
+			end
+		end
+		def init(category, word)
+			if !key_for?(category, word)
+				insert(category, word, 0)
+			end
+		end
+		def init_total
+			$redis.set redis_total_key, 0
+		end
+		def total_words
+			$redis.get(redis_total_key).to_i
+		end
+		def key_for?(category, word)
+			$redis.exists(redis_key(category, word))
+		end
+		alias :has_word? :key_for?
+		def insert(category, word, val)
+			$redis.set(redis_key(category, word), "#{val}")
+		end
+		def get(category, word)
+			val = $redis.get redis_key(category, word)
+			val.nil? ? nil : val.to_i
+		end
+		def remove(category, word)
+			$redis.del redis_key(category, word)
+		end
+		def incr(category, word, count)
+			$redis.incrby redis_key(category, word), count.to_i
+		end
+		def incr_total(count)
+			$redis.incrby redis_total_key, count.to_i
+		end
+		def decr
+			$redis.decrby redis_key(category, word), count.to_i
+		end
+		def decr_total(count)
+			$redis.decrby redis_total_key, count.to_i
+		end
+		def each(&block)
+			#return enum_for(__method__) if block.nil?
+			@names.each do |category|
+				if block_given?
+					block.call(category, get_by_wild_keys(category))
+				else
+					yield category
+				end
+			end
+		end
+		#protected
+		def redis_key(category, word)
+			"#{escape_lang}:#{escape_category(category)}:#{escape_word(word)}"
+		end
+		def redis_total_key
+			"redis_bayes_store_#{@lang}"
+		end
+		def escape_category(category)
+			category.to_s.gsub(" ", "_").downcase
+		end
+		def escape_word(word)
+			word.to_s.force_encoding('UTF-8')
+		end
+		def escape_lang
+			@lang.to_s.downcase
+		end
+		def get_by_wild_keys(category)
+			wildlings = []
+			$redis.keys("#{escape_category(category)}:*").each do |key|
+				wildlings << get_by_key(key).to_i
+			end
+			wildlings
+		end
+		def get_by_key(key)
+			val = $redis.get(key)
+			val.is_a?(String) ? eval(val) : val
+		end
+	end
+end

data/lib/classifier.rb ADDED Viewed

@@ -0,0 +1,31 @@
+#--
+# Copyright (c) 2005 Lucas Carlson
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+require 'rubygems'
+require 'classifier/extensions/string'
+require 'classifier/bayes'
+require 'classifier/lsi'
+require 'classifier/redis_store'

data/test/bayes/bayesian_test.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.dirname(__FILE__) + '/../test_helper'
+class BayesianTest < Test::Unit::TestCase
+	def setup
+		@classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
+	end
+	def test_good_training
+		assert_nothing_raised { @classifier.train_interesting "love" }
+	end
+	def test_bad_training
+		assert_raise(StandardError) { @classifier.train_no_category "words" }
+	end
+	def test_bad_method
+		assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
+	end
+	def test_categories
+		assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_add_category
+		@classifier.add_category 'Test'
+		assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_classification
+		@classifier.train_interesting "here are some good words. I hope you love them"
+		@classifier.train_uninteresting "here are some bad words, I hate you"
+		assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
+	end
+end

metadata ADDED Viewed

@@ -0,0 +1,57 @@
+--- !ruby/object:Gem::Specification
+name: classifier_atsukamoto
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Lucas Carlson
+- Afonso Tsukamoto
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-12-13 00:00:00.000000000 Z
+dependencies: []
+description: Classifier with redis
+email: atsukamoto@faber-ventures.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- Rakefile
+- lib/classifier/extensions/string.rb
+- lib/classifier/extensions/vector.rb
+- lib/classifier/extensions/vector_serialize.rb
+- lib/classifier/extensions/word_hash.rb
+- lib/classifier/lsi/content_node.rb
+- lib/classifier/lsi/summary.rb
+- lib/classifier/lsi/word_list.rb
+- lib/classifier/bayes.rb
+- lib/classifier/lsi.rb
+- lib/classifier/redis_store.rb
+- lib/classifier.rb
+- test/bayes/bayesian_test.rb
+homepage: http://rubygems.org/gems/classifier_atsukamoto
+licenses:
+- GNU
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.1.11
+signing_key:
+specification_version: 4
+summary: Classifier with Redis
+test_files: []