RubyGems - classifier_atsukamoto - Versions diffs - 0.0.1 - Mend

classifier_atsukamoto 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +7 -0
data/Rakefile +97 -0
data/lib/classifier/bayes.rb +156 -0
data/lib/classifier/extensions/string.rb +10 -0
data/lib/classifier/extensions/vector.rb +113 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/extensions/word_hash.rb +129 -0
data/lib/classifier/lsi/content_node.rb +72 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/lib/classifier/lsi.rb +318 -0
data/lib/classifier/redis_store.rb +125 -0
data/lib/classifier.rb +31 -0
data/test/bayes/bayesian_test.rb +33 -0
metadata +57 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: e6eca98ba96b5157ddcfef0ba3f02e129652c5ce
+  data.tar.gz: bc219f04544083e8a017b548ca2fede7a942fa45
+SHA512:
+  metadata.gz: 1c78965de0ffd493b57ebf013deb1baf92c8e554f21157bb681dca1ae980edcd1d4a7fdafe601983270ecb0bbd86214a6b19ec3a1e9ebebc12789ebdbc1f0131
+  data.tar.gz: a9eb9c3ebac570198f25800f4c02677aea29ad8e9f152b41585242bd8667cf8b6de3812cf28ce68386c1090f5764fbc0c610972a0041342cb58bfb2fddccaf15

data/Rakefile ADDED Viewed

@@ -0,0 +1,97 @@
+require 'rubygems'
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+require 'rake/contrib/rubyforgepublisher'
+PKG_VERSION = "0.0.1"
+PKG_FILES = FileList[
+    "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
+]
+desc "Default Task"
+task :default => [ :test ]
+# Run the unit tests
+desc "Run all unit tests"
+Rake::TestTask.new("test") { |t|
+  t.libs << "lib"
+  t.pattern = 'test/*/*_test.rb'
+  t.verbose = true
+}
+# Make a console, useful when working on tests
+desc "Generate a test console"
+task :console do
+   verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
+end
+# Genereate the RDoc documentation
+desc "Create documentation"
+Rake::RDocTask.new("doc") { |rdoc|
+  rdoc.title = "Ruby Classifier Fork by ATsukamoto - Bayesian and LSI classification library with Redis for persistence
+  "
+  rdoc.rdoc_dir = 'html'
+  rdoc.rdoc_files.include('README')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+}
+# Genereate the package
+spec = Gem::Specification.new do |s|
+  #### Basic information.
+  s.name = 'classifier'
+  s.version = PKG_VERSION
+  s.summary = <<-EOF
+   A general classifier module to allow Bayesian and other types of classifications.
+  EOF
+  s.description = <<-EOF
+   A general classifier module to allow Bayesian and other types of classifications.
+  EOF
+  #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)
+  s.files = PKG_FILES
+  #### Load-time details: library and application (you will need one or both).
+  s.require_path = 'lib'
+  s.autorequire = 'classifier'
+  #### Documentation and testing.
+  s.has_rdoc = true
+  #### Dependencies and requirements.
+  s.add_dependency('fast-stemmer', '>= 1.0.0')
+  s.requirements << "A porter-stemmer module to split word stems."
+  #### Author and project details.
+  s.author = "Lucas Carlson"
+  s.email = "lucas@rufy.com"
+  s.homepage = "http://classifier.rufy.com/"
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.need_zip = true
+  pkg.need_tar = true
+end
+desc "Report code statistics (KLOCs, etc) from the application"
+task :stats do
+  require 'code_statistics'
+  CodeStatistics.new(
+    ["Library", "lib"],
+    ["Units", "test"]
+  ).to_s
+end
+desc "Publish new documentation"
+task :publish do
+   `ssh rufy update-classifier-doc`
+    Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
+end

data/lib/classifier/bayes.rb ADDED Viewed

@@ -0,0 +1,156 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module Classifier
+require 'lingua/stemmer'
+class Bayes
+  # The class can be created with one or more categories, each of which will be
+  # initialized and given a training method. E.g.,
+  #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
+	def initialize(lang, *categories)
+		#@categories = Hash.new
+		#categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
+		# RedisStore.total_words = 0
+		@categories = RedisStore.new lang, categories
+		@categories.init_total
+		@stemmer = Lingua::Stemmer.new(:language => lang.downcase)
+	end
+	#
+	# Provides a general training method for all categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.train "that", "That text"
+	#     b.train "The other", "The other text"
+	def train(category, text)
+		category = category.prepare_category_name
+		text.word_hash(@stemmer).each do |word, count|
+			# @categories[category][word] ||= 0
+			@categories.init(category, word)
+			# @categories[category][word] += count
+			@categories.incr(category, word, count)
+			# @total_words += count
+			@categories.incr_total(count)
+		end
+	end
+	#
+	# Provides a untraining method for all categories specified in Bayes#new
+	# Be very careful with this method.
+	#
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train :this, "This text"
+	#     b.untrain :this, "This text"
+	def untrain(category, text)
+		category = category.prepare_category_name
+		text.word_hash(@stemmer).each do |word, count|
+			# @total_words >= 0
+			if @categories.total_words >= 0
+				# orig = @categories[category][word]
+				orig = @categories.get(category,word)
+				# @categories[category][word] ||= 0
+				@categories.init(category, word)
+				# @categories[category][word] -= count
+				@categories.decr(category, word, count)
+				#if @categories[category][word] <= 0
+				if @categories.get(category,word) <= 0
+					# @categories[category].delete(word)
+					@categories.remove(category,word)
+					count = orig
+				end
+				#@total_words -= count
+				@categories.decr_total(count)
+			end
+		end
+	end
+	#
+	# Returns the scores in each category the provided +text+. E.g.,
+	#    b.classifications "I hate bad words and you"
+	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
+	# The largest of these scores (the one closest to 0) is the one picked out by #classify
+	def classifications(text)
+		score = Hash.new
+		# actual categories saved in the beggining but each do |category|
+		@categories.names.each do |category, category_words|
+			score[category.to_s] = 0
+			# total = category_words.values.inject(0) {|sum, element| sum+element}
+			total = category_words.inject(0) { |sum, element| sum + element }
+			text.word_hash(@stemmer).each do |word, count|
+				#s = category_words.has_key?(word) ? category_words[word] : 0.1
+				s = @categories.has_word?(category, word) ? @categories.get(category, word) : 0.1
+				score[category.to_s] += Math.log(s/total.to_f)
+			end
+		end
+		return score
+	end
+  #
+  # Returns the classification of the provided +text+, which is one of the
+  # categories given in the initializer. E.g.,
+  #    b.classify "I hate bad words and you"
+  #    =>  'Uninteresting'
+	def classify(text)
+		(classifications(text).sort_by { |a| -a[1] })[0][0]
+	end
+	#
+	# Provides training and untraining methods for the categories specified in Bayes#new
+	# For example:
+	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
+	#     b.train_this "This text"
+	#     b.train_that "That text"
+	#     b.untrain_that "That text"
+	#     b.train_the_other "The other text"
+	def method_missing(name, *args)
+		category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
+		# categories.has_key?(key)
+		if @categories.names.include? category
+			args.each { |text| eval("#{$1}train(category, text)") }
+		elsif name.to_s =~ /(un)?train_([\w]+)/
+			raise StandardError, "No such category: #{category}"
+		else
+	    super  #raise StandardError, "No such method: #{name}"
+		end
+	end
+	#
+	# Provides a list of category names
+	# For example:
+	#     b.categories
+	#     =>   ['This', 'That', 'the_other']
+	def categories # :nodoc:
+		@categories
+	end
+	#
+	# Allows you to add categories to the classifier.
+	# For example:
+	#     b.add_category "Not spam"
+	#
+	# WARNING: Adding categories to a trained classifier will
+	# result in an undertrained category that will tend to match
+	# more criteria than the trained selective categories. In short,
+	# try to initialize your categories at initialization.
+	def add_category(category)
+		@categories[category.prepare_category_name] = Hash.new
+	end
+	alias append_category add_category
+end
+end

data/lib/classifier/extensions/string.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+# require 'fast_stemmer'
+require 'classifier/extensions/word_hash'
+class Object
+	def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
+end

data/lib/classifier/extensions/vector.rb ADDED Viewed

@@ -0,0 +1,113 @@
+# Author::    Ernest Ellingson
+# Copyright:: Copyright (c) 2005
+# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
+require 'matrix'
+require 'mathn'
+class Array
+  # TODO! Change name!
+  def a_sum(identity = 0, &block)
+    return identity unless size > 0
+    if block_given?
+      map(&block).sum
+    else
+      inject { |sum, element| sum + element }.to_f
+    end
+  end
+end
+class Vector
+  def magnitude
+    sumsqs = 0.0
+    self.size.times do |i|
+      sumsqs += self[i] ** 2.0
+    end
+    Math.sqrt(sumsqs)
+  end
+  def normalize
+    nv = []
+    mag = self.magnitude
+    self.size.times do |i|
+      nv << (self[i] / mag)
+    end
+    Vector[*nv]
+  end
+end
+class Matrix
+  def Matrix.diag(s)
+     Matrix.diagonal(*s)
+  end
+  alias :trans :transpose
+  def SV_decomp(maxSweeps = 20)
+    if self.row_size >= self.column_size
+      q = self.trans * self
+    else
+      q = self * self.trans
+    end
+    qrot    = q.dup
+    v       = Matrix.identity(q.row_size)
+    azrot   = nil
+    mzrot   = nil
+    cnt     = 0
+    s_old   = nil
+    mu      = nil
+    while true do
+      cnt += 1
+      for row in (0...qrot.row_size-1) do
+        for col in (1..qrot.row_size-1) do
+          next if row == col
+          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          hcos = Math.cos(h)
+          hsin = Math.sin(h)
+          mzrot = Matrix.identity(qrot.row_size)
+          mzrot[row,row] = hcos
+          mzrot[row,col] = -hsin
+          mzrot[col,row] = hsin
+          mzrot[col,col] = hcos
+          qrot = mzrot.trans * qrot * mzrot
+          v = v * mzrot
+        end
+      end
+      s_old = qrot.dup if cnt == 1
+      sum_qrot = 0.0
+      if cnt > 1
+        qrot.row_size.times do |r|
+          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+        end
+        s_old = qrot.dup
+      end
+      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+    end # of do while true
+    s = []
+    qrot.row_size.times do |r|
+      s << Math.sqrt(qrot[r,r])
+    end
+    #puts "cnt = #{cnt}"
+    if self.row_size >= self.column_size
+      mu = self *  v * Matrix.diagonal(*s).inverse
+      return [mu, v, s]
+    else
+      puts v.row_size
+      puts v.column_size
+      puts self.row_size
+      puts self.column_size
+      puts s.size
+      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      return [mu, v, s]
+    end
+  end
+  def []=(i,j,val)
+    @rows[i][j] = val
+  end
+end

data/lib/classifier/extensions/vector_serialize.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump( self.to_a )
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
+end

data/lib/classifier/extensions/word_hash.rb ADDED Viewed

@@ -0,0 +1,129 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+# These are extensions to the String class to provide convenience
+# methods for the Classifier package.
+require 'lingua/stemmer'
+class String
+  # Removes common punctuation symbols, returning a new string.
+  # E.g.,
+  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+  #   => "Hello  greetings   with  braces         "
+  def without_punctuation
+    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+  end
+  # Return a Hash of strings => ints. Each word in the string is stemmed,
+  # interned, and indexes to its frequency in the document.
+	def word_hash(stemmer)
+		word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split, stemmer)
+	end
+	# Return a word hash without extra punctuation or short symbols, just stemmed words
+	def clean_word_hash
+		word_hash_for_words gsub(/[^\w\s]/,"").split
+	end
+	private
+	def word_hash_for_words(words, stemmer)
+		d = Hash.new
+		words.each do |word|
+			word.downcase! if word =~ /[\w]+/
+			#key = word.stem.intern
+      key = stemmer.stem(word).intern
+			if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+				d[key] ||= 0
+				d[key] += 1
+			end
+		end
+		return d
+	end
+  # TODO! Actualize for each language
+	CORPUS_SKIP_WORDS = [
+      "a",
+      "again",
+      "all",
+      "along",
+      "are",
+      "also",
+      "an",
+      "and",
+      "as",
+      "at",
+      "but",
+      "by",
+      "came",
+      "can",
+      "cant",
+      "couldnt",
+      "did",
+      "didn",
+      "didnt",
+      "do",
+      "doesnt",
+      "dont",
+      "ever",
+      "first",
+      "from",
+      "have",
+      "her",
+      "here",
+      "him",
+      "how",
+      "i",
+      "if",
+      "in",
+      "into",
+      "is",
+      "isnt",
+      "it",
+      "itll",
+      "just",
+      "last",
+      "least",
+      "like",
+      "most",
+      "my",
+      "new",
+      "no",
+      "not",
+      "now",
+      "of",
+      "on",
+      "or",
+      "should",
+      "sinc",
+      "so",
+      "some",
+      "th",
+      "than",
+      "this",
+      "that",
+      "the",
+      "their",
+      "then",
+      "those",
+      "to",
+      "told",
+      "too",
+      "true",
+      "try",
+      "until",
+      "url",
+      "us",
+      "were",
+      "when",
+      "whether",
+      "while",
+      "with",
+      "within",
+      "yes",
+      "you",
+      "youll",
+      ]
+end

data/lib/classifier/lsi/content_node.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+# This is an internal data structure class for the LSI node. Save for
+# raw_vector_with, it should be fairly straightforward to understand.
+# You should never have to use it directly.
+  class ContentNode
+    attr_accessor :raw_vector, :raw_norm,
+                  :lsi_vector, :lsi_norm,
+                  :categories
+    attr_reader :word_hash
+    # If text_proc is not specified, the source will be duck-typed
+    # via source.to_s
+    def initialize( word_hash, *categories )
+      @categories = categories || []
+      @word_hash = word_hash
+    end
+    # Use this to fetch the appropriate search vector.
+    def search_vector
+      @lsi_vector || @raw_vector
+    end
+    # Use this to fetch the appropriate search vector in normalized form.
+    def search_norm
+      @lsi_norm || @raw_norm
+    end
+    # Creates the raw vector out of word_hash using word_list as the
+    # key for mapping the vector space.
+    def raw_vector_with( word_list )
+      if $GSL
+         vec = GSL::Vector.alloc(word_list.size)
+      else
+         vec = Array.new(word_list.size, 0)
+      end
+      @word_hash.each_key do |word|
+        vec[word_list[word]] = @word_hash[word] if word_list[word]
+      end
+      # Perform the scaling transform
+      total_words = vec.a_sum
+      # Perform first-order association transform if this vector has more
+      # than one word in it.
+      if total_words > 1.0
+        weighted_total = 0.0
+        vec.each do |term|
+          if ( term > 0 )
+            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
+          end
+        end
+        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+      end
+      if $GSL
+         @raw_norm   = vec.normalize
+         @raw_vector = vec
+      else
+         @raw_norm   = Vector[*vec].normalize
+         @raw_vector = Vector[*vec]
+      end
+    end
+  end
+end

data/lib/classifier/lsi/summary.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+class String
+   def summary( count=10, separator=" [...] " )
+      perform_lsi split_sentences, count, separator
+   end
+   def paragraph_summary( count=1, separator=" [...] " )
+      perform_lsi split_paragraphs, count, separator
+   end
+   def split_sentences
+      split /(\.|\!|\?)/ # TODO: make this less primitive
+   end
+   def split_paragraphs
+      split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
+   end
+   private
+   def perform_lsi(chunks, count, separator)
+      lsi = Classifier::LSI.new :auto_rebuild => false
+      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
+      lsi.build_index
+      summaries = lsi.highest_relative_content count
+      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+   end
+end

data/lib/classifier/lsi/word_list.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+module Classifier
+  # This class keeps a word => index mapping. It is used to map stemmed words
+  # to dimensions of a vector.
+  class WordList
+    def initialize
+      @location_table = Hash.new
+    end
+    # Adds a word (if it is new) and assigns it a unique dimension.
+    def add_word(word)
+      term = word
+      @location_table[term] = @location_table.size unless @location_table[term]
+    end
+    # Returns the dimension of the word or nil if the word is not in the space.
+    def [](lookup)
+      term = lookup
+      @location_table[term]
+    end
+    def word_for_index(ind)
+      @location_table.invert[ind]
+    end
+    # Returns the number of words mapped.
+    def size
+      @location_table.size
+    end
+  end
+end

data/lib/classifier/lsi.rb ADDED Viewed

@@ -0,0 +1,318 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+begin
+   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+   require 'gsl' # requires http://rb-gsl.rubyforge.org/
+   require 'classifier/extensions/vector_serialize'
+   $GSL = true
+rescue LoadError
+	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
+	require 'classifier/extensions/vector'
+end
+require 'classifier/lsi/word_list'
+require 'classifier/lsi/content_node'
+require 'classifier/lsi/summary'
+module Classifier
+  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
+  # data based on underlying semantic relations. For more information on the algorithms used,
+  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
+  class LSI
+    attr_reader :word_list
+    attr_accessor :auto_rebuild
+    # Create a fresh index.
+    # If you want to call #build_index manually, use
+    #      Classifier::LSI.new :auto_rebuild => false
+    #
+    def initialize(options = {})
+      @auto_rebuild = true unless options[:auto_rebuild] == false
+      @word_list, @items = WordList.new, {}
+      @version, @built_at_version = 0, -1
+    end
+    # Returns true if the index needs to be rebuilt.  The index needs
+    # to be built after all informaton is added, but before you start
+    # using it for search, classification and cluster detection.
+    def needs_rebuild?
+      (@items.keys.size > 1) && (@version != @built_at_version)
+    end
+    # Adds an item to the index. item is assumed to be a string, but
+    # any item may be indexed so long as it responds to #to_s or if
+    # you provide an optional block explaining how the indexer can
+    # fetch fresh string data. This optional block is passed the item,
+    # so the item may only be a reference to a URL or file name.
+    #
+    # For example:
+    #   lsi = Classifier::LSI.new
+    #   lsi.add_item "This is just plain text"
+    #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
+    #   ar = ActiveRecordObject.find( :all )
+    #   lsi.add_item ar, *ar.categories { |x| ar.content }
+    #
+    def add_item( item, *categories, &block )
+      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      @version += 1
+      build_index if @auto_rebuild
+    end
+    # A less flexible shorthand for add_item that assumes
+    # you are passing in a string with no categorries. item
+    # will be duck typed via to_s .
+    #
+    def <<( item )
+      add_item item
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # Removes an item from the database, if it is indexed.
+    #
+    def remove_item( item )
+      if @items.keys.contain? item
+        @items.remove item
+        @version += 1
+      end
+    end
+    # Returns an array of items that are indexed.
+    def items
+      @items.keys
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # This function rebuilds the index if needs_rebuild? returns true.
+    # For very large document spaces, this indexing operation may take some
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
+    #
+    # As a rule, indexing will be fairly swift on modern machines until
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
+    #
+    # The optional parameter "cutoff" is a tuning parameter. When the index is
+    # built, a certain number of s-values are discarded from the system. The
+    # cutoff parameter tells the indexer how many of these values to keep.
+    # A value of 1 for cutoff means that no semantic analysis will take place,
+    # turning the LSI class into a simple vector search engine.
+    def build_index( cutoff=0.75 )
+      return unless needs_rebuild?
+      make_word_list
+      doc_list = @items.values
+      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      if $GSL
+         tdm = GSL::Matrix.alloc(*tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.size[1].times do |col|
+           vec = GSL::Vector.alloc( ntdm.column(col) ).row
+           doc_list[col].lsi_vector = vec
+           doc_list[col].lsi_norm = vec.normalize
+         end
+      else
+         tdm = Matrix.rows(tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.row_size.times do |col|
+           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
+         end
+      end
+      @built_at_version = @version
+    end
+    # This method returns max_chunks entries, ordered by their average semantic rating.
+    # Essentially, the average distance of each entry from all other entries is calculated,
+    # the highest are returned.
+    #
+    # This can be used to build a summary service, or to provide more information about
+    # your dataset's general content. For example, if you were to use categorize on the
+    # results of this data, you could gather information on what your dataset is generally
+    # about.
+    def highest_relative_content( max_chunks=10 )
+       return [] if needs_rebuild?
+       avg_density = Hash.new
+       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    end
+    # This function is the primitive that find_related and classify
+    # build upon. It returns an array of 2-element arrays. The first element
+    # of this array is a document, and the second is its "score", defining
+    # how "close" it is to other indexed items.
+    #
+    # These values are somewhat arbitrary, having to do with the vector space
+    # created by your content, so the magnitude is interpretable but not always
+    # meaningful between indexes.
+    #
+    # The parameter doc is the content to compare. If that content is not
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
+    def proximity_array_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          if $GSL
+             val = content_node.search_vector * @items[item].search_vector.col
+          else
+             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+          end
+          [item, val]
+        end
+      result.sort_by { |x| x[1] }.reverse
+    end
+    # Similar to proximity_array_for_content, this function takes similar
+    # arguments and returns a similar array. However, it uses the normalized
+    # calculated vectors instead of their full versions. This is useful when
+    # you're trying to perform operations on content that is much smaller than
+    # the text you're working with. search uses this primitive.
+    def proximity_norms_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          if $GSL
+            val = content_node.search_norm * @items[item].search_norm.col
+          else
+            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+          end
+          [item, val]
+        end
+      result.sort_by { |x| x[1] }.reverse
+    end
+    # This function allows for text-based search of your index. Unlike other functions
+    # like find_related and classify, search only takes short strings. It will also ignore
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
+    #
+    # While this may seem backwards compared to the other functions that LSI supports,
+    # it is actually the same algorithm, just applied on a smaller document.
+    def search( string, max_nearest=3 )
+      return [] if needs_rebuild?
+      carry = proximity_norms_for_content( string )
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function takes content and finds other documents
+    # that are semantically "close", returning an array of documents sorted
+    # from most to least relavant.
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
+    #
+    # This is particularly useful for identifing clusters in your document space.
+    # For example you may want to identify several "What's Related" items for weblog
+    # articles, or find paragraphs that relate to each other in an essay.
+    def find_related( doc, max_nearest=3, &block )
+      carry =
+        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
+    # find_related function to find related documents, then returns the
+    # most obvious category from this list.
+    #
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
+    # what category the document is in. This may not always make sense.
+    #
+    def classify( doc, cutoff=0.30, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      ranking = votes.keys.sort_by { |x| votes[x] }
+      return ranking[-1]
+    end
+    # Prototype, only works on indexed documents.
+    # I have no clue if this is going to work, but in theory
+    # it's supposed to.
+    def highest_ranked_stems( doc, count=3 )
+      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+      arr = node_for_content(doc).lsi_vector.to_a
+      top_n = arr.sort.reverse[0..count-1]
+      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    end
+    private
+    def build_reduced_matrix( matrix, cutoff=0.75 )
+      # TODO: Check that M>=N on these dimensions! Transpose helps assure this
+      u, v, s = matrix.SV_decomp
+      # TODO: Better than 75% term, please. :\
+      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      s.size.times do |ord|
+        s[ord] = 0.0 if s[ord] < s_cutoff
+      end
+      # Reconstruct the term document matrix, only with reduced rank
+      u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
+    end
+    def node_for_content(item, &block)
+      if @items[item]
+        return @items[item]
+      else
+        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        unless needs_rebuild?
+          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+        end
+      end
+      return cn
+    end
+    def make_word_list
+      @word_list = WordList.new
+      @items.each_value do |node|
+        node.word_hash.each_key { |key| @word_list.add_word key }
+      end
+    end
+  end
+end

data/lib/classifier/redis_store.rb ADDED Viewed

@@ -0,0 +1,125 @@
+module Classifier
+	require 'redis'
+	#if !String.instance_methods.include?(:underscore)
+		class String
+  		def underscore
+  	  	self.gsub(/::/, '/').
+  	  	gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+  	  	gsub(/([a-z\d])([A-Z])/,'\1_\2').
+  	  	tr("-", "_").
+  	  	downcase
+  		end
+		end
+	#end
+	class RedisStore
+		include Enumerable
+		attr_accessor :names
+		def initialize(lang, categories)
+			$redis = Redis.new
+			@names = []
+			@lang = lang
+			categories.each_with_index do |category, index|
+				@names << category.prepare_category_name
+			end
+		end
+		def init(category, word)
+			if !key_for?(category, word)
+				insert(category, word, 0)
+			end
+		end
+		def init_total
+			$redis.set redis_total_key, 0
+		end
+		def total_words
+			$redis.get(redis_total_key).to_i
+		end
+		def key_for?(category, word)
+			$redis.exists(redis_key(category, word))
+		end
+		alias :has_word? :key_for?
+		def insert(category, word, val)
+			$redis.set(redis_key(category, word), "#{val}")
+		end
+		def get(category, word)
+			val = $redis.get redis_key(category, word)
+			val.nil? ? nil : val.to_i
+		end
+		def remove(category, word)
+			$redis.del redis_key(category, word)
+		end
+		def incr(category, word, count)
+			$redis.incrby redis_key(category, word), count.to_i
+		end
+		def incr_total(count)
+			$redis.incrby redis_total_key, count.to_i
+		end
+		def decr
+			$redis.decrby redis_key(category, word), count.to_i
+		end
+		def decr_total(count)
+			$redis.decrby redis_total_key, count.to_i
+		end
+		def each(&block)
+			#return enum_for(__method__) if block.nil?
+			@names.each do |category|
+				if block_given?
+					block.call(category, get_by_wild_keys(category))
+				else
+					yield category
+				end
+			end
+		end
+		#protected
+		def redis_key(category, word)
+			"#{escape_lang}:#{escape_category(category)}:#{escape_word(word)}"
+		end
+		def redis_total_key
+			"redis_bayes_store_#{@lang}"
+		end
+		def escape_category(category)
+			category.to_s.gsub(" ", "_").downcase
+		end
+		def escape_word(word)
+			word.to_s.force_encoding('UTF-8')
+		end
+		def escape_lang
+			@lang.to_s.downcase
+		end
+		def get_by_wild_keys(category)
+			wildlings = []
+			$redis.keys("#{escape_category(category)}:*").each do |key|
+				wildlings << get_by_key(key).to_i
+			end
+			wildlings
+		end
+		def get_by_key(key)
+			val = $redis.get(key)
+			val.is_a?(String) ? eval(val) : val
+		end
+	end
+end

data/lib/classifier.rb ADDED Viewed

@@ -0,0 +1,31 @@
+#--
+# Copyright (c) 2005 Lucas Carlson
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+require 'rubygems'
+require 'classifier/extensions/string'
+require 'classifier/bayes'
+require 'classifier/lsi'
+require 'classifier/redis_store'

data/test/bayes/bayesian_test.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.dirname(__FILE__) + '/../test_helper'
+class BayesianTest < Test::Unit::TestCase
+	def setup
+		@classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
+	end
+	def test_good_training
+		assert_nothing_raised { @classifier.train_interesting "love" }
+	end
+	def test_bad_training
+		assert_raise(StandardError) { @classifier.train_no_category "words" }
+	end
+	def test_bad_method
+		assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
+	end
+	def test_categories
+		assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_add_category
+		@classifier.add_category 'Test'
+		assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
+	end
+	def test_classification
+		@classifier.train_interesting "here are some good words. I hope you love them"
+		@classifier.train_uninteresting "here are some bad words, I hate you"
+		assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
+	end
+end

metadata ADDED Viewed

@@ -0,0 +1,57 @@
+--- !ruby/object:Gem::Specification
+name: classifier_atsukamoto
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Lucas Carlson
+- Afonso Tsukamoto
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-12-13 00:00:00.000000000 Z
+dependencies: []
+description: Classifier with redis
+email: atsukamoto@faber-ventures.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- Rakefile
+- lib/classifier/extensions/string.rb
+- lib/classifier/extensions/vector.rb
+- lib/classifier/extensions/vector_serialize.rb
+- lib/classifier/extensions/word_hash.rb
+- lib/classifier/lsi/content_node.rb
+- lib/classifier/lsi/summary.rb
+- lib/classifier/lsi/word_list.rb
+- lib/classifier/bayes.rb
+- lib/classifier/lsi.rb
+- lib/classifier/redis_store.rb
+- lib/classifier.rb
+- test/bayes/bayesian_test.rb
+homepage: http://rubygems.org/gems/classifier_atsukamoto
+licenses:
+- GNU
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.1.11
+signing_key:
+specification_version: 4
+summary: Classifier with Redis
+test_files: []