RubyGems - otherinbox-classifier - Versions diffs - 1.3.1.1 - Mend

otherinbox-classifier 1.3.1.1

Files changed (20) hide show

data/LICENSE +429 -0
data/README +88 -0
data/Rakefile +96 -0
data/bin/bayes.rb +36 -0
data/bin/summarize.rb +16 -0
data/lib/classifier.rb +30 -0
data/lib/classifier/bayes.rb +172 -0
data/lib/classifier/extensions/string.rb +16 -0
data/lib/classifier/extensions/vector.rb +106 -0
data/lib/classifier/extensions/vector_serialize.rb +20 -0
data/lib/classifier/extensions/word_hash.rb +154 -0
data/lib/classifier/lsi.rb +318 -0
data/lib/classifier/lsi/content_node.rb +72 -0
data/lib/classifier/lsi/summary.rb +31 -0
data/lib/classifier/lsi/word_list.rb +36 -0
data/test/bayes/bayesian_test.rb +33 -0
data/test/extensions/word_hash_test.rb +14 -0
data/test/lsi/lsi_test.rb +123 -0
data/test/test_helper.rb +4 -0
metadata +85 -0

data/lib/classifier/extensions/vector_serialize.rb ADDED

@@ -0,0 +1,20 @@
+module GSL
+  class Vector
+    def _dump(v)
+      Marshal.dump( self.to_a )
+    end
+    def self._load(arr)
+      arry = Marshal.load(arr)
+      return GSL::Vector.alloc(arry)
+    end
+  end
+  class Matrix
+     class <<self
+        alias :diag :diagonal
+     end
+  end
+end

data/lib/classifier/extensions/word_hash.rb ADDED

@@ -0,0 +1,154 @@
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+# These are extensions to the String class to provide convenience
+# methods for the Classifier package.
+class String
+  # Removes common punctuation symbols, returning a new string.
+  # E.g.,
+  #   "Hello (greeting's), with {braces} < >...?".without_punctuation
+  #   => "Hello  greetings   with  braces         "
+  def without_punctuation
+    tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
+  end
+  # Return a Hash of strings => ints. Each word in the string is stemmed,
+  # interned, and indexes to its frequency in the document.
+	def word_hash
+		word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
+	end
+	# Return a word hash without extra punctuation or short symbols, just stemmed words
+	def clean_word_hash
+		word_hash_for_words gsub(/[^\w\s]/,"").split
+	end
+	private
+	def word_hash_for_words(words)
+		d = Hash.new
+		words.each do |word|
+			word.downcase! if word =~ /[\w]+/
+      # Don't stem words -- most sources I've read say that it's more hurtful than helpful and my tests found the same thing
+			#key = word.stem.intern
+      # Ignore words if they have no word chars, are in the skip list, all numbers or length <= 2
+			if word =~ /\w/ && word !~ /\d+/ && word.length > 2 && !CORPUS_SKIP_WORDS.include?(word)
+        key = word.intern
+				d[key] ||= 0
+				d[key] += 1
+			end
+		end
+		return d
+	end
+	CORPUS_SKIP_WORDS = [
+      # Hopefully all of the HTML is gone by this point but because broken HTML is out there
+      # we're not able to easily get rid of it all. This really messes up the classifier
+      # so we're doing a last stand and removing HTML artifacts here.
+      # TODO replace with a definitive and less haphazard list
+      "href",
+      "http",
+      "https",
+      "alt",
+      "coords",
+      "nbsp",
+      "target",
+      "com",
+      "net",
+      "org",
+      "shape",
+      "rect",
+      "apos",
+      "quot",
+      "bull",
+      "html",
+      "www",
+      # These are OI specific but are still showing up as a result of broken HTML
+      "otherinbox",
+      "beta",
+      "blank",
+      "a",
+      "again",
+      "all",
+      "along",
+      "are",
+      "also",
+      "an",
+      "and",
+      "as",
+      "at",
+      "but",
+      "by",
+      "came",
+      "can",
+      "cant",
+      "couldnt",
+      "did",
+      "didn",
+      "didnt",
+      "do",
+      "doesnt",
+      "dont",
+      "ever",
+      "first",
+      "from",
+      "have",
+      "her",
+      "here",
+      "him",
+      "how",
+      "i",
+      "if",
+      "in",
+      "into",
+      "is",
+      "isnt",
+      "it",
+      "itll",
+      "just",
+      "last",
+      "least",
+      "like",
+      "most",
+      "my",
+      "new",
+      "no",
+      "not",
+      "now",
+      "of",
+      "on",
+      "or",
+      "should",
+      "sinc",
+      "so",
+      "some",
+      "th",
+      "than",
+      "this",
+      "that",
+      "the",
+      "their",
+      "then",
+      "those",
+      "to",
+      "told",
+      "too",
+      "true",
+      "try",
+      "until",
+      "url",
+      "us",
+      "were",
+      "when",
+      "whether",
+      "while",
+      "with",
+      "within",
+      "yes",
+      "you",
+      "youll",
+      ].to_set
+end

data/lib/classifier/lsi.rb ADDED

@@ -0,0 +1,318 @@
+# Author::    David Fayram  (mailto:dfayram@lensmen.net)
+# Copyright:: Copyright (c) 2005 David Fayram II
+# License::   LGPL
+begin
+   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+   require 'gsl' # requires http://rb-gsl.rubyforge.org/
+   require 'classifier/extensions/vector_serialize'
+   $GSL = true
+rescue LoadError
+	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
+	require 'classifier/extensions/vector'
+end
+require 'classifier/lsi/word_list'
+require 'classifier/lsi/content_node'
+require 'classifier/lsi/summary'
+module Classifier
+  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
+  # data based on underlying semantic relations. For more information on the algorithms used,
+  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
+  class LSI
+    attr_reader :word_list
+    attr_accessor :auto_rebuild
+    # Create a fresh index.
+    # If you want to call #build_index manually, use
+    #      Classifier::LSI.new :auto_rebuild => false
+    #
+    def initialize(options = {})
+      @auto_rebuild = true unless options[:auto_rebuild] == false
+      @word_list, @items = WordList.new, {}
+      @version, @built_at_version = 0, -1
+    end
+    # Returns true if the index needs to be rebuilt.  The index needs
+    # to be built after all informaton is added, but before you start
+    # using it for search, classification and cluster detection.
+    def needs_rebuild?
+      (@items.keys.size > 1) && (@version != @built_at_version)
+    end
+    # Adds an item to the index. item is assumed to be a string, but
+    # any item may be indexed so long as it responds to #to_s or if
+    # you provide an optional block explaining how the indexer can
+    # fetch fresh string data. This optional block is passed the item,
+    # so the item may only be a reference to a URL or file name.
+    #
+    # For example:
+    #   lsi = Classifier::LSI.new
+    #   lsi.add_item "This is just plain text"
+    #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
+    #   ar = ActiveRecordObject.find( :all )
+    #   lsi.add_item ar, *ar.categories { |x| ar.content }
+    #
+    def add_item( item, *categories, &block )
+      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      @version += 1
+      build_index if @auto_rebuild
+    end
+    # A less flexible shorthand for add_item that assumes
+    # you are passing in a string with no categorries. item
+    # will be duck typed via to_s .
+    #
+    def <<( item )
+      add_item item
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # Removes an item from the database, if it is indexed.
+    #
+    def remove_item( item )
+      if @items.keys.contain? item
+        @items.remove item
+        @version += 1
+      end
+    end
+    # Returns an array of items that are indexed.
+    def items
+      @items.keys
+    end
+    # Returns the categories for a given indexed items. You are free to add and remove
+    # items from this as you see fit. It does not invalide an index to change its categories.
+    def categories_for(item)
+      return [] unless @items[item]
+      return @items[item].categories
+    end
+    # This function rebuilds the index if needs_rebuild? returns true.
+    # For very large document spaces, this indexing operation may take some
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
+    #
+    # As a rule, indexing will be fairly swift on modern machines until
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
+    #
+    # The optional parameter "cutoff" is a tuning parameter. When the index is
+    # built, a certain number of s-values are discarded from the system. The
+    # cutoff parameter tells the indexer how many of these values to keep.
+    # A value of 1 for cutoff means that no semantic analysis will take place,
+    # turning the LSI class into a simple vector search engine.
+    def build_index( cutoff=0.75 )
+      return unless needs_rebuild?
+      make_word_list
+      doc_list = @items.values
+      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      if $GSL
+         tdm = GSL::Matrix.alloc(*tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.size[1].times do |col|
+           vec = GSL::Vector.alloc( ntdm.column(col) ).row
+           doc_list[col].lsi_vector = vec
+           doc_list[col].lsi_norm = vec.normalize
+         end
+      else
+         tdm = Matrix.rows(tda).trans
+         ntdm = build_reduced_matrix(tdm, cutoff)
+         ntdm.row_size.times do |col|
+           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
+         end
+      end
+      @built_at_version = @version
+    end
+    # This method returns max_chunks entries, ordered by their average semantic rating.
+    # Essentially, the average distance of each entry from all other entries is calculated,
+    # the highest are returned.
+    #
+    # This can be used to build a summary service, or to provide more information about
+    # your dataset's general content. For example, if you were to use categorize on the
+    # results of this data, you could gather information on what your dataset is generally
+    # about.
+    def highest_relative_content( max_chunks=10 )
+       return [] if needs_rebuild?
+       avg_density = Hash.new
+       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    end
+    # This function is the primitive that find_related and classify
+    # build upon. It returns an array of 2-element arrays. The first element
+    # of this array is a document, and the second is its "score", defining
+    # how "close" it is to other indexed items.
+    #
+    # These values are somewhat arbitrary, having to do with the vector space
+    # created by your content, so the magnitude is interpretable but not always
+    # meaningful between indexes.
+    #
+    # The parameter doc is the content to compare. If that content is not
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
+    def proximity_array_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          if $GSL
+             val = content_node.search_vector * @items[item].search_vector.col
+          else
+             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+          end
+          [item, val]
+        end
+      result.sort_by { |x| x[1] }.reverse
+    end
+    # Similar to proximity_array_for_content, this function takes similar
+    # arguments and returns a similar array. However, it uses the normalized
+    # calculated vectors instead of their full versions. This is useful when
+    # you're trying to perform operations on content that is much smaller than
+    # the text you're working with. search uses this primitive.
+    def proximity_norms_for_content( doc, &block )
+      return [] if needs_rebuild?
+      content_node = node_for_content( doc, &block )
+      result =
+        @items.keys.collect do |item|
+          if $GSL
+            val = content_node.search_norm * @items[item].search_norm.col
+          else
+            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+          end
+          [item, val]
+        end
+      result.sort_by { |x| x[1] }.reverse
+    end
+    # This function allows for text-based search of your index. Unlike other functions
+    # like find_related and classify, search only takes short strings. It will also ignore
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
+    #
+    # While this may seem backwards compared to the other functions that LSI supports,
+    # it is actually the same algorithm, just applied on a smaller document.
+    def search( string, max_nearest=3 )
+      return [] if needs_rebuild?
+      carry = proximity_norms_for_content( string )
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function takes content and finds other documents
+    # that are semantically "close", returning an array of documents sorted
+    # from most to least relavant.
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
+    #
+    # This is particularly useful for identifing clusters in your document space.
+    # For example you may want to identify several "What's Related" items for weblog
+    # articles, or find paragraphs that relate to each other in an essay.
+    def find_related( doc, max_nearest=3, &block )
+      carry =
+        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+      result = carry.collect { |x| x[0] }
+      return result[0..max_nearest-1]
+    end
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
+    # find_related function to find related documents, then returns the
+    # most obvious category from this list.
+    #
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
+    # what category the document is in. This may not always make sense.
+    #
+    def classify( doc, cutoff=0.30, &block )
+      icutoff = (@items.size * cutoff).round
+      carry = proximity_array_for_content( doc, &block )
+      carry = carry[0..icutoff-1]
+      votes = {}
+      carry.each do |pair|
+        categories = @items[pair[0]].categories
+        categories.each do |category|
+          votes[category] ||= 0.0
+          votes[category] += pair[1]
+        end
+      end
+      ranking = votes.keys.sort_by { |x| votes[x] }
+      return ranking[-1]
+    end
+    # Prototype, only works on indexed documents.
+    # I have no clue if this is going to work, but in theory
+    # it's supposed to.
+    def highest_ranked_stems( doc, count=3 )
+      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+      arr = node_for_content(doc).lsi_vector.to_a
+      top_n = arr.sort.reverse[0..count-1]
+      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    end
+    private
+    def build_reduced_matrix( matrix, cutoff=0.75 )
+      # TODO: Check that M>=N on these dimensions! Transpose helps assure this
+      u, v, s = matrix.SV_decomp
+      # TODO: Better than 75% term, please. :\
+      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      s.size.times do |ord|
+        s[ord] = 0.0 if s[ord] < s_cutoff
+      end
+      # Reconstruct the term document matrix, only with reduced rank
+      u * Matrix.diag( s ) * v.trans
+    end
+    def node_for_content(item, &block)
+      if @items[item]
+        return @items[item]
+      else
+        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        unless needs_rebuild?
+          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+        end
+      end
+      return cn
+    end
+    def make_word_list
+      @word_list = WordList.new
+      @items.each_value do |node|
+        node.word_hash.each_key { |key| @word_list.add_word key }
+      end
+    end
+  end
+end