RubyGems - rjspotter-basset - Versions diffs - 1.0.5 - Mend

rjspotter-basset 1.0.5

Files changed (31) hide show

data/History.txt +7 -0
data/License.txt +20 -0
data/Manifest.txt +21 -0
data/README.rdoc +58 -0
data/Rakefile +63 -0
data/VERSION.yml +4 -0
data/basset.gemspec +38 -0
data/examples/example.rb +25 -0
data/lib/basset.rb +9 -0
data/lib/basset/classification_evaluator.rb +170 -0
data/lib/basset/classifier.rb +188 -0
data/lib/basset/core_extensions.rb +93 -0
data/lib/basset/document.rb +84 -0
data/lib/basset/document_override_example.rb +11 -0
data/lib/basset/feature.rb +26 -0
data/lib/basset/feature_extractor.rb +52 -0
data/lib/basset/feature_selector.rb +126 -0
data/lib/basset/naive_bayes.rb +151 -0
data/lib/basset/svm.rb +180 -0
data/lib/basset/yaml_serialization.rb +41 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +12 -0
data/spec/unit/classifier_spec.rb +166 -0
data/spec/unit/core_extension_spec.rb +33 -0
data/spec/unit/document_spec.rb +59 -0
data/spec/unit/feature_extractor_spec.rb +33 -0
data/spec/unit/feature_selector_spec.rb +108 -0
data/spec/unit/feature_spec.rb +40 -0
data/spec/unit/naive_bayes_spec.rb +119 -0
data/spec/unit/svm_spec.rb +83 -0
metadata +115 -0

data/lib/basset/core_extensions.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# This file contains extensions to built in Ruby classes.
+require 'rubygems'
+require 'stemmer'
+# Extensions to the array class.
+class Array
+  # Returns a new array that contains everything except the first element of this one. (just like in lisp)
+  def rest
+    return self if empty?
+    self.slice(1, size)
+  end
+  # Returns the second item in the array
+  def second
+    self[1]
+  end
+  # Returns a random item from the array
+  def pick_random
+    self[rand(self.size)]
+  end
+  # Returns a randomized array
+  def randomize
+    self.sort_by { rand }
+  end
+  def sum
+    inject(0) { |sum, val| sum + val }
+  end
+  # Randomizes array in place
+  def randomize!
+    self.replace(self.randomize)
+  end
+end
+class Float
+  def to_s_decimal_places(decimal_places)
+    pattern = "[0-9]*\."
+    decimal_places.times { pattern << "[0-9]"}
+    return self.to_s.match(pattern)[0]
+  end
+end
+class Symbol
+  unless public_method_defined? :to_proc
+    def to_proc
+      Proc.new { |*args| args.shift.__send__(self, *args) }
+    end
+  end
+end
+# Extensions to the string class.
+# We're just including the stemmable module into string. This adds the .stem method.
+class String
+  include Stemmable
+end
+module Math
+  def variance(population)
+    n = 0
+    mean = 0.0
+    s = 0.0
+    population.each { |x|
+      n = n + 1
+      delta = x - mean
+      mean = mean + (delta / n)
+      s = s + delta * (x - mean)
+    }
+    # if you want to calculate std deviation
+    # of a sample change this to "s / (n-1)"
+    return s / n
+  end
+  # calculate the standard deviation of a population
+  # accepts: an array, the population
+  # returns: the standard deviation
+  def stddev(population)
+    sqrt(variance(population))
+  end
+  def avg(pop)
+    total = pop.inject(0) { |sum, n| sum + n }
+    total.to_f / pop.count.to_f
+  end
+  module_function :variance, :avg, :stddev
+end

data/lib/basset/document.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'uri'
+module Basset
+  # A class for representing a document as a vector of features. It takes the text
+  # of the document and the classification. The vector of features representation is
+  # just a basic bag of words approach.
+  class Document
+    attr_reader :text, :classification
+    #
+    # initialize the object with document text.  Set an explicit classification
+    # to use the document as training data
+    def initialize(text, classification = nil)
+      @text, @classification = text, classification
+      @tokens = stemmed_words
+    end
+    #
+    # returns an array of feature (token) vectors, which are instances Feature
+    def vector_of_features
+      @feature_vector ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( @tokens ) )
+    end
+    #
+    # Alias for #vector_of_features
+    def feature_vectors
+      vector_of_features
+    end
+  private
+    # returns a hash with each word as a key and the value is the number of times
+    # the word appears in the passed in words array
+    def terms_hash_from_words_array(words)
+      terms = Hash.new(0)
+      words.each do |term|
+        terms[term] += 1
+      end
+      return terms
+    end
+    def vector_of_features_from_terms_hash(terms)
+      terms.collect do |term, frequency|
+        Feature.new(term, frequency)
+      end
+    end
+    def stemmed_words
+      words.map { |w| w.stem.downcase }
+    end
+    def words
+      clean_text.split(" ")
+    end
+    # Remove punctuation, numbers and symbols
+    def clean_text
+      text.tr("'@_", '').gsub(/\W/, ' ').gsub(/[0-9]/, '')
+#      text.tr( ',?.!;:"#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "@'\-", "")
+    end
+  end
+  #
+  # Subclass of Document intended to be used to classify URIs
+  class UriDocument < Document
+    def initialize(uri, classification=nil)
+      @text, @classification = uri, classification
+      @tokens = uri_tokens
+    end
+    def vector_of_features
+      @feature_vector ||= vector_of_features_from_terms_hash(terms_hash_from_words_array(@tokens))
+    end
+    def uri_tokens
+      URI.decode(@text).gsub(/(\&|\?|\\\\|\\|\/\/|\/|\=|\[|\]|\.\.|\.)/) { |char| " " + char + " " }.split
+    end
+  end
+end

data/lib/basset/document_override_example.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Basset
+  # This class is an example for how to do custom document representations. In this
+  # example, I change the way text is cleaned and don't stem the words. It would also
+  # be easy to put in additional hard coded features.
+  # The important thing to note is that the new document class only needs one function: vector_of_features
+  class DocumentOverrideExample < Document
+    def vector_of_features
+      @vector_of_features ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( text.gsub(/\W/, ' ').split(' ') ) )
+    end
+  end
+end

data/lib/basset/feature.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module Basset
+  # A class to hold a feature which consists of a name and a value. In the basic sense
+  # of document classification the name would be the word and the value would be the
+  # number of times that word appeared in the document.
+  class Feature
+    attr_accessor :name, :value
+    def initialize(name, value = 0)
+      @name   = name
+      @value  = value
+    end
+    def <=>(other)
+      ret = self.name <=> other.name
+      ret = self.value <=> other.value if ret.zero?
+      ret
+    end
+    def ==(other)
+      ret = self.name == other.name
+      ret = self.value == other.value if ret
+      ret
+    end
+  end
+end

data/lib/basset/feature_extractor.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require File.join(File.dirname(__FILE__), "yaml_serialization")
+module Basset
+  # Extracts features from a document. On initialization it expects the set of features that
+  # are to be extracted from documents. The extracted features will just be numbered in
+  # ascending order. This makes it easy to output feature sets for libraries like svmlight.
+  class FeatureExtractor
+    include YamlSerialization
+    # the constructor takes an array of feature names. These are the features that will be
+    # extracted from documents. All others will be ignored.
+    def initialize(feature_names)
+      @feature_names = {}
+      feature_names.each_with_index {|feature_name, index| @feature_names[feature_name] = index + 1}
+    end
+    def number_of_features
+      @feature_names.size
+    end
+    # returns an array of features, but with their names replaced with an integer identifier.
+    # They should be sorted in ascending identifier order. This is a generic representation that works
+    # well with other machine learning packages like svm_light.
+    def extract_numbered(document)
+      numbered_features = extract(document).collect do |feature|
+        Feature.new(@feature_names[feature.name], feature.value)
+      end
+      numbered_features.sort
+    end
+    # just returns the features from the document that the extractor is interested in
+    def extract(document)
+      document.vector_of_features.find_all do |feature|
+        @feature_names[feature.name]
+      end
+    end
+    # def extract_with_duplicate_removal(document)
+    #   features = extract(document)
+    #   # # now remove the unigrams that dupe bigram features
+    #   # # first grab an array of the bigram ones
+    #   # bigram_features = []
+    #   # sorted_features.each {|feature| bigram_features << feature if feature.name =~ /.*_AND_.*/}
+    #   # # now remove all the ones that have a match in the bigram features
+    #   # sorted_features.each_with_index do |feature, index|
+    #   #   sorted_features.delete_at(index) if (feature.name !~ /_AND_/ and bigram_features.detect {|bf| bf.name =~ /^#{feature.name}_|_#{feature.name}$/})
+    #   # end
+    # end
+  end
+end

data/lib/basset/feature_selector.rb ADDED Viewed

@@ -0,0 +1,126 @@
+module Basset
+  # This class is the feature selector. All documents in the training set should be added
+  # to the selector. Once they are in, a number of features may be selected based on the
+  # chi square value. When in doubt just call feature_with_chi_value_greater_than with an
+  # empty hash. It will return all features that have at least some statistical significance
+  # and occur in more than one document.
+  class FeatureSelector
+    attr_reader :docs
+    def initialize
+      @docs           = 0
+      @docs_in_class  = Hash.new(0)
+      @features       = Hash.new { |h, k| h[k] = FeatureValues.new }
+    end
+    # Adds a document to the feature selector. The document should respond_to a
+    # method vector_of_features which returns a vector of unique features.
+    def add_document(document)
+      @docs += 1
+      @docs_in_class[document.classification] += 1
+      document.vector_of_features.each do |feature|
+        @features[feature.name].add_document_with_class(document.classification)
+      end
+    end
+    # returns all features, regardless of chi_square or frequency
+    def all_feature_names
+      @features.keys
+    end
+    def number_of_features
+      @features.size
+    end
+    # returns an array of the best features for a given classification
+    def best_features(count = 10, classification = nil)
+      select_features(1.0, classification).first(count)
+    end
+    def features_with_chi(classification)
+      @features.keys.map do |feature_name|
+        Feature.new(feature_name, chi_squared(feature_name, classification))
+      end
+    end
+    # returns an array of features that have a minimum or better chi_square value.
+    def select_features(chi_value = 1.0, classification = nil)
+      classification ||= @docs_in_class.keys.first
+      selected_features = features_with_chi(classification).select do |feature|
+        (docs_with_feature(feature.name) > 1) && (feature.value >= chi_value)
+      end
+      selected_features.sort_by(&:value).reverse.collect(&:name)
+    end
+  private
+    def docs_with_feature_and_class(feature_name, classification)
+      @features[feature_name].docs_with_class(classification)
+    end
+    def docs_with_feature_and_not_class(feature_name, classification)
+      @features[feature_name].docs_with_feature - @features[feature_name].docs_with_class(classification)
+    end
+    def docs_with_class_and_not_feature(classification, feature_name)
+      @docs_in_class[classification] - @features[feature_name].docs_with_class(classification)
+    end
+    def docs_without_feature_or_class(feature_name, classification)
+      @docs - @docs_in_class[classification] - docs_with_feature_and_not_class(feature_name, classification)
+    end
+    def docs_with_feature(feature_name)
+      @features[feature_name].docs_with_feature
+    end
+    def docs_with_class(classification)
+      @docs_in_class[classification]
+    end
+    # Returns the chi_squared value for this feature with the passed classification
+    # This is formula 13.14 on page 215 of An Introduction to Information Retrieval by
+    # Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze.
+    def chi_squared(feature_name, classification)
+      chi_squared_algo(
+        docs_with_feature_and_class(feature_name, classification),
+        docs_with_class_and_not_feature(classification, feature_name),
+        docs_with_feature_and_not_class(feature_name, classification),
+        docs_without_feature_or_class(feature_name, classification)
+      )
+    end
+    def chi_squared_algo(o11, o10, o01, o00)
+      denominator = ((o11 + o01) * (o11 + o10) * (o10 + o00) * (o01 + o00))
+      numerator   = ((o11 + o10 + o01 + o00) * ((o11 * o00 - o10 * o01)**2))
+      # Checking zero to avoid producing Infinity
+      denominator.zero? ? 0.0 : numerator.to_f / denominator.to_f
+    end
+    # A class to hold the values associated with a feature. These values are
+    # important for feature selection.
+    class FeatureValues
+      attr_accessor :docs_with_feature
+      def initialize()
+        @classes = Hash.new(0)
+        @docs_with_feature = 0
+      end
+      def add_document_with_class(classification)
+        @classes[classification] += 1
+        @docs_with_feature += 1
+      end
+      def docs_with_class(classification)
+        @classes[classification]
+      end
+    end
+  end
+end

data/lib/basset/naive_bayes.rb ADDED Viewed

@@ -0,0 +1,151 @@
+require File.join(File.dirname(__FILE__), "yaml_serialization")
+module Basset
+  # A class for running Naive Bayes classification.
+  # Documents are added to the classifier. Once they are added
+  # it can be used to classify new documents.
+  class NaiveBayes
+    include YamlSerialization
+    attr_reader :total_docs, :total_docs_in_class, :feature_counts
+    def initialize
+      @total_docs = 0
+      @total_docs_in_class = Hash.new(0)
+      @feature_counts = {}
+      @occurrences_of_all_features_in_class = {}
+    end
+    # takes a classification which can be a string and
+    # a vector of features.
+    def add_document(classification, feature_vector)
+      reset_cached_probabilities
+      @total_docs_in_class[classification] += 1
+      @total_docs += 1
+      feature_vector.each do |feature|
+        @feature_counts[feature.name] ||= FeatureCount.new(feature.name)
+        @feature_counts[feature.name].add_count_for_class(feature.value, classification)
+      end
+    end
+    def classes
+      @total_docs_in_class.keys
+    end
+    # returns the most likely class given a vector of features
+    def classify(feature_vectors, opts={:normalize_classes=>true})
+      class_probabilities = []
+      classes.each do |classification|
+        class_probability = 0
+        class_probability += Math.log10(probability_of_class(classification)) if opts[:normalize_classes]
+        class_probability += probability_of_vectors_for_class(feature_vectors, classification)
+        class_probabilities << [class_probability, classification]
+      end
+      # this next bit picks a random item first
+      # this covers the case that all the class probabilities are equal and we need to randomly select a class
+      max = class_probabilities.pick_random
+      class_probabilities.each do |cp|
+        max = cp if cp.first > max.first
+      end
+      max
+    end
+    #
+    # Gives a score for probability of _feature_vector_ being in
+    # class _classification_.
+    #
+    # This score can be normalized to the number of feature vectors by passing
+    # :normalize => true for the third argument.
+    #
+    # Score is not normalized for the relatives probabilities of each class.
+    def probability_of_vectors_for_class(feature_vectors, classification, opts={:normalize=>false})
+      probability = 0
+      feature_vectors.each do |feature_vector|
+        probability += probability_of_vector_for_class(feature_vector, classification)
+      end
+      if opts[:normalize]
+        probability / feature_vectors.count.to_f
+      else
+        probability
+      end
+    end
+    # returns the probability of a feature given the class
+    def probability_of_vector_for_class(feature_vector, classification)
+      # the reason the rescue 0 is in there is tricky
+      # because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
+      decimal_probability = (((@feature_counts[feature_vector.name].count_for_class(classification) rescue 0) + 0.1)/ occurrences_of_all_features_in_class(classification).to_f) * feature_vector.value
+      Math.log10(decimal_probability)
+    end
+    # The sum total of times all features occurs for a given class.
+    def occurrences_of_all_features_in_class(classification)
+      # return the cached value, if there is one
+      return @occurrences_of_all_features_in_class[classification] if @occurrences_of_all_features_in_class[classification]
+      @feature_counts.each_value do |feature_count|
+        @occurrences_of_all_features_in_class[classification] ||= 0
+        @occurrences_of_all_features_in_class[classification] += feature_count.count_for_class(classification)
+      end
+      @occurrences_of_all_features_in_class[classification]
+    end
+    def ==(other)
+      other.is_a?(self.class) && other.total_docs == total_docs &&
+      other.total_docs_in_class == total_docs_in_class && other.feature_counts == feature_counts
+    end
+  private
+    # probabilities are cached when the classification is run. This method resets
+    # the cached probabities.
+    def reset_cached_probabilities
+      @occurrences_of_all_features_in_class.clear
+    end
+    # returns the probability of a given class
+    def probability_of_class(classification)
+      @total_docs_in_class[classification] / @total_docs.to_f
+    end
+    # A class to store feature counts
+    class FeatureCount
+      attr_reader :classes, :name
+      def initialize(feature_name=nil, classification=nil, count=0)
+        @name, @classes = feature_name, {}
+        add_count_for_class(count, classification) if classification
+      end
+      def add_count_for_class(count, classification)
+        @classes[classification] ||= 0
+        @classes[classification] += count
+      end
+      def count_for_class(classification)
+        #@classes[classification] || 1 um, what?
+        @classes[classification] || 0
+      end
+      def count
+        @classes.values.sum
+      end
+      def ==(other)
+        other.kind_of?(FeatureCount) && other.classes == @classes && other.name == @name
+      end
+      def inspect(opts={:verbose=>false})
+        return super if opts[:verbose]
+        "#<FeatureCount for ``" + @name.to_s + "''" + " --> " + @classes.inspect + " > "
+      end
+    end
+  end
+end