RubyGems - danielsdeleo-basset - Versions diffs - 1.0.4 - Mend

danielsdeleo-basset 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/History.txt +7 -0
data/License.txt +20 -0
data/Manifest.txt +21 -0
data/README.rdoc +58 -0
data/Rakefile +63 -0
data/VERSION.yml +4 -0
data/basset.gemspec +32 -0
data/examples/example.rb +25 -0
data/lib/basset.rb +9 -0
data/lib/basset/classification_evaluator.rb +170 -0
data/lib/basset/classifier.rb +188 -0
data/lib/basset/core_extensions.rb +93 -0
data/lib/basset/document.rb +84 -0
data/lib/basset/document_override_example.rb +11 -0
data/lib/basset/feature.rb +26 -0
data/lib/basset/feature_extractor.rb +52 -0
data/lib/basset/feature_selector.rb +126 -0
data/lib/basset/naive_bayes.rb +151 -0
data/lib/basset/svm.rb +180 -0
data/lib/basset/yaml_serialization.rb +41 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +12 -0
data/spec/unit/classifier_spec.rb +166 -0
data/spec/unit/core_extension_spec.rb +33 -0
data/spec/unit/document_spec.rb +59 -0
data/spec/unit/feature_extractor_spec.rb +33 -0
data/spec/unit/feature_selector_spec.rb +108 -0
data/spec/unit/feature_spec.rb +40 -0
data/spec/unit/naive_bayes_spec.rb +119 -0
data/spec/unit/svm_spec.rb +83 -0
metadata +94 -0

data/lib/basset/core_extensions.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# This file contains extensions to built in Ruby classes.
+require 'rubygems'
+require 'stemmer'
+# Extensions to the array class.
+class Array
+  # Returns a new array that contains everything except the first element of this one. (just like in lisp)
+  def rest
+    return self if empty?
+    self.slice(1, size)
+  end
+  # Returns the second item in the array
+  def second
+    self[1]
+  end
+  # Returns a random item from the array
+  def pick_random
+    self[rand(self.size)]
+  end
+  # Returns a randomized array
+  def randomize
+    self.sort_by { rand }
+  end
+  def sum
+    inject(0) { |sum, val| sum + val }
+  end
+  # Randomizes array in place
+  def randomize!
+    self.replace(self.randomize)
+  end
+end
+class Float
+  def to_s_decimal_places(decimal_places)
+    pattern = "[0-9]*\."
+    decimal_places.times { pattern << "[0-9]"}
+    return self.to_s.match(pattern)[0]
+  end
+end
+class Symbol
+  unless public_method_defined? :to_proc
+    def to_proc
+      Proc.new { |*args| args.shift.__send__(self, *args) }
+    end
+  end
+end
+# Extensions to the string class.
+# We're just including the stemmable module into string. This adds the .stem method.
+class String
+  include Stemmable
+end
+module Math
+  def variance(population)
+    n = 0
+    mean = 0.0
+    s = 0.0
+    population.each { |x|
+      n = n + 1
+      delta = x - mean
+      mean = mean + (delta / n)
+      s = s + delta * (x - mean)
+    }
+    # if you want to calculate std deviation
+    # of a sample change this to "s / (n-1)"
+    return s / n
+  end
+  # calculate the standard deviation of a population
+  # accepts: an array, the population
+  # returns: the standard deviation
+  def stddev(population)
+    sqrt(variance(population))
+  end
+  def avg(pop)
+    total = pop.inject(0) { |sum, n| sum + n }
+    total.to_f / pop.count.to_f
+  end
+  module_function :variance, :avg, :stddev
+end

data/lib/basset/document.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'uri'
+module Basset
+  # A class for representing a document as a vector of features. It takes the text
+  # of the document and the classification. The vector of features representation is
+  # just a basic bag of words approach.
+  class Document
+    attr_reader :text, :classification
+    #
+    # initialize the object with document text.  Set an explicit classification
+    # to use the document as training data
+    def initialize(text, classification = nil)
+      @text, @classification = text, classification
+      @tokens = stemmed_words
+    end
+    #
+    # returns an array of feature (token) vectors, which are instances Feature
+    def vector_of_features
+      @feature_vector ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( @tokens ) )
+    end
+    #
+    # Alias for #vector_of_features
+    def feature_vectors
+      vector_of_features
+    end
+  private
+    # returns a hash with each word as a key and the value is the number of times
+    # the word appears in the passed in words array
+    def terms_hash_from_words_array(words)
+      terms = Hash.new(0)
+      words.each do |term|
+        terms[term] += 1
+      end
+      return terms
+    end
+    def vector_of_features_from_terms_hash(terms)
+      terms.collect do |term, frequency|
+        Feature.new(term, frequency)
+      end
+    end
+    def stemmed_words
+      words.map { |w| w.stem.downcase }
+    end
+    def words
+      clean_text.split(" ")
+    end
+    # Remove punctuation, numbers and symbols
+    def clean_text
+      text.tr("'@_", '').gsub(/\W/, ' ').gsub(/[0-9]/, '')
+#      text.tr( ',?.!;:"#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "@'\-", "")
+    end
+  end
+  #
+  # Subclass of Document intended to be used to classify URIs
+  class UriDocument < Document
+    def initialize(uri, classification=nil)
+      @text, @classification = uri, classification
+      @tokens = uri_tokens
+    end
+    def vector_of_features
+      @feature_vector ||= vector_of_features_from_terms_hash(terms_hash_from_words_array(@tokens))
+    end
+    def uri_tokens
+      URI.decode(@text).gsub(/(\&|\?|\\\\|\\|\/\/|\/|\=|\[|\]|\.\.|\.)/) { |char| " " + char + " " }.split
+    end
+  end
+end

data/lib/basset/document_override_example.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Basset
+  # This class is an example for how to do custom document representations. In this
+  # example, I change the way text is cleaned and don't stem the words. It would also
+  # be easy to put in additional hard coded features.
+  # The important thing to note is that the new document class only needs one function: vector_of_features
+  class DocumentOverrideExample < Document
+    def vector_of_features
+      @vector_of_features ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( text.gsub(/\W/, ' ').split(' ') ) )
+    end
+  end
+end

data/lib/basset/feature.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module Basset
+  # A class to hold a feature which consists of a name and a value. In the basic sense
+  # of document classification the name would be the word and the value would be the
+  # number of times that word appeared in the document.
+  class Feature
+    attr_accessor :name, :value
+    def initialize(name, value = 0)
+      @name   = name
+      @value  = value
+    end
+    def <=>(other)
+      ret = self.name <=> other.name
+      ret = self.value <=> other.value if ret.zero?
+      ret
+    end
+    def ==(other)
+      ret = self.name == other.name
+      ret = self.value == other.value if ret
+      ret
+    end
+  end
+end

data/lib/basset/feature_extractor.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require File.join(File.dirname(__FILE__), "yaml_serialization")
+module Basset
+  # Extracts features from a document. On initialization it expects the set of features that
+  # are to be extracted from documents. The extracted features will just be numbered in
+  # ascending order. This makes it easy to output feature sets for libraries like svmlight.
+  class FeatureExtractor
+    include YamlSerialization
+    # the constructor takes an array of feature names. These are the features that will be
+    # extracted from documents. All others will be ignored.
+    def initialize(feature_names)
+      @feature_names = {}
+      feature_names.each_with_index {|feature_name, index| @feature_names[feature_name] = index + 1}
+    end
+    def number_of_features
+      @feature_names.size
+    end
+    # returns an array of features, but with their names replaced with an integer identifier.
+    # They should be sorted in ascending identifier order. This is a generic representation that works
+    # well with other machine learning packages like svm_light.
+    def extract_numbered(document)
+      numbered_features = extract(document).collect do |feature|
+        Feature.new(@feature_names[feature.name], feature.value)
+      end
+      numbered_features.sort
+    end
+    # just returns the features from the document that the extractor is interested in
+    def extract(document)
+      document.vector_of_features.find_all do |feature|
+        @feature_names[feature.name]
+      end
+    end
+    # def extract_with_duplicate_removal(document)
+    #   features = extract(document)
+    #   # # now remove the unigrams that dupe bigram features
+    #   # # first grab an array of the bigram ones
+    #   # bigram_features = []
+    #   # sorted_features.each {|feature| bigram_features << feature if feature.name =~ /.*_AND_.*/}
+    #   # # now remove all the ones that have a match in the bigram features
+    #   # sorted_features.each_with_index do |feature, index|
+    #   #   sorted_features.delete_at(index) if (feature.name !~ /_AND_/ and bigram_features.detect {|bf| bf.name =~ /^#{feature.name}_|_#{feature.name}$/})
+    #   # end
+    # end
+  end
+end

data/lib/basset/feature_selector.rb ADDED Viewed

@@ -0,0 +1,126 @@
+module Basset
+  # This class is the feature selector. All documents in the training set should be added
+  # to the selector. Once they are in, a number of features may be selected based on the
+  # chi square value. When in doubt just call feature_with_chi_value_greater_than with an
+  # empty hash. It will return all features that have at least some statistical significance
+  # and occur in more than one document.
+  class FeatureSelector
+    attr_reader :docs
+    def initialize
+      @docs           = 0
+      @docs_in_class  = Hash.new(0)
+      @features       = Hash.new { |h, k| h[k] = FeatureValues.new }
+    end
+    # Adds a document to the feature selector. The document should respond_to a
+    # method vector_of_features which returns a vector of unique features.
+    def add_document(document)
+      @docs += 1
+      @docs_in_class[document.classification] += 1
+      document.vector_of_features.each do |feature|
+        @features[feature.name].add_document_with_class(document.classification)
+      end
+    end
+    # returns all features, regardless of chi_square or frequency
+    def all_feature_names
+      @features.keys
+    end
+    def number_of_features
+      @features.size
+    end
+    # returns an array of the best features for a given classification
+    def best_features(count = 10, classification = nil)
+      select_features(1.0, classification).first(count)
+    end
+    def features_with_chi(classification)
+      @features.keys.map do |feature_name|
+        Feature.new(feature_name, chi_squared(feature_name, classification))
+      end
+    end
+    # returns an array of features that have a minimum or better chi_square value.
+    def select_features(chi_value = 1.0, classification = nil)
+      classification ||= @docs_in_class.keys.first
+      selected_features = features_with_chi(classification).select do |feature|
+        (docs_with_feature(feature.name) > 1) && (feature.value >= chi_value)
+      end
+      selected_features.sort_by(&:value).reverse.collect(&:name)
+    end
+  private
+    def docs_with_feature_and_class(feature_name, classification)
+      @features[feature_name].docs_with_class(classification)
+    end
+    def docs_with_feature_and_not_class(feature_name, classification)
+      @features[feature_name].docs_with_feature - @features[feature_name].docs_with_class(classification)
+    end
+    def docs_with_class_and_not_feature(classification, feature_name)
+      @docs_in_class[classification] - @features[feature_name].docs_with_class(classification)
+    end
+    def docs_without_feature_or_class(feature_name, classification)
+      @docs - @docs_in_class[classification] - docs_with_feature_and_not_class(feature_name, classification)
+    end
+    def docs_with_feature(feature_name)
+      @features[feature_name].docs_with_feature
+    end
+    def docs_with_class(classification)
+      @docs_in_class[classification]
+    end
+    # Returns the chi_squared value for this feature with the passed classification
+    # This is formula 13.14 on page 215 of An Introduction to Information Retrieval by
+    # Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze.
+    def chi_squared(feature_name, classification)
+      chi_squared_algo(
+        docs_with_feature_and_class(feature_name, classification),
+        docs_with_class_and_not_feature(classification, feature_name),
+        docs_with_feature_and_not_class(feature_name, classification),
+        docs_without_feature_or_class(feature_name, classification)
+      )
+    end
+    def chi_squared_algo(o11, o10, o01, o00)
+      denominator = ((o11 + o01) * (o11 + o10) * (o10 + o00) * (o01 + o00))
+      numerator   = ((o11 + o10 + o01 + o00) * ((o11 * o00 - o10 * o01)**2))
+      # Checking zero to avoid producing Infinity
+      denominator.zero? ? 0.0 : numerator.to_f / denominator.to_f
+    end
+    # A class to hold the values associated with a feature. These values are
+    # important for feature selection.
+    class FeatureValues
+      attr_accessor :docs_with_feature
+      def initialize()
+        @classes = Hash.new(0)
+        @docs_with_feature = 0
+      end
+      def add_document_with_class(classification)
+        @classes[classification] += 1
+        @docs_with_feature += 1
+      end
+      def docs_with_class(classification)
+        @classes[classification]
+      end
+    end
+  end
+end

data/lib/basset/naive_bayes.rb ADDED Viewed

@@ -0,0 +1,151 @@
+require File.join(File.dirname(__FILE__), "yaml_serialization")
+module Basset
+  # A class for running Naive Bayes classification.
+  # Documents are added to the classifier. Once they are added
+  # it can be used to classify new documents.
+  class NaiveBayes
+    include YamlSerialization
+    attr_reader :total_docs, :total_docs_in_class, :feature_counts
+    def initialize
+      @total_docs = 0
+      @total_docs_in_class = Hash.new(0)
+      @feature_counts = {}
+      @occurrences_of_all_features_in_class = {}
+    end
+    # takes a classification which can be a string and
+    # a vector of features.
+    def add_document(classification, feature_vector)
+      reset_cached_probabilities
+      @total_docs_in_class[classification] += 1
+      @total_docs += 1
+      feature_vector.each do |feature|
+        @feature_counts[feature.name] ||= FeatureCount.new(feature.name)
+        @feature_counts[feature.name].add_count_for_class(feature.value, classification)
+      end
+    end
+    def classes
+      @total_docs_in_class.keys
+    end
+    # returns the most likely class given a vector of features
+    def classify(feature_vectors, opts={:normalize_classes=>true})
+      class_probabilities = []
+      classes.each do |classification|
+        class_probability = 0
+        class_probability += Math.log10(probability_of_class(classification)) if opts[:normalize_classes]
+        class_probability += probability_of_vectors_for_class(feature_vectors, classification)
+        class_probabilities << [class_probability, classification]
+      end
+      # this next bit picks a random item first
+      # this covers the case that all the class probabilities are equal and we need to randomly select a class
+      max = class_probabilities.pick_random
+      class_probabilities.each do |cp|
+        max = cp if cp.first > max.first
+      end
+      max
+    end
+    #
+    # Gives a score for probability of _feature_vector_ being in
+    # class _classification_.
+    #
+    # This score can be normalized to the number of feature vectors by passing
+    # :normalize => true for the third argument.
+    #
+    # Score is not normalized for the relatives probabilities of each class.
+    def probability_of_vectors_for_class(feature_vectors, classification, opts={:normalize=>false})
+      probability = 0
+      feature_vectors.each do |feature_vector|
+        probability += probability_of_vector_for_class(feature_vector, classification)
+      end
+      if opts[:normalize]
+        probability / feature_vectors.count.to_f
+      else
+        probability
+      end
+    end
+    # returns the probability of a feature given the class
+    def probability_of_vector_for_class(feature_vector, classification)
+      # the reason the rescue 0 is in there is tricky
+      # because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
+      decimal_probability = (((@feature_counts[feature_vector.name].count_for_class(classification) rescue 0) + 0.1)/ occurrences_of_all_features_in_class(classification).to_f) * feature_vector.value
+      Math.log10(decimal_probability)
+    end
+    # The sum total of times all features occurs for a given class.
+    def occurrences_of_all_features_in_class(classification)
+      # return the cached value, if there is one
+      return @occurrences_of_all_features_in_class[classification] if @occurrences_of_all_features_in_class[classification]
+      @feature_counts.each_value do |feature_count|
+        @occurrences_of_all_features_in_class[classification] ||= 0
+        @occurrences_of_all_features_in_class[classification] += feature_count.count_for_class(classification)
+      end
+      @occurrences_of_all_features_in_class[classification]
+    end
+    def ==(other)
+      other.is_a?(self.class) && other.total_docs == total_docs &&
+      other.total_docs_in_class == total_docs_in_class && other.feature_counts == feature_counts
+    end
+  private
+    # probabilities are cached when the classification is run. This method resets
+    # the cached probabities.
+    def reset_cached_probabilities
+      @occurrences_of_all_features_in_class.clear
+    end
+    # returns the probability of a given class
+    def probability_of_class(classification)
+      @total_docs_in_class[classification] / @total_docs.to_f
+    end
+    # A class to store feature counts
+    class FeatureCount
+      attr_reader :classes, :name
+      def initialize(feature_name=nil, classification=nil, count=0)
+        @name, @classes = feature_name, {}
+        add_count_for_class(count, classification) if classification
+      end
+      def add_count_for_class(count, classification)
+        @classes[classification] ||= 0
+        @classes[classification] += count
+      end
+      def count_for_class(classification)
+        #@classes[classification] || 1 um, what?
+        @classes[classification] || 0
+      end
+      def count
+        @classes.values.sum
+      end
+      def ==(other)
+        other.kind_of?(FeatureCount) && other.classes == @classes && other.name == @name
+      end
+      def inspect(opts={:verbose=>false})
+        return super if opts[:verbose]
+        "#<FeatureCount for ``" + @name.to_s + "''" + " --> " + @classes.inspect + " > "
+      end
+    end
+  end
+end