RubyGems - danielsdeleo-basset - Versions diffs - 1.0.4 - Mend

danielsdeleo-basset 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/History.txt +7 -0
data/License.txt +20 -0
data/Manifest.txt +21 -0
data/README.rdoc +58 -0
data/Rakefile +63 -0
data/VERSION.yml +4 -0
data/basset.gemspec +32 -0
data/examples/example.rb +25 -0
data/lib/basset.rb +9 -0
data/lib/basset/classification_evaluator.rb +170 -0
data/lib/basset/classifier.rb +188 -0
data/lib/basset/core_extensions.rb +93 -0
data/lib/basset/document.rb +84 -0
data/lib/basset/document_override_example.rb +11 -0
data/lib/basset/feature.rb +26 -0
data/lib/basset/feature_extractor.rb +52 -0
data/lib/basset/feature_selector.rb +126 -0
data/lib/basset/naive_bayes.rb +151 -0
data/lib/basset/svm.rb +180 -0
data/lib/basset/yaml_serialization.rb +41 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +12 -0
data/spec/unit/classifier_spec.rb +166 -0
data/spec/unit/core_extension_spec.rb +33 -0
data/spec/unit/document_spec.rb +59 -0
data/spec/unit/feature_extractor_spec.rb +33 -0
data/spec/unit/feature_selector_spec.rb +108 -0
data/spec/unit/feature_spec.rb +40 -0
data/spec/unit/naive_bayes_spec.rb +119 -0
data/spec/unit/svm_spec.rb +83 -0
metadata +94 -0

data/lib/basset/svm.rb ADDED Viewed

@@ -0,0 +1,180 @@
+require "svm" #File.dirname(__FILE__) + "/../../../libsvm-ruby-swig/lib/svm"
+require "bloomfilter" #  igrigorik-bloomfilter (github)
+module Basset
+  # =Overview
+  # A class for SVM document classification.  Follows the same basic interface
+  # as NaiveBayes; add labeled training documents to the classifier, then
+  # use it to classify unlabeled documents.  Do test your accuracy before
+  # using the classifier in production, there are a lot of knobs to tweak.
+  # When testing, it is usually best to use a separate set of documents, i.e.,
+  # not the training set.
+  # =Learning Resources
+  # SVM can be tricky to understand at first, try the following references:
+  # http://en.wikipedia.org/wiki/Support_vector_machine
+  # http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
+  # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
+  # =Implementation
+  # This class wraps libsvm-ruby-swig, which is itself a swig based wrapper for
+  # libsvm.
+  # libsvm-ruby-swig: http://github.com/tomz/libsvm-ruby-swig
+  # libsvm: http://www.csie.ntu.edu.tw/~cjlin/libsvm
+  # verbose version:
+  # Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+  #
+  # There is also the libsvm-ruby implementation.  It was originally available from
+  # http://debian.cilibrar.com/debian/pool/main/libs/libsvm-ruby/libsvm-ruby_2.8.4.orig.tar.gz
+  # but was not available from there when I last checked.  The Ubuntu package
+  # was still available as of this writing.
+  class Svm
+    #include YamlSerialization
+    attr_reader :class_labels, :feature_dictionary
+    def initialize
+      @total_classes = 0
+      @feature_dictionary = []
+      @class_labels = {}
+      @documents_for_class = Hash.new {|docs_hash,key| docs_hash[key] = []}
+      @svm_parameter = default_svm_parameter
+    end
+    # Adds a new document to the training set.
+    def add_document(classification, feature_vectors)
+      update_class_labels_with_new(classification) if new_class?(classification)
+      @feature_dictionary += feature_vectors.map { |fv| fv.name }
+      @feature_dictionary.uniq!
+      @documents_for_class[classification] << feature_vectors.map { |fv| fv.name }
+      reset_memoized_vars!
+    end
+    # Gives the vector representation of the training documents of class
+    # _classification_
+    def vectorized_docs(classification)
+      # hardwired to binary representation
+      @documents_for_class[classification].map do |features|
+        vectorize_doc(features)
+        #@feature_dictionary.map { |dict_feature| features.include?(dict_feature) ? 1 : 0}
+      end
+    end
+    # Returns the vectorized representation of the training data, suitable for
+    # use in the constructor for the libsvm Problem class.
+    def labels_and_document_vectors
+      # {labels => [features1-label, features2-label, ...], :features => [features1, features2, ...]}
+      labels_features = {:labels => [], :features => []}
+      @class_labels.each do |classification, label|
+        vectorized_docs(classification).each do |document_vector|
+          labels_features[:labels] << label
+          labels_features[:features] << document_vector
+        end
+      end
+      labels_features
+    end
+    def classify(feature_vectors)
+      class_of_label(model.predict(vectorize_doc(feature_vectors.map { |fv| fv.name })))
+    end
+    def classes
+      @class_labels.keys
+    end
+    # Exposes the libsvm-ruby-swig Parameter object.  If given
+    # a block, the parameter object is yielded, otherwise,
+    # it's returned.
+    #
+    # For example, to set parameters to their default values:
+    #
+    #   basset_svm_obj.parameters do |param|
+    #     param.C = 100
+    #     param.svm_type = NU_SVC
+    #     param.degree = 1
+    #     param.coef0 = 0
+    #     param.eps= 0.001
+    #     param.kernel_type = RBF
+    #   end
+    #
+    # To access one value:
+    #   basset_svm_obj.parameters.svm_type
+    #   => NU_SVC
+    def parameters
+      if block_given?
+        yield @svm_parameter
+      else
+        @svm_parameter
+      end
+    end
+    private
+    def vectorize_doc(features)
+      vectorized_doc = Array.new(@feature_dictionary.size, 0)
+      features.each do |feature|
+        if index = feature_dictionary_hash[feature]
+          vectorized_doc[index] = 1
+        end
+      end
+      vectorized_doc
+    end
+    def feature_dictionary_hash
+      unless @memoized_feature_dictionary_hash
+        m = 15 * @feature_dictionary.count  # bloom filter size (bytes)
+        @memoized_feature_dictionary_hash = BloomFilter.new(m,3,23)
+        @feature_dictionary.each_index do |i|
+          @memoized_feature_dictionary_hash[@feature_dictionary[i]] = i
+        end
+      end
+      @memoized_feature_dictionary_hash
+    end
+    def reset_memoized_vars!
+      @memoized_model, @memoized_problem, @memoized_feature_dictionary_hash = nil, nil, nil
+      @memoized_inverted_class_labels = nil
+    end
+    def model
+      @memoized_model ||= Model.new(problem, @svm_parameter)
+    end
+    def problem
+      unless @memoized_problem
+        labels_features = labels_and_document_vectors
+        @memoized_problem = Problem.new(labels_features[:labels], labels_features[:features])
+      end
+      @memoized_problem
+    end
+    def new_class?(classification)
+      !@class_labels.keys.include?(classification)
+    end
+    def default_svm_parameter
+      param = ::Parameter.new
+      param.C = 100
+      param.svm_type = NU_SVC
+      param.degree = 1
+      param.coef0 = 0
+      param.eps= 0.001
+      param.nu = 0.5          #?! this blows up on my dataset...
+      param.kernel_type = RBF
+      param
+    end
+    def update_class_labels_with_new(classification)
+      #@class_labels.each_value { |vector| vector << 0 }
+      @class_labels[classification] = @total_classes  #Array.new(@total_classes, 0) << 1
+      @total_classes += 1
+    end
+    def class_of_label(label)
+      unless @memoized_inverted_class_labels
+        @memoized_inverted_class_labels = @class_labels.invert
+      end
+      @memoized_inverted_class_labels[label.to_i]
+    end
+  end
+end

data/lib/basset/yaml_serialization.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require "yaml"
+module YamlSerialization
+  def self.included(base)
+    base.extend ClassMethods
+  end
+  module ClassMethods
+    def load_from_file(file_name)
+      YAML.load_file(file_name)
+    end
+  end
+  def save_to_file(file_name)
+    File.open(file_name, 'w') do |file|
+      YAML.dump(self, file)
+    end
+  end
+end
+class ::Class
+  yaml_as "tag:ruby.yaml.org,2002:class"
+  def Class.yaml_new( klass, tag, val )
+    if String === val
+      val.split(/::/).inject(Object) {|m, n| m.const_get(n)}
+    else
+      raise YAML::TypeError, "Invalid Class: " + val.inspect
+    end
+  end
+  def to_yaml( opts = {} )
+    YAML::quick_emit( nil, opts ) { |out|
+      out.scalar( "tag:ruby.yaml.org,2002:class", self.name, :plain )
+    }
+  end
+end

data/spec/spec.opts ADDED Viewed

	@@ -0,0 +1 @@
1	+ -c

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'spec'
+require File.join(File.dirname(__FILE__), "..", "lib", "basset")
+class Array
+  def sort_to_s
+    self.map { |item| item.to_s }.sort
+  end
+end
+include Basset

data/spec/unit/classifier_spec.rb ADDED Viewed

@@ -0,0 +1,166 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Classifier do
+  before(:each) do
+    @classifier = Classifier.new
+  end
+  it "should automagically determine the ruby class of the classifier engine a la rails' constantize" do
+    Classifier.new(:type => :naive_bayes).engine.class.should == NaiveBayes
+  end
+  it "should automagically determine the ruby class of the document type" do
+    Classifier.new(:type => :naive_bayes, :doctype => :document).doctype.should == Document
+  end
+  it "should default to NaiveBayes engine and Document doctype" do
+    classifier = Classifier.new()
+    classifier.engine.class.should == NaiveBayes
+    classifier.doctype.should == Document
+  end
+  it "should accept training docs as plain strings, extracting features automatically" do
+    @classifier.train(:hip, "that hipster has an asymmetrical haircut")
+    @classifier.train(:unhip, "that dude is a frat boy")
+    @classifier.engine.classes.should == [:hip, :unhip]
+    @classifier.engine.occurrences_of_all_features_in_class(:unhip).should == 6
+  end
+  it "should classify documents" do
+    @classifier.train(:hip, "that hipster has an asymmetrical haircut")
+    @classifier.train(:unhip, "that dude is a frat boy")
+    @classifier.classify("hipsters").should == :hip
+  end
+  it "should train iteratively for speed learning" do
+    @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
+    @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
+    @classifier.classify("guitar music").should == :unhip
+    # now everyone likes rock music again! retrain fast! cf LCD Soundsystem
+    @classifier.train_iterative(:hip, "guitars") # takes 3 iterations
+    @classifier.classify("guitars").should == :hip
+  end
+  it "should give document scores for a class" do
+    @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
+    @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
+    @classifier.similarity_score(:hip, "guitars").should be_a Float
+  end
+end
+describe AnomalyDetector do
+  YAML_FILE_BASENAME = "/tmp/basset_anomaly_detector_rspec_savefile"
+  def tmp_file
+    YAML_FILE_BASENAME + rand(2 ** 32).to_s(16)
+  end
+  before(:each) do
+    @detector = AnomalyDetector.new
+  end
+  after(:each) do
+    Dir.glob(YAML_FILE_BASENAME + '*').each  {|file| File.delete file}
+  end
+  def train_detector_on_code_love
+    @detector.train("coding all night and loving it", "coding and drinking jolt")
+  end
+  it "should train on the normal set only" do
+    @detector.train("coding all night", "coding and drinking jolt")
+    @detector.engine.classes.should == [:normal]
+    @detector.engine.occurrences_of_all_features_in_class(:normal).should == 7
+  end
+  it "should give a score for the probability of a document to be in the ``normal'' set" do
+    train_detector_on_code_love
+    score = @detector.similarity_score("I love coding and jolt")
+    score.should be_a Float
+    score.should be_close(-1, 2)
+  end
+  it "should give a list of the probability scores for the training set" do
+    train_detector_on_code_love
+    @detector.scores_for_training_set.should have(2).items
+    @detector.scores_for_training_set.each do |score|
+      score.should be_close(-1, 1)
+    end
+  end
+  it "should compute the average probability score for training set" do
+    train_detector_on_code_love
+    @detector.avg_score_of_training_set.should be_close(-0.841, 0.001)
+  end
+  it "should give the range of probability scores for the training set" do
+    train_detector_on_code_love
+    @detector.score_range_of_training_set.should be_a Range
+    @detector.score_range_of_training_set.first.should be_close(-0.864, 0.001)
+    @detector.score_range_of_training_set.last.should be_close(-0.818, 0.001)
+  end
+  it "should give the standard deviation of probability scores for the training set" do
+    train_detector_on_code_love
+    @detector.stddev_of_scores_of_training_set.should be_close(0.0234, 0.001) #0.0234
+  end
+  it "should use the average minus 4 times the stddev as the lower bound for normal" do
+    train_detector_on_code_love
+    expected = -0.841 - (4 * 0.0234)
+    @detector.minimum_acceptable_score.should be_close(expected, 0.001)
+  end
+  it "should say if text is anomalous or not" do
+    train_detector_on_code_love
+    @detector.should be_anomalous("watching tv")
+    @detector.should_not be_anomalous("code and jolt")
+  end
+  it "should classify text as anomalous or normal" do
+    train_detector_on_code_love
+    @detector.classify("watching tv").should == :anomalous
+    @detector.classify("code and jolt").should == :normal
+  end
+  it "should give an anomaly score based on the std deviations from mean" do
+    train_detector_on_code_love
+    @detector.anomaly_score("watching_tv").should be_close( 80, 10)
+  end
+  it "should train iteratively" do
+    train_detector_on_code_love
+    50.times {@detector.train("coding drinking jolt")}
+    @detector.train_iterative("watching tv")
+    @detector.should be_normal("watching tv")
+  end
+  it "should reset memoized values to nil after retraining" do
+    train_detector_on_code_love
+    @detector.score_range_of_training_set
+    @detector.scores_for_training_set.should_not be_nil
+    train_detector_on_code_love
+    @detector.instance_variable_get(:@scores_for_training_set).should be_nil
+  end
+  it "should serialize itself to YAML" do
+    train_detector_on_code_love
+    file = tmp_file
+    @detector.save_to_file(file)
+    File.exist?(file).should be_true
+  end
+  it "should load itself from YAML" do
+    train_detector_on_code_love
+    file = tmp_file
+    @detector.save_to_file(file)
+    reloaded_detector = AnomalyDetector.load_from_file(file)
+    reloaded_detector.should == @detector
+  end
+end

data/spec/unit/core_extension_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Array, "with Basset extensions" do
+  it "should give the tail of an array like FP lists do" do
+    [1,2,3].rest.should == [2,3]
+  end
+  it "should not choke when giving the tail of an empty list" do
+    [].rest.should == []
+  end
+  it "should return a random element" do
+    srand(123456)
+    [1,2,3,4].pick_random.should == 2
+  end
+  it "should randomly rearrange itself" do
+    srand(123456)
+    [1,2,3,4].randomize.should == [1,3,4,2]
+  end
+  it "should sum itself" do
+    [1,2,3,4].sum.should == 10
+  end
+end
+describe Float, "with Basset extensions" do
+  it "should convert itself to a string with variable precsion" do
+    1.23456.to_s_decimal_places(3).should == "1.234"
+  end
+end

data/spec/unit/document_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe Document do
+  it "should remove punctuation from words" do
+    Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should remove numbers from words" do
+    Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should remove symbols from words" do
+    Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should lowercase text" do
+    Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should stem words" do
+    Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
+  end
+  it "should count feature occurances" do
+    Document.new("test doc test", :test).vector_of_features.should ==
+      [Feature.new("doc", 1), Feature.new("test", 2)]
+  end
+end
+describe UriDocument do
+  def single_features(*uris)
+    uris.flatten.map { |uri| Feature.new(uri.to_s, 1) }
+  end
+  it "should extract URI token separators &, ?, \\, /, =, [, ], and . separately" do
+    expected_features = [:a,:b,:c,:d,:e,:f,:g,:h, :i, '&', '?', "\\", '/', '=', '[', ']', '.']
+    expected = single_features(expected_features).sort
+    UriDocument.new('a&b?c\d/e=f[g]h.i').feature_vectors.sort.should == expected
+  end
+  it "should extract two dots as a single feature instead of two dots" do
+    UriDocument.new('..').feature_vectors.should == [Feature.new("..", 1)]
+  end
+  it "should extract two slashes as a single feature" do
+    UriDocument.new('//').feature_vectors.should == [Feature.new('//', 1)]
+    UriDocument.new("\\\\").feature_vectors.should == [Feature.new('\\\\', 1)]
+  end
+  it "should not stem words" do
+    UriDocument.new("testing").feature_vectors.should == [Feature.new("testing", 1)]
+  end
+  it "should URI decode encoded strings" do
+    UriDocument.new("%23%25").feature_vectors.should == [Feature.new("#%", 1)]
+  end
+end