RubyGems - rjspotter-basset - Versions diffs - 1.0.5 - Mend

rjspotter-basset 1.0.5

Files changed (31) hide show

data/History.txt +7 -0
data/License.txt +20 -0
data/Manifest.txt +21 -0
data/README.rdoc +58 -0
data/Rakefile +63 -0
data/VERSION.yml +4 -0
data/basset.gemspec +38 -0
data/examples/example.rb +25 -0
data/lib/basset.rb +9 -0
data/lib/basset/classification_evaluator.rb +170 -0
data/lib/basset/classifier.rb +188 -0
data/lib/basset/core_extensions.rb +93 -0
data/lib/basset/document.rb +84 -0
data/lib/basset/document_override_example.rb +11 -0
data/lib/basset/feature.rb +26 -0
data/lib/basset/feature_extractor.rb +52 -0
data/lib/basset/feature_selector.rb +126 -0
data/lib/basset/naive_bayes.rb +151 -0
data/lib/basset/svm.rb +180 -0
data/lib/basset/yaml_serialization.rb +41 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +12 -0
data/spec/unit/classifier_spec.rb +166 -0
data/spec/unit/core_extension_spec.rb +33 -0
data/spec/unit/document_spec.rb +59 -0
data/spec/unit/feature_extractor_spec.rb +33 -0
data/spec/unit/feature_selector_spec.rb +108 -0
data/spec/unit/feature_spec.rb +40 -0
data/spec/unit/naive_bayes_spec.rb +119 -0
data/spec/unit/svm_spec.rb +83 -0
metadata +115 -0

data/lib/basset/svm.rb ADDED Viewed

@@ -0,0 +1,180 @@
+require "svm" #File.dirname(__FILE__) + "/../../../libsvm-ruby-swig/lib/svm"
+require "bloomfilter" #  igrigorik-bloomfilter (github)
+module Basset
+  # =Overview
+  # A class for SVM document classification.  Follows the same basic interface
+  # as NaiveBayes; add labeled training documents to the classifier, then
+  # use it to classify unlabeled documents.  Do test your accuracy before
+  # using the classifier in production, there are a lot of knobs to tweak.
+  # When testing, it is usually best to use a separate set of documents, i.e.,
+  # not the training set.
+  # =Learning Resources
+  # SVM can be tricky to understand at first, try the following references:
+  # http://en.wikipedia.org/wiki/Support_vector_machine
+  # http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
+  # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
+  # =Implementation
+  # This class wraps libsvm-ruby-swig, which is itself a swig based wrapper for
+  # libsvm.
+  # libsvm-ruby-swig: http://github.com/tomz/libsvm-ruby-swig
+  # libsvm: http://www.csie.ntu.edu.tw/~cjlin/libsvm
+  # verbose version:
+  # Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+  #
+  # There is also the libsvm-ruby implementation.  It was originally available from
+  # http://debian.cilibrar.com/debian/pool/main/libs/libsvm-ruby/libsvm-ruby_2.8.4.orig.tar.gz
+  # but was not available from there when I last checked.  The Ubuntu package
+  # was still available as of this writing.
+  class Svm
+    #include YamlSerialization
+    attr_reader :class_labels, :feature_dictionary
+    def initialize
+      @total_classes = 0
+      @feature_dictionary = []
+      @class_labels = {}
+      @documents_for_class = Hash.new {|docs_hash,key| docs_hash[key] = []}
+      @svm_parameter = default_svm_parameter
+    end
+    # Adds a new document to the training set.
+    def add_document(classification, feature_vectors)
+      update_class_labels_with_new(classification) if new_class?(classification)
+      @feature_dictionary += feature_vectors.map { |fv| fv.name }
+      @feature_dictionary.uniq!
+      @documents_for_class[classification] << feature_vectors.map { |fv| fv.name }
+      reset_memoized_vars!
+    end
+    # Gives the vector representation of the training documents of class
+    # _classification_
+    def vectorized_docs(classification)
+      # hardwired to binary representation
+      @documents_for_class[classification].map do |features|
+        vectorize_doc(features)
+        #@feature_dictionary.map { |dict_feature| features.include?(dict_feature) ? 1 : 0}
+      end
+    end
+    # Returns the vectorized representation of the training data, suitable for
+    # use in the constructor for the libsvm Problem class.
+    def labels_and_document_vectors
+      # {labels => [features1-label, features2-label, ...], :features => [features1, features2, ...]}
+      labels_features = {:labels => [], :features => []}
+      @class_labels.each do |classification, label|
+        vectorized_docs(classification).each do |document_vector|
+          labels_features[:labels] << label
+          labels_features[:features] << document_vector
+        end
+      end
+      labels_features
+    end
+    def classify(feature_vectors)
+      class_of_label(model.predict(vectorize_doc(feature_vectors.map { |fv| fv.name })))
+    end
+    def classes
+      @class_labels.keys
+    end
+    # Exposes the libsvm-ruby-swig Parameter object.  If given
+    # a block, the parameter object is yielded, otherwise,
+    # it's returned.
+    #
+    # For example, to set parameters to their default values:
+    #
+    #   basset_svm_obj.parameters do |param|
+    #     param.C = 100
+    #     param.svm_type = NU_SVC
+    #     param.degree = 1
+    #     param.coef0 = 0
+    #     param.eps= 0.001
+    #     param.kernel_type = RBF
+    #   end
+    #
+    # To access one value:
+    #   basset_svm_obj.parameters.svm_type
+    #   => NU_SVC
+    def parameters
+      if block_given?
+        yield @svm_parameter
+      else
+        @svm_parameter
+      end
+    end
+    private
+    def vectorize_doc(features)
+      vectorized_doc = Array.new(@feature_dictionary.size, 0)
+      features.each do |feature|
+        if index = feature_dictionary_hash[feature]
+          vectorized_doc[index] = 1
+        end
+      end
+      vectorized_doc
+    end
+    def feature_dictionary_hash
+      unless @memoized_feature_dictionary_hash
+        m = 15 * @feature_dictionary.count  # bloom filter size (bytes)
+        @memoized_feature_dictionary_hash = BloomFilter.new(m,3,23)
+        @feature_dictionary.each_index do |i|
+          @memoized_feature_dictionary_hash[@feature_dictionary[i]] = i
+        end
+      end
+      @memoized_feature_dictionary_hash
+    end
+    def reset_memoized_vars!
+      @memoized_model, @memoized_problem, @memoized_feature_dictionary_hash = nil, nil, nil
+      @memoized_inverted_class_labels = nil
+    end
+    def model
+      @memoized_model ||= Model.new(problem, @svm_parameter)
+    end
+    def problem
+      unless @memoized_problem
+        labels_features = labels_and_document_vectors
+        @memoized_problem = Problem.new(labels_features[:labels], labels_features[:features])
+      end
+      @memoized_problem
+    end
+    def new_class?(classification)
+      !@class_labels.keys.include?(classification)
+    end
+    def default_svm_parameter
+      param = ::Parameter.new
+      param.C = 100
+      param.svm_type = NU_SVC
+      param.degree = 1
+      param.coef0 = 0
+      param.eps= 0.001
+      param.nu = 0.5          #?! this blows up on my dataset...
+      param.kernel_type = RBF
+      param
+    end
+    def update_class_labels_with_new(classification)
+      #@class_labels.each_value { |vector| vector << 0 }
+      @class_labels[classification] = @total_classes  #Array.new(@total_classes, 0) << 1
+      @total_classes += 1
+    end
+    def class_of_label(label)
+      unless @memoized_inverted_class_labels
+        @memoized_inverted_class_labels = @class_labels.invert
+      end
+      @memoized_inverted_class_labels[label.to_i]
+    end
+  end
+end

data/lib/basset/yaml_serialization.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require "yaml"
+module YamlSerialization
+  def self.included(base)
+    base.extend ClassMethods
+  end
+  module ClassMethods
+    def load_from_file(file_name)
+      YAML.load_file(file_name)
+    end
+  end
+  def save_to_file(file_name)
+    File.open(file_name, 'w') do |file|
+      YAML.dump(self, file)
+    end
+  end
+end
+class ::Class
+  yaml_as "tag:ruby.yaml.org,2002:class"
+  def Class.yaml_new( klass, tag, val )
+    if String === val
+      val.split(/::/).inject(Object) {|m, n| m.const_get(n)}
+    else
+      raise YAML::TypeError, "Invalid Class: " + val.inspect
+    end
+  end
+  def to_yaml( opts = {} )
+    YAML::quick_emit( nil, opts ) { |out|
+      out.scalar( "tag:ruby.yaml.org,2002:class", self.name, :plain )
+    }
+  end
+end

data/spec/spec.opts ADDED Viewed

	@@ -0,0 +1 @@
1	+ -c

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'spec'
+require File.join(File.dirname(__FILE__), "..", "lib", "basset")
+class Array
+  def sort_to_s
+    self.map { |item| item.to_s }.sort
+  end
+end
+include Basset

data/spec/unit/classifier_spec.rb ADDED Viewed

@@ -0,0 +1,166 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Classifier do
+  before(:each) do
+    @classifier = Classifier.new
+  end
+  it "should automagically determine the ruby class of the classifier engine a la rails' constantize" do
+    Classifier.new(:type => :naive_bayes).engine.class.should == NaiveBayes
+  end
+  it "should automagically determine the ruby class of the document type" do
+    Classifier.new(:type => :naive_bayes, :doctype => :document).doctype.should == Document
+  end
+  it "should default to NaiveBayes engine and Document doctype" do
+    classifier = Classifier.new()
+    classifier.engine.class.should == NaiveBayes
+    classifier.doctype.should == Document
+  end
+  it "should accept training docs as plain strings, extracting features automatically" do
+    @classifier.train(:hip, "that hipster has an asymmetrical haircut")
+    @classifier.train(:unhip, "that dude is a frat boy")
+    @classifier.engine.classes.should == [:hip, :unhip]
+    @classifier.engine.occurrences_of_all_features_in_class(:unhip).should == 6
+  end
+  it "should classify documents" do
+    @classifier.train(:hip, "that hipster has an asymmetrical haircut")
+    @classifier.train(:unhip, "that dude is a frat boy")
+    @classifier.classify("hipsters").should == :hip
+  end
+  it "should train iteratively for speed learning" do
+    @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
+    @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
+    @classifier.classify("guitar music").should == :unhip
+    # now everyone likes rock music again! retrain fast! cf LCD Soundsystem
+    @classifier.train_iterative(:hip, "guitars") # takes 3 iterations
+    @classifier.classify("guitars").should == :hip
+  end
+  it "should give document scores for a class" do
+    @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
+    @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
+    @classifier.similarity_score(:hip, "guitars").should be_a Float
+  end
+end
+describe AnomalyDetector do
+  YAML_FILE_BASENAME = "/tmp/basset_anomaly_detector_rspec_savefile"
+  def tmp_file
+    YAML_FILE_BASENAME + rand(2 ** 32).to_s(16)
+  end
+  before(:each) do
+    @detector = AnomalyDetector.new
+  end
+  after(:each) do
+    Dir.glob(YAML_FILE_BASENAME + '*').each  {|file| File.delete file}
+  end
+  def train_detector_on_code_love
+    @detector.train("coding all night and loving it", "coding and drinking jolt")
+  end
+  it "should train on the normal set only" do
+    @detector.train("coding all night", "coding and drinking jolt")
+    @detector.engine.classes.should == [:normal]
+    @detector.engine.occurrences_of_all_features_in_class(:normal).should == 7
+  end
+  it "should give a score for the probability of a document to be in the ``normal'' set" do
+    train_detector_on_code_love
+    score = @detector.similarity_score("I love coding and jolt")
+    score.should be_a Float
+    score.should be_close(-1, 2)
+  end
+  it "should give a list of the probability scores for the training set" do
+    train_detector_on_code_love
+    @detector.scores_for_training_set.should have(2).items
+    @detector.scores_for_training_set.each do |score|
+      score.should be_close(-1, 1)
+    end
+  end
+  it "should compute the average probability score for training set" do
+    train_detector_on_code_love
+    @detector.avg_score_of_training_set.should be_close(-0.841, 0.001)
+  end
+  it "should give the range of probability scores for the training set" do
+    train_detector_on_code_love
+    @detector.score_range_of_training_set.should be_a Range
+    @detector.score_range_of_training_set.first.should be_close(-0.864, 0.001)
+    @detector.score_range_of_training_set.last.should be_close(-0.818, 0.001)
+  end
+  it "should give the standard deviation of probability scores for the training set" do
+    train_detector_on_code_love
+    @detector.stddev_of_scores_of_training_set.should be_close(0.0234, 0.001) #0.0234
+  end
+  it "should use the average minus 4 times the stddev as the lower bound for normal" do
+    train_detector_on_code_love
+    expected = -0.841 - (4 * 0.0234)
+    @detector.minimum_acceptable_score.should be_close(expected, 0.001)
+  end
+  it "should say if text is anomalous or not" do
+    train_detector_on_code_love
+    @detector.should be_anomalous("watching tv")
+    @detector.should_not be_anomalous("code and jolt")
+  end
+  it "should classify text as anomalous or normal" do
+    train_detector_on_code_love
+    @detector.classify("watching tv").should == :anomalous
+    @detector.classify("code and jolt").should == :normal
+  end
+  it "should give an anomaly score based on the std deviations from mean" do
+    train_detector_on_code_love
+    @detector.anomaly_score("watching_tv").should be_close( 80, 10)
+  end
+  it "should train iteratively" do
+    train_detector_on_code_love
+    50.times {@detector.train("coding drinking jolt")}
+    @detector.train_iterative("watching tv")
+    @detector.should be_normal("watching tv")
+  end
+  it "should reset memoized values to nil after retraining" do
+    train_detector_on_code_love
+    @detector.score_range_of_training_set
+    @detector.scores_for_training_set.should_not be_nil
+    train_detector_on_code_love
+    @detector.instance_variable_get(:@scores_for_training_set).should be_nil
+  end
+  it "should serialize itself to YAML" do
+    train_detector_on_code_love
+    file = tmp_file
+    @detector.save_to_file(file)
+    File.exist?(file).should be_true
+  end
+  it "should load itself from YAML" do
+    train_detector_on_code_love
+    file = tmp_file
+    @detector.save_to_file(file)
+    reloaded_detector = AnomalyDetector.load_from_file(file)
+    reloaded_detector.should == @detector
+  end
+end

data/spec/unit/core_extension_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Array, "with Basset extensions" do
+  it "should give the tail of an array like FP lists do" do
+    [1,2,3].rest.should == [2,3]
+  end
+  it "should not choke when giving the tail of an empty list" do
+    [].rest.should == []
+  end
+  it "should return a random element" do
+    srand(123456)
+    [1,2,3,4].pick_random.should == 2
+  end
+  it "should randomly rearrange itself" do
+    srand(123456)
+    [1,2,3,4].randomize.should == [1,3,4,2]
+  end
+  it "should sum itself" do
+    [1,2,3,4].sum.should == 10
+  end
+end
+describe Float, "with Basset extensions" do
+  it "should convert itself to a string with variable precsion" do
+    1.23456.to_s_decimal_places(3).should == "1.234"
+  end
+end

data/spec/unit/document_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe Document do
+  it "should remove punctuation from words" do
+    Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should remove numbers from words" do
+    Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should remove symbols from words" do
+    Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should lowercase text" do
+    Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
+  end
+  it "should stem words" do
+    Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
+  end
+  it "should count feature occurances" do
+    Document.new("test doc test", :test).vector_of_features.should ==
+      [Feature.new("doc", 1), Feature.new("test", 2)]
+  end
+end
+describe UriDocument do
+  def single_features(*uris)
+    uris.flatten.map { |uri| Feature.new(uri.to_s, 1) }
+  end
+  it "should extract URI token separators &, ?, \\, /, =, [, ], and . separately" do
+    expected_features = [:a,:b,:c,:d,:e,:f,:g,:h, :i, '&', '?', "\\", '/', '=', '[', ']', '.']
+    expected = single_features(expected_features).sort
+    UriDocument.new('a&b?c\d/e=f[g]h.i').feature_vectors.sort.should == expected
+  end
+  it "should extract two dots as a single feature instead of two dots" do
+    UriDocument.new('..').feature_vectors.should == [Feature.new("..", 1)]
+  end
+  it "should extract two slashes as a single feature" do
+    UriDocument.new('//').feature_vectors.should == [Feature.new('//', 1)]
+    UriDocument.new("\\\\").feature_vectors.should == [Feature.new('\\\\', 1)]
+  end
+  it "should not stem words" do
+    UriDocument.new("testing").feature_vectors.should == [Feature.new("testing", 1)]
+  end
+  it "should URI decode encoded strings" do
+    UriDocument.new("%23%25").feature_vectors.should == [Feature.new("#%", 1)]
+  end
+end