rjspotter-basset 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/lib/basset/svm.rb ADDED
@@ -0,0 +1,180 @@
1
+ require "svm" #File.dirname(__FILE__) + "/../../../libsvm-ruby-swig/lib/svm"
2
+
3
+ require "bloomfilter" # igrigorik-bloomfilter (github)
4
+
5
+ module Basset
6
+ # =Overview
7
+ # A class for SVM document classification. Follows the same basic interface
8
+ # as NaiveBayes; add labeled training documents to the classifier, then
9
+ # use it to classify unlabeled documents. Do test your accuracy before
10
+ # using the classifier in production, there are a lot of knobs to tweak.
11
+ # When testing, it is usually best to use a separate set of documents, i.e.,
12
+ # not the training set.
13
+ # =Learning Resources
14
+ # SVM can be tricky to understand at first, try the following references:
15
+ # http://en.wikipedia.org/wiki/Support_vector_machine
16
+ # http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
17
+ # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
18
+ # =Implementation
19
+ # This class wraps libsvm-ruby-swig, which is itself a swig based wrapper for
20
+ # libsvm.
21
+ # libsvm-ruby-swig: http://github.com/tomz/libsvm-ruby-swig
22
+ # libsvm: http://www.csie.ntu.edu.tw/~cjlin/libsvm
23
+ # verbose version:
24
+ # Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
25
+ #
26
+ # There is also the libsvm-ruby implementation. It was originally available from
27
+ # http://debian.cilibrar.com/debian/pool/main/libs/libsvm-ruby/libsvm-ruby_2.8.4.orig.tar.gz
28
+ # but was not available from there when I last checked. The Ubuntu package
29
+ # was still available as of this writing.
30
+ class Svm
31
+ #include YamlSerialization
32
+ attr_reader :class_labels, :feature_dictionary
33
+
34
+ def initialize
35
+ @total_classes = 0
36
+ @feature_dictionary = []
37
+ @class_labels = {}
38
+ @documents_for_class = Hash.new {|docs_hash,key| docs_hash[key] = []}
39
+ @svm_parameter = default_svm_parameter
40
+ end
41
+
42
+ # Adds a new document to the training set.
43
+ def add_document(classification, feature_vectors)
44
+ update_class_labels_with_new(classification) if new_class?(classification)
45
+ @feature_dictionary += feature_vectors.map { |fv| fv.name }
46
+ @feature_dictionary.uniq!
47
+ @documents_for_class[classification] << feature_vectors.map { |fv| fv.name }
48
+ reset_memoized_vars!
49
+ end
50
+
51
+ # Gives the vector representation of the training documents of class
52
+ # _classification_
53
+ def vectorized_docs(classification)
54
+ # hardwired to binary representation
55
+ @documents_for_class[classification].map do |features|
56
+ vectorize_doc(features)
57
+ #@feature_dictionary.map { |dict_feature| features.include?(dict_feature) ? 1 : 0}
58
+ end
59
+ end
60
+
61
+ # Returns the vectorized representation of the training data, suitable for
62
+ # use in the constructor for the libsvm Problem class.
63
+ def labels_and_document_vectors
64
+ # {labels => [features1-label, features2-label, ...], :features => [features1, features2, ...]}
65
+ labels_features = {:labels => [], :features => []}
66
+ @class_labels.each do |classification, label|
67
+ vectorized_docs(classification).each do |document_vector|
68
+ labels_features[:labels] << label
69
+ labels_features[:features] << document_vector
70
+ end
71
+ end
72
+ labels_features
73
+ end
74
+
75
+ def classify(feature_vectors)
76
+ class_of_label(model.predict(vectorize_doc(feature_vectors.map { |fv| fv.name })))
77
+ end
78
+
79
+ def classes
80
+ @class_labels.keys
81
+ end
82
+
83
+ # Exposes the libsvm-ruby-swig Parameter object. If given
84
+ # a block, the parameter object is yielded, otherwise,
85
+ # it's returned.
86
+ #
87
+ # For example, to set parameters to their default values:
88
+ #
89
+ # basset_svm_obj.parameters do |param|
90
+ # param.C = 100
91
+ # param.svm_type = NU_SVC
92
+ # param.degree = 1
93
+ # param.coef0 = 0
94
+ # param.eps= 0.001
95
+ # param.kernel_type = RBF
96
+ # end
97
+ #
98
+ # To access one value:
99
+ # basset_svm_obj.parameters.svm_type
100
+ # => NU_SVC
101
+ def parameters
102
+ if block_given?
103
+ yield @svm_parameter
104
+ else
105
+ @svm_parameter
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def vectorize_doc(features)
112
+ vectorized_doc = Array.new(@feature_dictionary.size, 0)
113
+ features.each do |feature|
114
+ if index = feature_dictionary_hash[feature]
115
+ vectorized_doc[index] = 1
116
+ end
117
+ end
118
+ vectorized_doc
119
+ end
120
+
121
+ def feature_dictionary_hash
122
+ unless @memoized_feature_dictionary_hash
123
+ m = 15 * @feature_dictionary.count # bloom filter size (bytes)
124
+ @memoized_feature_dictionary_hash = BloomFilter.new(m,3,23)
125
+
126
+ @feature_dictionary.each_index do |i|
127
+ @memoized_feature_dictionary_hash[@feature_dictionary[i]] = i
128
+ end
129
+ end
130
+ @memoized_feature_dictionary_hash
131
+ end
132
+
133
+ def reset_memoized_vars!
134
+ @memoized_model, @memoized_problem, @memoized_feature_dictionary_hash = nil, nil, nil
135
+ @memoized_inverted_class_labels = nil
136
+ end
137
+
138
+ def model
139
+ @memoized_model ||= Model.new(problem, @svm_parameter)
140
+ end
141
+
142
+ def problem
143
+ unless @memoized_problem
144
+ labels_features = labels_and_document_vectors
145
+ @memoized_problem = Problem.new(labels_features[:labels], labels_features[:features])
146
+ end
147
+ @memoized_problem
148
+ end
149
+
150
+ def new_class?(classification)
151
+ !@class_labels.keys.include?(classification)
152
+ end
153
+
154
+ def default_svm_parameter
155
+ param = ::Parameter.new
156
+ param.C = 100
157
+ param.svm_type = NU_SVC
158
+ param.degree = 1
159
+ param.coef0 = 0
160
+ param.eps= 0.001
161
+ param.nu = 0.5 #?! this blows up on my dataset...
162
+ param.kernel_type = RBF
163
+ param
164
+ end
165
+
166
+ def update_class_labels_with_new(classification)
167
+ #@class_labels.each_value { |vector| vector << 0 }
168
+ @class_labels[classification] = @total_classes #Array.new(@total_classes, 0) << 1
169
+ @total_classes += 1
170
+ end
171
+
172
+ def class_of_label(label)
173
+ unless @memoized_inverted_class_labels
174
+ @memoized_inverted_class_labels = @class_labels.invert
175
+ end
176
+ @memoized_inverted_class_labels[label.to_i]
177
+ end
178
+
179
+ end
180
+ end
@@ -0,0 +1,41 @@
1
+ require "yaml"
2
+
3
+ module YamlSerialization
4
+
5
+ def self.included(base)
6
+ base.extend ClassMethods
7
+ end
8
+
9
+
10
+ module ClassMethods
11
+ def load_from_file(file_name)
12
+ YAML.load_file(file_name)
13
+ end
14
+ end
15
+
16
+ def save_to_file(file_name)
17
+ File.open(file_name, 'w') do |file|
18
+ YAML.dump(self, file)
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ class ::Class
25
+ yaml_as "tag:ruby.yaml.org,2002:class"
26
+
27
+ def Class.yaml_new( klass, tag, val )
28
+ if String === val
29
+ val.split(/::/).inject(Object) {|m, n| m.const_get(n)}
30
+ else
31
+ raise YAML::TypeError, "Invalid Class: " + val.inspect
32
+ end
33
+ end
34
+
35
+ def to_yaml( opts = {} )
36
+ YAML::quick_emit( nil, opts ) { |out|
37
+ out.scalar( "tag:ruby.yaml.org,2002:class", self.name, :plain )
38
+ }
39
+ end
40
+
41
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ -c
@@ -0,0 +1,12 @@
1
+ require 'spec'
2
+ require File.join(File.dirname(__FILE__), "..", "lib", "basset")
3
+
4
+ class Array
5
+
6
+ def sort_to_s
7
+ self.map { |item| item.to_s }.sort
8
+ end
9
+
10
+ end
11
+
12
+ include Basset
@@ -0,0 +1,166 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Classifier do
4
+
5
+ before(:each) do
6
+ @classifier = Classifier.new
7
+ end
8
+
9
+ it "should automagically determine the ruby class of the classifier engine a la rails' constantize" do
10
+ Classifier.new(:type => :naive_bayes).engine.class.should == NaiveBayes
11
+ end
12
+
13
+ it "should automagically determine the ruby class of the document type" do
14
+ Classifier.new(:type => :naive_bayes, :doctype => :document).doctype.should == Document
15
+ end
16
+
17
+ it "should default to NaiveBayes engine and Document doctype" do
18
+ classifier = Classifier.new()
19
+ classifier.engine.class.should == NaiveBayes
20
+ classifier.doctype.should == Document
21
+ end
22
+
23
+
24
+ it "should accept training docs as plain strings, extracting features automatically" do
25
+ @classifier.train(:hip, "that hipster has an asymmetrical haircut")
26
+ @classifier.train(:unhip, "that dude is a frat boy")
27
+ @classifier.engine.classes.should == [:hip, :unhip]
28
+ @classifier.engine.occurrences_of_all_features_in_class(:unhip).should == 6
29
+ end
30
+
31
+ it "should classify documents" do
32
+ @classifier.train(:hip, "that hipster has an asymmetrical haircut")
33
+ @classifier.train(:unhip, "that dude is a frat boy")
34
+ @classifier.classify("hipsters").should == :hip
35
+ end
36
+
37
+ it "should train iteratively for speed learning" do
38
+ @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
39
+ @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
40
+ @classifier.classify("guitar music").should == :unhip
41
+ # now everyone likes rock music again! retrain fast! cf LCD Soundsystem
42
+ @classifier.train_iterative(:hip, "guitars") # takes 3 iterations
43
+ @classifier.classify("guitars").should == :hip
44
+ end
45
+
46
+ it "should give document scores for a class" do
47
+ @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
48
+ @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
49
+ @classifier.similarity_score(:hip, "guitars").should be_a Float
50
+ end
51
+
52
+ end
53
+
54
+
55
+ describe AnomalyDetector do
56
+
57
+ YAML_FILE_BASENAME = "/tmp/basset_anomaly_detector_rspec_savefile"
58
+
59
+ def tmp_file
60
+ YAML_FILE_BASENAME + rand(2 ** 32).to_s(16)
61
+ end
62
+
63
+ before(:each) do
64
+ @detector = AnomalyDetector.new
65
+ end
66
+
67
+ after(:each) do
68
+ Dir.glob(YAML_FILE_BASENAME + '*').each {|file| File.delete file}
69
+ end
70
+
71
+ def train_detector_on_code_love
72
+ @detector.train("coding all night and loving it", "coding and drinking jolt")
73
+ end
74
+
75
+ it "should train on the normal set only" do
76
+ @detector.train("coding all night", "coding and drinking jolt")
77
+ @detector.engine.classes.should == [:normal]
78
+ @detector.engine.occurrences_of_all_features_in_class(:normal).should == 7
79
+ end
80
+
81
+ it "should give a score for the probability of a document to be in the ``normal'' set" do
82
+ train_detector_on_code_love
83
+ score = @detector.similarity_score("I love coding and jolt")
84
+ score.should be_a Float
85
+ score.should be_close(-1, 2)
86
+ end
87
+
88
+ it "should give a list of the probability scores for the training set" do
89
+ train_detector_on_code_love
90
+ @detector.scores_for_training_set.should have(2).items
91
+ @detector.scores_for_training_set.each do |score|
92
+ score.should be_close(-1, 1)
93
+ end
94
+ end
95
+
96
+ it "should compute the average probability score for training set" do
97
+ train_detector_on_code_love
98
+ @detector.avg_score_of_training_set.should be_close(-0.841, 0.001)
99
+ end
100
+
101
+ it "should give the range of probability scores for the training set" do
102
+ train_detector_on_code_love
103
+ @detector.score_range_of_training_set.should be_a Range
104
+ @detector.score_range_of_training_set.first.should be_close(-0.864, 0.001)
105
+ @detector.score_range_of_training_set.last.should be_close(-0.818, 0.001)
106
+ end
107
+
108
+ it "should give the standard deviation of probability scores for the training set" do
109
+ train_detector_on_code_love
110
+ @detector.stddev_of_scores_of_training_set.should be_close(0.0234, 0.001) #0.0234
111
+ end
112
+
113
+ it "should use the average minus 4 times the stddev as the lower bound for normal" do
114
+ train_detector_on_code_love
115
+ expected = -0.841 - (4 * 0.0234)
116
+ @detector.minimum_acceptable_score.should be_close(expected, 0.001)
117
+ end
118
+
119
+ it "should say if text is anomalous or not" do
120
+ train_detector_on_code_love
121
+ @detector.should be_anomalous("watching tv")
122
+ @detector.should_not be_anomalous("code and jolt")
123
+ end
124
+
125
+ it "should classify text as anomalous or normal" do
126
+ train_detector_on_code_love
127
+ @detector.classify("watching tv").should == :anomalous
128
+ @detector.classify("code and jolt").should == :normal
129
+ end
130
+
131
+ it "should give an anomaly score based on the std deviations from mean" do
132
+ train_detector_on_code_love
133
+ @detector.anomaly_score("watching_tv").should be_close( 80, 10)
134
+ end
135
+
136
+ it "should train iteratively" do
137
+ train_detector_on_code_love
138
+ 50.times {@detector.train("coding drinking jolt")}
139
+ @detector.train_iterative("watching tv")
140
+ @detector.should be_normal("watching tv")
141
+ end
142
+
143
+ it "should reset memoized values to nil after retraining" do
144
+ train_detector_on_code_love
145
+ @detector.score_range_of_training_set
146
+ @detector.scores_for_training_set.should_not be_nil
147
+ train_detector_on_code_love
148
+ @detector.instance_variable_get(:@scores_for_training_set).should be_nil
149
+ end
150
+
151
+ it "should serialize itself to YAML" do
152
+ train_detector_on_code_love
153
+ file = tmp_file
154
+ @detector.save_to_file(file)
155
+ File.exist?(file).should be_true
156
+ end
157
+
158
+ it "should load itself from YAML" do
159
+ train_detector_on_code_love
160
+ file = tmp_file
161
+ @detector.save_to_file(file)
162
+ reloaded_detector = AnomalyDetector.load_from_file(file)
163
+ reloaded_detector.should == @detector
164
+ end
165
+
166
+ end
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Array, "with Basset extensions" do
4
+
5
+ it "should give the tail of an array like FP lists do" do
6
+ [1,2,3].rest.should == [2,3]
7
+ end
8
+
9
+ it "should not choke when giving the tail of an empty list" do
10
+ [].rest.should == []
11
+ end
12
+
13
+ it "should return a random element" do
14
+ srand(123456)
15
+ [1,2,3,4].pick_random.should == 2
16
+ end
17
+
18
+ it "should randomly rearrange itself" do
19
+ srand(123456)
20
+ [1,2,3,4].randomize.should == [1,3,4,2]
21
+ end
22
+
23
+ it "should sum itself" do
24
+ [1,2,3,4].sum.should == 10
25
+ end
26
+
27
+ end
28
+
29
+ describe Float, "with Basset extensions" do
30
+ it "should convert itself to a string with variable precsion" do
31
+ 1.23456.to_s_decimal_places(3).should == "1.234"
32
+ end
33
+ end
@@ -0,0 +1,59 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe Document do
4
+ it "should remove punctuation from words" do
5
+ Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
6
+ end
7
+
8
+ it "should remove numbers from words" do
9
+ Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
10
+ end
11
+
12
+ it "should remove symbols from words" do
13
+ Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
14
+ end
15
+
16
+ it "should lowercase text" do
17
+ Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
18
+ end
19
+
20
+ it "should stem words" do
21
+ Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
22
+ end
23
+
24
+ it "should count feature occurances" do
25
+ Document.new("test doc test", :test).vector_of_features.should ==
26
+ [Feature.new("doc", 1), Feature.new("test", 2)]
27
+ end
28
+ end
29
+
30
+ describe UriDocument do
31
+
32
+ def single_features(*uris)
33
+ uris.flatten.map { |uri| Feature.new(uri.to_s, 1) }
34
+ end
35
+
36
+ it "should extract URI token separators &, ?, \\, /, =, [, ], and . separately" do
37
+ expected_features = [:a,:b,:c,:d,:e,:f,:g,:h, :i, '&', '?', "\\", '/', '=', '[', ']', '.']
38
+ expected = single_features(expected_features).sort
39
+ UriDocument.new('a&b?c\d/e=f[g]h.i').feature_vectors.sort.should == expected
40
+ end
41
+
42
+ it "should extract two dots as a single feature instead of two dots" do
43
+ UriDocument.new('..').feature_vectors.should == [Feature.new("..", 1)]
44
+ end
45
+
46
+ it "should extract two slashes as a single feature" do
47
+ UriDocument.new('//').feature_vectors.should == [Feature.new('//', 1)]
48
+ UriDocument.new("\\\\").feature_vectors.should == [Feature.new('\\\\', 1)]
49
+ end
50
+
51
+ it "should not stem words" do
52
+ UriDocument.new("testing").feature_vectors.should == [Feature.new("testing", 1)]
53
+ end
54
+
55
+ it "should URI decode encoded strings" do
56
+ UriDocument.new("%23%25").feature_vectors.should == [Feature.new("#%", 1)]
57
+ end
58
+
59
+ end