danielsdeleo-basset 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/basset/svm.rb ADDED
@@ -0,0 +1,180 @@
1
+ require "svm" #File.dirname(__FILE__) + "/../../../libsvm-ruby-swig/lib/svm"
2
+
3
+ require "bloomfilter" # igrigorik-bloomfilter (github)
4
+
5
+ module Basset
6
+ # =Overview
7
+ # A class for SVM document classification. Follows the same basic interface
8
+ # as NaiveBayes; add labeled training documents to the classifier, then
9
+ # use it to classify unlabeled documents. Do test your accuracy before
10
+ # using the classifier in production, there are a lot of knobs to tweak.
11
+ # When testing, it is usually best to use a separate set of documents, i.e.,
12
+ # not the training set.
13
+ # =Learning Resources
14
+ # SVM can be tricky to understand at first, try the following references:
15
+ # http://en.wikipedia.org/wiki/Support_vector_machine
16
+ # http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
17
+ # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
18
+ # =Implementation
19
+ # This class wraps libsvm-ruby-swig, which is itself a swig based wrapper for
20
+ # libsvm.
21
+ # libsvm-ruby-swig: http://github.com/tomz/libsvm-ruby-swig
22
+ # libsvm: http://www.csie.ntu.edu.tw/~cjlin/libsvm
23
+ # verbose version:
24
+ # Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
25
+ #
26
+ # There is also the libsvm-ruby implementation. It was originally available from
27
+ # http://debian.cilibrar.com/debian/pool/main/libs/libsvm-ruby/libsvm-ruby_2.8.4.orig.tar.gz
28
+ # but was not available from there when I last checked. The Ubuntu package
29
+ # was still available as of this writing.
30
+ class Svm
31
+ #include YamlSerialization
32
+ attr_reader :class_labels, :feature_dictionary
33
+
34
+ def initialize
35
+ @total_classes = 0
36
+ @feature_dictionary = []
37
+ @class_labels = {}
38
+ @documents_for_class = Hash.new {|docs_hash,key| docs_hash[key] = []}
39
+ @svm_parameter = default_svm_parameter
40
+ end
41
+
42
+ # Adds a new document to the training set.
43
+ def add_document(classification, feature_vectors)
44
+ update_class_labels_with_new(classification) if new_class?(classification)
45
+ @feature_dictionary += feature_vectors.map { |fv| fv.name }
46
+ @feature_dictionary.uniq!
47
+ @documents_for_class[classification] << feature_vectors.map { |fv| fv.name }
48
+ reset_memoized_vars!
49
+ end
50
+
51
+ # Gives the vector representation of the training documents of class
52
+ # _classification_
53
+ def vectorized_docs(classification)
54
+ # hardwired to binary representation
55
+ @documents_for_class[classification].map do |features|
56
+ vectorize_doc(features)
57
+ #@feature_dictionary.map { |dict_feature| features.include?(dict_feature) ? 1 : 0}
58
+ end
59
+ end
60
+
61
+ # Returns the vectorized representation of the training data, suitable for
62
+ # use in the constructor for the libsvm Problem class.
63
+ def labels_and_document_vectors
64
+ # {labels => [features1-label, features2-label, ...], :features => [features1, features2, ...]}
65
+ labels_features = {:labels => [], :features => []}
66
+ @class_labels.each do |classification, label|
67
+ vectorized_docs(classification).each do |document_vector|
68
+ labels_features[:labels] << label
69
+ labels_features[:features] << document_vector
70
+ end
71
+ end
72
+ labels_features
73
+ end
74
+
75
+ def classify(feature_vectors)
76
+ class_of_label(model.predict(vectorize_doc(feature_vectors.map { |fv| fv.name })))
77
+ end
78
+
79
+ def classes
80
+ @class_labels.keys
81
+ end
82
+
83
+ # Exposes the libsvm-ruby-swig Parameter object. If given
84
+ # a block, the parameter object is yielded, otherwise,
85
+ # it's returned.
86
+ #
87
+ # For example, to set parameters to their default values:
88
+ #
89
+ # basset_svm_obj.parameters do |param|
90
+ # param.C = 100
91
+ # param.svm_type = NU_SVC
92
+ # param.degree = 1
93
+ # param.coef0 = 0
94
+ # param.eps= 0.001
95
+ # param.kernel_type = RBF
96
+ # end
97
+ #
98
+ # To access one value:
99
+ # basset_svm_obj.parameters.svm_type
100
+ # => NU_SVC
101
+ def parameters
102
+ if block_given?
103
+ yield @svm_parameter
104
+ else
105
+ @svm_parameter
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def vectorize_doc(features)
112
+ vectorized_doc = Array.new(@feature_dictionary.size, 0)
113
+ features.each do |feature|
114
+ if index = feature_dictionary_hash[feature]
115
+ vectorized_doc[index] = 1
116
+ end
117
+ end
118
+ vectorized_doc
119
+ end
120
+
121
+ def feature_dictionary_hash
122
+ unless @memoized_feature_dictionary_hash
123
+ m = 15 * @feature_dictionary.count # bloom filter size (bytes)
124
+ @memoized_feature_dictionary_hash = BloomFilter.new(m,3,23)
125
+
126
+ @feature_dictionary.each_index do |i|
127
+ @memoized_feature_dictionary_hash[@feature_dictionary[i]] = i
128
+ end
129
+ end
130
+ @memoized_feature_dictionary_hash
131
+ end
132
+
133
+ def reset_memoized_vars!
134
+ @memoized_model, @memoized_problem, @memoized_feature_dictionary_hash = nil, nil, nil
135
+ @memoized_inverted_class_labels = nil
136
+ end
137
+
138
+ def model
139
+ @memoized_model ||= Model.new(problem, @svm_parameter)
140
+ end
141
+
142
+ def problem
143
+ unless @memoized_problem
144
+ labels_features = labels_and_document_vectors
145
+ @memoized_problem = Problem.new(labels_features[:labels], labels_features[:features])
146
+ end
147
+ @memoized_problem
148
+ end
149
+
150
+ def new_class?(classification)
151
+ !@class_labels.keys.include?(classification)
152
+ end
153
+
154
+ def default_svm_parameter
155
+ param = ::Parameter.new
156
+ param.C = 100
157
+ param.svm_type = NU_SVC
158
+ param.degree = 1
159
+ param.coef0 = 0
160
+ param.eps= 0.001
161
+ param.nu = 0.5 #?! this blows up on my dataset...
162
+ param.kernel_type = RBF
163
+ param
164
+ end
165
+
166
+ def update_class_labels_with_new(classification)
167
+ #@class_labels.each_value { |vector| vector << 0 }
168
+ @class_labels[classification] = @total_classes #Array.new(@total_classes, 0) << 1
169
+ @total_classes += 1
170
+ end
171
+
172
+ def class_of_label(label)
173
+ unless @memoized_inverted_class_labels
174
+ @memoized_inverted_class_labels = @class_labels.invert
175
+ end
176
+ @memoized_inverted_class_labels[label.to_i]
177
+ end
178
+
179
+ end
180
+ end
@@ -0,0 +1,41 @@
1
+ require "yaml"
2
+
3
+ module YamlSerialization
4
+
5
+ def self.included(base)
6
+ base.extend ClassMethods
7
+ end
8
+
9
+
10
+ module ClassMethods
11
+ def load_from_file(file_name)
12
+ YAML.load_file(file_name)
13
+ end
14
+ end
15
+
16
+ def save_to_file(file_name)
17
+ File.open(file_name, 'w') do |file|
18
+ YAML.dump(self, file)
19
+ end
20
+ end
21
+
22
+ end
23
+
24
+ class ::Class
25
+ yaml_as "tag:ruby.yaml.org,2002:class"
26
+
27
+ def Class.yaml_new( klass, tag, val )
28
+ if String === val
29
+ val.split(/::/).inject(Object) {|m, n| m.const_get(n)}
30
+ else
31
+ raise YAML::TypeError, "Invalid Class: " + val.inspect
32
+ end
33
+ end
34
+
35
+ def to_yaml( opts = {} )
36
+ YAML::quick_emit( nil, opts ) { |out|
37
+ out.scalar( "tag:ruby.yaml.org,2002:class", self.name, :plain )
38
+ }
39
+ end
40
+
41
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ -c
@@ -0,0 +1,12 @@
1
+ require 'spec'
2
+ require File.join(File.dirname(__FILE__), "..", "lib", "basset")
3
+
4
+ class Array
5
+
6
+ def sort_to_s
7
+ self.map { |item| item.to_s }.sort
8
+ end
9
+
10
+ end
11
+
12
+ include Basset
@@ -0,0 +1,166 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Classifier do
4
+
5
+ before(:each) do
6
+ @classifier = Classifier.new
7
+ end
8
+
9
+ it "should automagically determine the ruby class of the classifier engine a la rails' constantize" do
10
+ Classifier.new(:type => :naive_bayes).engine.class.should == NaiveBayes
11
+ end
12
+
13
+ it "should automagically determine the ruby class of the document type" do
14
+ Classifier.new(:type => :naive_bayes, :doctype => :document).doctype.should == Document
15
+ end
16
+
17
+ it "should default to NaiveBayes engine and Document doctype" do
18
+ classifier = Classifier.new()
19
+ classifier.engine.class.should == NaiveBayes
20
+ classifier.doctype.should == Document
21
+ end
22
+
23
+
24
+ it "should accept training docs as plain strings, extracting features automatically" do
25
+ @classifier.train(:hip, "that hipster has an asymmetrical haircut")
26
+ @classifier.train(:unhip, "that dude is a frat boy")
27
+ @classifier.engine.classes.should == [:hip, :unhip]
28
+ @classifier.engine.occurrences_of_all_features_in_class(:unhip).should == 6
29
+ end
30
+
31
+ it "should classify documents" do
32
+ @classifier.train(:hip, "that hipster has an asymmetrical haircut")
33
+ @classifier.train(:unhip, "that dude is a frat boy")
34
+ @classifier.classify("hipsters").should == :hip
35
+ end
36
+
37
+ it "should train iteratively for speed learning" do
38
+ @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
39
+ @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
40
+ @classifier.classify("guitar music").should == :unhip
41
+ # now everyone likes rock music again! retrain fast! cf LCD Soundsystem
42
+ @classifier.train_iterative(:hip, "guitars") # takes 3 iterations
43
+ @classifier.classify("guitars").should == :hip
44
+ end
45
+
46
+ it "should give document scores for a class" do
47
+ @classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
48
+ @classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
49
+ @classifier.similarity_score(:hip, "guitars").should be_a Float
50
+ end
51
+
52
+ end
53
+
54
+
55
+ describe AnomalyDetector do
56
+
57
+ YAML_FILE_BASENAME = "/tmp/basset_anomaly_detector_rspec_savefile"
58
+
59
+ def tmp_file
60
+ YAML_FILE_BASENAME + rand(2 ** 32).to_s(16)
61
+ end
62
+
63
+ before(:each) do
64
+ @detector = AnomalyDetector.new
65
+ end
66
+
67
+ after(:each) do
68
+ Dir.glob(YAML_FILE_BASENAME + '*').each {|file| File.delete file}
69
+ end
70
+
71
+ def train_detector_on_code_love
72
+ @detector.train("coding all night and loving it", "coding and drinking jolt")
73
+ end
74
+
75
+ it "should train on the normal set only" do
76
+ @detector.train("coding all night", "coding and drinking jolt")
77
+ @detector.engine.classes.should == [:normal]
78
+ @detector.engine.occurrences_of_all_features_in_class(:normal).should == 7
79
+ end
80
+
81
+ it "should give a score for the probability of a document to be in the ``normal'' set" do
82
+ train_detector_on_code_love
83
+ score = @detector.similarity_score("I love coding and jolt")
84
+ score.should be_a Float
85
+ score.should be_close(-1, 2)
86
+ end
87
+
88
+ it "should give a list of the probability scores for the training set" do
89
+ train_detector_on_code_love
90
+ @detector.scores_for_training_set.should have(2).items
91
+ @detector.scores_for_training_set.each do |score|
92
+ score.should be_close(-1, 1)
93
+ end
94
+ end
95
+
96
+ it "should compute the average probability score for training set" do
97
+ train_detector_on_code_love
98
+ @detector.avg_score_of_training_set.should be_close(-0.841, 0.001)
99
+ end
100
+
101
+ it "should give the range of probability scores for the training set" do
102
+ train_detector_on_code_love
103
+ @detector.score_range_of_training_set.should be_a Range
104
+ @detector.score_range_of_training_set.first.should be_close(-0.864, 0.001)
105
+ @detector.score_range_of_training_set.last.should be_close(-0.818, 0.001)
106
+ end
107
+
108
+ it "should give the standard deviation of probability scores for the training set" do
109
+ train_detector_on_code_love
110
+ @detector.stddev_of_scores_of_training_set.should be_close(0.0234, 0.001) #0.0234
111
+ end
112
+
113
+ it "should use the average minus 4 times the stddev as the lower bound for normal" do
114
+ train_detector_on_code_love
115
+ expected = -0.841 - (4 * 0.0234)
116
+ @detector.minimum_acceptable_score.should be_close(expected, 0.001)
117
+ end
118
+
119
+ it "should say if text is anomalous or not" do
120
+ train_detector_on_code_love
121
+ @detector.should be_anomalous("watching tv")
122
+ @detector.should_not be_anomalous("code and jolt")
123
+ end
124
+
125
+ it "should classify text as anomalous or normal" do
126
+ train_detector_on_code_love
127
+ @detector.classify("watching tv").should == :anomalous
128
+ @detector.classify("code and jolt").should == :normal
129
+ end
130
+
131
+ it "should give an anomaly score based on the std deviations from mean" do
132
+ train_detector_on_code_love
133
+ @detector.anomaly_score("watching_tv").should be_close( 80, 10)
134
+ end
135
+
136
+ it "should train iteratively" do
137
+ train_detector_on_code_love
138
+ 50.times {@detector.train("coding drinking jolt")}
139
+ @detector.train_iterative("watching tv")
140
+ @detector.should be_normal("watching tv")
141
+ end
142
+
143
+ it "should reset memoized values to nil after retraining" do
144
+ train_detector_on_code_love
145
+ @detector.score_range_of_training_set
146
+ @detector.scores_for_training_set.should_not be_nil
147
+ train_detector_on_code_love
148
+ @detector.instance_variable_get(:@scores_for_training_set).should be_nil
149
+ end
150
+
151
+ it "should serialize itself to YAML" do
152
+ train_detector_on_code_love
153
+ file = tmp_file
154
+ @detector.save_to_file(file)
155
+ File.exist?(file).should be_true
156
+ end
157
+
158
+ it "should load itself from YAML" do
159
+ train_detector_on_code_love
160
+ file = tmp_file
161
+ @detector.save_to_file(file)
162
+ reloaded_detector = AnomalyDetector.load_from_file(file)
163
+ reloaded_detector.should == @detector
164
+ end
165
+
166
+ end
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Array, "with Basset extensions" do
4
+
5
+ it "should give the tail of an array like FP lists do" do
6
+ [1,2,3].rest.should == [2,3]
7
+ end
8
+
9
+ it "should not choke when giving the tail of an empty list" do
10
+ [].rest.should == []
11
+ end
12
+
13
+ it "should return a random element" do
14
+ srand(123456)
15
+ [1,2,3,4].pick_random.should == 2
16
+ end
17
+
18
+ it "should randomly rearrange itself" do
19
+ srand(123456)
20
+ [1,2,3,4].randomize.should == [1,3,4,2]
21
+ end
22
+
23
+ it "should sum itself" do
24
+ [1,2,3,4].sum.should == 10
25
+ end
26
+
27
+ end
28
+
29
+ describe Float, "with Basset extensions" do
30
+ it "should convert itself to a string with variable precsion" do
31
+ 1.23456.to_s_decimal_places(3).should == "1.234"
32
+ end
33
+ end
@@ -0,0 +1,59 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe Document do
4
+ it "should remove punctuation from words" do
5
+ Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
6
+ end
7
+
8
+ it "should remove numbers from words" do
9
+ Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
10
+ end
11
+
12
+ it "should remove symbols from words" do
13
+ Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
14
+ end
15
+
16
+ it "should lowercase text" do
17
+ Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
18
+ end
19
+
20
+ it "should stem words" do
21
+ Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
22
+ end
23
+
24
+ it "should count feature occurances" do
25
+ Document.new("test doc test", :test).vector_of_features.should ==
26
+ [Feature.new("doc", 1), Feature.new("test", 2)]
27
+ end
28
+ end
29
+
30
+ describe UriDocument do
31
+
32
+ def single_features(*uris)
33
+ uris.flatten.map { |uri| Feature.new(uri.to_s, 1) }
34
+ end
35
+
36
+ it "should extract URI token separators &, ?, \\, /, =, [, ], and . separately" do
37
+ expected_features = [:a,:b,:c,:d,:e,:f,:g,:h, :i, '&', '?', "\\", '/', '=', '[', ']', '.']
38
+ expected = single_features(expected_features).sort
39
+ UriDocument.new('a&b?c\d/e=f[g]h.i').feature_vectors.sort.should == expected
40
+ end
41
+
42
+ it "should extract two dots as a single feature instead of two dots" do
43
+ UriDocument.new('..').feature_vectors.should == [Feature.new("..", 1)]
44
+ end
45
+
46
+ it "should extract two slashes as a single feature" do
47
+ UriDocument.new('//').feature_vectors.should == [Feature.new('//', 1)]
48
+ UriDocument.new("\\\\").feature_vectors.should == [Feature.new('\\\\', 1)]
49
+ end
50
+
51
+ it "should not stem words" do
52
+ UriDocument.new("testing").feature_vectors.should == [Feature.new("testing", 1)]
53
+ end
54
+
55
+ it "should URI decode encoded strings" do
56
+ UriDocument.new("%23%25").feature_vectors.should == [Feature.new("#%", 1)]
57
+ end
58
+
59
+ end