rjspotter-basset 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/License.txt +20 -0
- data/Manifest.txt +21 -0
- data/README.rdoc +58 -0
- data/Rakefile +63 -0
- data/VERSION.yml +4 -0
- data/basset.gemspec +38 -0
- data/examples/example.rb +25 -0
- data/lib/basset.rb +9 -0
- data/lib/basset/classification_evaluator.rb +170 -0
- data/lib/basset/classifier.rb +188 -0
- data/lib/basset/core_extensions.rb +93 -0
- data/lib/basset/document.rb +84 -0
- data/lib/basset/document_override_example.rb +11 -0
- data/lib/basset/feature.rb +26 -0
- data/lib/basset/feature_extractor.rb +52 -0
- data/lib/basset/feature_selector.rb +126 -0
- data/lib/basset/naive_bayes.rb +151 -0
- data/lib/basset/svm.rb +180 -0
- data/lib/basset/yaml_serialization.rb +41 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/unit/classifier_spec.rb +166 -0
- data/spec/unit/core_extension_spec.rb +33 -0
- data/spec/unit/document_spec.rb +59 -0
- data/spec/unit/feature_extractor_spec.rb +33 -0
- data/spec/unit/feature_selector_spec.rb +108 -0
- data/spec/unit/feature_spec.rb +40 -0
- data/spec/unit/naive_bayes_spec.rb +119 -0
- data/spec/unit/svm_spec.rb +83 -0
- metadata +115 -0
data/History.txt
ADDED
data/License.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2007 Paul Dix
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
History.txt
|
2
|
+
License.txt
|
3
|
+
Manifest.txt
|
4
|
+
README.txt
|
5
|
+
Rakefile
|
6
|
+
lib/basset.rb
|
7
|
+
lib/basset/classification_evaluator.rb
|
8
|
+
lib/basset/core_extensions.rb
|
9
|
+
lib/basset/document.rb
|
10
|
+
lib/basset/document_override_example.rb
|
11
|
+
lib/basset/feature.rb
|
12
|
+
lib/basset/feature_extractor.rb
|
13
|
+
lib/basset/feature_selector.rb
|
14
|
+
lib/basset/naive_bayes.rb
|
15
|
+
lib/basset/yaml_serialization.rb
|
16
|
+
spec/spec.opts
|
17
|
+
spec/spec_helper.rb
|
18
|
+
spec/unit/document_spec.rb
|
19
|
+
spec/unit/feature_extractor_spec.rb
|
20
|
+
spec/unit/feature_selector_spec.rb
|
21
|
+
spec/unit/feature_spec.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
Author:: Paul Dix (mailto:paul@pauldix.net)
|
2
|
+
|
3
|
+
=Summary
|
4
|
+
This is Daniel DeLeo's fork of Paul Dix's basset[http://github.com/pauldix/basset/], a library for machine learning.
|
5
|
+
|
6
|
+
Basset includes a generic document representation class, a feature selector, a feature extractor, naive bayes and SVM classifiers, and a classification evaluator for running tests. The goal is to create a general framework that is easy to modify for specific problems. It is designed to be extensible so it should be easy to add more classification and clustering algorithms.
|
7
|
+
|
8
|
+
==Additions
|
9
|
+
I have added lots of tests (TATFT![http://smartic.us/2008/8/15/tatft-i-feel-a-revolution-coming-on]), a document class for URIs, a high level classifier class, SVM support using libsvm-ruby-swig[http://github.com/tomz/libsvm-ruby-swig] and an anomaly detector (one-class classifier). I have also modified the naive bayes classifier so that it does not require you to use the feature selector and extractor. This should help you to get encouraging results when you first start. Once you have a large training dataset, you can use the feature selector and extractor for better performance.
|
10
|
+
|
11
|
+
=Usage
|
12
|
+
The most popular task is spam identification, though there are many others, such as document retrieval, security/intrusion detection, and applications in Biology and Astrophysics.
|
13
|
+
|
14
|
+
To build a classifier, you'll first need a set of training documents. For a spam/non-spam classifier, this would consist of a number of documents which you have labeled as either spam or not. With training sets, bigger is better. You should probably have at least 100 of each type (spam and not spam). Really 1,000 of each type would be better and 10,000 of each would be super sweet.
|
15
|
+
|
16
|
+
==Simple Example
|
17
|
+
The Classifier class takes care of all of the messy details for you. It defaults to a naive bayes classifier and using the Document (best for western natural language) class to represent documents.
|
18
|
+
|
19
|
+
# This example based on the song "Losing My Edge" by LCD Soundsystem
|
20
|
+
|
21
|
+
classifier = Basset::Classifier.new #default options
|
22
|
+
|
23
|
+
classifier.train(:hip, "turntables", "techno music", "DJs with turntables", "techno DJs")
|
24
|
+
classifier.train(:unhip, "rock music", "guitar bass drums", "guitar rock", "guitar players")
|
25
|
+
classifier.classify("guitar music")
|
26
|
+
=> :unhip
|
27
|
+
|
28
|
+
# now everyone likes rock music again! retrain the classifier fast!
|
29
|
+
|
30
|
+
classifier.train_iterative(:hip, "guitars") # takes 3 iterations
|
31
|
+
classifier.classify("guitars")
|
32
|
+
=> :hip
|
33
|
+
|
34
|
+
==Full Control
|
35
|
+
For more control over the various stages of the training and classification process, you can create document, feature selector, feature extractor, and document classifier objects directly. The process is as follows:
|
36
|
+
|
37
|
+
* Create each as a Document (a class in this library)
|
38
|
+
* Pass those documents into the FeatureSelector
|
39
|
+
* Get the best features and pass those into the FeatureExtractor
|
40
|
+
* Now extract features from each document using the extractor and
|
41
|
+
* Pass those extracted features to NaiveBayes or Svm as part of the training set
|
42
|
+
* Now you can save the FeatureExtractor and NaiveBayes (or Svm) to a file
|
43
|
+
|
44
|
+
That represents the process of selecting features and training the classifier. Once you've done that you can predict if a new previously unseen document is spam or not by just doing the following:
|
45
|
+
|
46
|
+
* Load the feature extractor and document classifier from their files
|
47
|
+
* Create a new document object from your new unseen document
|
48
|
+
* Extract the features from that document using the feature extractor and
|
49
|
+
* Pass those to the classify method of the naive bayes classifier
|
50
|
+
|
51
|
+
Something that you'll probably want to do before doing real classification is to test things. Use the ClassificationEvaluator for this. Using the evaluator you can pass your training documents in and have it run through a series of tests to give you an estimate of how successful the classifier will be at predicting unseen documents. Easy classification tasks will generally be > 90% accurate while others can be much harder. Each classification task is different and most of the time you won't know until you actually test it out.
|
52
|
+
|
53
|
+
=Contact
|
54
|
+
I love machine learning and classification so if you have a problem that is giving you trouble don't hesitate to get a hold of me. The same applies for anyone who wants to write additional classifiers, better document representations, or just to tell my my code is amateur.
|
55
|
+
|
56
|
+
Author:: Paul Dix (mailto:paul@pauldix.net)
|
57
|
+
Site:: http://www.pauldix.net
|
58
|
+
Freenode:: pauldix in #nyc.rb
|
data/Rakefile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require './lib/basset.rb'
|
4
|
+
require "spec/rake/spectask"
|
5
|
+
require "rake/clean"
|
6
|
+
require "rake/rdoctask"
|
7
|
+
|
8
|
+
desc "Run all of the specs"
|
9
|
+
Spec::Rake::SpecTask.new do |t|
|
10
|
+
t.spec_opts = ['--options', "\"spec/spec.opts\""]
|
11
|
+
t.fail_on_error = false
|
12
|
+
end
|
13
|
+
|
14
|
+
namespace :spec do
|
15
|
+
|
16
|
+
desc "Run all spec with RCov"
|
17
|
+
Spec::Rake::SpecTask.new(:rcov) do |t|
|
18
|
+
t.rcov = true
|
19
|
+
t.rcov_dir = 'doc/tools/coverage/'
|
20
|
+
t.rcov_opts = ['--exclude', 'spec']
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "Generate HTML report for failing examples"
|
24
|
+
Spec::Rake::SpecTask.new('report') do |t|
|
25
|
+
t.spec_files = FileList['failing_examples/**/*.rb']
|
26
|
+
t.spec_opts = ["--format", "html:doc/tools/reports/failing_examples.html", "--diff", '--options', '"spec/spec.opts"']
|
27
|
+
t.fail_on_error = false
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
Rake::RDocTask.new do |rdt|
|
34
|
+
rdt.rdoc_dir = "doc"
|
35
|
+
rdt.main = "README.rdoc"
|
36
|
+
rdt.rdoc_files.include("README.rdoc", "lib/*", "ext/*/*.yy.c")
|
37
|
+
end
|
38
|
+
|
39
|
+
begin
|
40
|
+
require 'jeweler'
|
41
|
+
Jeweler::Tasks.new do |s|
|
42
|
+
s.name = 'basset'
|
43
|
+
s.summary = 'A library for machine learning and classification'
|
44
|
+
s.description = s.summary
|
45
|
+
s.email = 'ddeleo@basecommander.net'
|
46
|
+
s.homepage = "http://github.com/danielsdeleo/basset"
|
47
|
+
s.platform = Gem::Platform::RUBY
|
48
|
+
s.has_rdoc = true
|
49
|
+
s.extra_rdoc_files = ["README.rdoc"]
|
50
|
+
s.require_path = ["lib"]
|
51
|
+
s.authors = ['Paul Dix', 'Bryan Helmkamp', 'Daniel DeLeo']
|
52
|
+
s.add_dependency('stemmer', '>= 1.0.1')
|
53
|
+
# ruby -rpp -e' pp `git ls-files`.split("\n") '
|
54
|
+
s.files = `git ls-files`.split("\n").reject {|f| f =~ /git/}
|
55
|
+
end
|
56
|
+
rescue LoadError
|
57
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
58
|
+
end
|
59
|
+
|
60
|
+
desc "outputs a list of files suitable for use with the gemspec"
|
61
|
+
task :list_files do
|
62
|
+
sh %q{ruby -rpp -e' pp `git ls-files`.split("\n").reject {|f| f =~ /git/} '}
|
63
|
+
end
|
data/VERSION.yml
ADDED
data/basset.gemspec
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{basset}
|
5
|
+
s.version = "1.0.5"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Paul Dix", "Bryan Helmkamp", "Daniel DeLeo", "R. Potter"]
|
9
|
+
s.date = %q{2009-05-09}
|
10
|
+
s.description = %q{A library for machine learning and classification}
|
11
|
+
s.email = %q{rjspotter@gmail.com}
|
12
|
+
s.extra_rdoc_files = ["README.rdoc"]
|
13
|
+
s.files = ["History.txt", "License.txt", "Manifest.txt", "README.rdoc", "Rakefile", "VERSION.yml", "basset.gemspec", "examples/example.rb", "lib/basset.rb", "lib/basset/classification_evaluator.rb", "lib/basset/classifier.rb", "lib/basset/core_extensions.rb", "lib/basset/document.rb", "lib/basset/document_override_example.rb", "lib/basset/feature.rb", "lib/basset/feature_extractor.rb", "lib/basset/feature_selector.rb", "lib/basset/naive_bayes.rb", "lib/basset/svm.rb", "lib/basset/yaml_serialization.rb", "spec/spec.opts", "spec/spec_helper.rb", "spec/unit/classifier_spec.rb", "spec/unit/core_extension_spec.rb", "spec/unit/document_spec.rb", "spec/unit/feature_extractor_spec.rb", "spec/unit/feature_selector_spec.rb", "spec/unit/feature_spec.rb", "spec/unit/naive_bayes_spec.rb", "spec/unit/svm_spec.rb"]
|
14
|
+
s.homepage = %q{http://github.com/danielsdeleo/basset}
|
15
|
+
s.rdoc_options = ["--inline-source", "--charset=UTF-8"]
|
16
|
+
s.require_paths = [["lib"]]
|
17
|
+
s.rubygems_version = %q{1.3.3}
|
18
|
+
s.summary = %q{A library for machine learning and classification}
|
19
|
+
|
20
|
+
if s.respond_to? :specification_version then
|
21
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
22
|
+
s.specification_version = 3
|
23
|
+
|
24
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
25
|
+
s.add_runtime_dependency(%q<stemmer>, [">= 1.0.1"])
|
26
|
+
s.add_runtime_dependency(%q<tomz-libsvm-ruby-swig>, [">= 0.3.3"])
|
27
|
+
s.add_runtime_dependency(%q<igrigorik-bloomfilter>, [">= 0.1.2"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<stemmer>, [">= 1.0.1"])
|
30
|
+
s.add_dependency(%q<tomz-libsvm-ruby-swig>, [">= 0.3.3"])
|
31
|
+
s.add_dependency(%q<igrigorik-bloomfilter>, [">= 0.1.2"])
|
32
|
+
end
|
33
|
+
else
|
34
|
+
s.add_dependency(%q<stemmer>, [">= 1.0.1"])
|
35
|
+
s.add_dependency(%q<tomz-libsvm-ruby-swig>, [">= 0.3.3"])
|
36
|
+
s.add_dependency(%q<igrigorik-bloomfilter>, [">= 0.1.2"])
|
37
|
+
end
|
38
|
+
end
|
data/examples/example.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require File.dirname(__FILE__) + '/../lib/basset.rb'
|
3
|
+
|
4
|
+
documents = [
|
5
|
+
Basset::Document.new("ruby is awesome", :ruby),
|
6
|
+
Basset::Document.new("python is good", :python),
|
7
|
+
Basset::Document.new("ruby is fun", :ruby),
|
8
|
+
Basset::Document.new("python is boring", :python)]
|
9
|
+
|
10
|
+
# first add the docs to the feature selector
|
11
|
+
# The feature selector is tricky. It messes with this kind of toy example since it throws
|
12
|
+
# out features that don't occur in enough documents.
|
13
|
+
feature_selector = Basset::FeatureSelector.new
|
14
|
+
documents.each {|doc| feature_selector.add_document(doc)}
|
15
|
+
|
16
|
+
# now create a feature extractor, which expects an array of features on init. This comes
|
17
|
+
# from the feature selector
|
18
|
+
feature_extractor = Basset::FeatureExtractor.new(feature_selector.best_features)
|
19
|
+
|
20
|
+
# now we're ready to set up the classifier
|
21
|
+
naive_bayes = Basset::NaiveBayes.new
|
22
|
+
documents.each {|doc| naive_bayes.add_document(doc.classification, feature_extractor.extract_numbered(doc))}
|
23
|
+
|
24
|
+
test_doc = Basset::Document.new("I like ruby")
|
25
|
+
puts naive_bayes.classify(test_doc.vector_of_features).inspect
|
data/lib/basset.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
module Basset
|
2
|
+
# Class for running evaluation tests on a classifier, and document
|
3
|
+
# representation.
|
4
|
+
# Takes the training_documents, which should be an array of objects that can return a vector of features (like Basset::Document)
|
5
|
+
# The args hash has two optional keys {:output => true, :folding_amount => 10} where folding_amount is the amount of cross validation.
|
6
|
+
class ClassificationEvaluator
|
7
|
+
def initialize(training_documents, args = {})
|
8
|
+
args[:output] = true unless args.has_key?(:output)
|
9
|
+
@output_to_console = args[:output]
|
10
|
+
@folding_amount = (args[:folding_amount] or 10)
|
11
|
+
@total_documents_trained = 0
|
12
|
+
@document_sets = split_documents_into_cross_validation_sets(training_documents, @folding_amount)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Classifiers should be an array of basset classifier objects to run cross validation tests on
|
16
|
+
def test_with_basset_classifiers(classifiers)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Classifiers should be an array of basset classifier objects to run cross validation tests on.
|
20
|
+
# chi_value will be passed on to the feature_selector. The default value of 0 will select all features.
|
21
|
+
# The block will get called and passed in each training_set and test_set from the document_sets. It should
|
22
|
+
# run some external classifier and return the number of documents from the test_set that were correctly classified.
|
23
|
+
def compare_against_basset_classifiers(classifiers, chi_value = 0, &block)
|
24
|
+
# initialize the results hash
|
25
|
+
results = {"External" => {:correct => 0, :total => 0, :time => 0.0}}
|
26
|
+
classifiers.each {|classifier| results[classifier.class] = {:correct => 0, :total => 0, :time => 0.0}}
|
27
|
+
|
28
|
+
# run on each doc set
|
29
|
+
@document_sets.each_with_index do |document_set, iteration|
|
30
|
+
puts "iteration #{iteration + 1} of #{@document_sets.size}" if @output_to_console
|
31
|
+
feature_extractor = nil
|
32
|
+
feature_extractor_time = time_execution { feature_extractor = create_feature_extractor(document_set[:training_set], chi_value) }
|
33
|
+
number_of_test_documents = document_set[:test_set].size
|
34
|
+
|
35
|
+
# do a test run on each classifier
|
36
|
+
classifiers.each do |classifier|
|
37
|
+
correct = 0
|
38
|
+
time = time_execution { correct = test_run(document_set[:training_set], document_set[:test_set], feature_extractor, classifier) } + feature_extractor_time
|
39
|
+
results[classifier.class][:time] += time
|
40
|
+
results[classifier.class][:correct] += correct
|
41
|
+
results[classifier.class][:total] += number_of_test_documents
|
42
|
+
output_results(correct, number_of_test_documents, time, classifier.class) if @output_to_console
|
43
|
+
end
|
44
|
+
|
45
|
+
# now run the external and gather results
|
46
|
+
correct = 0
|
47
|
+
time = time_execution { correct = block.call(document_set[:training_set], document_set[:test_set]) }
|
48
|
+
results["External"][:time] += time
|
49
|
+
results["External"][:correct] += correct
|
50
|
+
results["External"][:total] += number_of_test_documents
|
51
|
+
output_results(correct, number_of_test_documents, time, "External") if @output_to_console
|
52
|
+
end
|
53
|
+
|
54
|
+
puts "\nFinal Results\n---------------------------------------------------------------------------------------" if @output_to_console
|
55
|
+
puts "Trained on #{@total_documents_trained} documents on #{@folding_amount} cross validation runs." if @output_to_console
|
56
|
+
if @output_to_console
|
57
|
+
results.each_pair {|classifier, results_numbers| output_results(results_numbers[:correct], results_numbers[:total], results_numbers[:time], classifier)}
|
58
|
+
end
|
59
|
+
|
60
|
+
return results
|
61
|
+
end
|
62
|
+
|
63
|
+
# It will then feature select and train on 9 and test on
|
64
|
+
# the other. Iterate 10 times using each block as the test set and the others as the
|
65
|
+
# training and combine the results.
|
66
|
+
def test_with_cross_validation(training_document_names, folding_amount = 10)
|
67
|
+
# make sure it's not in some order
|
68
|
+
training_document_names.each {|class_documents| class_documents.randomize!}
|
69
|
+
|
70
|
+
# the folding amount determines how big the test set size is. for 10 fold it's 10% and we run 10 times
|
71
|
+
total_correct, total_documents = 0, 0
|
72
|
+
|
73
|
+
# there's some tricky code here to make sure that the training and test sets have an equal percentage
|
74
|
+
# of docs from each class for each iteration.
|
75
|
+
folding_amount.times do |iteration|
|
76
|
+
puts "iteration #{iteration + 1} of #{folding_amount}" if @output_to_console
|
77
|
+
test_set = []
|
78
|
+
training_document_names.each do |class_document_names|
|
79
|
+
test_set_size = (class_document_names.size / folding_amount).to_i
|
80
|
+
test_set << class_document_names.slice(iteration * test_set_size, test_set_size)
|
81
|
+
end
|
82
|
+
training_set = []
|
83
|
+
training_document_names.each_with_index {|class_document_names, i| training_set += (class_document_names - test_set[i])}
|
84
|
+
test_set = test_set.flatten
|
85
|
+
|
86
|
+
correct, total = test_run(training_set, test_set)
|
87
|
+
total_correct += correct
|
88
|
+
total_documents += total
|
89
|
+
end
|
90
|
+
|
91
|
+
output_results(total_correct, total_documents) if @output_to_console
|
92
|
+
return [total_correct, total_documents]
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
# Splits entire set. The goal here is to test classification accuracy
|
97
|
+
# using cross validation. 10 fold is the default. So it will split the training set
|
98
|
+
# into 10 equal size chunks.
|
99
|
+
# training_documents is actually an array of arrays. each class to be considered
|
100
|
+
# has an array of documents.
|
101
|
+
def split_documents_into_cross_validation_sets(training_documents, folding_amount = 10)
|
102
|
+
document_sets = []
|
103
|
+
# make sure it's not in some order
|
104
|
+
training_documents.each {|class_documents| class_documents.randomize!}
|
105
|
+
|
106
|
+
# the folding amount determines how big the test set size is. for 10 fold it's 10% and we run 10 times
|
107
|
+
# there's some tricky code here to make sure that the training and test sets have an equal percentage
|
108
|
+
# of docs from each class for each iteration.
|
109
|
+
folding_amount.times do |iteration|
|
110
|
+
test_set = []
|
111
|
+
training_documents.each do |class_documents|
|
112
|
+
test_set_size = (class_documents.size / folding_amount).to_i
|
113
|
+
test_set << class_documents.slice(iteration * test_set_size, test_set_size)
|
114
|
+
end
|
115
|
+
training_set = []
|
116
|
+
training_documents.each_with_index {|class_documents, i| training_set += (class_documents - test_set[i])}
|
117
|
+
test_set = test_set.flatten
|
118
|
+
@total_documents_trained += training_set.size
|
119
|
+
document_sets << {:training_set => training_set, :test_set => test_set}
|
120
|
+
end
|
121
|
+
return document_sets
|
122
|
+
end
|
123
|
+
|
124
|
+
# this method returns a feature extractor for the passed in training_set using the chi_value
|
125
|
+
def create_feature_extractor(training_set, chi_value)
|
126
|
+
feature_selector = FeatureSelector.new
|
127
|
+
# select features based on training set
|
128
|
+
training_set.each do |document|
|
129
|
+
feature_selector.add_document(document)
|
130
|
+
end
|
131
|
+
if chi_value == 0
|
132
|
+
selected_features = feature_selector.all_feature_names
|
133
|
+
else
|
134
|
+
selected_features = feature_selector.select_features(chi_value)
|
135
|
+
end
|
136
|
+
puts "selected #{selected_features.size} of #{feature_selector.number_of_features} features for this iteration" if @output_to_console
|
137
|
+
return FeatureExtractor.new(selected_features)
|
138
|
+
end
|
139
|
+
|
140
|
+
# this is a single run on a training and test set. It will run feature_selection, the feature_extraction, then training, then testing
|
141
|
+
def test_run(training_set, testing_set, feature_extractor, classifier)
|
142
|
+
puts "training #{classifier.class} on #{training_set.size} documents..." if @output_to_console
|
143
|
+
# now train the classifier
|
144
|
+
training_set.each do |document|
|
145
|
+
classifier.add_document(document.classification, feature_extractor.extract_numbered(document) )
|
146
|
+
end
|
147
|
+
|
148
|
+
puts "running #{classifier.class} on #{testing_set.size} documents..." if @output_to_console
|
149
|
+
# now classify test set
|
150
|
+
number_correctly_classified = 0
|
151
|
+
testing_set.each do |document|
|
152
|
+
score, predicted_classification = classifier.classify(feature_extractor.extract_numbered(document))
|
153
|
+
number_correctly_classified += 1 if document.classification == predicted_classification
|
154
|
+
end
|
155
|
+
|
156
|
+
return number_correctly_classified
|
157
|
+
end
|
158
|
+
|
159
|
+
def output_results(correct, total, time, classifier_name)
|
160
|
+
puts "#{classifier_name} classified #{correct} of #{total} correctly for #{(correct/total.to_f * 100).to_s_decimal_places(2)}% accurcy. Executed run in #{time.to_s_decimal_places(1)} seconds."
|
161
|
+
end
|
162
|
+
|
163
|
+
def time_execution(&block)
|
164
|
+
start_time = Time.now
|
165
|
+
yield
|
166
|
+
end_time = Time.now
|
167
|
+
return end_time - start_time
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
require "basset/yaml_serialization"
|
2
|
+
|
3
|
+
module Basset
|
4
|
+
|
5
|
+
#
|
6
|
+
# Classifier wraps up all of the operations spread between Document and friends,
|
7
|
+
# FeatureExtractor, FeatureSelector, and specific classifiers such as
|
8
|
+
# NaiveBayes into one convenient interface.
|
9
|
+
#
|
10
|
+
class Classifier
|
11
|
+
include YamlSerialization
|
12
|
+
|
13
|
+
DEFAULTS = {:type => "naive_bayes", :doctype => "document"}
|
14
|
+
|
15
|
+
attr_reader :engine, :doctype
|
16
|
+
|
17
|
+
#
|
18
|
+
# Create a new classifier object. You can specify the type of classifier
|
19
|
+
# and kind of documents with the options. The defaults are
|
20
|
+
# :type => :naive_bayes, :doctype => :document; There is also a uri_document,ie.
|
21
|
+
# opts: {:type => :naive_bayes, :doctype => :uri_document }
|
22
|
+
def initialize(opts={})
|
23
|
+
@engine = constanize_opt(opts[:type] || DEFAULTS[:type]).new
|
24
|
+
@doctype = constanize_opt(opts[:doctype] || DEFAULTS[:doctype])
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Trains the classifier with _texts_ of class _classification_.
|
29
|
+
# _texts_ gets flattened, so you can pass in an array without breaking
|
30
|
+
# anything.
|
31
|
+
def train(classification, *texts)
|
32
|
+
texts.flatten.each do |text|
|
33
|
+
train_with_features(classification, features_of(text, classification))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Trains the classifier on a text repeatedly until the classifier recognizes
|
39
|
+
# it as being in class _classification_ (up to a maximum of 5 retrainings).
|
40
|
+
# Handy for training the classifier quickly or when it has been mistrained.
|
41
|
+
def train_iterative(classification, text)
|
42
|
+
(1 .. 5).each do |i|
|
43
|
+
train(classification, text)
|
44
|
+
break if classify(text) == classification
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Classifies _text_ based on training
|
50
|
+
def classify(text)
|
51
|
+
classify_features(features_of(text)).last
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Gives a numeric value for the similarity of _text_ to previously seen
|
56
|
+
# texts of class _classification_. For a Naive Bayes filter, this will
|
57
|
+
# be the log10 of the probabilities of each token in _text_ occuring in
|
58
|
+
# a text of class _classification_, normalized for the number of tokens.
|
59
|
+
def similarity_score(classification, text)
|
60
|
+
similarity_score_for_features(classification, features_of(text))
|
61
|
+
end
|
62
|
+
|
63
|
+
def ==(other)
|
64
|
+
other.is_a?(self.class) && other.engine == engine && other.doctype == doctype
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def train_with_features(classification, features)
|
70
|
+
@engine.add_document(classification, features)
|
71
|
+
end
|
72
|
+
|
73
|
+
def classify_features(features)
|
74
|
+
@engine.classify(features)
|
75
|
+
end
|
76
|
+
|
77
|
+
def similarity_score_for_features(classification, features)
|
78
|
+
@engine.probability_of_vectors_for_class(features, classification, :normalize => true)
|
79
|
+
end
|
80
|
+
|
81
|
+
def features_of(text, classification=nil)
|
82
|
+
@doctype.new(text, classification).feature_vectors
|
83
|
+
end
|
84
|
+
|
85
|
+
# poor man's version of Rails' String#classify.constantize
|
86
|
+
def constanize_opt(option)
|
87
|
+
class_name = option.to_s.split('_').map { |word| word.capitalize }.join('')
|
88
|
+
Basset.class_eval class_name
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# A class for anomaly detection.
|
95
|
+
#
|
96
|
+
# The purpose of this is to enable a statistical machine learning approach
|
97
|
+
# even when you can't/don't want to assume that "abnormal" documents will
|
98
|
+
# have certain features or fit nicely into classes.
|
99
|
+
#
|
100
|
+
# An example use case is an anomaly based IDS where you don't want to classify
|
101
|
+
# different kinds of attacks but instead want to find all events that deviate
|
102
|
+
# from an established baseline.
|
103
|
+
#
|
104
|
+
# With the default NaiveBayes classification method, uses the log10 of the
|
105
|
+
# Bayesian probability of a document belonging to the normal behavior group
|
106
|
+
# as a distance measurement; any document with a distance measurement higher
|
107
|
+
# than a given threshold is considered anomalous.
|
108
|
+
class AnomalyDetector < Classifier
|
109
|
+
include YamlSerialization
|
110
|
+
|
111
|
+
def initialize(opts={})
|
112
|
+
@training_features=[]
|
113
|
+
@updated = true
|
114
|
+
super(opts)
|
115
|
+
end
|
116
|
+
|
117
|
+
def classify(text)
|
118
|
+
anomalous?(text) ? :anomalous : :normal
|
119
|
+
end
|
120
|
+
|
121
|
+
def anomalous?(text)
|
122
|
+
minimum_acceptable_score > similarity_score(text)
|
123
|
+
end
|
124
|
+
|
125
|
+
def normal?(text)
|
126
|
+
!anomalous?(text)
|
127
|
+
end
|
128
|
+
|
129
|
+
def train(*texts)
|
130
|
+
texts.flatten.each do |text|
|
131
|
+
features = features_of(text)
|
132
|
+
@training_features << features
|
133
|
+
train_with_features(:normal, features)
|
134
|
+
end
|
135
|
+
reset_memoized_values
|
136
|
+
end
|
137
|
+
|
138
|
+
def similarity_score(text)
|
139
|
+
super(:normal, text)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Gives the number of standard deviations from average
|
143
|
+
def anomaly_score(text)
|
144
|
+
-1 * similarity_score(text) / stddev_of_scores_of_training_set
|
145
|
+
end
|
146
|
+
|
147
|
+
def scores_for_training_set
|
148
|
+
unless @scores_for_training_set
|
149
|
+
@scores_for_training_set = @training_features.map { |feature_set| similarity_score_for_features(:normal, feature_set)}
|
150
|
+
stddev_of_scores_of_training_set
|
151
|
+
end
|
152
|
+
@scores_for_training_set
|
153
|
+
end
|
154
|
+
|
155
|
+
def avg_score_of_training_set
|
156
|
+
scores_for_training_set.inject(0) { |sum, score| sum += score } / scores_for_training_set.length.to_f
|
157
|
+
end
|
158
|
+
|
159
|
+
def score_range_of_training_set
|
160
|
+
scores_for_training_set.min .. scores_for_training_set.max
|
161
|
+
end
|
162
|
+
|
163
|
+
def stddev_of_scores_of_training_set
|
164
|
+
unless @stddev_of_scores_of_training_set
|
165
|
+
@stddev_of_scores_of_training_set = Math.stddev(scores_for_training_set)
|
166
|
+
end
|
167
|
+
@stddev_of_scores_of_training_set
|
168
|
+
end
|
169
|
+
|
170
|
+
def minimum_acceptable_score
|
171
|
+
avg_score_of_training_set - (4 * stddev_of_scores_of_training_set)
|
172
|
+
end
|
173
|
+
|
174
|
+
def train_iterative(text)
|
175
|
+
(1 .. 5).each do
|
176
|
+
train(text)
|
177
|
+
break if normal?(text)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def reset_memoized_values
|
182
|
+
@memoized_vals_stale = true
|
183
|
+
@stddev_of_scores_of_training_set = nil
|
184
|
+
@scores_for_training_set = nil
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
end
|