basset 1.0.1 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,48 @@
1
+ h1. Basset
2
+
3
+ "http://github.com/pauldix/basset":http://github.com/pauldix/basset
4
+
5
+ h2. Summary
6
+
7
+ A wonderful hound that finds patterns in your data using machine learning.
8
+
9
+ h2. Description
10
+
11
+ This library is under construction. I decided to reboot it from its former version. If for some reason you're still using the old version it can still be found at "http://github.com/pauldix/basset/tree/1.0.1":http://github.com/pauldix/basset/tree/1.0.1
12
+
13
+ h2. Installation
14
+
15
+ <pre>
16
+ gem install basset --source http://gemcutter.org
17
+ </pre>
18
+
19
+ h2. Use
20
+
21
+ awesomeness goes here
22
+
23
+ h2. LICENSE
24
+
25
+ (The MIT License)
26
+
27
+ Copyright (c) 2009:
28
+
29
+ "Paul Dix":http://pauldix.net
30
+
31
+ Permission is hereby granted, free of charge, to any person obtaining
32
+ a copy of this software and associated documentation files (the
33
+ 'Software'), to deal in the Software without restriction, including
34
+ without limitation the rights to use, copy, modify, merge, publish,
35
+ distribute, sublicense, and/or sell copies of the Software, and to
36
+ permit persons to whom the Software is furnished to do so, subject to
37
+ the following conditions:
38
+
39
+ The above copyright notice and this permission notice shall be
40
+ included in all copies or substantial portions of the Software.
41
+
42
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
43
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
45
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
46
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
47
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
48
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,7 +1,11 @@
1
- Dir[File.join(File.dirname(__FILE__), "basset", "*.rb")].each do |file|
2
- require file
3
- end
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
2
+
3
+ module Basset; end;
4
+
5
+ require 'basset/parser'
6
+ require 'basset/feature_collection'
7
+ require 'basset/vector_collection'
4
8
 
5
9
  module Basset
6
- VERSION = "1.0.1"
10
+ VERSION = "2.0.1"
7
11
  end
@@ -0,0 +1,19 @@
1
+ class Basset::Parser
2
+ def self.parse(text, options = {})
3
+ unigrams = clean_text(text).split
4
+
5
+ ngrams = (options[:ngrams] || 1)
6
+ (unigrams + (2..ngrams).map {|n| ngrams(unigrams, n)}).flatten
7
+ end
8
+
9
+ def self.ngrams(unigrams, n)
10
+ grams = []
11
+ unigrams.each_cons(n) {|a| grams << a.join("_")}
12
+ grams
13
+ end
14
+
15
+ def self.clean_text(text)
16
+ #text.tr(',"#$%^&*()_=+[]{}\|<>/`~\—', " ") .tr("@'\-\'\”\‘\’0123456789", "")
17
+ text.gsub(/\W/, ' ').gsub(/\d/, ' ').tr('_', ' ').downcase
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe "parsing" do
4
+ it "should parse out punctuation" do
5
+ Basset::Parser.parse("hello! there").should == %w[hello there]
6
+ end
7
+
8
+ it "should parse out numbers" do
9
+ Basset::Parser.parse("this 234 number3").should == %w[this number]
10
+ end
11
+
12
+ it "should optionally return bigrams" do
13
+ Basset::Parser.parse("hi there paul", :ngrams => 2).should == %w[hi there paul hi_there there_paul]
14
+ end
15
+
16
+ it "should downcase everything" do
17
+ Basset::Parser.parse("HelLo").should == %w[hello]
18
+ end
19
+ end
@@ -0,0 +1,2 @@
1
+ --diff
2
+ --color
@@ -1,4 +1,10 @@
1
- require 'spec'
2
- require File.join(File.dirname(__FILE__), "..", "lib", "basset")
1
+ require "rubygems"
2
+ require "spec"
3
3
 
4
- include Basset
4
+ # gem install redgreen for colored test output
5
+ begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
6
+
7
+ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
8
+ $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
9
+
10
+ require "lib/basset"
metadata CHANGED
@@ -1,88 +1,60 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: basset
5
3
  version: !ruby/object:Gem::Version
6
- version: 1.0.1
7
- date: 2008-01-08 00:00:00 +00:00
8
- summary: A library for running machine learning algorithms for classification, feature selection and evaluation
9
- require_paths:
10
- - lib
11
- email: paul@pauldix.net
12
- homepage: http://basset.rubyforge.org/
13
- rubyforge_project: basset
14
- description: "=What You Could Use This For Just in case you don't have a clue what machine learning or classification is, here's a quick example scenario and an explanation of the process. The most popular task is spam identification. To do this you'll first need a set of training documents. This would consist of a number of documents which you have labeled as either spam or not. With training sets, bigger is better. You should probably have at least 100 of each type (spam and not spam). Really 1,000 of each type would be better and 10,000 of each would be super sweet. Once you have the training set the process with this library flows like this: * Create each as a Document (a class in this library) * Pass those documents into the FeatureSelector * Get the best features and pass those into the FeatureExtractor * Now extract features from each document using the extractor and * Pass those extracted features to NaiveBayes as part of the training set * Now you can save the FeatureExtractor and NaiveBayes to a file That represents the process of selecting features and training the classifier. Once you've done that you can predict if a new previously unseen document is spam or not by just doing the following: * Load the feature extractor and naive bayes from their files * Create a new document object from your new unseen document * Extract the features from that document using the feature extractor and * Pass those to the classify method of the naive bayes classifier"
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 2.0.1
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Paul Dix
31
- - Bryan Helmkamp
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-27 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: paul@pauldix.net
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
32
24
  files:
33
- - History.txt
34
- - License.txt
35
- - Manifest.txt
36
- - README.txt
37
- - Rakefile
38
25
  - lib/basset.rb
39
- - lib/basset/classification_evaluator.rb
40
- - lib/basset/core_extensions.rb
41
- - lib/basset/document.rb
42
- - lib/basset/document_override_example.rb
43
- - lib/basset/feature.rb
44
- - lib/basset/feature_extractor.rb
45
- - lib/basset/feature_selector.rb
46
- - lib/basset/naive_bayes.rb
47
- - lib/basset/yaml_serialization.rb
26
+ - lib/basset/parser.rb
27
+ - README.textile
48
28
  - spec/spec.opts
49
29
  - spec/spec_helper.rb
50
- - spec/unit/document_spec.rb
51
- - spec/unit/feature_extractor_spec.rb
52
- - spec/unit/feature_selector_spec.rb
53
- - spec/unit/feature_spec.rb
54
- test_files: []
55
-
56
- rdoc_options:
57
- - --main
58
- - README.txt
59
- extra_rdoc_files:
60
- - History.txt
61
- - License.txt
62
- - Manifest.txt
63
- - README.txt
64
- executables: []
30
+ - spec/basset/parser_spec.rb
31
+ has_rdoc: true
32
+ homepage: http://github.com/pauldix/basset
33
+ licenses: []
65
34
 
66
- extensions: []
35
+ post_install_message:
36
+ rdoc_options: []
67
37
 
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
68
52
  requirements: []
69
53
 
70
- dependencies:
71
- - !ruby/object:Gem::Dependency
72
- name: stemmer
73
- version_requirement:
74
- version_requirements: !ruby/object:Gem::Version::Requirement
75
- requirements:
76
- - - ">="
77
- - !ruby/object:Gem::Version
78
- version: 1.0.1
79
- version:
80
- - !ruby/object:Gem::Dependency
81
- name: hoe
82
- version_requirement:
83
- version_requirements: !ruby/object:Gem::Version::Requirement
84
- requirements:
85
- - - ">="
86
- - !ruby/object:Gem::Version
87
- version: 1.4.0
88
- version:
54
+ rubyforge_project:
55
+ rubygems_version: 1.3.5
56
+ signing_key:
57
+ specification_version: 2
58
+ summary: A wonderful hound that finds patterns in your data using machine learning.
59
+ test_files: []
60
+
@@ -1,7 +0,0 @@
1
- == 1.0.0 / 2008-01-08
2
-
3
- * Initial release
4
-
5
- == 1.0.1 / 2008-01-08
6
-
7
- * Updated release with new code actually checked into rubyforge.
@@ -1,20 +0,0 @@
1
- Copyright (c) 2007 Paul Dix
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,21 +0,0 @@
1
- History.txt
2
- License.txt
3
- Manifest.txt
4
- README.txt
5
- Rakefile
6
- lib/basset.rb
7
- lib/basset/classification_evaluator.rb
8
- lib/basset/core_extensions.rb
9
- lib/basset/document.rb
10
- lib/basset/document_override_example.rb
11
- lib/basset/feature.rb
12
- lib/basset/feature_extractor.rb
13
- lib/basset/feature_selector.rb
14
- lib/basset/naive_bayes.rb
15
- lib/basset/yaml_serialization.rb
16
- spec/spec.opts
17
- spec/spec_helper.rb
18
- spec/unit/document_spec.rb
19
- spec/unit/feature_extractor_spec.rb
20
- spec/unit/feature_selector_spec.rb
21
- spec/unit/feature_spec.rb
data/README.txt DELETED
@@ -1,31 +0,0 @@
1
- Author:: Paul Dix (mailto:paul@pauldix.net)
2
-
3
- =Summary
4
- This is a library for running machine learning tasks.
5
- These include a generic document representation class, a feature selector, a feature extractor, a naive bayes classifier, and a classification evaluator for running tests. The goal was to create a general framework that would be easy to modify for specific problems. I also tried to design the system to be extensible so I could add more classification and clustering algorithms as I get deeper into my studies on machine learning.
6
-
7
- =What You Could Use This For
8
- Just in case you don't have a clue what machine learning or classification is, here's a quick example scenario and an explanation of the process. The most popular task is spam identification. To do this you'll first need a set of training documents. This would consist of a number of documents which you have labeled as either spam or not. With training sets, bigger is better. You should probably have at least 100 of each type (spam and not spam). Really 1,000 of each type would be better and 10,000 of each would be super sweet. Once you have the training set the process with this library flows like this:
9
-
10
- * Create each as a Document (a class in this library)
11
- * Pass those documents into the FeatureSelector
12
- * Get the best features and pass those into the FeatureExtractor
13
- * Now extract features from each document using the extractor and
14
- * Pass those extracted features to NaiveBayes as part of the training set
15
- * Now you can save the FeatureExtractor and NaiveBayes to a file
16
-
17
- That represents the process of selecting features and training the classifier. Once you've done that you can predict if a new previously unseen document is spam or not by just doing the following:
18
-
19
- * Load the feature extractor and naive bayes from their files
20
- * Create a new document object from your new unseen document
21
- * Extract the features from that document using the feature extractor and
22
- * Pass those to the classify method of the naive bayes classifier
23
-
24
- Something that you'll probably want to do before doing real classification is to test things. Use the ClassificationEvaluator for this. Using the evaluator you can pass your training documents in and have it run through a series of tests to give you an estimate of how successful the classifier will be at predicting unseen documents. Easy classification tasks will generally be > 90% accurate while others can be much harder. Each classification task is different and most of the time you won't know until you actually test it out.
25
-
26
- =Contact
27
- I love machine learning and classification so if you have a problem that is giving you trouble don't hesitate to get a hold of me. The same applies for anyone who wants to write additional classifiers, better document representations, or just to tell my my code is amateur.
28
-
29
- Author:: Paul Dix (mailto:paul@pauldix.net)
30
- Site:: http://www.pauldix.net
31
- Freenode:: pauldix in #nyc.rb
data/Rakefile DELETED
@@ -1,30 +0,0 @@
1
- # -*- ruby -*-
2
-
3
- require 'rubygems'
4
- require 'hoe'
5
- require 'spec/rake/spectask'
6
- require './lib/basset.rb'
7
-
8
- Hoe.new('basset', Basset::VERSION) do |p|
9
- p.summary = 'A library for running machine learning algorithms for classification, feature selection and evaluation'
10
- p.url = 'http://basset.rubyforge.org/'
11
-
12
- p.author = ['Paul Dix', 'Bryan Helmkamp']
13
- p.email = 'paul@pauldix.net'
14
-
15
- p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
16
- p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
17
- p.remote_rdoc_dir = '' # Release to root
18
- p.extra_deps << ['stemmer', '>= 1.0.1']
19
- end
20
-
21
- desc "Run all of the specs"
22
- Spec::Rake::SpecTask.new do |t|
23
- t.spec_opts = ['--options', "\"spec/spec.opts\""]
24
- end
25
-
26
- desc "Run all spec with RCov"
27
- Spec::Rake::SpecTask.new(:coverage) do |t|
28
- t.rcov = true
29
- t.rcov_opts = ['--exclude', 'spec']
30
- end
@@ -1,170 +0,0 @@
1
- module Basset
2
- # Class for running evaluation tests on a classifier, and document
3
- # representation.
4
- # Takes the training_documents, which should be an array of objects that can return a vector of features (like Basset::Document)
5
- # The args hash has two optional keys {:output => true, :folding_amount => 10} where folding_amount is the amount of cross validation.
6
- class ClassificationEvaluator
7
- def initialize(training_documents, args = {})
8
- args[:output] = true unless args.has_key?(:output)
9
- @output_to_console = args[:output]
10
- @folding_amount = (args[:folding_amount] or 10)
11
- @total_documents_trained = 0
12
- @document_sets = split_documents_into_cross_validation_sets(training_documents, @folding_amount)
13
- end
14
-
15
- # Classifiers should be an array of basset classifier objects to run cross validation tests on
16
- def test_with_basset_classifiers(classifiers)
17
- end
18
-
19
- # Classifiers should be an array of basset classifier objects to run cross validation tests on.
20
- # chi_value will be passed on to the feature_selector. The default value of 0 will select all features.
21
- # The block will get called and passed in each training_set and test_set from the document_sets. It should
22
- # run some external classifier and return the number of documents from the test_set that were correctly classified.
23
- def compare_against_basset_classifiers(classifiers, chi_value = 0, &block)
24
- # initialize the results hash
25
- results = {"External" => {:correct => 0, :total => 0, :time => 0.0}}
26
- classifiers.each {|classifier| results[classifier.class] = {:correct => 0, :total => 0, :time => 0.0}}
27
-
28
- # run on each doc set
29
- @document_sets.each_with_index do |document_set, iteration|
30
- puts "iteration #{iteration + 1} of #{@document_sets.size}" if @output_to_console
31
- feature_extractor = nil
32
- feature_extractor_time = time_execution { feature_extractor = create_feature_extractor(document_set[:training_set], chi_value) }
33
- number_of_test_documents = document_set[:test_set].size
34
-
35
- # do a test run on each classifier
36
- classifiers.each do |classifier|
37
- correct = 0
38
- time = time_execution { correct = test_run(document_set[:training_set], document_set[:test_set], feature_extractor, classifier) } + feature_extractor_time
39
- results[classifier.class][:time] += time
40
- results[classifier.class][:correct] += correct
41
- results[classifier.class][:total] += number_of_test_documents
42
- output_results(correct, number_of_test_documents, time, classifier.class) if @output_to_console
43
- end
44
-
45
- # now run the external and gather results
46
- correct = 0
47
- time = time_execution { correct = block.call(document_set[:training_set], document_set[:test_set]) }
48
- results["External"][:time] += time
49
- results["External"][:correct] += correct
50
- results["External"][:total] += number_of_test_documents
51
- output_results(correct, number_of_test_documents, time, "External") if @output_to_console
52
- end
53
-
54
- puts "\nFinal Results\n---------------------------------------------------------------------------------------" if @output_to_console
55
- puts "Trained on #{@total_documents_trained} documents on #{@folding_amount} cross validation runs." if @output_to_console
56
- if @output_to_console
57
- results.each_pair {|classifier, results_numbers| output_results(results_numbers[:correct], results_numbers[:total], results_numbers[:time], classifier)}
58
- end
59
-
60
- return results
61
- end
62
-
63
- # It will then feature select and train on 9 and test on
64
- # the other. Iterate 10 times using each block as the test set and the others as the
65
- # training and combine the results.
66
- def test_with_cross_validation(training_document_names, folding_amount = 10)
67
- # make sure it's not in some order
68
- training_document_names.each {|class_documents| class_documents.randomize!}
69
-
70
- # the folding amount determines how big the test set size is. for 10 fold it's 10% and we run 10 times
71
- total_correct, total_documents = 0, 0
72
-
73
- # there's some tricky code here to make sure that the training and test sets have an equal percentage
74
- # of docs from each class for each iteration.
75
- folding_amount.times do |iteration|
76
- puts "iteration #{iteration + 1} of #{folding_amount}" if @output_to_console
77
- test_set = []
78
- training_document_names.each do |class_document_names|
79
- test_set_size = (class_document_names.size / folding_amount).to_i
80
- test_set << class_document_names.slice(iteration * test_set_size, test_set_size)
81
- end
82
- training_set = []
83
- training_document_names.each_with_index {|class_document_names, i| training_set += (class_document_names - test_set[i])}
84
- test_set = test_set.flatten
85
-
86
- correct, total = test_run(training_set, test_set)
87
- total_correct += correct
88
- total_documents += total
89
- end
90
-
91
- output_results(total_correct, total_documents) if @output_to_console
92
- return [total_correct, total_documents]
93
- end
94
-
95
- private
96
- # Splits entire set. The goal here is to test classification accuracy
97
- # using cross validation. 10 fold is the default. So it will split the training set
98
- # into 10 equal size chunks.
99
- # training_documents is actually an array of arrays. each class to be considered
100
- # has an array of documents.
101
- def split_documents_into_cross_validation_sets(training_documents, folding_amount = 10)
102
- document_sets = []
103
- # make sure it's not in some order
104
- training_documents.each {|class_documents| class_documents.randomize!}
105
-
106
- # the folding amount determines how big the test set size is. for 10 fold it's 10% and we run 10 times
107
- # there's some tricky code here to make sure that the training and test sets have an equal percentage
108
- # of docs from each class for each iteration.
109
- folding_amount.times do |iteration|
110
- test_set = []
111
- training_documents.each do |class_documents|
112
- test_set_size = (class_documents.size / folding_amount).to_i
113
- test_set << class_documents.slice(iteration * test_set_size, test_set_size)
114
- end
115
- training_set = []
116
- training_documents.each_with_index {|class_documents, i| training_set += (class_documents - test_set[i])}
117
- test_set = test_set.flatten
118
- @total_documents_trained += training_set.size
119
- document_sets << {:training_set => training_set, :test_set => test_set}
120
- end
121
- return document_sets
122
- end
123
-
124
- # this method returns a feature extractor for the passed in training_set using the chi_value
125
- def create_feature_extractor(training_set, chi_value)
126
- feature_selector = FeatureSelector.new
127
- # select features based on training set
128
- training_set.each do |document|
129
- feature_selector.add_document(document)
130
- end
131
- if chi_value == 0
132
- selected_features = feature_selector.all_feature_names
133
- else
134
- selected_features = feature_selector.select_features(chi_value)
135
- end
136
- puts "selected #{selected_features.size} of #{feature_selector.number_of_features} features for this iteration" if @output_to_console
137
- return FeatureExtractor.new(selected_features)
138
- end
139
-
140
- # this is a single run on a training and test set. It will run feature_selection, the feature_extraction, then training, then testing
141
- def test_run(training_set, testing_set, feature_extractor, classifier)
142
- puts "training #{classifier.class} on #{training_set.size} documents..." if @output_to_console
143
- # now train the classifier
144
- training_set.each do |document|
145
- classifier.add_document(document.classification, feature_extractor.extract_numbered(document) )
146
- end
147
-
148
- puts "running #{classifier.class} on #{testing_set.size} documents..." if @output_to_console
149
- # now classify test set
150
- number_correctly_classified = 0
151
- testing_set.each do |document|
152
- score, predicted_classification = classifier.classify(feature_extractor.extract_numbered(document))
153
- number_correctly_classified += 1 if document.classification == predicted_classification
154
- end
155
-
156
- return number_correctly_classified
157
- end
158
-
159
- def output_results(correct, total, time, classifier_name)
160
- puts "#{classifier_name} classified #{correct} of #{total} correctly for #{(correct/total.to_f * 100).to_s_decimal_places(2)}% accurcy. Executed run in #{time.to_s_decimal_places(1)} seconds."
161
- end
162
-
163
- def time_execution(&block)
164
- start_time = Time.now
165
- yield
166
- end_time = Time.now
167
- return end_time - start_time
168
- end
169
- end
170
- end
@@ -1,56 +0,0 @@
1
- # This file contains extensions to built in Ruby classes.
2
-
3
- require 'rubygems'
4
- require 'stemmer'
5
-
6
- # Extensions to the array class.
7
- class Array
8
- # Returns a new array that contains everything except the first element of this one. (just like in lisp)
9
- def rest
10
- self.slice(1, size)
11
- end
12
-
13
- # Returns the second item in the array
14
- def second
15
- self[1]
16
- end
17
-
18
- # Returns a random item from the array
19
- def pick_random
20
- self[rand(self.size)]
21
- end
22
-
23
- # Returns a randomized array
24
- def randomize
25
- self.sort_by { rand }
26
- end
27
-
28
- def sum
29
- inject(0) { |sum, val| sum + val }
30
- end
31
-
32
- # Randomizes array in place
33
- def randomize!
34
- self.replace(self.randomize)
35
- end
36
- end
37
-
38
- class Float
39
- def to_s_decimal_places(decimal_places)
40
- pattern = "[0-9]*\."
41
- decimal_places.times { pattern << "[0-9]"}
42
- return self.to_s.match(pattern)[0]
43
- end
44
- end
45
-
46
- class Symbol
47
- def to_proc
48
- proc { |obj, *args| obj.send(self, *args) }
49
- end
50
- end
51
-
52
- # Extensions to the string class.
53
- # We're just including the stemmable module into string. This adds the .stem method.
54
- class String
55
- include Stemmable
56
- end
@@ -1,51 +0,0 @@
1
- module Basset
2
-
3
- # A class for representing a document as a vector of features. It takes the text
4
- # of the document and the classification. The vector of features representation is
5
- # just a basic bag of words approach.
6
- class Document
7
- attr_reader :text, :classification
8
-
9
- def initialize(text, classification = nil)
10
- @text = text
11
- @classification = classification
12
- end
13
-
14
- def vector_of_features
15
- @feature_vector ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( stemmed_words ) )
16
- end
17
-
18
- private
19
-
20
- # returns a hash with each word as a key and the value is the number of times
21
- # the word appears in the passed in words array
22
- def terms_hash_from_words_array(words)
23
- terms = Hash.new(0)
24
- stemmed_words.each do |term|
25
- terms[term] += 1
26
- end
27
- return terms
28
- end
29
-
30
- def vector_of_features_from_terms_hash(terms)
31
- terms.collect do |term, frequency|
32
- Feature.new(term, frequency)
33
- end
34
- end
35
-
36
- def stemmed_words
37
- words.collect(&:stem)
38
- end
39
-
40
- def words
41
- clean_text.split(" ")
42
- end
43
-
44
- # Remove punctuation, numbers and symbols
45
- def clean_text
46
- text.tr("'@_", '').gsub(/\W/, ' ').gsub(/[0-9]/, '')
47
- # text.tr( ',?.!;:"#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "@'\-", "")
48
- end
49
-
50
- end
51
- end
@@ -1,11 +0,0 @@
1
- module Basset
2
- # This class is an example for how to do custom document representations. In this
3
- # example, I change the way text is cleaned and don't stem the words. It would also
4
- # be easy to put in additional hard coded features.
5
- # The important thing to note is that the new document class only needs one function: vector_of_features
6
- class DocumentOverrideExample < Document
7
- def vector_of_features
8
- @vector_of_features ||= vector_of_features_from_terms_hash( terms_hash_from_words_array( text.gsub(/\W/, ' ').split(' ') ) )
9
- end
10
- end
11
- end
@@ -1,26 +0,0 @@
1
- module Basset
2
-
3
- # A class to hold a feature which consists of a name and a value. In the basic sense
4
- # of document classification the name would be the word and the value would be the
5
- # number of times that word appeared in the document.
6
- class Feature
7
- attr_accessor :name, :value
8
-
9
- def initialize(name, value = 0)
10
- @name = name
11
- @value = value
12
- end
13
-
14
- def <=>(other)
15
- ret = self.name <=> other.name
16
- ret = self.value <=> other.value if ret.zero?
17
- ret
18
- end
19
-
20
- def ==(other)
21
- ret = self.name == other.name
22
- ret = self.value == other.value if ret
23
- ret
24
- end
25
- end
26
- end
@@ -1,52 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "yaml_serialization")
2
-
3
- module Basset
4
-
5
- # Extracts features from a document. On initialization it expects the set of features that
6
- # are to be extracted from documents. The extracted features will just be numbered in
7
- # ascending order. This makes it easy to output feature sets for libraries like svmlight.
8
- class FeatureExtractor
9
- include YamlSerialization
10
-
11
- # the constructor takes an array of feature names. These are the features that will be
12
- # extracted from documents. All others will be ignored.
13
- def initialize(feature_names)
14
- @feature_names = {}
15
- feature_names.each_with_index {|feature_name, index| @feature_names[feature_name] = index + 1}
16
- end
17
-
18
- def number_of_features
19
- @feature_names.size
20
- end
21
-
22
- # returns an array of features, but with their names replaced with an integer identifier.
23
- # They should be sorted in ascending identifier order. This is a generic representation that works
24
- # well with other machine learning packages like svm_light.
25
- def extract_numbered(document)
26
- numbered_features = extract(document).collect do |feature|
27
- Feature.new(@feature_names[feature.name], feature.value)
28
- end
29
- numbered_features.sort
30
- end
31
-
32
- # just returns the features from the document that the extractor is interested in
33
- def extract(document)
34
- document.vector_of_features.find_all do |feature|
35
- @feature_names[feature.name]
36
- end
37
- end
38
-
39
- # def extract_with_duplicate_removal(document)
40
- # features = extract(document)
41
- # # # now remove the unigrams that dupe bigram features
42
- # # # first grab an array of the bigram ones
43
- # # bigram_features = []
44
- # # sorted_features.each {|feature| bigram_features << feature if feature.name =~ /.*_AND_.*/}
45
- # # # now remove all the ones that have a match in the bigram features
46
- # # sorted_features.each_with_index do |feature, index|
47
- # # sorted_features.delete_at(index) if (feature.name !~ /_AND_/ and bigram_features.detect {|bf| bf.name =~ /^#{feature.name}_|_#{feature.name}$/})
48
- # # end
49
- # end
50
-
51
- end
52
- end
@@ -1,126 +0,0 @@
1
- module Basset
2
-
3
- # This class is the feature selector. All documents in the training set should be added
4
- # to the selector. Once they are in, a number of features may be selected based on the
5
- # chi square value. When in doubt just call feature_with_chi_value_greater_than with an
6
- # empty hash. It will return all features that have at least some statistical significance
7
- # and occur in more than one document.
8
- class FeatureSelector
9
- attr_reader :docs
10
-
11
- def initialize
12
- @docs = 0
13
- @docs_in_class = Hash.new(0)
14
- @features = Hash.new { |h, k| h[k] = FeatureValues.new }
15
- end
16
-
17
- # Adds a document to the feature selector. The document should respond_to a
18
- # method vector_of_features which returns a vector of unique features.
19
- def add_document(document)
20
- @docs += 1
21
- @docs_in_class[document.classification] += 1
22
-
23
- document.vector_of_features.each do |feature|
24
- @features[feature.name].add_document_with_class(document.classification)
25
- end
26
- end
27
-
28
- # returns all features, regardless of chi_square or frequency
29
- def all_feature_names
30
- @features.keys
31
- end
32
-
33
- def number_of_features
34
- @features.size
35
- end
36
-
37
- # returns an array of the best features for a given classification
38
- def best_features(count = 10, classification = nil)
39
- select_features(1.0, classification).first(count)
40
- end
41
-
42
- def features_with_chi(classification)
43
- @features.keys.map do |feature_name|
44
- Feature.new(feature_name, chi_squared(feature_name, classification))
45
- end
46
- end
47
-
48
- # returns an array of features that have a minimum or better chi_square value.
49
- def select_features(chi_value = 1.0, classification = nil)
50
- classification ||= @docs_in_class.keys.first
51
-
52
- selected_features = features_with_chi(classification).select do |feature|
53
- (docs_with_feature(feature.name) > 1) && (feature.value >= chi_value)
54
- end
55
-
56
- selected_features.sort_by(&:value).reverse.collect(&:name)
57
- end
58
-
59
- private
60
-
61
- def docs_with_feature_and_class(feature_name, classification)
62
- @features[feature_name].docs_with_class(classification)
63
- end
64
-
65
- def docs_with_feature_and_not_class(feature_name, classification)
66
- @features[feature_name].docs_with_feature - @features[feature_name].docs_with_class(classification)
67
- end
68
-
69
- def docs_with_class_and_not_feature(classification, feature_name)
70
- @docs_in_class[classification] - @features[feature_name].docs_with_class(classification)
71
- end
72
-
73
- def docs_without_feature_or_class(feature_name, classification)
74
- @docs - @docs_in_class[classification] - docs_with_feature_and_not_class(feature_name, classification)
75
- end
76
-
77
- def docs_with_feature(feature_name)
78
- @features[feature_name].docs_with_feature
79
- end
80
-
81
- def docs_with_class(classification)
82
- @docs_in_class[classification]
83
- end
84
-
85
- # Returns the chi_squared value for this feature with the passed classification
86
- # This is formula 13.14 on page 215 of An Introduction to Information Retrieval by
87
- # Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze.
88
- def chi_squared(feature_name, classification)
89
- chi_squared_algo(
90
- docs_with_feature_and_class(feature_name, classification),
91
- docs_with_class_and_not_feature(classification, feature_name),
92
- docs_with_feature_and_not_class(feature_name, classification),
93
- docs_without_feature_or_class(feature_name, classification)
94
- )
95
- end
96
-
97
- def chi_squared_algo(o11, o10, o01, o00)
98
- denominator = ((o11 + o01) * (o11 + o10) * (o10 + o00) * (o01 + o00))
99
- numerator = ((o11 + o10 + o01 + o00) * ((o11 * o00 - o10 * o01)**2))
100
- # Checking zero to avoid producing Infinity
101
- denominator.zero? ? 0.0 : numerator.to_f / denominator.to_f
102
- end
103
-
104
- # A class to hold the values associated with a feature. These values are
105
- # important for feature selection.
106
- class FeatureValues
107
- attr_accessor :docs_with_feature
108
-
109
- def initialize()
110
- @classes = Hash.new(0)
111
- @docs_with_feature = 0
112
- end
113
-
114
- def add_document_with_class(classification)
115
- @classes[classification] += 1
116
- @docs_with_feature += 1
117
- end
118
-
119
- def docs_with_class(classification)
120
- @classes[classification]
121
- end
122
-
123
- end
124
-
125
- end
126
- end
@@ -1,109 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "yaml_serialization")
2
-
3
- module Basset
4
-
5
- # A class for running Naive Bayes classification.
6
- # Documents are added to the classifier. Once they are added
7
- # it can be used to classify new documents.
8
- class NaiveBayes
9
- include YamlSerialization
10
-
11
- def initialize
12
- @number_of_documents = 0
13
- @number_of_documents_in_class = Hash.new(0)
14
- @features = []
15
- reset_cached_probabilities
16
- end
17
-
18
- # takes a classification which can be a string and
19
- # a vector of numbered features.
20
- def add_document(classification, feature_vector)
21
- reset_cached_probabilities
22
-
23
- @number_of_documents_in_class[classification] += 1
24
- @number_of_documents += 1
25
-
26
- feature_vector.each do |feature|
27
- @features[feature.name] ||= FeatureCount.new
28
- @features[feature.name].add_count_for_class(feature.value, classification)
29
- end
30
- end
31
-
32
- # returns the most likely class given a vector of features
33
- def classify(feature_vector)
34
- class_probabilities = []
35
-
36
- @number_of_documents_in_class.keys.each do |classification|
37
- class_probability = Math.log10(probability_of_class(classification))
38
- feature_vector.each do |feature|
39
- class_probability += Math.log10(probability_of_feature_given_class(feature.name, classification)) * feature.value
40
- end
41
- class_probabilities << [class_probability, classification]
42
- end
43
-
44
- # this next bit picks a random item first
45
- # this covers the case that all the class probabilities are equal and we need to randomly select a class
46
- max = class_probabilities.pick_random
47
- class_probabilities.each do |cp|
48
- max = cp if cp.first > max.first
49
- end
50
- max
51
- end
52
-
53
- private
54
-
55
- # probabilities are cached when the classification is run. This method resets
56
- # the cached probabities.
57
- def reset_cached_probabilities
58
- @occurences_of_every_feature_in_class = Hash.new
59
- end
60
-
61
- # The number of times every feature occurs for a given class.
62
- def number_of_occurences_of_every_feature_in_class(classification)
63
- # return the cached value, if there is one
64
- return @occurences_of_every_feature_in_class[classification] if @occurences_of_every_feature_in_class[classification]
65
-
66
- # we drop the first (since there is no 0 feature) and sum on the rest
67
- # the reason the rescue 0 is in there is tricky
68
- # because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
69
- @occurences_of_every_feature_in_class[classification] = @features.rest.compact.inject(0) do |sum, feature_count|
70
- sum + feature_count.count_for_class(classification)
71
- end
72
- end
73
-
74
- # returns the probability of a given class
75
- def probability_of_class(classification)
76
- @number_of_documents_in_class[classification] / @number_of_documents.to_f
77
- end
78
-
79
- # returns the probability of a feature given the class
80
- def probability_of_feature_given_class(feature, classification)
81
- # the reason the rescue 0 is in there is tricky
82
- # because of the removal of redundant unigrams, it's possible that one of the features is never used/initialized
83
- ((@features[feature].count_for_class(classification) rescue 0) + 1)/ number_of_occurences_of_every_feature_in_class(classification).to_f
84
- end
85
-
86
- # A class to store feature counts
87
- class FeatureCount
88
-
89
- def initialize
90
- @classes = {}
91
- end
92
-
93
- def add_count_for_class(count, classification)
94
- @classes[classification] ||= 0
95
- @classes[classification] += count
96
- end
97
-
98
- def count_for_class(classification)
99
- @classes[classification] || 1
100
- end
101
-
102
- def count
103
- @classes.values.sum
104
- end
105
-
106
- end
107
-
108
- end
109
- end
@@ -1,21 +0,0 @@
1
- require "yaml"
2
-
3
- module YamlSerialization
4
-
5
- def self.included(base)
6
- base.extend ClassMethods
7
- end
8
-
9
- module ClassMethods
10
- def load_from_file(file_name)
11
- YAML.load_file(file_name)
12
- end
13
- end
14
-
15
- def save_to_file(file_name)
16
- File.open(file_name, 'w') do |file|
17
- YAML.dump(self, file)
18
- end
19
- end
20
-
21
- end
@@ -1,28 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
-
3
- describe Document do
4
- it "should remove punctuation from words" do
5
- Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
6
- end
7
-
8
- it "should remove numbers from words" do
9
- Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
10
- end
11
-
12
- it "should remove symbols from words" do
13
- Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
14
- end
15
-
16
- it "should lowercase text" do
17
- Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
18
- end
19
-
20
- it "should stem words" do
21
- Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
22
- end
23
-
24
- it "should count feature occurances" do
25
- Document.new("test doc test", :test).vector_of_features.should ==
26
- [Feature.new("doc", 1), Feature.new("test", 2)]
27
- end
28
- end
@@ -1,32 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
-
3
- describe FeatureExtractor do
4
- DocumentMock = Struct.new(:vector_of_features)
5
-
6
- it "should save to file"
7
- it "should be loadable from file"
8
-
9
- it "should return number of features" do
10
- FeatureExtractor.new(%w[one two]).number_of_features.should == 2
11
- end
12
-
13
- it "should throw away extra features" do
14
- doc = DocumentMock.new([Feature.new("keep"), Feature.new("throwaway")])
15
- FeatureExtractor.new(%w[keep]).extract(doc).should == [Feature.new("keep")]
16
- end
17
-
18
- it "should extract no features from a doc with no features" do
19
- FeatureExtractor.new(%w[keep]).extract(DocumentMock.new([])).should == []
20
- end
21
-
22
- it "should extract numbered features" do
23
- doc = DocumentMock.new([Feature.new("keep", 0)])
24
- FeatureExtractor.new(%w[keep]).extract_numbered(doc).should == [Feature.new(1, 0)]
25
- end
26
-
27
- it "should sort extracted numbered features" do
28
- feature_extractor = FeatureExtractor.new(%w[keep1 keep2])
29
- doc = DocumentMock.new([Feature.new("keep2", 10), Feature.new("keep1", 20)])
30
- feature_extractor.extract_numbered(doc).should == [Feature.new(1, 20), Feature.new(2, 10)]
31
- end
32
- end
@@ -1,108 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
-
3
- describe FeatureSelector do
4
- DocumentMock = Struct.new(:vector_of_features, :classification)
5
-
6
- it "should count documents" do
7
- feature_selector = FeatureSelector.new
8
- feature_selector.docs.should == 0
9
- feature_selector.add_document(DocumentMock.new([]))
10
- feature_selector.docs.should == 1
11
- end
12
-
13
- it "should return all feature names" do
14
- feature_selector = FeatureSelector.new
15
- feature_selector.all_feature_names.should == []
16
- feature_selector.add_document(DocumentMock.new([Feature.new("a")]))
17
- feature_selector.add_document(DocumentMock.new([Feature.new("b")]))
18
- feature_selector.all_feature_names.should == %w[a b]
19
- end
20
-
21
- # TODO
22
- # it "should return_all_features_as_best
23
- # feature_selector = FeatureSelector.new
24
- # feature_selector.add_document(DocumentMock.new([Feature.new("a")], :test))
25
- # assert_equal %w[a], feature_selector.best_features_for_classification(:test, 10)
26
- # end
27
-
28
- it "should count docs with feature and class" do
29
- feature_selector = FeatureSelector.new
30
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
31
- feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
32
- feature_selector.docs_with_feature_and_class("viagra", :spam).should == 1
33
- feature_selector.docs_with_feature_and_class("viagra", :ham).should == 0
34
- end
35
-
36
- it "should count docs with feature and not class" do
37
- feature_selector = FeatureSelector.new
38
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
39
- feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
40
- feature_selector.docs_with_feature_and_not_class("puppy", :spam).should == 1
41
- feature_selector.docs_with_feature_and_not_class("puppy", :ham).should == 0
42
- end
43
-
44
- it "should count docs with class and not feature" do
45
- feature_selector = FeatureSelector.new
46
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
47
- feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
48
- feature_selector.docs_with_class_and_not_feature(:spam, "puppy").should == 1
49
- feature_selector.docs_with_class_and_not_feature(:spam, "viagra").should == 0
50
- end
51
-
52
- it "should count docs without feature or class" do
53
- feature_selector = FeatureSelector.new
54
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
55
- feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
56
- feature_selector.docs_without_feature_or_class("viagra", :spam).should == 1
57
- feature_selector.docs_without_feature_or_class("viagra", :ham).should == 0
58
- end
59
-
60
- it "should return zero chi if all docs contain feature" do
61
- feature_selector = FeatureSelector.new
62
- the = Feature.new("the", 1)
63
- feature_selector.add_document(doc([the], :spam))
64
- feature_selector.add_document(doc([the], :ham))
65
- feature_selector.features_with_chi(:spam).should == [Feature.new("the", 0.0)]
66
- end
67
-
68
- it "should compute chi squared" do
69
- feature_selector = FeatureSelector.new
70
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
71
- feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
72
- feature_selector.features_with_chi(:spam).should == [Feature.new("viagra", 2.0), Feature.new("puppy", 2.0)]
73
- end
74
-
75
- it "should not select any features if they are all insignificant" do
76
- feature_selector = FeatureSelector.new
77
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
78
- feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
79
- feature_selector.select_features.should == []
80
- end
81
-
82
- it "should not select features in only one doc" do
83
- feature_selector = FeatureSelector.new
84
- the = Feature.new("the", 1)
85
- feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
86
- feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
87
- feature_selector.select_features.should == []
88
- end
89
-
90
- it "should select significant features occuring in more than one doc" do
91
- feature_selector = FeatureSelector.new
92
- the = Feature.new("the", 1)
93
- feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
94
- feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
95
- feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
96
- feature_selector.select_features.should == %w[viagra]
97
- end
98
-
99
- it "should return selected features sorted by chi squared descending"
100
- it "should select based on first feature by default"
101
- it "should select with a chi squared of 1 by default"
102
-
103
- private
104
-
105
- def doc(*args)
106
- DocumentMock.new(*args)
107
- end
108
- end
@@ -1,40 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
-
3
- describe Feature do
4
- it "should store name" do
5
- Feature.new("hello").name.should == "hello"
6
- end
7
-
8
- it "should require name" do
9
- lambda { Feature.new }.should raise_error(ArgumentError)
10
- end
11
-
12
- it "should store values" do
13
- Feature.new("name", 2).value.should ==2
14
- end
15
-
16
- it "should default value to zero" do
17
- Feature.new("name").value.should == 0
18
- end
19
-
20
- it "should be equal with same name and no value" do
21
- Feature.new("hello").should == Feature.new("hello")
22
- end
23
-
24
- it "should be equal with same name and same value" do
25
- Feature.new("hello", 1).should == Feature.new("hello", 1)
26
- end
27
-
28
- it "should not be equal with different name" do
29
- Feature.new("hello").should_not == Feature.new("test")
30
- end
31
-
32
- it "should not be equal with same name and different value" do
33
- Feature.new("hello", 1).should_not == Feature.new("hello", 2)
34
- end
35
-
36
- it "should sort by name ascending then value ascending" do
37
- [Feature.new("b", 3), Feature.new("a", 2), Feature.new("a", 1)].sort.should ==
38
- [Feature.new("a", 1), Feature.new("a", 2), Feature.new("b", 3)]
39
- end
40
- end