rjspotter-basset 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,33 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe FeatureExtractor do
4
+ DocumentMock = Struct.new(:vector_of_features)
5
+
6
+ it "should save to file"
7
+
8
+ it "should be loadable from file"
9
+
10
+ it "should return number of features" do
11
+ FeatureExtractor.new(%w[one two]).number_of_features.should == 2
12
+ end
13
+
14
+ it "should throw away extra features" do
15
+ doc = DocumentMock.new([Feature.new("keep"), Feature.new("throwaway")])
16
+ FeatureExtractor.new(%w[keep]).extract(doc).should == [Feature.new("keep")]
17
+ end
18
+
19
+ it "should extract no features from a doc with no features" do
20
+ FeatureExtractor.new(%w[keep]).extract(DocumentMock.new([])).should == []
21
+ end
22
+
23
+ it "should extract numbered features" do
24
+ doc = DocumentMock.new([Feature.new("keep", 0)])
25
+ FeatureExtractor.new(%w[keep]).extract_numbered(doc).should == [Feature.new(1, 0)]
26
+ end
27
+
28
+ it "should sort extracted numbered features" do
29
+ feature_extractor = FeatureExtractor.new(%w[keep1 keep2])
30
+ doc = DocumentMock.new([Feature.new("keep2", 10), Feature.new("keep1", 20)])
31
+ feature_extractor.extract_numbered(doc).should == [Feature.new(1, 20), Feature.new(2, 10)]
32
+ end
33
+ end
@@ -0,0 +1,108 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe FeatureSelector do
4
+ DocumentMock = Struct.new(:vector_of_features, :classification)
5
+
6
+ it "should count documents" do
7
+ feature_selector = FeatureSelector.new
8
+ feature_selector.docs.should == 0
9
+ feature_selector.add_document(DocumentMock.new([]))
10
+ feature_selector.docs.should == 1
11
+ end
12
+
13
+ it "should return all feature names" do
14
+ feature_selector = FeatureSelector.new
15
+ feature_selector.all_feature_names.should == []
16
+ feature_selector.add_document(DocumentMock.new([Feature.new("a")]))
17
+ feature_selector.add_document(DocumentMock.new([Feature.new("b")]))
18
+ feature_selector.all_feature_names.should == %w[a b]
19
+ end
20
+
21
+ # TODO
22
+ # it "should return_all_features_as_best
23
+ # feature_selector = FeatureSelector.new
24
+ # feature_selector.add_document(DocumentMock.new([Feature.new("a")], :test))
25
+ # assert_equal %w[a], feature_selector.best_features_for_classification(:test, 10)
26
+ # end
27
+
28
+ it "should count docs with feature and class" do
29
+ feature_selector = FeatureSelector.new
30
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
31
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
32
+ feature_selector.__send__(:docs_with_feature_and_class, "viagra", :spam).should == 1
33
+ feature_selector.__send__(:docs_with_feature_and_class, "viagra", :ham).should == 0
34
+ end
35
+
36
+ it "should count docs with feature and not class" do
37
+ feature_selector = FeatureSelector.new
38
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
39
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
40
+ feature_selector.__send__(:docs_with_feature_and_not_class, "puppy", :spam).should == 1
41
+ feature_selector.__send__(:docs_with_feature_and_not_class,"puppy", :ham).should == 0
42
+ end
43
+
44
+ it "should count docs with class and not feature" do
45
+ feature_selector = FeatureSelector.new
46
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
47
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
48
+ feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "puppy").should == 1
49
+ feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "viagra").should == 0
50
+ end
51
+
52
+ it "should count docs without feature or class" do
53
+ feature_selector = FeatureSelector.new
54
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
55
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
56
+ feature_selector.__send__(:docs_without_feature_or_class, "viagra", :spam).should == 1
57
+ feature_selector.__send__(:docs_without_feature_or_class, "viagra", :ham).should == 0
58
+ end
59
+
60
+ it "should return zero chi if all docs contain feature" do
61
+ feature_selector = FeatureSelector.new
62
+ the = Feature.new("the", 1)
63
+ feature_selector.add_document(doc([the], :spam))
64
+ feature_selector.add_document(doc([the], :ham))
65
+ feature_selector.features_with_chi(:spam).should == [Feature.new("the", 0.0)]
66
+ end
67
+
68
+ it "should compute chi squared" do
69
+ feature_selector = FeatureSelector.new
70
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
71
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
72
+ feature_selector.features_with_chi(:spam).should == [Feature.new("viagra", 2.0), Feature.new("puppy", 2.0)]
73
+ end
74
+
75
+ it "should not select any features if they are all insignificant" do
76
+ feature_selector = FeatureSelector.new
77
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
78
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
79
+ feature_selector.select_features.should == []
80
+ end
81
+
82
+ it "should not select features in only one doc" do
83
+ feature_selector = FeatureSelector.new
84
+ the = Feature.new("the", 1)
85
+ feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
86
+ feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
87
+ feature_selector.select_features.should == []
88
+ end
89
+
90
+ it "should select significant features occuring in more than one doc" do
91
+ feature_selector = FeatureSelector.new
92
+ the = Feature.new("the", 1)
93
+ feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
94
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
95
+ feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
96
+ feature_selector.select_features.should == %w[viagra]
97
+ end
98
+
99
+ it "should return selected features sorted by chi squared descending"
100
+ it "should select based on first feature by default"
101
+ it "should select with a chi squared of 1 by default"
102
+
103
+ private
104
+
105
+ def doc(*args)
106
+ DocumentMock.new(*args)
107
+ end
108
+ end
@@ -0,0 +1,40 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe Feature do
4
+ it "should store name" do
5
+ Feature.new("hello").name.should == "hello"
6
+ end
7
+
8
+ it "should require name" do
9
+ lambda { Feature.new }.should raise_error(ArgumentError)
10
+ end
11
+
12
+ it "should store values" do
13
+ Feature.new("name", 2).value.should ==2
14
+ end
15
+
16
+ it "should default value to zero" do
17
+ Feature.new("name").value.should == 0
18
+ end
19
+
20
+ it "should be equal with same name and no value" do
21
+ Feature.new("hello").should == Feature.new("hello")
22
+ end
23
+
24
+ it "should be equal with same name and same value" do
25
+ Feature.new("hello", 1).should == Feature.new("hello", 1)
26
+ end
27
+
28
+ it "should not be equal with different name" do
29
+ Feature.new("hello").should_not == Feature.new("test")
30
+ end
31
+
32
+ it "should not be equal with same name and different value" do
33
+ Feature.new("hello", 1).should_not == Feature.new("hello", 2)
34
+ end
35
+
36
+ it "should sort by name ascending then value ascending" do
37
+ [Feature.new("b", 3), Feature.new("a", 2), Feature.new("a", 1)].sort.should ==
38
+ [Feature.new("a", 1), Feature.new("a", 2), Feature.new("b", 3)]
39
+ end
40
+ end
@@ -0,0 +1,119 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe NaiveBayes::FeatureCount do
4
+
5
+ it "should be equal to another feature count if the feature name and counts per class are equal" do
6
+ NaiveBayes::FeatureCount.new("rspec", :sweet, 1).should == NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
7
+ end
8
+
9
+ it "should give the sum of all occurrences of a feature for a given class" do
10
+ fc = NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
11
+ fc.add_count_for_class(2, :sweet)
12
+ fc.add_count_for_class(6, :super_sweet)
13
+ fc.count_for_class(:sweet).should == 3
14
+ fc.count_for_class(:super_sweet).should == 6
15
+ end
16
+
17
+ end
18
+
19
+ describe NaiveBayes do
20
+
21
+ before(:each) do
22
+ @nbayes = NaiveBayes.new
23
+ @doc = Document.new("here are some interesting words", :interesting)
24
+ @feature_vectors = @doc.feature_vectors
25
+ @other_vectors = Document.new("these words are interesting", :interesting).feature_vectors
26
+ @test_vectors = Document.new("this word seems interesting", :interesting).feature_vectors
27
+ end
28
+
29
+ def feature_counts_for(classification, *feature_count_tuples)
30
+ feature_counts = {}
31
+ feature_count_tuples.each do |tuple|
32
+ feature_counts[tuple.first] = NaiveBayes::FeatureCount.new(tuple.first, classification, tuple.last)
33
+ end
34
+ feature_counts
35
+ end
36
+
37
+ def add_interesting_docs
38
+ @nbayes.add_document(:interesting, @feature_vectors)
39
+ @nbayes.add_document(:interesting, @other_vectors)
40
+ end
41
+
42
+ def add_boring_docs
43
+ @nbayes.add_document(:boring, Document.new("yawn lets go flame").feature_vectors)
44
+ @nbayes.add_document(:boring, Document.new("yawn lets flame and troll").feature_vectors)
45
+ end
46
+
47
+ it "should keep track of the total docs and total docs for class when adding new docs" do
48
+ @nbayes.add_document(:interesting, @feature_vectors)
49
+ @nbayes.total_docs.should == 1
50
+ @nbayes.total_docs_in_class[:interesting].should == 1
51
+ end
52
+
53
+ it "should create a feature count for each feature with the # of occurances and class" do
54
+ @nbayes.add_document(:interesting, @feature_vectors)
55
+ expected = feature_counts_for(:interesting, ["here", 1], ["ar", 1],["some", 1],["interest", 1],["word", 1])
56
+ @nbayes.feature_counts.should == expected
57
+ end
58
+
59
+ it "should sum the number of all occurances of all features for a given class" do
60
+ @nbayes.add_document(:interesting, @feature_vectors)
61
+ @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
62
+ add_boring_docs
63
+ @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
64
+ end
65
+
66
+ it "should give a list of classes it knows about" do
67
+ @nbayes.add_document(:interesting, @feature_vectors)
68
+ @nbayes.add_document(:kinda_interesting, @feature_vectors)
69
+ @nbayes.classes.sort_to_s.should == [:kinda_interesting, :interesting].sort_to_s
70
+ end
71
+
72
+ it "should not divide by zero or nil when determining the probability of a feature vector for a class" do
73
+ @nbayes.occurrences_of_all_features_in_class(:interesting).should be_nil
74
+ probability_computation = lambda {@nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)}
75
+ probability_computation.should_not raise_error
76
+ end
77
+
78
+ it "should compute the probability that a given (singular) feature vector belongs to a given class" do
79
+ @nbayes.add_document(:interesting, @feature_vectors)
80
+ probablity = @nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)
81
+ probablity.should be_a Float
82
+ probablity.round.should == -1
83
+ end
84
+
85
+ it "should compute the probability that a given set of feature vectors belongs to a given class" do
86
+ add_interesting_docs
87
+ probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting)
88
+ probability.should be_a(Float)
89
+ probability.round.should == -5
90
+ end
91
+
92
+ it "should compute a probability of a class for a set of vectors normalized by the number of features" do
93
+ add_interesting_docs
94
+ probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting, :normalize => true)
95
+ probability.should be_a Float
96
+ probability.round.should == -1
97
+ end
98
+
99
+ it "should determine the most likely class of a set feature vectors" do
100
+ add_interesting_docs
101
+ add_boring_docs
102
+ test_vectors = Document.new("some interesting words").feature_vectors
103
+ classification = @nbayes.classify(test_vectors, :normalize_classes => false)
104
+ classification.last.should == :interesting
105
+ classification.first.should be_a Float
106
+ classification.first.round.should == -2
107
+ end
108
+
109
+ it "should account for the relative probabilities of classes by default when classifying" do
110
+ add_interesting_docs
111
+ add_boring_docs
112
+ vectors = Document.new("some interesting words").feature_vectors
113
+ classification = @nbayes.classify(vectors)
114
+ classification.last.should == :interesting
115
+ classification.first.should be_a Float
116
+ classification.first.round.should == -2
117
+ end
118
+
119
+ end
@@ -0,0 +1,83 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Svm do
4
+
5
+ before(:each) do
6
+ @svm = Svm.new
7
+ @doc = Document.new("here are some interesting words", :interesting)
8
+ @feature_vectors = @doc.feature_vectors
9
+ @negative_doc = Document.new("let's eat some spamiagra", :uninteresting)
10
+ @negative_fvs = @negative_doc.feature_vectors
11
+ end
12
+
13
+ def add_simple_docs_to_svm
14
+ @svm.add_document(:interesting, @feature_vectors)
15
+ @svm.add_document(:uninteresting, @negative_fvs)
16
+ end
17
+
18
+ it "should list document classes it knows about" do
19
+ add_simple_docs_to_svm
20
+ @svm.classes.sort_to_s.should == [:interesting, :uninteresting].sort_to_s
21
+ end
22
+
23
+ it "should express classes as SVM friendly unit integer labels"do
24
+ add_simple_docs_to_svm
25
+ spam_doc = Document.new("make your junk repulsive to women with free v14gra")
26
+ @svm.add_document(:uninteresting, spam_doc.feature_vectors)
27
+ @svm.class_labels.should == {:interesting => 0, :uninteresting => 1}
28
+ end
29
+
30
+ it "should create a feature dictionary based on training docs" do
31
+ add_simple_docs_to_svm
32
+ expected = (@feature_vectors + @negative_fvs).map { |fv| fv.name }.uniq
33
+ @svm.feature_dictionary.should == expected
34
+ end
35
+
36
+ it "should express documents as SVM friendly vectors using the binary method" do
37
+ # for a brief but usable description of binary, frequency, tf-idf, and
38
+ # Hadamard vector representations of documents, see section 2.1 (2nd page) of:
39
+ # http://jmlr.csail.mit.edu/papers/volume2/manevitz01a/manevitz01a.pdf
40
+ add_simple_docs_to_svm
41
+ @svm.vectorized_docs(:interesting).first.should == [1,1,1,1,1,0,0,0]
42
+ @svm.vectorized_docs(:uninteresting).first.should == [0,0,0,0,1,1,1,1]
43
+ end
44
+
45
+ it "should set SVM parameters to reasonable defaults and allow access via a block" do
46
+ @svm.parameters do |params|
47
+ params.C.should == 100
48
+ params.svm_type.should == NU_SVC
49
+ params.degree.should == 1
50
+ params.coef0.should == 0
51
+ params.eps.should == 0.001
52
+ params.kernel_type.should == RBF
53
+ end
54
+ end
55
+
56
+ it "should construct the list of labels and document feature vectors" do
57
+ add_simple_docs_to_svm
58
+ result = @svm.labels_and_document_vectors
59
+ result[:labels].sort.should == [0,1]
60
+ result[:features].sort.should == [[0,0,0,0,1,1,1,1], [1,1,1,1,1,0,0,0]]
61
+ # can't count on consistent Hash#each ordering, hence this:
62
+ expected_result_with_hash_ordering_workaround = {1 => [0,0,0,0,1,1,1,1], 0 => [1,1,1,1,1,0,0,0]}
63
+ stabilized_actual_result = {result[:labels].first => result[:features].first,
64
+ result[:labels].last => result[:features].last}
65
+ stabilized_actual_result.should == expected_result_with_hash_ordering_workaround
66
+ end
67
+
68
+ it "should classify unlabeled documents" do
69
+ # examples from http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
70
+ non_spam_texts = ["Peter and Stewie are hilarious", "New episode rocks, Peter and Stewie are hilarious",
71
+ "Peter is my fav!"]
72
+ spam_texts = ["FREE NATIONAL TREASURE", "FREE TV for EVERY visitor", "AS SEEN ON NATIONAL TV",
73
+ "FREE drugs"]
74
+ non_spam_texts.each { |t| @svm.add_document(:nonspam, Document.new(t, :nonspam).feature_vectors) }
75
+ spam_texts.each { |t| @svm.add_document(:spam, Document.new(t, :spam).feature_vectors) }
76
+ test_non_spams = ["Stewie is hilarious", "Poor Peter is hilarious"]
77
+ test_spam = "FREE lotterry for the NATIONAL TREASURE !!!"
78
+ @svm.classify(Document.new(test_non_spams.first).feature_vectors).should == :nonspam
79
+ @svm.classify(Document.new(test_non_spams.last).feature_vectors).should == :nonspam
80
+ @svm.classify(Document.new(test_spam).feature_vectors).should == :spam
81
+ end
82
+
83
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rjspotter-basset
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.5
5
+ platform: ruby
6
+ authors:
7
+ - Paul Dix
8
+ - Bryan Helmkamp
9
+ - Daniel DeLeo
10
+ - R. Potter
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+
15
+ date: 2009-05-09 00:00:00 -07:00
16
+ default_executable:
17
+ dependencies:
18
+ - !ruby/object:Gem::Dependency
19
+ name: stemmer
20
+ type: :runtime
21
+ version_requirement:
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.0.1
27
+ version:
28
+ - !ruby/object:Gem::Dependency
29
+ name: tomz-libsvm-ruby-swig
30
+ type: :runtime
31
+ version_requirement:
32
+ version_requirements: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 0.3.3
37
+ version:
38
+ - !ruby/object:Gem::Dependency
39
+ name: igrigorik-bloomfilter
40
+ type: :runtime
41
+ version_requirement:
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.1.2
47
+ version:
48
+ description: A library for machine learning and classification
49
+ email: rjspotter@gmail.com
50
+ executables: []
51
+
52
+ extensions: []
53
+
54
+ extra_rdoc_files:
55
+ - README.rdoc
56
+ files:
57
+ - History.txt
58
+ - License.txt
59
+ - Manifest.txt
60
+ - README.rdoc
61
+ - Rakefile
62
+ - VERSION.yml
63
+ - basset.gemspec
64
+ - examples/example.rb
65
+ - lib/basset.rb
66
+ - lib/basset/classification_evaluator.rb
67
+ - lib/basset/classifier.rb
68
+ - lib/basset/core_extensions.rb
69
+ - lib/basset/document.rb
70
+ - lib/basset/document_override_example.rb
71
+ - lib/basset/feature.rb
72
+ - lib/basset/feature_extractor.rb
73
+ - lib/basset/feature_selector.rb
74
+ - lib/basset/naive_bayes.rb
75
+ - lib/basset/svm.rb
76
+ - lib/basset/yaml_serialization.rb
77
+ - spec/spec.opts
78
+ - spec/spec_helper.rb
79
+ - spec/unit/classifier_spec.rb
80
+ - spec/unit/core_extension_spec.rb
81
+ - spec/unit/document_spec.rb
82
+ - spec/unit/feature_extractor_spec.rb
83
+ - spec/unit/feature_selector_spec.rb
84
+ - spec/unit/feature_spec.rb
85
+ - spec/unit/naive_bayes_spec.rb
86
+ - spec/unit/svm_spec.rb
87
+ has_rdoc: false
88
+ homepage: http://github.com/danielsdeleo/basset
89
+ post_install_message:
90
+ rdoc_options:
91
+ - --inline-source
92
+ - --charset=UTF-8
93
+ require_paths:
94
+ - - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: "0"
100
+ version:
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: "0"
106
+ version:
107
+ requirements: []
108
+
109
+ rubyforge_project:
110
+ rubygems_version: 1.2.0
111
+ signing_key:
112
+ specification_version: 3
113
+ summary: A library for machine learning and classification
114
+ test_files: []
115
+