danielsdeleo-basset 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe FeatureExtractor do
4
+ DocumentMock = Struct.new(:vector_of_features)
5
+
6
+ it "should save to file"
7
+
8
+ it "should be loadable from file"
9
+
10
+ it "should return number of features" do
11
+ FeatureExtractor.new(%w[one two]).number_of_features.should == 2
12
+ end
13
+
14
+ it "should throw away extra features" do
15
+ doc = DocumentMock.new([Feature.new("keep"), Feature.new("throwaway")])
16
+ FeatureExtractor.new(%w[keep]).extract(doc).should == [Feature.new("keep")]
17
+ end
18
+
19
+ it "should extract no features from a doc with no features" do
20
+ FeatureExtractor.new(%w[keep]).extract(DocumentMock.new([])).should == []
21
+ end
22
+
23
+ it "should extract numbered features" do
24
+ doc = DocumentMock.new([Feature.new("keep", 0)])
25
+ FeatureExtractor.new(%w[keep]).extract_numbered(doc).should == [Feature.new(1, 0)]
26
+ end
27
+
28
+ it "should sort extracted numbered features" do
29
+ feature_extractor = FeatureExtractor.new(%w[keep1 keep2])
30
+ doc = DocumentMock.new([Feature.new("keep2", 10), Feature.new("keep1", 20)])
31
+ feature_extractor.extract_numbered(doc).should == [Feature.new(1, 20), Feature.new(2, 10)]
32
+ end
33
+ end
@@ -0,0 +1,108 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe FeatureSelector do
4
+ DocumentMock = Struct.new(:vector_of_features, :classification)
5
+
6
+ it "should count documents" do
7
+ feature_selector = FeatureSelector.new
8
+ feature_selector.docs.should == 0
9
+ feature_selector.add_document(DocumentMock.new([]))
10
+ feature_selector.docs.should == 1
11
+ end
12
+
13
+ it "should return all feature names" do
14
+ feature_selector = FeatureSelector.new
15
+ feature_selector.all_feature_names.should == []
16
+ feature_selector.add_document(DocumentMock.new([Feature.new("a")]))
17
+ feature_selector.add_document(DocumentMock.new([Feature.new("b")]))
18
+ feature_selector.all_feature_names.should == %w[a b]
19
+ end
20
+
21
+ # TODO
22
+ # it "should return_all_features_as_best
23
+ # feature_selector = FeatureSelector.new
24
+ # feature_selector.add_document(DocumentMock.new([Feature.new("a")], :test))
25
+ # assert_equal %w[a], feature_selector.best_features_for_classification(:test, 10)
26
+ # end
27
+
28
+ it "should count docs with feature and class" do
29
+ feature_selector = FeatureSelector.new
30
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
31
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
32
+ feature_selector.__send__(:docs_with_feature_and_class, "viagra", :spam).should == 1
33
+ feature_selector.__send__(:docs_with_feature_and_class, "viagra", :ham).should == 0
34
+ end
35
+
36
+ it "should count docs with feature and not class" do
37
+ feature_selector = FeatureSelector.new
38
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
39
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
40
+ feature_selector.__send__(:docs_with_feature_and_not_class, "puppy", :spam).should == 1
41
+ feature_selector.__send__(:docs_with_feature_and_not_class,"puppy", :ham).should == 0
42
+ end
43
+
44
+ it "should count docs with class and not feature" do
45
+ feature_selector = FeatureSelector.new
46
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
47
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
48
+ feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "puppy").should == 1
49
+ feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "viagra").should == 0
50
+ end
51
+
52
+ it "should count docs without feature or class" do
53
+ feature_selector = FeatureSelector.new
54
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
55
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
56
+ feature_selector.__send__(:docs_without_feature_or_class, "viagra", :spam).should == 1
57
+ feature_selector.__send__(:docs_without_feature_or_class, "viagra", :ham).should == 0
58
+ end
59
+
60
+ it "should return zero chi if all docs contain feature" do
61
+ feature_selector = FeatureSelector.new
62
+ the = Feature.new("the", 1)
63
+ feature_selector.add_document(doc([the], :spam))
64
+ feature_selector.add_document(doc([the], :ham))
65
+ feature_selector.features_with_chi(:spam).should == [Feature.new("the", 0.0)]
66
+ end
67
+
68
+ it "should compute chi squared" do
69
+ feature_selector = FeatureSelector.new
70
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
71
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
72
+ feature_selector.features_with_chi(:spam).should == [Feature.new("viagra", 2.0), Feature.new("puppy", 2.0)]
73
+ end
74
+
75
+ it "should not select any features if they are all insignificant" do
76
+ feature_selector = FeatureSelector.new
77
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
78
+ feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
79
+ feature_selector.select_features.should == []
80
+ end
81
+
82
+ it "should not select features in only one doc" do
83
+ feature_selector = FeatureSelector.new
84
+ the = Feature.new("the", 1)
85
+ feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
86
+ feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
87
+ feature_selector.select_features.should == []
88
+ end
89
+
90
+ it "should select significant features occuring in more than one doc" do
91
+ feature_selector = FeatureSelector.new
92
+ the = Feature.new("the", 1)
93
+ feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
94
+ feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
95
+ feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
96
+ feature_selector.select_features.should == %w[viagra]
97
+ end
98
+
99
+ it "should return selected features sorted by chi squared descending"
100
+ it "should select based on first feature by default"
101
+ it "should select with a chi squared of 1 by default"
102
+
103
+ private
104
+
105
+ def doc(*args)
106
+ DocumentMock.new(*args)
107
+ end
108
+ end
@@ -0,0 +1,40 @@
1
+ require File.join(File.dirname(__FILE__), "..", "spec_helper")
2
+
3
+ describe Feature do
4
+ it "should store name" do
5
+ Feature.new("hello").name.should == "hello"
6
+ end
7
+
8
+ it "should require name" do
9
+ lambda { Feature.new }.should raise_error(ArgumentError)
10
+ end
11
+
12
+ it "should store values" do
13
+ Feature.new("name", 2).value.should ==2
14
+ end
15
+
16
+ it "should default value to zero" do
17
+ Feature.new("name").value.should == 0
18
+ end
19
+
20
+ it "should be equal with same name and no value" do
21
+ Feature.new("hello").should == Feature.new("hello")
22
+ end
23
+
24
+ it "should be equal with same name and same value" do
25
+ Feature.new("hello", 1).should == Feature.new("hello", 1)
26
+ end
27
+
28
+ it "should not be equal with different name" do
29
+ Feature.new("hello").should_not == Feature.new("test")
30
+ end
31
+
32
+ it "should not be equal with same name and different value" do
33
+ Feature.new("hello", 1).should_not == Feature.new("hello", 2)
34
+ end
35
+
36
+ it "should sort by name ascending then value ascending" do
37
+ [Feature.new("b", 3), Feature.new("a", 2), Feature.new("a", 1)].sort.should ==
38
+ [Feature.new("a", 1), Feature.new("a", 2), Feature.new("b", 3)]
39
+ end
40
+ end
@@ -0,0 +1,119 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe NaiveBayes::FeatureCount do
4
+
5
+ it "should be equal to another feature count if the feature name and counts per class are equal" do
6
+ NaiveBayes::FeatureCount.new("rspec", :sweet, 1).should == NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
7
+ end
8
+
9
+ it "should give the sum of all occurrences of a feature for a given class" do
10
+ fc = NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
11
+ fc.add_count_for_class(2, :sweet)
12
+ fc.add_count_for_class(6, :super_sweet)
13
+ fc.count_for_class(:sweet).should == 3
14
+ fc.count_for_class(:super_sweet).should == 6
15
+ end
16
+
17
+ end
18
+
19
+ describe NaiveBayes do
20
+
21
+ before(:each) do
22
+ @nbayes = NaiveBayes.new
23
+ @doc = Document.new("here are some interesting words", :interesting)
24
+ @feature_vectors = @doc.feature_vectors
25
+ @other_vectors = Document.new("these words are interesting", :interesting).feature_vectors
26
+ @test_vectors = Document.new("this word seems interesting", :interesting).feature_vectors
27
+ end
28
+
29
+ def feature_counts_for(classification, *feature_count_tuples)
30
+ feature_counts = {}
31
+ feature_count_tuples.each do |tuple|
32
+ feature_counts[tuple.first] = NaiveBayes::FeatureCount.new(tuple.first, classification, tuple.last)
33
+ end
34
+ feature_counts
35
+ end
36
+
37
+ def add_interesting_docs
38
+ @nbayes.add_document(:interesting, @feature_vectors)
39
+ @nbayes.add_document(:interesting, @other_vectors)
40
+ end
41
+
42
+ def add_boring_docs
43
+ @nbayes.add_document(:boring, Document.new("yawn lets go flame").feature_vectors)
44
+ @nbayes.add_document(:boring, Document.new("yawn lets flame and troll").feature_vectors)
45
+ end
46
+
47
+ it "should keep track of the total docs and total docs for class when adding new docs" do
48
+ @nbayes.add_document(:interesting, @feature_vectors)
49
+ @nbayes.total_docs.should == 1
50
+ @nbayes.total_docs_in_class[:interesting].should == 1
51
+ end
52
+
53
+ it "should create a feature count for each feature with the # of occurances and class" do
54
+ @nbayes.add_document(:interesting, @feature_vectors)
55
+ expected = feature_counts_for(:interesting, ["here", 1], ["ar", 1],["some", 1],["interest", 1],["word", 1])
56
+ @nbayes.feature_counts.should == expected
57
+ end
58
+
59
+ it "should sum the number of all occurances of all features for a given class" do
60
+ @nbayes.add_document(:interesting, @feature_vectors)
61
+ @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
62
+ add_boring_docs
63
+ @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
64
+ end
65
+
66
+ it "should give a list of classes it knows about" do
67
+ @nbayes.add_document(:interesting, @feature_vectors)
68
+ @nbayes.add_document(:kinda_interesting, @feature_vectors)
69
+ @nbayes.classes.sort_to_s.should == [:kinda_interesting, :interesting].sort_to_s
70
+ end
71
+
72
+ it "should not divide by zero or nil when determining the probability of a feature vector for a class" do
73
+ @nbayes.occurrences_of_all_features_in_class(:interesting).should be_nil
74
+ probability_computation = lambda {@nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)}
75
+ probability_computation.should_not raise_error
76
+ end
77
+
78
+ it "should compute the probability that a given (singular) feature vector belongs to a given class" do
79
+ @nbayes.add_document(:interesting, @feature_vectors)
80
+ probablity = @nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)
81
+ probablity.should be_a Float
82
+ probablity.round.should == -1
83
+ end
84
+
85
+ it "should compute the probability that a given set of feature vectors belongs to a given class" do
86
+ add_interesting_docs
87
+ probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting)
88
+ probability.should be_a(Float)
89
+ probability.round.should == -5
90
+ end
91
+
92
+ it "should compute a probability of a class for a set of vectors normalized by the number of features" do
93
+ add_interesting_docs
94
+ probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting, :normalize => true)
95
+ probability.should be_a Float
96
+ probability.round.should == -1
97
+ end
98
+
99
+ it "should determine the most likely class of a set feature vectors" do
100
+ add_interesting_docs
101
+ add_boring_docs
102
+ test_vectors = Document.new("some interesting words").feature_vectors
103
+ classification = @nbayes.classify(test_vectors, :normalize_classes => false)
104
+ classification.last.should == :interesting
105
+ classification.first.should be_a Float
106
+ classification.first.round.should == -2
107
+ end
108
+
109
+ it "should account for the relative probabilities of classes by default when classifying" do
110
+ add_interesting_docs
111
+ add_boring_docs
112
+ vectors = Document.new("some interesting words").feature_vectors
113
+ classification = @nbayes.classify(vectors)
114
+ classification.last.should == :interesting
115
+ classification.first.should be_a Float
116
+ classification.first.round.should == -2
117
+ end
118
+
119
+ end
@@ -0,0 +1,83 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Svm do
4
+
5
+ before(:each) do
6
+ @svm = Svm.new
7
+ @doc = Document.new("here are some interesting words", :interesting)
8
+ @feature_vectors = @doc.feature_vectors
9
+ @negative_doc = Document.new("let's eat some spamiagra", :uninteresting)
10
+ @negative_fvs = @negative_doc.feature_vectors
11
+ end
12
+
13
+ def add_simple_docs_to_svm
14
+ @svm.add_document(:interesting, @feature_vectors)
15
+ @svm.add_document(:uninteresting, @negative_fvs)
16
+ end
17
+
18
+ it "should list document classes it knows about" do
19
+ add_simple_docs_to_svm
20
+ @svm.classes.sort_to_s.should == [:interesting, :uninteresting].sort_to_s
21
+ end
22
+
23
+ it "should express classes as SVM friendly unit integer labels"do
24
+ add_simple_docs_to_svm
25
+ spam_doc = Document.new("make your junk repulsive to women with free v14gra")
26
+ @svm.add_document(:uninteresting, spam_doc.feature_vectors)
27
+ @svm.class_labels.should == {:interesting => 0, :uninteresting => 1}
28
+ end
29
+
30
+ it "should create a feature dictionary based on training docs" do
31
+ add_simple_docs_to_svm
32
+ expected = (@feature_vectors + @negative_fvs).map { |fv| fv.name }.uniq
33
+ @svm.feature_dictionary.should == expected
34
+ end
35
+
36
+ it "should express documents as SVM friendly vectors using the binary method" do
37
+ # for a brief but usable description of binary, frequency, tf-idf, and
38
+ # Hadamard vector representations of documents, see section 2.1 (2nd page) of:
39
+ # http://jmlr.csail.mit.edu/papers/volume2/manevitz01a/manevitz01a.pdf
40
+ add_simple_docs_to_svm
41
+ @svm.vectorized_docs(:interesting).first.should == [1,1,1,1,1,0,0,0]
42
+ @svm.vectorized_docs(:uninteresting).first.should == [0,0,0,0,1,1,1,1]
43
+ end
44
+
45
+ it "should set SVM parameters to reasonable defaults and allow access via a block" do
46
+ @svm.parameters do |params|
47
+ params.C.should == 100
48
+ params.svm_type.should == NU_SVC
49
+ params.degree.should == 1
50
+ params.coef0.should == 0
51
+ params.eps.should == 0.001
52
+ params.kernel_type.should == RBF
53
+ end
54
+ end
55
+
56
+ it "should construct the list of labels and document feature vectors" do
57
+ add_simple_docs_to_svm
58
+ result = @svm.labels_and_document_vectors
59
+ result[:labels].sort.should == [0,1]
60
+ result[:features].sort.should == [[0,0,0,0,1,1,1,1], [1,1,1,1,1,0,0,0]]
61
+ # can't count on consistent Hash#each ordering, hence this:
62
+ expected_result_with_hash_ordering_workaround = {1 => [0,0,0,0,1,1,1,1], 0 => [1,1,1,1,1,0,0,0]}
63
+ stabilized_actual_result = {result[:labels].first => result[:features].first,
64
+ result[:labels].last => result[:features].last}
65
+ stabilized_actual_result.should == expected_result_with_hash_ordering_workaround
66
+ end
67
+
68
+ it "should classify unlabeled documents" do
69
+ # examples from http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
70
+ non_spam_texts = ["Peter and Stewie are hilarious", "New episode rocks, Peter and Stewie are hilarious",
71
+ "Peter is my fav!"]
72
+ spam_texts = ["FREE NATIONAL TREASURE", "FREE TV for EVERY visitor", "AS SEEN ON NATIONAL TV",
73
+ "FREE drugs"]
74
+ non_spam_texts.each { |t| @svm.add_document(:nonspam, Document.new(t, :nonspam).feature_vectors) }
75
+ spam_texts.each { |t| @svm.add_document(:spam, Document.new(t, :spam).feature_vectors) }
76
+ test_non_spams = ["Stewie is hilarious", "Poor Peter is hilarious"]
77
+ test_spam = "FREE lotterry for the NATIONAL TREASURE !!!"
78
+ @svm.classify(Document.new(test_non_spams.first).feature_vectors).should == :nonspam
79
+ @svm.classify(Document.new(test_non_spams.last).feature_vectors).should == :nonspam
80
+ @svm.classify(Document.new(test_spam).feature_vectors).should == :spam
81
+ end
82
+
83
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: danielsdeleo-basset
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Paul Dix
8
+ - Bryan Helmkamp
9
+ - Daniel DeLeo
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+
14
+ date: 2009-05-09 00:00:00 -07:00
15
+ default_executable:
16
+ dependencies:
17
+ - !ruby/object:Gem::Dependency
18
+ name: stemmer
19
+ type: :runtime
20
+ version_requirement:
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 1.0.1
26
+ version:
27
+ description: A library for machine learning and classification
28
+ email: ddeleo@basecommander.net
29
+ executables: []
30
+
31
+ extensions: []
32
+
33
+ extra_rdoc_files:
34
+ - README.rdoc
35
+ files:
36
+ - History.txt
37
+ - License.txt
38
+ - Manifest.txt
39
+ - README.rdoc
40
+ - Rakefile
41
+ - VERSION.yml
42
+ - basset.gemspec
43
+ - examples/example.rb
44
+ - lib/basset.rb
45
+ - lib/basset/classification_evaluator.rb
46
+ - lib/basset/classifier.rb
47
+ - lib/basset/core_extensions.rb
48
+ - lib/basset/document.rb
49
+ - lib/basset/document_override_example.rb
50
+ - lib/basset/feature.rb
51
+ - lib/basset/feature_extractor.rb
52
+ - lib/basset/feature_selector.rb
53
+ - lib/basset/naive_bayes.rb
54
+ - lib/basset/svm.rb
55
+ - lib/basset/yaml_serialization.rb
56
+ - spec/spec.opts
57
+ - spec/spec_helper.rb
58
+ - spec/unit/classifier_spec.rb
59
+ - spec/unit/core_extension_spec.rb
60
+ - spec/unit/document_spec.rb
61
+ - spec/unit/feature_extractor_spec.rb
62
+ - spec/unit/feature_selector_spec.rb
63
+ - spec/unit/feature_spec.rb
64
+ - spec/unit/naive_bayes_spec.rb
65
+ - spec/unit/svm_spec.rb
66
+ has_rdoc: false
67
+ homepage: http://github.com/danielsdeleo/basset
68
+ post_install_message:
69
+ rdoc_options:
70
+ - --inline-source
71
+ - --charset=UTF-8
72
+ require_paths:
73
+ - - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: "0"
79
+ version:
80
+ required_rubygems_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: "0"
85
+ version:
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.2.0
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: A library for machine learning and classification
93
+ test_files: []
94
+