RubyGems - danielsdeleo-basset - Versions diffs - 1.0.4 - Mend

danielsdeleo-basset 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/History.txt +7 -0
data/License.txt +20 -0
data/Manifest.txt +21 -0
data/README.rdoc +58 -0
data/Rakefile +63 -0
data/VERSION.yml +4 -0
data/basset.gemspec +32 -0
data/examples/example.rb +25 -0
data/lib/basset.rb +9 -0
data/lib/basset/classification_evaluator.rb +170 -0
data/lib/basset/classifier.rb +188 -0
data/lib/basset/core_extensions.rb +93 -0
data/lib/basset/document.rb +84 -0
data/lib/basset/document_override_example.rb +11 -0
data/lib/basset/feature.rb +26 -0
data/lib/basset/feature_extractor.rb +52 -0
data/lib/basset/feature_selector.rb +126 -0
data/lib/basset/naive_bayes.rb +151 -0
data/lib/basset/svm.rb +180 -0
data/lib/basset/yaml_serialization.rb +41 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +12 -0
data/spec/unit/classifier_spec.rb +166 -0
data/spec/unit/core_extension_spec.rb +33 -0
data/spec/unit/document_spec.rb +59 -0
data/spec/unit/feature_extractor_spec.rb +33 -0
data/spec/unit/feature_selector_spec.rb +108 -0
data/spec/unit/feature_spec.rb +40 -0
data/spec/unit/naive_bayes_spec.rb +119 -0
data/spec/unit/svm_spec.rb +83 -0
metadata +94 -0

data/spec/unit/feature_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe FeatureExtractor do
+  DocumentMock = Struct.new(:vector_of_features)
+  it "should save to file"
+  it "should be loadable from file"
+  it "should return number of features" do
+    FeatureExtractor.new(%w[one two]).number_of_features.should == 2
+  end
+  it "should throw away extra features" do
+    doc = DocumentMock.new([Feature.new("keep"), Feature.new("throwaway")])
+    FeatureExtractor.new(%w[keep]).extract(doc).should == [Feature.new("keep")]
+  end
+  it "should extract no features from a doc with no features" do
+    FeatureExtractor.new(%w[keep]).extract(DocumentMock.new([])).should == []
+  end
+  it "should extract numbered features" do
+    doc = DocumentMock.new([Feature.new("keep", 0)])
+    FeatureExtractor.new(%w[keep]).extract_numbered(doc).should == [Feature.new(1, 0)]
+  end
+  it "should sort extracted numbered features" do
+    feature_extractor = FeatureExtractor.new(%w[keep1 keep2])
+    doc = DocumentMock.new([Feature.new("keep2", 10), Feature.new("keep1", 20)])
+    feature_extractor.extract_numbered(doc).should == [Feature.new(1, 20), Feature.new(2, 10)]
+  end
+end

data/spec/unit/feature_selector_spec.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe FeatureSelector do
+  DocumentMock = Struct.new(:vector_of_features, :classification)
+  it "should count documents" do
+    feature_selector = FeatureSelector.new
+    feature_selector.docs.should == 0
+    feature_selector.add_document(DocumentMock.new([]))
+    feature_selector.docs.should == 1
+  end
+  it "should return all feature names" do
+    feature_selector = FeatureSelector.new
+    feature_selector.all_feature_names.should == []
+    feature_selector.add_document(DocumentMock.new([Feature.new("a")]))
+    feature_selector.add_document(DocumentMock.new([Feature.new("b")]))
+    feature_selector.all_feature_names.should == %w[a b]
+  end
+  # TODO
+  # it "should return_all_features_as_best
+  #   feature_selector = FeatureSelector.new
+  #   feature_selector.add_document(DocumentMock.new([Feature.new("a")], :test))
+  #   assert_equal %w[a], feature_selector.best_features_for_classification(:test, 10)
+  # end
+  it "should count docs with feature and class" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_with_feature_and_class, "viagra", :spam).should == 1
+    feature_selector.__send__(:docs_with_feature_and_class, "viagra", :ham).should == 0
+  end
+  it "should count docs with feature and not class" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_with_feature_and_not_class, "puppy", :spam).should == 1
+    feature_selector.__send__(:docs_with_feature_and_not_class,"puppy", :ham).should == 0
+  end
+  it "should count docs with class and not feature" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "puppy").should == 1
+    feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "viagra").should == 0
+  end
+  it "should count docs without feature or class" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_without_feature_or_class, "viagra", :spam).should == 1
+    feature_selector.__send__(:docs_without_feature_or_class, "viagra", :ham).should == 0
+  end
+  it "should return zero chi if all docs contain feature" do
+    feature_selector = FeatureSelector.new
+    the = Feature.new("the", 1)
+    feature_selector.add_document(doc([the], :spam))
+    feature_selector.add_document(doc([the], :ham))
+    feature_selector.features_with_chi(:spam).should == [Feature.new("the", 0.0)]
+  end
+  it "should compute chi squared" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.features_with_chi(:spam).should == [Feature.new("viagra", 2.0), Feature.new("puppy", 2.0)]
+  end
+  it "should not select any features if they are all insignificant" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.select_features.should == []
+  end
+  it "should not select features in only one doc" do
+    feature_selector = FeatureSelector.new
+    the = Feature.new("the", 1)
+    feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
+    feature_selector.select_features.should == []
+  end
+  it "should select significant features occuring in more than one doc" do
+    feature_selector = FeatureSelector.new
+    the = Feature.new("the", 1)
+    feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
+    feature_selector.select_features.should == %w[viagra]
+  end
+  it "should return selected features sorted by chi squared descending"
+  it "should select based on first feature by default"
+  it "should select with a chi squared of 1 by default"
+private
+  def doc(*args)
+    DocumentMock.new(*args)
+  end
+end

data/spec/unit/feature_spec.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe Feature do
+  it "should store name" do
+    Feature.new("hello").name.should == "hello"
+  end
+  it "should require name" do
+    lambda { Feature.new }.should raise_error(ArgumentError)
+  end
+  it "should store values" do
+    Feature.new("name", 2).value.should ==2
+  end
+  it "should default value to zero" do
+    Feature.new("name").value.should == 0
+  end
+  it "should be equal with same name and no value" do
+    Feature.new("hello").should == Feature.new("hello")
+  end
+  it "should be equal with same name and same value" do
+    Feature.new("hello", 1).should == Feature.new("hello", 1)
+  end
+  it "should not be equal with different name" do
+    Feature.new("hello").should_not == Feature.new("test")
+  end
+  it "should not be equal with same name and different value" do
+    Feature.new("hello", 1).should_not == Feature.new("hello", 2)
+  end
+  it "should sort by name ascending then value ascending" do
+    [Feature.new("b", 3), Feature.new("a", 2), Feature.new("a", 1)].sort.should ==
+    [Feature.new("a", 1), Feature.new("a", 2), Feature.new("b", 3)]
+  end
+end

data/spec/unit/naive_bayes_spec.rb ADDED Viewed

@@ -0,0 +1,119 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe NaiveBayes::FeatureCount do
+  it "should be equal to another feature count if the feature name and counts per class are equal" do
+    NaiveBayes::FeatureCount.new("rspec", :sweet, 1).should == NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
+  end
+  it "should give the sum of all occurrences of a feature for a given class" do
+    fc = NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
+    fc.add_count_for_class(2, :sweet)
+    fc.add_count_for_class(6, :super_sweet)
+    fc.count_for_class(:sweet).should == 3
+    fc.count_for_class(:super_sweet).should == 6
+  end
+end
+describe NaiveBayes do
+  before(:each) do
+    @nbayes = NaiveBayes.new
+    @doc = Document.new("here are some interesting words", :interesting)
+    @feature_vectors = @doc.feature_vectors
+    @other_vectors = Document.new("these words are interesting", :interesting).feature_vectors
+    @test_vectors = Document.new("this word seems interesting", :interesting).feature_vectors
+  end
+  def feature_counts_for(classification, *feature_count_tuples)
+    feature_counts = {}
+    feature_count_tuples.each do |tuple|
+      feature_counts[tuple.first] = NaiveBayes::FeatureCount.new(tuple.first, classification, tuple.last)
+    end
+    feature_counts
+  end
+  def add_interesting_docs
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.add_document(:interesting, @other_vectors)
+  end
+  def add_boring_docs
+    @nbayes.add_document(:boring, Document.new("yawn lets go flame").feature_vectors)
+    @nbayes.add_document(:boring, Document.new("yawn lets flame and troll").feature_vectors)
+  end
+  it "should keep track of the total docs and total docs for class when adding new docs" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.total_docs.should == 1
+    @nbayes.total_docs_in_class[:interesting].should == 1
+  end
+  it "should create a feature count for each feature with the # of occurances and class" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    expected = feature_counts_for(:interesting, ["here", 1], ["ar", 1],["some", 1],["interest", 1],["word", 1])
+    @nbayes.feature_counts.should == expected
+  end
+  it "should sum the number of all occurances of all features for a given class" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
+    add_boring_docs
+    @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
+  end
+  it "should give a list of classes it knows about" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.add_document(:kinda_interesting, @feature_vectors)
+    @nbayes.classes.sort_to_s.should == [:kinda_interesting, :interesting].sort_to_s
+  end
+  it "should not divide by zero or nil when determining the probability of a feature vector for a class" do
+    @nbayes.occurrences_of_all_features_in_class(:interesting).should be_nil
+    probability_computation = lambda {@nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)}
+    probability_computation.should_not raise_error
+  end
+  it "should compute the probability that a given (singular) feature vector belongs to a given class" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    probablity = @nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)
+    probablity.should be_a Float
+    probablity.round.should == -1
+  end
+  it "should compute the probability that a given set of feature vectors belongs to a given class" do
+    add_interesting_docs
+    probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting)
+    probability.should be_a(Float)
+    probability.round.should == -5
+  end
+  it "should compute a probability of a class for a set of vectors normalized by the number of features" do
+    add_interesting_docs
+    probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting, :normalize => true)
+    probability.should be_a Float
+    probability.round.should == -1
+  end
+  it "should determine the most likely class of a set feature vectors" do
+    add_interesting_docs
+    add_boring_docs
+    test_vectors = Document.new("some interesting words").feature_vectors
+    classification = @nbayes.classify(test_vectors, :normalize_classes => false)
+    classification.last.should == :interesting
+    classification.first.should be_a Float
+    classification.first.round.should == -2
+  end
+  it "should account for the relative probabilities of classes by default when classifying" do
+    add_interesting_docs
+    add_boring_docs
+    vectors = Document.new("some interesting words").feature_vectors
+    classification = @nbayes.classify(vectors)
+    classification.last.should == :interesting
+    classification.first.should be_a Float
+    classification.first.round.should == -2
+  end
+end

data/spec/unit/svm_spec.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Svm do
+  before(:each) do
+    @svm = Svm.new
+    @doc = Document.new("here are some interesting words", :interesting)
+    @feature_vectors = @doc.feature_vectors
+    @negative_doc = Document.new("let's eat some spamiagra", :uninteresting)
+    @negative_fvs = @negative_doc.feature_vectors
+  end
+  def add_simple_docs_to_svm
+    @svm.add_document(:interesting, @feature_vectors)
+    @svm.add_document(:uninteresting, @negative_fvs)
+  end
+  it "should list document classes it knows about" do
+    add_simple_docs_to_svm
+    @svm.classes.sort_to_s.should == [:interesting, :uninteresting].sort_to_s
+  end
+  it "should express classes as SVM friendly unit integer labels"do
+    add_simple_docs_to_svm
+    spam_doc = Document.new("make your junk repulsive to women with free v14gra")
+    @svm.add_document(:uninteresting, spam_doc.feature_vectors)
+    @svm.class_labels.should == {:interesting => 0, :uninteresting => 1}
+  end
+  it "should create a feature dictionary based on training docs" do
+    add_simple_docs_to_svm
+    expected = (@feature_vectors + @negative_fvs).map { |fv| fv.name }.uniq
+    @svm.feature_dictionary.should == expected
+  end
+  it "should express documents as SVM friendly vectors using the binary method" do
+    # for a brief but usable description of binary, frequency, tf-idf, and
+    # Hadamard vector representations of documents, see section 2.1 (2nd page) of:
+    # http://jmlr.csail.mit.edu/papers/volume2/manevitz01a/manevitz01a.pdf
+    add_simple_docs_to_svm
+    @svm.vectorized_docs(:interesting).first.should == [1,1,1,1,1,0,0,0]
+    @svm.vectorized_docs(:uninteresting).first.should == [0,0,0,0,1,1,1,1]
+  end
+  it "should set SVM parameters to reasonable defaults and allow access via a block" do
+    @svm.parameters do |params|
+      params.C.should == 100
+      params.svm_type.should == NU_SVC
+      params.degree.should == 1
+      params.coef0.should == 0
+      params.eps.should == 0.001
+      params.kernel_type.should == RBF
+    end
+  end
+  it "should construct the list of labels and document feature vectors" do
+    add_simple_docs_to_svm
+    result = @svm.labels_and_document_vectors
+    result[:labels].sort.should == [0,1]
+    result[:features].sort.should == [[0,0,0,0,1,1,1,1], [1,1,1,1,1,0,0,0]]
+    # can't count on consistent Hash#each ordering, hence this:
+    expected_result_with_hash_ordering_workaround = {1 => [0,0,0,0,1,1,1,1], 0 => [1,1,1,1,1,0,0,0]}
+    stabilized_actual_result = {result[:labels].first => result[:features].first,
+      result[:labels].last => result[:features].last}
+    stabilized_actual_result.should == expected_result_with_hash_ordering_workaround
+  end
+  it "should classify unlabeled documents" do
+    # examples from http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
+    non_spam_texts = ["Peter and Stewie are hilarious", "New episode rocks, Peter and Stewie are hilarious",
+      "Peter is my fav!"]
+    spam_texts = ["FREE NATIONAL TREASURE", "FREE TV for EVERY visitor", "AS SEEN ON NATIONAL TV",
+      "FREE drugs"]
+    non_spam_texts.each { |t| @svm.add_document(:nonspam, Document.new(t, :nonspam).feature_vectors) }
+    spam_texts.each { |t| @svm.add_document(:spam, Document.new(t, :spam).feature_vectors) }
+    test_non_spams = ["Stewie is hilarious", "Poor Peter is hilarious"]
+    test_spam = "FREE lotterry for the NATIONAL TREASURE !!!"
+    @svm.classify(Document.new(test_non_spams.first).feature_vectors).should == :nonspam
+    @svm.classify(Document.new(test_non_spams.last).feature_vectors).should == :nonspam
+    @svm.classify(Document.new(test_spam).feature_vectors).should == :spam
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,94 @@
+--- !ruby/object:Gem::Specification
+name: danielsdeleo-basset
+version: !ruby/object:Gem::Version
+  version: 1.0.4
+platform: ruby
+authors:
+- Paul Dix
+- Bryan Helmkamp
+- Daniel DeLeo
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-05-09 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: stemmer
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.0.1
+    version:
+description: A library for machine learning and classification
+email: ddeleo@basecommander.net
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+files:
+- History.txt
+- License.txt
+- Manifest.txt
+- README.rdoc
+- Rakefile
+- VERSION.yml
+- basset.gemspec
+- examples/example.rb
+- lib/basset.rb
+- lib/basset/classification_evaluator.rb
+- lib/basset/classifier.rb
+- lib/basset/core_extensions.rb
+- lib/basset/document.rb
+- lib/basset/document_override_example.rb
+- lib/basset/feature.rb
+- lib/basset/feature_extractor.rb
+- lib/basset/feature_selector.rb
+- lib/basset/naive_bayes.rb
+- lib/basset/svm.rb
+- lib/basset/yaml_serialization.rb
+- spec/spec.opts
+- spec/spec_helper.rb
+- spec/unit/classifier_spec.rb
+- spec/unit/core_extension_spec.rb
+- spec/unit/document_spec.rb
+- spec/unit/feature_extractor_spec.rb
+- spec/unit/feature_selector_spec.rb
+- spec/unit/feature_spec.rb
+- spec/unit/naive_bayes_spec.rb
+- spec/unit/svm_spec.rb
+has_rdoc: false
+homepage: http://github.com/danielsdeleo/basset
+post_install_message:
+rdoc_options:
+- --inline-source
+- --charset=UTF-8
+require_paths:
+- - lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 3
+summary: A library for machine learning and classification
+test_files: []