RubyGems - green_midget - Versions diffs - 0.0.3 → 0.1.0 - Mend

green_midget 0.0.3 → 0.1.0

Files changed (35) hide show

data/.travis.yml +6 -0
data/Gemfile +13 -1
data/README.md +26 -10
data/Rakefile +6 -1
data/benchmark/benchmark.rb +44 -0
data/benchmark/test.rb +33 -0
data/green_midget.gemspec +0 -8
data/lib/green_midget.rb +14 -3
data/lib/green_midget/base.rb +52 -53
data/lib/green_midget/constants.rb +23 -21
data/lib/green_midget/db/migrate/create_green_midget_records.rb +13 -11
data/lib/green_midget/default_features.rb +40 -0
data/lib/green_midget/errors/feature_method_not_implemented.rb +11 -0
data/lib/green_midget/errors/no_examples_given.rb +9 -0
data/lib/green_midget/errors/no_text_found.rb +10 -0
data/lib/green_midget/heuristic_checks.rb +23 -0
data/lib/green_midget/models/countable.rb +27 -11
data/lib/green_midget/models/examples.rb +25 -12
data/lib/green_midget/models/features.rb +11 -1
data/lib/green_midget/models/records.rb +61 -0
data/lib/green_midget/models/words.rb +4 -1
data/lib/green_midget/url_detection.rb +2 -2
data/lib/green_midget/version.rb +1 -1
data/lib/tasks/green_midget.rake +10 -11
data/spec/base_spec.rb +42 -34
data/spec/examples_spec.rb +19 -19
data/spec/features_spec.rb +6 -6
data/spec/green_midget_records_spec.rb +38 -33
data/spec/spec_helper.rb +10 -7
data/spec/tester.rb +1 -1
data/spec/words_spec.rb +2 -2
metadata +14 -30
data/.document +0 -5
data/lib/green_midget/green_midget.rb +0 -6
data/lib/green_midget/models/green_midget_records.rb +0 -49

data/.travis.yml ADDED Viewed

@@ -0,0 +1,6 @@
+bundler_args: --without development
+language: ruby
+rvm:
+  - 1.8.7
+  - 1.9.2
+  - 1.9.3

data/Gemfile CHANGED Viewed

@@ -1,4 +1,16 @@
 # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
 source "http://rubygems.org"
-gemspec
+gem 'activerecord'
+group :development do
+  gem 'pry'
+  gem 'pry-doc'
+end
+group :test do
+  gem 'rspec', '>=2.4.0'
+  gem 'sqlite3'
+  gem 'rake'
+end

data/README.md CHANGED Viewed

@@ -1,3 +1,6 @@
+[![Build
+Status](https://secure.travis-ci.org/chochkov/GreenMidget.png)](http://travis-ci.org/chochkov/GreenMidget)
 On Bayesian Classification
 ----------
@@ -62,9 +65,9 @@ If the above functionality is not enough for you and you want to add custom logi
 * Implement heuristics logic, which will directly classify incoming object as a given category. Example:
-        def pass_ham_heuristics?
-          words.count > 5 || url_in_text?
-        end
+    def pass_ham_heuristics?
+      words.count > 5 || url_in_text?
+    end
   This method will be `true` for longer text or such that contains an external url. In this case the classifier would go on to the actual testing procedure. If `false`, however, the procedure will not be done and the classifier will return the ham category as a result. Note the native `GreenMidget::Base#words` and `GreenMidget::Base#url_in_text?`
@@ -74,21 +77,21 @@ If the above functionality is not enough for you and you want to add custom logi
   By default GreenMidget comes with two feature definitions `url_in_text` and `email_in_text`, but you can implement as many more as you want by writing a boolean method that checks for the feature:
-         def regular_user?
-           @user.sign_up_count > 10
-         end
+    def regular_user?
+      @user.sign_up_count > 10
+    end
   and then implement a `features` method that returns an array with your custom feature names:
-         def features
-           ['regular_user', .... ]
-         end
+    def features
+      ['regular_user', .... ]
+    end
   (do make sure that the array entry is the same as the name of the method that would be checking for this feature)
   The GreenMidget features definitions have more weight on shorter texts and less weight on longer thus they provide a ground source of evidence for GreenMidget's classification.
-If that's not enough too, you're welcome to [browse the code][green_midget_github] and either extend more parts of it or simply make your own fork of the project.
+If that's not enough too, see the Contribute section below.
 Benchmarking
 ----------
@@ -114,5 +117,18 @@ Classification Efficiency
 TODO: give test results; provide a web interface to a trained classifier using some of SoundCloud's spam and legit data; give production experience from DigitaleSeiten.
+Contribute
+----------
+Let me know on any feedback or feature requests. If you want to hack on the
+code, just do that!
+  * Make a fork
+    * `git clone git@github.com:chochkov/GreenMidget.git`
+    * `bundle`
+    * `bundle exec rake` to run the specs
+  * Make a patch
+  * Send a Pull Request
 [green_midget_github]: http://github.com/chochkov/GreenMidget "Github repository"
 [guidelines]: http://soundcloud.com/community-guidelines "Community guidelines"

data/Rakefile CHANGED Viewed

@@ -1 +1,6 @@
-require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new do |t|
+end
+task :default => :spec

data/benchmark/benchmark.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# Measures training times and classification times over arbitrary message lengths
+# Dont run this on a database that already has training data - this script will polute it.
+# TODO: move this to a rake task
+include GreenMidget
+TRAININGS               = 90
+CLASSIFICATIONS         = 1
+MESSAGE_LENGTH          = 1000
+@training_times         = []
+@classification_times   = []
+records_count_at_start  = Records.count
+def generate_text(message_length = 1)
+  message ||= []
+  while message.count < message_length do
+    word = ''
+    (rand(7) + 3).times { word += ('a'..'z').to_a[rand(26)] }
+    message << word unless message.include?(word)
+  end
+  text = message.join(' ')
+end
+TRAININGS.times do
+  a = GreenMidget::Classifier.new generate_text(MESSAGE_LENGTH)
+  @training_times << Benchmark.measure { a.classify_as! [ ALTERNATIVE, NULL ][rand(2)] }.real
+end
+CLASSIFICATIONS.times do
+  a = GreenMidget::Classifier.new generate_text(MESSAGE_LENGTH)
+  @classification_times << Benchmark.measure { a.classify }.real
+end
+puts " ------------------------------- "
+puts " Average seconds from #{TRAININGS} trainings and #{CLASSIFICATIONS} classifications. #{MESSAGE_LENGTH} words per message:"
+puts " Number of records at start: #{records_count_at_start} and at the end: #{Records.count}"
+puts " ------------------------------- "
+puts " Training times:                 #{(@training_times.sum.to_f/TRAININGS).round(4)}"
+puts " ------------------------------- "
+puts " Classification times:           #{(@classification_times.sum.to_f/CLASSIFICATIONS).round(4)}"
+puts " ------------------------------- "

data/benchmark/test.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# This is a cross validation script
+# TODO: move it to a rake task
+require 'sqlite3'
+require File.join(File.dirname(__FILE__), '..', 'spec', 'tester')
+include GreenMidget
+ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => '~/sc/user_backup/data.db')
+@spam = [ 'messages', 'comments', 'posts' ].map { |table| ActiveRecord::Base.connection.execute("select body from #{table} limit 1500").inject([]) { |memo, hash| memo << hash["body"] } }
+ActiveRecord::Base.establish_connection(:adapter => 'mysql', :username => 'root', :password => 'root', :database => 'soundcloud_development_temp')
+@ham  = [ 'messages', 'comments', 'posts' ].map { |table| Records.find_by_sql("select body from #{table} limit 1500").to_a.inject([]) { |memo, hash| memo << hash["body"] } }
+ActiveRecord::Base.establish_connection(:adapter => 'mysql', :username => 'root', :password => 'root', :database => 'classifier_development_weird')
+#
+# # ------ I. PERFORM TRAINING
+# puts Benchmark.measure {
+#   @spam.each { |src|
+#     src.each {|body|
+#       klass = Tester.new(body);klass.classify_as! :spam
+#     }
+#   };true
+# }
+#
+# puts Benchmark.measure {
+#   @ham.each { |src|
+#     src.each {|body|
+#       klass = Tester.new(body);klass.classify_as! :ham
+#     }
+#   };true
+# }

data/green_midget.gemspec CHANGED Viewed

@@ -14,18 +14,10 @@ Gem::Specification.new do |s|
   s.summary     = %q{Bayesian Text Classifier}
   s.description = %q{Naive Bayesian Classifier with customizable features}
-  s.rubyforge_project = "green_midget"
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
   s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
   s.require_paths = ["lib"]
   s.add_runtime_dependency "activerecord"
-  s.add_development_dependency "rspec"
-  s.add_development_dependency "bundler"
-  # specify any dependencies here; for example:
-  # s.add_development_dependency "rspec"
-  # s.add_runtime_dependency "rest-client"
 end

data/lib/green_midget.rb CHANGED Viewed

@@ -1,15 +1,26 @@
 # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
 require 'active_record'
-require 'green_midget/green_midget'
+require 'green_midget/constants'
+require 'green_midget/url_detection'
+require 'green_midget/logger'
+require 'green_midget/heuristic_checks'
+require 'green_midget/default_features'
 require 'green_midget/base'
 require 'green_midget/models/countable'
 require 'green_midget/models/examples'
 require 'green_midget/models/features'
-require 'green_midget/models/green_midget_records'
+require 'green_midget/models/records'
 require 'green_midget/models/words'
+require 'green_midget/errors/no_text_found'
+require 'green_midget/errors/feature_method_not_implemented'
+require 'green_midget/errors/no_examples_given'
 require 'green_midget/extensions/classifier'
-if (classifier = Gem.searcher.find('green_midget'))
+if classifier = Gem.searcher.find('green_midget')
   path = classifier.full_gem_path
   Dir["#{path}/lib/tasks/*.rake"].each { |ext| load ext }
 end

data/lib/green_midget/base.rb CHANGED Viewed

@@ -1,75 +1,61 @@
 # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
-%w(logger constants url_detection).each do |file|
-  require File.join(File.dirname(__FILE__), file)
-end
 module GreenMidget
   class Base
-    include Logger
-    include Constants
+    include DefaultFeatures
+    include HeuristicChecks
+    # Get classification for unknown messages based on history
+    #
+    #   Examples:
+    #
+    #   result = GreenMidget::Classifier.new(unknown_text)
+    #   # result is now in -1, 0, 1 meaning respectively
+    #   # no_spam, no_answer, spam
+    #
     def classify
-      CATEGORIES.each do |category|
-        if respond_to?(:"pass_#{category}_heuristics?") && send(:"pass_#{category}_heuristics?")
-          classify_as!(category)
-          return HYPOTHESES[category]
-        end
+      if respond_to?(:heuristic_checks, true) && response = heuristic_checks
+        return response
       end
-      GreenMidgetRecords.fetch_all(words)
-      register_classification
+      # load all relevant records in one go
+      Records.fetch_all(words)
       factor = log_ratio
       case
       when factor >= ACCEPT_ALTERNATIVE_MIN
-        ALTERNATIVE_RESPONSE
+        RESPONSES[ALTERNATIVE]
       when factor >= REJECT_ALTERNATIVE_MAX
-        DUNNO
+        RESPONSES[:dunno]
       else
-        NULL_RESPONSE
+        RESPONSES[NULL]
       end
     end
+    # Public method used to train the classifier with examples
+    # belonging to a known `category`.
+    #
+    #   Examples:
+    #
+    #   classifier = GreenMidget::Classifier.new(known_good_text)
+    #   classifier.classify_as!(:ham)
+    #   # increases the chances for similar text to pass the check next time
+    #
+    #   classifier = GreenMidget::Classifier.new(known_spam_text)
+    #   classifier.classify_as!(:spam)
+    #   # increases the chances for similar text to fail the check next time
+    #
     def classify_as!(category)
-      keys = [ Words.objects(words), Features.objects(present_features), Examples.objects(features, true) ].flatten.map do |object|
-        object.record_key(category)
-      end
+      keys = [
+        Words.objects(words),
+        Features.objects(present_features),
+        Examples.objects(features, true)
+      ].flatten.map { |object| object.record_key(category) }
-      GreenMidgetRecords.increment(keys)
-      register_training
+      !! Records.increment(keys)
     end
     private
-    # ------ Features --------
-    def features
-      FEATURES
-    end
-    def present_features
-      features.select { |feature| feature_present?(feature) }
-    end
-    def feature_present?(feature)
-      method = :"#{ feature }?"
-      if respond_to?(method, true)
-        send(method)
-      else
-        raise("You must implement method #{ method } or remove feature #{ feature }.")
-      end
-    end
-    def url_in_text?
-      UrlDetection.new(text).any?
-    end
-    def email_in_text?
-      text.scan(EMAIL_REGEX).size > 0
-    end
-    # ------ Words --------
     def words
       strip_external_links.scan(WORDS_SPLIT_REGEX).uniq.
         map(&:downcase).
@@ -81,11 +67,24 @@ module GreenMidget
     end
     def text
-      @text || raise('You should either implement the text method or provide an instance variable at this point.')
+      @text || raise(NoTextFound)
     end
+    # Calculate the log ratio between the scores for both categories.
+    # It takes into account the Examples counts ( ie. how much history
+    # there is for each category ), the Words count ( i.e. how much history for
+    # each word in each category ) and if any other Features are there -
+    # accounts for them as well.
     def log_ratio
-      Examples.log_ratio + words.map{ |word| Words[word].log_ratio }.sum + present_features.map{ |feature| Features[feature].log_ratio }.sum
+      result = Examples.log_ratio
+      result += words.map{ |word| Words[word].log_ratio }.sum
+      if respond_to?(:features, true)
+        result += present_features.map{ |feature| Features[feature].log_ratio }.sum
+      end
+      result
     end
   end
 end

data/lib/green_midget/constants.rb CHANGED Viewed

@@ -1,31 +1,33 @@
 # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
 module GreenMidget
-  module Constants
-    TOLERATED_URLS  = /(soundcloud.com)|(facebook.com)|(myspace.com)|(twitter.com)/
+  TOLERATED_URLS  = /(soundcloud.com)|(facebook.com)|(myspace.com)|(twitter.com)/
-    EMAIL_REGEX     = /[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]/
-    URL_REGEX       = /(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))/
+  EMAIL_REGEX     = /[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]/
+  URL_REGEX       = /(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))/
-    EXTERNAL_LINK_REGEX     = Regexp.new(/(#{ EMAIL_REGEX })|(#{ URL_REGEX })/)
+  EXTERNAL_LINK_REGEX     = Regexp.new(/(#{EMAIL_REGEX})|(#{URL_REGEX})/)
-    STOP_WORDS              = %w()
+  STOP_WORDS              = %w()
-    MIN_CHARACTERS_IN_WORD  = 3
-    MAX_CHARACTERS_IN_WORD  = 20
+  MIN_CHARACTERS_IN_WORD  = 3
+  MAX_CHARACTERS_IN_WORD  = 20
-    WORDS_SPLIT_REGEX       = Regexp.new(/\w{#{ MIN_CHARACTERS_IN_WORD },#{ MAX_CHARACTERS_IN_WORD }}/)
-    FEATURES                = %w(url_in_text email_in_text)
+  WORDS_SPLIT_REGEX       = Regexp.new(/\w{#{ MIN_CHARACTERS_IN_WORD },#{ MAX_CHARACTERS_IN_WORD }}/)
+  FEATURES                = %w(url_in_text email_in_text)
-    # Decision making: Log(Pr(alternative | text)) - Log(Pr(null | text)) <=> [ REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN ]
-    ACCEPT_ALTERNATIVE_MIN  = Math::log(3.0)
-    REJECT_ALTERNATIVE_MAX  = 0.0
+  # Decision making:
+  # Log(Pr(alternative | text)) - Log(Pr(null | text)) <=>
+  # ( REJECT_ALTERNATIVE_MAX..ACCEPT_ALTERNATIVE_MIN )
+  #
+  ACCEPT_ALTERNATIVE_MIN  = Math::log(3.0)
+  REJECT_ALTERNATIVE_MAX  = 0.0
-    ALTERNATIVE_RESPONSE    = 1
-    DUNNO                   = 0
-    NULL_RESPONSE           = -1
-    NULL                    = :ham
-    ALTERNATIVE             = :spam
-    CATEGORIES              = [ NULL, ALTERNATIVE ]
-  end
+  NULL                    = :ham
+  ALTERNATIVE             = :spam
+  CATEGORIES              = [ NULL, ALTERNATIVE ]
+  RESPONSES               = {
+    NULL        => -1,
+    :dunno      => 0,
+    ALTERNATIVE => 1,
+  }
 end

data/lib/green_midget/db/migrate/create_green_midget_records.rb CHANGED Viewed

@@ -1,16 +1,18 @@
 # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
-class CreateGreenMidgetRecords < ActiveRecord::Migration
-  def self.up
-    create_table :green_midget_records do |t|
-      t.string   :key
-      t.string   :value
-      t.datetime :updated_at
+module GreenMidget
+  class CreateGreenMidgetRecords < ActiveRecord::Migration
+    def self.up
+      create_table :green_midget_records do |t|
+        t.string   :key
+        t.integer  :value
+        t.datetime :updated_at
+      end
+      add_index :green_midget_records, :key
+      add_index :green_midget_records, :updated_at
     end
-    add_index :green_midget_records, :key
-    add_index :green_midget_records, :updated_at
-  end
-  def self.down
-    drop_table :green_midget_records
+    def self.down
+      drop_table :green_midget_records
+    end
   end
 end