green_midget 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ # A mixin that implements features check and allows Base sublcasses
2
+ # to define their own features for spam/ham detection.
3
+ #
4
+ # By default texts are checked for presence of external URL or email
5
+ # references. An example of addional feature would be presence of particular
6
+ # words or expressions.
7
+ #
8
+ # See the example in `lib/green_midget/extensions/sample.rb`
9
+ #
10
+ module GreenMidget
11
+ module DefaultFeatures
12
+
13
+ private
14
+
15
+ def features
16
+ FEATURES
17
+ end
18
+
19
+ def present_features
20
+ features.select { |feature| feature_present?(feature) }
21
+ end
22
+
23
+ def feature_present?(feature)
24
+ method = :"#{feature}?"
25
+ if respond_to?(method, true)
26
+ send(method)
27
+ else
28
+ raise FeatureMethodNotImplemented.new(feature, method)
29
+ end
30
+ end
31
+
32
+ def url_in_text?
33
+ UrlDetection.new(text).any?
34
+ end
35
+
36
+ def email_in_text?
37
+ text.scan(EMAIL_REGEX).size > 0
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,11 @@
1
+ module GreenMidget
2
+ class FeatureMethodNotImplemented < StandardError
3
+ def initialize(feature, method_name)
4
+ super <<-MSG
5
+ Method #{method_name.inspect} not found. Either implement it or
6
+ delete feature #{feature} from your features list.
7
+ MSG
8
+ end
9
+ end
10
+ end
11
+
@@ -0,0 +1,9 @@
1
+ module GreenMidget
2
+ class NoExamplesGiven < StandardError
3
+ def initialize
4
+ super <<-MSG
5
+ Training examples must be provided for all categories before classification.
6
+ MSG
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,10 @@
1
+ module GreenMidget
2
+ class NoTextFound < StandardError
3
+ def initialize
4
+ super <<-MSG
5
+ You should either implement the text method or provide an instance variable at this point.
6
+ MSG
7
+ end
8
+ end
9
+ end
10
+
@@ -0,0 +1,23 @@
1
+ # A mixin that implements heuritics checks for both categories.
2
+ # If there're some conditions under which a spammable object could
3
+ # directly be classified as one of the classification categories
4
+ # the logic could be implemented using heuritic checks in your subclasses
5
+ #
6
+ # See the example in `lib/green_midget/extensions/sample.rb`
7
+ #
8
+ module GreenMidget
9
+ module HeuristicChecks
10
+
11
+ private
12
+
13
+ def heuristic_checks
14
+ CATEGORIES.each do |category|
15
+ if respond_to?(:"pass_#{category}_heuristics?") && send(:"pass_#{category}_heuristics?")
16
+ classify_as!(category)
17
+ return RESPONSES[category]
18
+ end
19
+ end
20
+ return false
21
+ end
22
+ end
23
+ end
@@ -1,25 +1,41 @@
1
1
  # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ #
3
+ # This is an abstraction from Words, Examples and Features. It provides common
4
+ # methods for building the record keys for individual countables in any
5
+ # category.
6
+ #
7
+ # For example the data record key for the word 'legit' in Spam category would
8
+ # be something like "word::legit::spam_count". The record key for a feature
9
+ # 'url_present' in Ham would be something like "feature::url_present::ham_count"
10
+ # The count of all training examples given for category Spam would be
11
+ # "example::any::spam_count"
12
+ #
13
+ # The example counts for individual features is stored as well. For example for
14
+ # 'url_present' we will have two records: "example::url_present::spam_count" and
15
+ # "example::url_present::ham_count". They will store the informatino about how
16
+ # much training the GreenMidget received for this feature in each category.
17
+ #
18
+ # This class is the link between countable and the Records data store adapter
19
+ #
2
20
  module GreenMidget
3
21
  class Countable
4
- include Constants
5
22
  attr_accessor :key
23
+ class_attribute :prefix
6
24
 
7
- class << self; attr_accessor :prefix end
8
-
9
- def self.[](key)
10
- new(key)
25
+ def initialize(key)
26
+ @key = self.class.prefix + key
11
27
  end
12
28
 
13
- def self.objects(keys)
14
- keys.map { |key| new(key) }
15
- end
29
+ class << self
30
+ alias :[] :new
16
31
 
17
- def initialize(key)
18
- @key = self.class.prefix + key
32
+ def objects(keys)
33
+ keys.map { |key| new(key) }
34
+ end
19
35
  end
20
36
 
21
37
  def [](category)
22
- GreenMidgetRecords[record_key(category)].to_f
38
+ Records[record_key(category)].to_f
23
39
  end
24
40
 
25
41
  def log_ratio
@@ -1,25 +1,26 @@
1
1
  # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ #
3
+ # A model for Examples used in GreenMidget. Examples represent the counts for
4
+ # how much training GreenMidget received in each respective category.
5
+ #
6
+ # Example['url_present'][:spam]
7
+ # # the number of spam training examples having an URL
8
+ #
9
+ # Example['any'][:ham]
10
+ # # the number of total Ham training examples
11
+ #
12
+ # See Countable
13
+ #
2
14
  module GreenMidget
3
15
  class Examples < Countable
4
- NO_EXAMPLES_GIVEN_ERROR = 'Training examples must be provided for all categories before classification.'
5
16
  GENERAL_FEATURE_NAME = 'any'
6
17
  self.prefix = 'examples_with_feature::'
7
18
 
8
- class_eval(<<-EVAL, __FILE__, __LINE__ + 1)
9
- def self.#{ ALTERNATIVE } # def self.ham
10
- @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE] # @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE]
11
- end # end
12
-
13
- def self.#{ NULL } # def self.spam
14
- @@null ||= self[GENERAL_FEATURE_NAME][NULL] # @@null ||= self[GENERAL_FEATURE_NAME][NULL]
15
- end # end
16
- EVAL
17
-
18
19
  def self.[](feature)
19
20
  object = super(feature)
20
21
 
21
22
  if object.no_examples? && (feature == GENERAL_FEATURE_NAME)
22
- raise NO_EXAMPLES_GIVEN_ERROR
23
+ raise NoExamplesGiven
23
24
  elsif object.no_examples?
24
25
  super GENERAL_FEATURE_NAME
25
26
  else
@@ -51,5 +52,17 @@ module GreenMidget
51
52
  def no_examples?
52
53
  CATEGORIES.inject(1) { |memo, category| memo *= self[category] } == 0
53
54
  end
55
+
56
+ # These methods store the total ham and spam examples count
57
+ #
58
+ class_eval(<<-EVAL, __FILE__, __LINE__ + 1)
59
+ def self.#{ ALTERNATIVE } # def self.ham
60
+ @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE] # @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE]
61
+ end # end
62
+
63
+ def self.#{ NULL } # def self.spam
64
+ @@null ||= self[GENERAL_FEATURE_NAME][NULL] # @@null ||= self[GENERAL_FEATURE_NAME][NULL]
65
+ end # end
66
+ EVAL
54
67
  end
55
68
  end
@@ -1,4 +1,14 @@
1
1
  # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ #
3
+ # A model for Features used in GreenMidget. A Feature could be defined by user.
4
+ # An example would be 'url_found_in_text' which will be true for spammable
5
+ # objects that have url in their text and false otherwise.
6
+ #
7
+ # Features['url_in_text'][:spam]
8
+ # # the count of spam messages that have the feature
9
+ #
10
+ # See Countable
11
+ #
2
12
  module GreenMidget
3
13
  class Features < Countable
4
14
  self.prefix = 'feature::'
@@ -8,7 +18,7 @@ module GreenMidget
8
18
  end
9
19
 
10
20
  def feature
11
- key.gsub(/(^#{ self.class.prefix })|(::\w+_count$)/, '')
21
+ key.gsub(/(^#{self.class.prefix})|(::\w+_count$)/, '')
12
22
  end
13
23
  end
14
24
  end
@@ -0,0 +1,61 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ #
3
+ # GreenMidget's simple data store adapter with only three public methods.
4
+ # It's currently ActiveRecord based but a plan is to make a
5
+ # Redis based extension as well.
6
+ #
7
+ module GreenMidget
8
+ class Records < ActiveRecord::Base
9
+ self.table_name = :green_midget_records
10
+
11
+ # Does a multi-get of the necessary count records for the given words.
12
+ # If no words are given, then only Examples and Features counts are taken
13
+ def self.fetch_all(words = [])
14
+ words_keys = Words.record_keys(words)
15
+
16
+ pairs = where(arel_table[:key].in(words_keys).
17
+ or(arel_table[:key].matches("#{Features.prefix}%")).
18
+ or(arel_table[:key].matches("#{Examples.prefix}%"))).
19
+ select(:key).select(:value)
20
+
21
+ @@cache = pairs.inject({}) do |memo, pair|
22
+ memo[pair['key']] = pair['value']
23
+ memo
24
+ end
25
+
26
+ words_keys.inject(@@cache) do |memo, word|
27
+ memo[word] ||= ''
28
+ memo
29
+ end
30
+ end
31
+
32
+ # Reads the value for a given key looking in the cache first and doing a
33
+ # database call if nothing is found.
34
+ def self.[](key)
35
+ key = key.to_s
36
+ @@cache ||= {}
37
+ @@cache[key] ||= where(:key => key).select(:value).map(&:value).first || ''
38
+ end
39
+
40
+ # Increment the values for given keys. The AR implementation increments each
41
+ # record individually, but implementing a multi-set is possible within this
42
+ # method.
43
+ def self.increment(keys)
44
+ keys = Array(keys)
45
+
46
+ @@objects = where(:key => keys).inject({}) do |memo, record|
47
+ memo[record.key] = record
48
+ memo
49
+ end
50
+
51
+ keys.inject(@@objects) do |memo, key|
52
+ memo[key] ||= new(:key => key, :value => 0)
53
+ memo
54
+ end
55
+
56
+ @@objects.each { |key, record| record.increment!(:value) }
57
+ @@objects = {}
58
+ end
59
+ end
60
+ end
61
+
@@ -1,11 +1,14 @@
1
1
  # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ #
3
+ # A model for Words used in GreenMidget. See Countable
4
+ #
2
5
  module GreenMidget
3
6
  class Words < Countable
4
7
  self.prefix = 'word::'
5
8
 
6
9
  def self.record_keys(words, category = nil)
7
10
  words.map do |word|
8
- Array(category || GreenMidget::CATEGORIES).map{ |category| Words[word].record_key(category) }
11
+ Array(category || CATEGORIES).map{ |category| Words[word].record_key(category) }
9
12
  end.flatten
10
13
  end
11
14
 
@@ -12,12 +12,12 @@ module GreenMidget
12
12
  private
13
13
 
14
14
  def urls
15
- @text.scan(GreenMidget::URL_REGEX).flatten.reject(&:nil?)
15
+ @text.scan(URL_REGEX).flatten.reject(&:nil?)
16
16
  end
17
17
 
18
18
  def non_tolerated_urls
19
19
  urls.reject do |url|
20
- url.to_s.downcase =~ GreenMidget::TOLERATED_URLS
20
+ url.to_s.downcase =~ TOLERATED_URLS
21
21
  end
22
22
  end
23
23
  end
@@ -1,3 +1,3 @@
1
1
  module GreenMidget
2
- VERSION = '0.0.3'
2
+ VERSION = '0.1.0'
3
3
  end
@@ -1,5 +1,4 @@
1
1
  # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
- require 'fileutils'
3
2
  require 'rake'
4
3
  require 'green_midget/db/migrate/create_green_midget_records'
5
4
 
@@ -9,24 +8,24 @@ namespace :green_midget do
9
8
  task :active_record => :environment do
10
9
  include GreenMidget
11
10
 
12
- unless GreenMidgetRecords.table_exists?
11
+ unless Records.table_exists?
13
12
  CreateGreenMidgetRecords.up
14
13
  end
15
14
 
16
15
  keys = [ ALTERNATIVE, NULL ].map do |hypothesis|
17
- [
18
- "feature::url_in_text::#{hypothesis}_count",
19
- "feature::email_in_text::#{hypothesis}_count",
20
- "examples::any::#{hypothesis}_count",
21
- "examples::url_in_text::#{hypothesis}_count",
22
- "examples::email_in_text::#{hypothesis}_count",
23
- ]
16
+ FEATURES.map do |feature|
17
+ [
18
+ "#{Features.prefix}#{feature}::#{hypothesis}_count",
19
+ "#{Examples.prefix}#{feature}::#{hypothesis}_count",
20
+ "#{Examples.prefix}any::#{hypothesis}_count",
21
+ ]
22
+ end
24
23
  end.flatten
25
24
 
26
25
  puts '== Creating records ==='
27
26
  keys.each { |key|
28
- unless GreenMidgetRecords.find_by_key(key)
29
- GreenMidgetRecords.create(key)
27
+ unless Records.find_by_key(key)
28
+ Records.create(:key => key, :value => 0)
30
29
  puts "-- Created #{key}"
31
30
  end
32
31
  }
data/spec/base_spec.rb CHANGED
@@ -1,41 +1,41 @@
1
1
  # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
2
  require 'spec_helper'
3
- require File.join(File.dirname(__FILE__), 'tester')
3
+ require 'tester'
4
4
 
5
5
  describe GreenMidget::Base do
6
6
  include GreenMidget
7
7
 
8
8
  before(:each) do
9
- GreenMidgetRecords.delete_all
9
+ Records.delete_all
10
10
  [
11
- {:key => "#{ Words.prefix }this::#{ ALTERNATIVE }_count", :value => 701.0 },
12
- {:key => "#{ Words.prefix }this::#{ NULL }_count", :value => 11.0 },
13
- {:key => "#{ Words.prefix }test::#{ ALTERNATIVE }_count", :value => 9.0 },
14
- {:key => "#{ Words.prefix }test::#{ NULL }_count", :value => 71.0 },
15
- {:key => "#{ Words.prefix }goes::#{ ALTERNATIVE }_count", :value => 90.0 },
16
- {:key => "#{ Words.prefix }goes::#{ NULL }_count", :value => 90.0 },
17
- {:key => "#{ Words.prefix }rid::#{ ALTERNATIVE }_count", :value => 311.0 },
18
- {:key => "#{ Words.prefix }rid::#{ NULL }_count", :value => 290.0 },
19
- {:key => "#{ Words.prefix }dirty::#{ ALTERNATIVE }_count", :value => 222.0 },
20
- {:key => "#{ Words.prefix }dirty::#{ NULL }_count", :value => 45.0 },
21
- {:key => "#{ Words.prefix }spam::#{ ALTERNATIVE }_count", :value => 11.0 },
22
- {:key => "#{ Words.prefix }spam::#{ NULL }_count", :value => 133.0 },
23
- {:key => "#{ Words.prefix }words::#{ ALTERNATIVE }_count", :value => 6.0 },
24
- {:key => "#{ Words.prefix }words::#{ NULL }_count", :value => 811.0 },
25
- {:key => "#{ Words.prefix }zero::#{ ALTERNATIVE }_count", :value => 0.0 },
26
- {:key => "#{ Words.prefix }zero::#{ NULL }_count", :value => 0.0 },
27
- {:key => "#{ Features.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 440.0 },
28
- {:key => "#{ Features.prefix }url_in_text::#{ NULL }_count", :value => 40.0 },
29
- {:key => "#{ Features.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 112.0 },
30
- {:key => "#{ Features.prefix }email_in_text::#{ NULL }_count", :value => 9.0 },
31
- {:key => "#{ Examples.prefix }any::#{ ALTERNATIVE }_count", :value => 1000.0 },
32
- {:key => "#{ Examples.prefix }any::#{ NULL }_count", :value => 1000.0 },
33
- {:key => "#{ Examples.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 1000.0 },
34
- {:key => "#{ Examples.prefix }url_in_text::#{ NULL }_count", :value => 1000.0 },
35
- {:key => "#{ Examples.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 1000.0 },
36
- {:key => "#{ Examples.prefix }email_in_text::#{ NULL }_count", :value => 1000.0 },
11
+ {:key => "#{ Words.prefix }this::#{ ALTERNATIVE }_count", :value => 701 },
12
+ {:key => "#{ Words.prefix }this::#{ NULL }_count", :value => 11 },
13
+ {:key => "#{ Words.prefix }test::#{ ALTERNATIVE }_count", :value => 9 },
14
+ {:key => "#{ Words.prefix }test::#{ NULL }_count", :value => 71 },
15
+ {:key => "#{ Words.prefix }goes::#{ ALTERNATIVE }_count", :value => 90 },
16
+ {:key => "#{ Words.prefix }goes::#{ NULL }_count", :value => 90 },
17
+ {:key => "#{ Words.prefix }rid::#{ ALTERNATIVE }_count", :value => 311 },
18
+ {:key => "#{ Words.prefix }rid::#{ NULL }_count", :value => 290 },
19
+ {:key => "#{ Words.prefix }dirty::#{ ALTERNATIVE }_count", :value => 222 },
20
+ {:key => "#{ Words.prefix }dirty::#{ NULL }_count", :value => 45 },
21
+ {:key => "#{ Words.prefix }spam::#{ ALTERNATIVE }_count", :value => 11 },
22
+ {:key => "#{ Words.prefix }spam::#{ NULL }_count", :value => 133 },
23
+ {:key => "#{ Words.prefix }words::#{ ALTERNATIVE }_count", :value => 6 },
24
+ {:key => "#{ Words.prefix }words::#{ NULL }_count", :value => 811 },
25
+ {:key => "#{ Words.prefix }zero::#{ ALTERNATIVE }_count", :value => 0 },
26
+ {:key => "#{ Words.prefix }zero::#{ NULL }_count", :value => 0 },
27
+ {:key => "#{ Features.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 440 },
28
+ {:key => "#{ Features.prefix }url_in_text::#{ NULL }_count", :value => 40 },
29
+ {:key => "#{ Features.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 112 },
30
+ {:key => "#{ Features.prefix }email_in_text::#{ NULL }_count", :value => 9 },
31
+ {:key => "#{ Examples.prefix }any::#{ ALTERNATIVE }_count", :value => 1000 },
32
+ {:key => "#{ Examples.prefix }any::#{ NULL }_count", :value => 1000 },
33
+ {:key => "#{ Examples.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 1000 },
34
+ {:key => "#{ Examples.prefix }url_in_text::#{ NULL }_count", :value => 1000 },
35
+ {:key => "#{ Examples.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 1000 },
36
+ {:key => "#{ Examples.prefix }email_in_text::#{ NULL }_count", :value => 1000 },
37
37
  ].each do |entry|
38
- GreenMidgetRecords.create(entry)
38
+ Records.create(entry)
39
39
  end
40
40
  end
41
41
 
@@ -106,23 +106,27 @@ describe GreenMidget::Base do
106
106
  it "should increase the index counts of the classified words" do
107
107
  lambda {
108
108
  Tester.new('zero').classify_as!(NULL)
109
- }.should change { GreenMidgetRecords.find_by_key(Words['zero'].record_key(NULL)).value.to_f }.by(1)
109
+ }.should change { Records.find_by_key(Words['zero'].record_key(NULL)).value.to_f }.by(1)
110
110
  end
111
+
111
112
  it "should increment the learning examples count for all features" do
112
113
  FEATURES.each do |feature|
113
114
  lambda {
114
115
  Tester.new('zero').classify_as!(NULL)
115
- }.should change { GreenMidgetRecords.find_by_key(Examples[feature].record_key(NULL)).value.to_f }.by(1)
116
+ }.should change { Records.find_by_key(Examples[feature].record_key(NULL)).value.to_f }.by(1)
116
117
  end
117
118
  end
119
+
118
120
  it "should not add new records for known keys" do
119
121
  a = Tester.new 'stuff unknown sofar'
122
+
120
123
  lambda {
121
124
  a.classify_as! ALTERNATIVE
122
- }.should change { GreenMidgetRecords.count }.by(3)
125
+ }.should change { Records.count }.by(3)
126
+
123
127
  lambda {
124
128
  a.classify_as! ALTERNATIVE
125
- }.should_not change { GreenMidgetRecords.count }
129
+ }.should_not change { Records.count }
126
130
  end
127
131
  end
128
132
 
@@ -130,15 +134,19 @@ describe GreenMidget::Base do
130
134
  it "should ignore words less than 3 characters" do
131
135
  Tester.new('is 2 ch').words.should == []
132
136
  end
137
+
133
138
  it "should break large character strings into chunks of 20 bytes" do
134
139
  Tester.new('s'*20 + '111').words.should == ['s'*20, '111']
135
140
  end
141
+
136
142
  it "should bring uppercase to lowcase" do
137
143
  Tester.new('HOWBIG').words.should == ['howbig']
138
144
  end
145
+
139
146
  it "should not consider parts of email address as individual words" do
140
147
  Tester.new('friend@soundcloud.com').words.should == []
141
148
  end
149
+
142
150
  it "should not consider parts of website url as individual words" do
143
151
  Tester.new('www.myguy.com http://weargeil.org').words.should == []
144
152
  end
@@ -150,7 +158,7 @@ describe GreenMidget::Base do
150
158
  # pending('todo')
151
159
  # end
152
160
  # it "throw an exception if no training examples were given, but it's asked for classification" do
153
- # # if GreenMidgetRecords.count(ALTERNATIVE) or GreenMidgetRecords.count(NULL) is 0.0 => throw an exception
161
+ # # if Records.count(ALTERNATIVE) or Records.count(NULL) is 0.0 => throw an exception
154
162
  # pending('todo')
155
163
  # end
156
164
  # end