green_midget 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/Gemfile +13 -1
- data/README.md +26 -10
- data/Rakefile +6 -1
- data/benchmark/benchmark.rb +44 -0
- data/benchmark/test.rb +33 -0
- data/green_midget.gemspec +0 -8
- data/lib/green_midget.rb +14 -3
- data/lib/green_midget/base.rb +52 -53
- data/lib/green_midget/constants.rb +23 -21
- data/lib/green_midget/db/migrate/create_green_midget_records.rb +13 -11
- data/lib/green_midget/default_features.rb +40 -0
- data/lib/green_midget/errors/feature_method_not_implemented.rb +11 -0
- data/lib/green_midget/errors/no_examples_given.rb +9 -0
- data/lib/green_midget/errors/no_text_found.rb +10 -0
- data/lib/green_midget/heuristic_checks.rb +23 -0
- data/lib/green_midget/models/countable.rb +27 -11
- data/lib/green_midget/models/examples.rb +25 -12
- data/lib/green_midget/models/features.rb +11 -1
- data/lib/green_midget/models/records.rb +61 -0
- data/lib/green_midget/models/words.rb +4 -1
- data/lib/green_midget/url_detection.rb +2 -2
- data/lib/green_midget/version.rb +1 -1
- data/lib/tasks/green_midget.rake +10 -11
- data/spec/base_spec.rb +42 -34
- data/spec/examples_spec.rb +19 -19
- data/spec/features_spec.rb +6 -6
- data/spec/green_midget_records_spec.rb +38 -33
- data/spec/spec_helper.rb +10 -7
- data/spec/tester.rb +1 -1
- data/spec/words_spec.rb +2 -2
- metadata +14 -30
- data/.document +0 -5
- data/lib/green_midget/green_midget.rb +0 -6
- data/lib/green_midget/models/green_midget_records.rb +0 -49
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -1,4 +1,16 @@
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
2
2
|
source "http://rubygems.org"
|
3
3
|
|
4
|
-
|
4
|
+
gem 'activerecord'
|
5
|
+
|
6
|
+
group :development do
|
7
|
+
gem 'pry'
|
8
|
+
gem 'pry-doc'
|
9
|
+
end
|
10
|
+
|
11
|
+
group :test do
|
12
|
+
gem 'rspec', '>=2.4.0'
|
13
|
+
gem 'sqlite3'
|
14
|
+
gem 'rake'
|
15
|
+
end
|
16
|
+
|
data/README.md
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
[![Build
|
2
|
+
Status](https://secure.travis-ci.org/chochkov/GreenMidget.png)](http://travis-ci.org/chochkov/GreenMidget)
|
3
|
+
|
1
4
|
On Bayesian Classification
|
2
5
|
----------
|
3
6
|
|
@@ -62,9 +65,9 @@ If the above functionality is not enough for you and you want to add custom logi
|
|
62
65
|
|
63
66
|
* Implement heuristics logic, which will directly classify incoming object as a given category. Example:
|
64
67
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
+
def pass_ham_heuristics?
|
69
|
+
words.count > 5 || url_in_text?
|
70
|
+
end
|
68
71
|
|
69
72
|
This method will be `true` for longer text or such that contains an external url. In this case the classifier would go on to the actual testing procedure. If `false`, however, the procedure will not be done and the classifier will return the ham category as a result. Note the native `GreenMidget::Base#words` and `GreenMidget::Base#url_in_text?`
|
70
73
|
|
@@ -74,21 +77,21 @@ If the above functionality is not enough for you and you want to add custom logi
|
|
74
77
|
|
75
78
|
By default GreenMidget comes with two feature definitions `url_in_text` and `email_in_text`, but you can implement as many more as you want by writing a boolean method that checks for the feature:
|
76
79
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
+
def regular_user?
|
81
|
+
@user.sign_up_count > 10
|
82
|
+
end
|
80
83
|
|
81
84
|
and then implement a `features` method that returns an array with your custom feature names:
|
82
85
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
+
def features
|
87
|
+
['regular_user', .... ]
|
88
|
+
end
|
86
89
|
|
87
90
|
(do make sure that the array entry is the same as the name of the method that would be checking for this feature)
|
88
91
|
|
89
92
|
The GreenMidget features definitions have more weight on shorter texts and less weight on longer thus they provide a ground source of evidence for GreenMidget's classification.
|
90
93
|
|
91
|
-
If that's not enough too,
|
94
|
+
If that's not enough too, see the Contribute section below.
|
92
95
|
|
93
96
|
Benchmarking
|
94
97
|
----------
|
@@ -114,5 +117,18 @@ Classification Efficiency
|
|
114
117
|
|
115
118
|
TODO: give test results; provide a web interface to a trained classifier using some of SoundCloud's spam and legit data; give production experience from DigitaleSeiten.
|
116
119
|
|
120
|
+
Contribute
|
121
|
+
----------
|
122
|
+
|
123
|
+
Let me know on any feedback or feature requests. If you want to hack on the
|
124
|
+
code, just do that!
|
125
|
+
|
126
|
+
* Make a fork
|
127
|
+
* `git clone git@github.com:chochkov/GreenMidget.git`
|
128
|
+
* `bundle`
|
129
|
+
* `bundle exec rake` to run the specs
|
130
|
+
* Make a patch
|
131
|
+
* Send a Pull Request
|
132
|
+
|
117
133
|
[green_midget_github]: http://github.com/chochkov/GreenMidget "Github repository"
|
118
134
|
[guidelines]: http://soundcloud.com/community-guidelines "Community guidelines"
|
data/Rakefile
CHANGED
@@ -0,0 +1,44 @@
|
|
1
|
+
# Measures training times and classification times over arbitrary message lengths
|
2
|
+
# Dont run this on a database that already has training data - this script will polute it.
|
3
|
+
# TODO: move this to a rake task
|
4
|
+
|
5
|
+
include GreenMidget
|
6
|
+
|
7
|
+
TRAININGS = 90
|
8
|
+
CLASSIFICATIONS = 1
|
9
|
+
|
10
|
+
MESSAGE_LENGTH = 1000
|
11
|
+
|
12
|
+
@training_times = []
|
13
|
+
@classification_times = []
|
14
|
+
|
15
|
+
records_count_at_start = Records.count
|
16
|
+
|
17
|
+
def generate_text(message_length = 1)
|
18
|
+
message ||= []
|
19
|
+
while message.count < message_length do
|
20
|
+
word = ''
|
21
|
+
(rand(7) + 3).times { word += ('a'..'z').to_a[rand(26)] }
|
22
|
+
message << word unless message.include?(word)
|
23
|
+
end
|
24
|
+
text = message.join(' ')
|
25
|
+
end
|
26
|
+
|
27
|
+
TRAININGS.times do
|
28
|
+
a = GreenMidget::Classifier.new generate_text(MESSAGE_LENGTH)
|
29
|
+
@training_times << Benchmark.measure { a.classify_as! [ ALTERNATIVE, NULL ][rand(2)] }.real
|
30
|
+
end
|
31
|
+
|
32
|
+
CLASSIFICATIONS.times do
|
33
|
+
a = GreenMidget::Classifier.new generate_text(MESSAGE_LENGTH)
|
34
|
+
@classification_times << Benchmark.measure { a.classify }.real
|
35
|
+
end
|
36
|
+
|
37
|
+
puts " ------------------------------- "
|
38
|
+
puts " Average seconds from #{TRAININGS} trainings and #{CLASSIFICATIONS} classifications. #{MESSAGE_LENGTH} words per message:"
|
39
|
+
puts " Number of records at start: #{records_count_at_start} and at the end: #{Records.count}"
|
40
|
+
puts " ------------------------------- "
|
41
|
+
puts " Training times: #{(@training_times.sum.to_f/TRAININGS).round(4)}"
|
42
|
+
puts " ------------------------------- "
|
43
|
+
puts " Classification times: #{(@classification_times.sum.to_f/CLASSIFICATIONS).round(4)}"
|
44
|
+
puts " ------------------------------- "
|
data/benchmark/test.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# This is a cross validation script
|
2
|
+
# TODO: move it to a rake task
|
3
|
+
require 'sqlite3'
|
4
|
+
|
5
|
+
require File.join(File.dirname(__FILE__), '..', 'spec', 'tester')
|
6
|
+
include GreenMidget
|
7
|
+
|
8
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => '~/sc/user_backup/data.db')
|
9
|
+
|
10
|
+
@spam = [ 'messages', 'comments', 'posts' ].map { |table| ActiveRecord::Base.connection.execute("select body from #{table} limit 1500").inject([]) { |memo, hash| memo << hash["body"] } }
|
11
|
+
|
12
|
+
ActiveRecord::Base.establish_connection(:adapter => 'mysql', :username => 'root', :password => 'root', :database => 'soundcloud_development_temp')
|
13
|
+
|
14
|
+
@ham = [ 'messages', 'comments', 'posts' ].map { |table| Records.find_by_sql("select body from #{table} limit 1500").to_a.inject([]) { |memo, hash| memo << hash["body"] } }
|
15
|
+
|
16
|
+
ActiveRecord::Base.establish_connection(:adapter => 'mysql', :username => 'root', :password => 'root', :database => 'classifier_development_weird')
|
17
|
+
#
|
18
|
+
# # ------ I. PERFORM TRAINING
|
19
|
+
# puts Benchmark.measure {
|
20
|
+
# @spam.each { |src|
|
21
|
+
# src.each {|body|
|
22
|
+
# klass = Tester.new(body);klass.classify_as! :spam
|
23
|
+
# }
|
24
|
+
# };true
|
25
|
+
# }
|
26
|
+
#
|
27
|
+
# puts Benchmark.measure {
|
28
|
+
# @ham.each { |src|
|
29
|
+
# src.each {|body|
|
30
|
+
# klass = Tester.new(body);klass.classify_as! :ham
|
31
|
+
# }
|
32
|
+
# };true
|
33
|
+
# }
|
data/green_midget.gemspec
CHANGED
@@ -14,18 +14,10 @@ Gem::Specification.new do |s|
|
|
14
14
|
s.summary = %q{Bayesian Text Classifier}
|
15
15
|
s.description = %q{Naive Bayesian Classifier with customizable features}
|
16
16
|
|
17
|
-
s.rubyforge_project = "green_midget"
|
18
|
-
|
19
17
|
s.files = `git ls-files`.split("\n")
|
20
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
20
|
s.require_paths = ["lib"]
|
23
21
|
|
24
22
|
s.add_runtime_dependency "activerecord"
|
25
|
-
s.add_development_dependency "rspec"
|
26
|
-
s.add_development_dependency "bundler"
|
27
|
-
|
28
|
-
# specify any dependencies here; for example:
|
29
|
-
# s.add_development_dependency "rspec"
|
30
|
-
# s.add_runtime_dependency "rest-client"
|
31
23
|
end
|
data/lib/green_midget.rb
CHANGED
@@ -1,15 +1,26 @@
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
2
2
|
require 'active_record'
|
3
|
-
|
3
|
+
|
4
|
+
require 'green_midget/constants'
|
5
|
+
require 'green_midget/url_detection'
|
6
|
+
require 'green_midget/logger'
|
7
|
+
require 'green_midget/heuristic_checks'
|
8
|
+
require 'green_midget/default_features'
|
4
9
|
require 'green_midget/base'
|
10
|
+
|
5
11
|
require 'green_midget/models/countable'
|
6
12
|
require 'green_midget/models/examples'
|
7
13
|
require 'green_midget/models/features'
|
8
|
-
require 'green_midget/models/
|
14
|
+
require 'green_midget/models/records'
|
9
15
|
require 'green_midget/models/words'
|
16
|
+
|
17
|
+
require 'green_midget/errors/no_text_found'
|
18
|
+
require 'green_midget/errors/feature_method_not_implemented'
|
19
|
+
require 'green_midget/errors/no_examples_given'
|
20
|
+
|
10
21
|
require 'green_midget/extensions/classifier'
|
11
22
|
|
12
|
-
if
|
23
|
+
if classifier = Gem.searcher.find('green_midget')
|
13
24
|
path = classifier.full_gem_path
|
14
25
|
Dir["#{path}/lib/tasks/*.rake"].each { |ext| load ext }
|
15
26
|
end
|
data/lib/green_midget/base.rb
CHANGED
@@ -1,75 +1,61 @@
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
2
|
-
%w(logger constants url_detection).each do |file|
|
3
|
-
require File.join(File.dirname(__FILE__), file)
|
4
|
-
end
|
5
|
-
|
6
2
|
module GreenMidget
|
7
3
|
class Base
|
8
|
-
include
|
9
|
-
include
|
10
|
-
|
4
|
+
include DefaultFeatures
|
5
|
+
include HeuristicChecks
|
6
|
+
|
7
|
+
# Get classification for unknown messages based on history
|
8
|
+
#
|
9
|
+
# Examples:
|
10
|
+
#
|
11
|
+
# result = GreenMidget::Classifier.new(unknown_text)
|
12
|
+
# # result is now in -1, 0, 1 meaning respectively
|
13
|
+
# # no_spam, no_answer, spam
|
14
|
+
#
|
11
15
|
def classify
|
12
|
-
|
13
|
-
|
14
|
-
classify_as!(category)
|
15
|
-
return HYPOTHESES[category]
|
16
|
-
end
|
16
|
+
if respond_to?(:heuristic_checks, true) && response = heuristic_checks
|
17
|
+
return response
|
17
18
|
end
|
18
19
|
|
19
|
-
|
20
|
-
|
20
|
+
# load all relevant records in one go
|
21
|
+
Records.fetch_all(words)
|
21
22
|
|
22
23
|
factor = log_ratio
|
23
24
|
case
|
24
25
|
when factor >= ACCEPT_ALTERNATIVE_MIN
|
25
|
-
|
26
|
+
RESPONSES[ALTERNATIVE]
|
26
27
|
when factor >= REJECT_ALTERNATIVE_MAX
|
27
|
-
|
28
|
+
RESPONSES[:dunno]
|
28
29
|
else
|
29
|
-
|
30
|
+
RESPONSES[NULL]
|
30
31
|
end
|
31
32
|
end
|
32
33
|
|
34
|
+
# Public method used to train the classifier with examples
|
35
|
+
# belonging to a known `category`.
|
36
|
+
#
|
37
|
+
# Examples:
|
38
|
+
#
|
39
|
+
# classifier = GreenMidget::Classifier.new(known_good_text)
|
40
|
+
# classifier.classify_as!(:ham)
|
41
|
+
# # increases the chances for similar text to pass the check next time
|
42
|
+
#
|
43
|
+
# classifier = GreenMidget::Classifier.new(known_spam_text)
|
44
|
+
# classifier.classify_as!(:spam)
|
45
|
+
# # increases the chances for similar text to fail the check next time
|
46
|
+
#
|
33
47
|
def classify_as!(category)
|
34
|
-
keys = [
|
35
|
-
|
36
|
-
|
48
|
+
keys = [
|
49
|
+
Words.objects(words),
|
50
|
+
Features.objects(present_features),
|
51
|
+
Examples.objects(features, true)
|
52
|
+
].flatten.map { |object| object.record_key(category) }
|
37
53
|
|
38
|
-
|
39
|
-
register_training
|
54
|
+
!! Records.increment(keys)
|
40
55
|
end
|
41
56
|
|
42
57
|
private
|
43
58
|
|
44
|
-
# ------ Features --------
|
45
|
-
|
46
|
-
def features
|
47
|
-
FEATURES
|
48
|
-
end
|
49
|
-
|
50
|
-
def present_features
|
51
|
-
features.select { |feature| feature_present?(feature) }
|
52
|
-
end
|
53
|
-
|
54
|
-
def feature_present?(feature)
|
55
|
-
method = :"#{ feature }?"
|
56
|
-
if respond_to?(method, true)
|
57
|
-
send(method)
|
58
|
-
else
|
59
|
-
raise("You must implement method #{ method } or remove feature #{ feature }.")
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def url_in_text?
|
64
|
-
UrlDetection.new(text).any?
|
65
|
-
end
|
66
|
-
|
67
|
-
def email_in_text?
|
68
|
-
text.scan(EMAIL_REGEX).size > 0
|
69
|
-
end
|
70
|
-
|
71
|
-
# ------ Words --------
|
72
|
-
|
73
59
|
def words
|
74
60
|
strip_external_links.scan(WORDS_SPLIT_REGEX).uniq.
|
75
61
|
map(&:downcase).
|
@@ -81,11 +67,24 @@ module GreenMidget
|
|
81
67
|
end
|
82
68
|
|
83
69
|
def text
|
84
|
-
@text || raise(
|
70
|
+
@text || raise(NoTextFound)
|
85
71
|
end
|
86
72
|
|
73
|
+
# Calculate the log ratio between the scores for both categories.
|
74
|
+
# It takes into account the Examples counts ( ie. how much history
|
75
|
+
# there is for each category ), the Words count ( i.e. how much history for
|
76
|
+
# each word in each category ) and if any other Features are there -
|
77
|
+
# accounts for them as well.
|
87
78
|
def log_ratio
|
88
|
-
|
79
|
+
result = Examples.log_ratio
|
80
|
+
|
81
|
+
result += words.map{ |word| Words[word].log_ratio }.sum
|
82
|
+
|
83
|
+
if respond_to?(:features, true)
|
84
|
+
result += present_features.map{ |feature| Features[feature].log_ratio }.sum
|
85
|
+
end
|
86
|
+
|
87
|
+
result
|
89
88
|
end
|
90
89
|
end
|
91
90
|
end
|
@@ -1,31 +1,33 @@
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
2
2
|
module GreenMidget
|
3
|
-
|
4
|
-
TOLERATED_URLS = /(soundcloud.com)|(facebook.com)|(myspace.com)|(twitter.com)/
|
3
|
+
TOLERATED_URLS = /(soundcloud.com)|(facebook.com)|(myspace.com)|(twitter.com)/
|
5
4
|
|
6
|
-
|
7
|
-
|
5
|
+
EMAIL_REGEX = /[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]/
|
6
|
+
URL_REGEX = /(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))/
|
8
7
|
|
9
|
-
|
8
|
+
EXTERNAL_LINK_REGEX = Regexp.new(/(#{EMAIL_REGEX})|(#{URL_REGEX})/)
|
10
9
|
|
11
|
-
|
10
|
+
STOP_WORDS = %w()
|
12
11
|
|
13
|
-
|
14
|
-
|
12
|
+
MIN_CHARACTERS_IN_WORD = 3
|
13
|
+
MAX_CHARACTERS_IN_WORD = 20
|
15
14
|
|
16
|
-
|
17
|
-
|
15
|
+
WORDS_SPLIT_REGEX = Regexp.new(/\w{#{ MIN_CHARACTERS_IN_WORD },#{ MAX_CHARACTERS_IN_WORD }}/)
|
16
|
+
FEATURES = %w(url_in_text email_in_text)
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
# Decision making:
|
19
|
+
# Log(Pr(alternative | text)) - Log(Pr(null | text)) <=>
|
20
|
+
# ( REJECT_ALTERNATIVE_MAX..ACCEPT_ALTERNATIVE_MIN )
|
21
|
+
#
|
22
|
+
ACCEPT_ALTERNATIVE_MIN = Math::log(3.0)
|
23
|
+
REJECT_ALTERNATIVE_MAX = 0.0
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
NULL
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
NULL = :ham
|
26
|
+
ALTERNATIVE = :spam
|
27
|
+
CATEGORIES = [ NULL, ALTERNATIVE ]
|
28
|
+
RESPONSES = {
|
29
|
+
NULL => -1,
|
30
|
+
:dunno => 0,
|
31
|
+
ALTERNATIVE => 1,
|
32
|
+
}
|
31
33
|
end
|
@@ -1,16 +1,18 @@
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
t
|
6
|
-
|
7
|
-
|
2
|
+
module GreenMidget
|
3
|
+
class CreateGreenMidgetRecords < ActiveRecord::Migration
|
4
|
+
def self.up
|
5
|
+
create_table :green_midget_records do |t|
|
6
|
+
t.string :key
|
7
|
+
t.integer :value
|
8
|
+
t.datetime :updated_at
|
9
|
+
end
|
10
|
+
add_index :green_midget_records, :key
|
11
|
+
add_index :green_midget_records, :updated_at
|
8
12
|
end
|
9
|
-
add_index :green_midget_records, :key
|
10
|
-
add_index :green_midget_records, :updated_at
|
11
|
-
end
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
def self.down
|
15
|
+
drop_table :green_midget_records
|
16
|
+
end
|
15
17
|
end
|
16
18
|
end
|