green_midget 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +4 -0
- data/.rspec +1 -0
- data/.rvmrc +1 -0
- data/Gemfile +19 -0
- data/Gemfile.lock +45 -0
- data/LICENSE.txt +15 -0
- data/README.md +128 -0
- data/Rakefile +1 -0
- data/benchmark/benchmark.rb +40 -0
- data/benchmark/test.rb +31 -0
- data/db/migrate/create_green_midget_records.rb +16 -0
- data/green_midget.gemspec +31 -0
- data/lib/extensions/green_midget_check.rb +8 -0
- data/lib/extensions/sample.rb +19 -0
- data/lib/green_midget/base.rb +91 -0
- data/lib/green_midget/constants.rb +31 -0
- data/lib/green_midget/green_midget.rb +6 -0
- data/lib/green_midget/logger.rb +16 -0
- data/lib/green_midget/models/countable.rb +33 -0
- data/lib/green_midget/models/examples.rb +55 -0
- data/lib/green_midget/models/features.rb +14 -0
- data/lib/green_midget/models/green_midget_records.rb +49 -0
- data/lib/green_midget/models/words.rb +21 -0
- data/lib/green_midget/url_detection.rb +24 -0
- data/lib/green_midget/version.rb +3 -0
- data/lib/green_midget.rb +16 -0
- data/lib/tasks/green_midget.rake +32 -0
- data/spec/base_spec.rb +163 -0
- data/spec/examples_spec.rb +85 -0
- data/spec/features_spec.rb +24 -0
- data/spec/green_midget_records_spec.rb +80 -0
- data/spec/green_midget_url_detection_spec.rb +14 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/tester.rb +17 -0
- data/spec/words_spec.rb +24 -0
- metadata +125 -0
data/.document
ADDED
data/.gitignore
ADDED
data/.rspec
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
--color
|
data/.rvmrc
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
rvm use ruby-1.9.2@green_midget
|
data/Gemfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
source "http://rubygems.org"
|
|
3
|
+
# Add dependencies required to use your gem here.
|
|
4
|
+
# Example:
|
|
5
|
+
# gem "activesupport", ">= 2.3.5"
|
|
6
|
+
|
|
7
|
+
gem "activerecord"
|
|
8
|
+
|
|
9
|
+
# remove this dependency after testing.
|
|
10
|
+
gem 'jberkel-mysql-ruby', '= 2.8.1', :require => 'mysql' # Ruby 1.9 fixes
|
|
11
|
+
|
|
12
|
+
# Add dependencies to develop your gem here.
|
|
13
|
+
# Include everything needed to run rake, tests, features, etc.
|
|
14
|
+
group :development do
|
|
15
|
+
gem "rspec", "~> 2.3.0"
|
|
16
|
+
gem "bundler", "~> 1.0.0"
|
|
17
|
+
gem "jeweler", "~> 1.5.2"
|
|
18
|
+
gem "rcov", ">= 0"
|
|
19
|
+
end
|
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
GEM
|
|
2
|
+
remote: http://rubygems.org/
|
|
3
|
+
specs:
|
|
4
|
+
activemodel (3.0.7)
|
|
5
|
+
activesupport (= 3.0.7)
|
|
6
|
+
builder (~> 2.1.2)
|
|
7
|
+
i18n (~> 0.5.0)
|
|
8
|
+
activerecord (3.0.7)
|
|
9
|
+
activemodel (= 3.0.7)
|
|
10
|
+
activesupport (= 3.0.7)
|
|
11
|
+
arel (~> 2.0.2)
|
|
12
|
+
tzinfo (~> 0.3.23)
|
|
13
|
+
activesupport (3.0.7)
|
|
14
|
+
arel (2.0.9)
|
|
15
|
+
builder (2.1.2)
|
|
16
|
+
diff-lcs (1.1.2)
|
|
17
|
+
git (1.2.5)
|
|
18
|
+
i18n (0.5.0)
|
|
19
|
+
jberkel-mysql-ruby (2.8.1)
|
|
20
|
+
jeweler (1.5.2)
|
|
21
|
+
bundler (~> 1.0.0)
|
|
22
|
+
git (>= 1.2.5)
|
|
23
|
+
rake
|
|
24
|
+
rake (0.8.7)
|
|
25
|
+
rcov (0.9.9)
|
|
26
|
+
rspec (2.3.0)
|
|
27
|
+
rspec-core (~> 2.3.0)
|
|
28
|
+
rspec-expectations (~> 2.3.0)
|
|
29
|
+
rspec-mocks (~> 2.3.0)
|
|
30
|
+
rspec-core (2.3.1)
|
|
31
|
+
rspec-expectations (2.3.0)
|
|
32
|
+
diff-lcs (~> 1.1.2)
|
|
33
|
+
rspec-mocks (2.3.0)
|
|
34
|
+
tzinfo (0.3.27)
|
|
35
|
+
|
|
36
|
+
PLATFORMS
|
|
37
|
+
ruby
|
|
38
|
+
|
|
39
|
+
DEPENDENCIES
|
|
40
|
+
activerecord
|
|
41
|
+
bundler (~> 1.0.0)
|
|
42
|
+
jberkel-mysql-ruby (= 2.8.1)
|
|
43
|
+
jeweler (~> 1.5.2)
|
|
44
|
+
rcov
|
|
45
|
+
rspec (~> 2.3.0)
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
|
|
3
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
4
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
|
|
5
|
+
BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
6
|
+
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
7
|
+
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
|
|
8
|
+
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
|
|
9
|
+
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
10
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
|
11
|
+
OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
12
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
|
13
|
+
TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
14
|
+
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
|
15
|
+
OF SUCH DAMAGE.
|
data/README.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
On Bayesian Classification
|
|
2
|
+
----------
|
|
3
|
+
|
|
4
|
+
Using SoundCloud's private messaging means that you can effectively reach out to everyone on the Cloud. On top of that, you have track commenting, groups posting, forum topics, track sharing - we care about your voice being heard! And read.
|
|
5
|
+
|
|
6
|
+
I'll put this in some perspective and say that we're now having daily text exchange volume in the order of hundreds of thousands. And it's also rapidly going up.
|
|
7
|
+
|
|
8
|
+
And while most of this runs smoother than Berliner beer on a SoundCloud Friday, violations to our [Community guidelines][guidelines] are starting to be less and less of an exception. So I've been given the task to address this and build a system that progressively learns how to tell good community behaviour from less good - welcome to the:
|
|
9
|
+
|
|
10
|
+
GreenMidget
|
|
11
|
+
----------
|
|
12
|
+
|
|
13
|
+
GreenMidget is a trainable, feature-full Bayesian text classifier. Out of the box it's super straightforward to use, but it also offers easy customisation options. It's a Ruby gem and today we're open sourcing it, so you can start with it within a minute after the:
|
|
14
|
+
|
|
15
|
+
Installation
|
|
16
|
+
----------
|
|
17
|
+
|
|
18
|
+
You are very likely (but not necessarily) gonna be on a Rails app, so just add
|
|
19
|
+
|
|
20
|
+
gem 'green_midget'
|
|
21
|
+
|
|
22
|
+
to your Gemfile and run
|
|
23
|
+
|
|
24
|
+
bundle install
|
|
25
|
+
|
|
26
|
+
then add
|
|
27
|
+
|
|
28
|
+
require 'green_midget'
|
|
29
|
+
|
|
30
|
+
to your Rakefile and run
|
|
31
|
+
|
|
32
|
+
rake green_midget:setup
|
|
33
|
+
|
|
34
|
+
You're now done.
|
|
35
|
+
|
|
36
|
+
How it works
|
|
37
|
+
----------
|
|
38
|
+
|
|
39
|
+
GreenMidget
|
|
40
|
+
|
|
41
|
+
GreenMidget is a learner, so you will only expect effective classification from it only once it has received sufficient training. Training it means providing examples of messages for the categor
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
Use it
|
|
46
|
+
----------
|
|
47
|
+
|
|
48
|
+
`GreenMidget` exposes two public methods as a start: `GreenMidget#classify_as!` and `GreenMidget#classify`. Let's do a three lines classification session and illustrate them
|
|
49
|
+
|
|
50
|
+
We'll start training `GreenMidget` with a spammy example
|
|
51
|
+
|
|
52
|
+
GreenMidget.new(known_spam_text).classify_as! :spam
|
|
53
|
+
|
|
54
|
+
Similarly for legitimate examples
|
|
55
|
+
|
|
56
|
+
GreenMidget.new(known_legit_text).classify_as! :ham
|
|
57
|
+
|
|
58
|
+
To get a classification decision we would
|
|
59
|
+
|
|
60
|
+
decision = GreenMidget.new(new_text).classify
|
|
61
|
+
|
|
62
|
+
`decision` is now one of `[-1, 0, 1]` meaning respectively 'No spam', 'Not enough evidence', 'Spam'.
|
|
63
|
+
|
|
64
|
+
Extend it
|
|
65
|
+
----------
|
|
66
|
+
|
|
67
|
+
If the above functionality is not enough for you and you want to add custom logic to GreenMidget you can do that by extending the `GreenMidget::Base` class (check `extensions/sample.b` in the [code][green_midget_github] for an example):
|
|
68
|
+
|
|
69
|
+
* Implement heuristics logic, which will directly classify incoming object as a given category. Example:
|
|
70
|
+
|
|
71
|
+
def pass_ham_heuristics?
|
|
72
|
+
words.count > 5 || url_in_text?
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
This method will be `true` for longer text or such that contains an external url. In this case the classifier would go on to the actual testing procedure. If `false`, however, the procedure will not be done and the classifier will return the ham category as a result. Note the native `GreenMidget::Base#words` and `GreenMidget::Base#url_in_text?`
|
|
76
|
+
|
|
77
|
+
All heuristic checks return `true` by default so it's up to you whether you will define and use heuristics at all or not. However, using them can help you integrate your application context and decrease classification error chance especially at the edge cases.
|
|
78
|
+
|
|
79
|
+
* Expand the source of evidence. Traditionally, _naive_ Bayesian text classifiers see individual words as evidence and calculate category-likelihoods for each word. But there could be more than that in your application context, eg. user's data or specific text features.
|
|
80
|
+
|
|
81
|
+
By default GreenMidget comes with two feature definitions `url_in_text` and `email_in_text`, but you can implement as many more as you want by writing a boolean method that checks for the feature:
|
|
82
|
+
|
|
83
|
+
def regular_user?
|
|
84
|
+
@user.sign_up_count > 10
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
and then implement a `features` method that returns an array with your custom feature names:
|
|
88
|
+
|
|
89
|
+
def features
|
|
90
|
+
['regular_user', .... ]
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
(do make sure that the array entry is the same as the name of the method that would be checking for this feature)
|
|
94
|
+
|
|
95
|
+
The GreenMidget features definitions have more weight on shorter texts and less weight on longer thus they provide a ground source of evidence for GreenMidget's classification.
|
|
96
|
+
|
|
97
|
+
If that's not enough too, you're welcome to [browse the code][green_midget_github] and either extend more parts of it or simply make your own fork of the project.
|
|
98
|
+
|
|
99
|
+
Benchmarking
|
|
100
|
+
----------
|
|
101
|
+
|
|
102
|
+
1. GreenMidget is optimised for classification operations (`classify` method), on which it's very efficient. The results below were obtained from classification on randomly generated messages of length _1 000 words_ (that's _very_ long for SoundCloud). Since GreenMidget runs on a relational database (through ActiveRecord) by default the table size impacts data fetch and write:
|
|
103
|
+
|
|
104
|
+
* on ~ 10 000 table rows = 0.0703 seconds / message
|
|
105
|
+
* on ~ 100 000 rows = 0.2082 sec / message
|
|
106
|
+
* on ~ 500 000 rows = 0.6505 sec / message
|
|
107
|
+
* on ~ 1 000 000 rows = 0.6773 sec / messages
|
|
108
|
+
|
|
109
|
+
2. Training operations (`classify_as!`) are, however, less performant because they invoke a database write per word. Under the same conditions as above, the training times of randomly generated messages follows:
|
|
110
|
+
|
|
111
|
+
* on ~ 10 000 table rows = 1.5984 seconds / message
|
|
112
|
+
* on ~ 100 000 rows = 0.1303 sec / message
|
|
113
|
+
* on ~ 500 000 rows = 1.7185 sec / message
|
|
114
|
+
* on ~ 1 000 000 rows = 2.5335 sec / message
|
|
115
|
+
|
|
116
|
+
Classification Efficiency
|
|
117
|
+
----------
|
|
118
|
+
|
|
119
|
+
Trained on s
|
|
120
|
+
|
|
121
|
+
Benchmarks = > data
|
|
122
|
+
|
|
123
|
+
Efficiency = > for the sake of this article I ran a small off-production test to show some results on real data - I used 150 000 text items from records which we marked as good and records which we marked as not-the-best!
|
|
124
|
+
|
|
125
|
+
We'll be next building our own SoundCloud extensions to GreenMidget and use it, so expect to hear more from the student! Meanwhile, I'll be happy to answer everything concerning the project so do feel free to get in touch.
|
|
126
|
+
|
|
127
|
+
[green_midget_github]: http://github.com/chochkov/green_midget "Github repository"
|
|
128
|
+
[guidelines]: http://soundcloud.com/community-guidelines "Community guidelines"
|
data/Rakefile
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require "bundler/gem_tasks"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
include GreenMidget
|
|
2
|
+
|
|
3
|
+
TRAININGS = 90
|
|
4
|
+
CLASSIFICATIONS = 1
|
|
5
|
+
|
|
6
|
+
MESSAGE_LENGTH = 1000
|
|
7
|
+
|
|
8
|
+
@training_times = []
|
|
9
|
+
@classification_times = []
|
|
10
|
+
|
|
11
|
+
records_count_at_start = GreenMidgetRecords.count
|
|
12
|
+
|
|
13
|
+
def generate_text(message_length = 1)
|
|
14
|
+
message ||= []
|
|
15
|
+
while message.count < message_length do
|
|
16
|
+
word = ''
|
|
17
|
+
(rand(7) + 3).times { word += ('a'..'z').to_a[rand(26)] }
|
|
18
|
+
message << word unless message.include?(word)
|
|
19
|
+
end
|
|
20
|
+
text = message.join(' ')
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
TRAININGS.times do
|
|
24
|
+
a = GreenMidgetCheck.new generate_text(MESSAGE_LENGTH)
|
|
25
|
+
@training_times << Benchmark.measure{ a.classify_as! [ ALTERNATIVE, NULL ][rand(2)] }.real
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
CLASSIFICATIONS.times do
|
|
29
|
+
a = GreenMidgetCheck.new generate_text(MESSAGE_LENGTH)
|
|
30
|
+
@classification_times << Benchmark.measure{ a.classify }.real
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
puts " ------------------------------- "
|
|
34
|
+
puts " Average seconds from #{ TRAININGS } trainings and #{ CLASSIFICATIONS } classifications. #{ MESSAGE_LENGTH } words per message:"
|
|
35
|
+
puts " Number of records at start: #{ records_count_at_start } and at the end: #{ GreenMidgetRecords.count }"
|
|
36
|
+
puts " ------------------------------- "
|
|
37
|
+
puts " Training times: #{ (@training_times.sum.to_f/TRAININGS).round(4) }"
|
|
38
|
+
puts " ------------------------------- "
|
|
39
|
+
puts " Classification times: #{ (@classification_times.sum.to_f/CLASSIFICATIONS).round(4) }"
|
|
40
|
+
puts " ------------------------------- "
|
data/benchmark/test.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'sqlite3'
|
|
2
|
+
|
|
3
|
+
require File.join(File.dirname(__FILE__), '..', 'spec', 'tester')
|
|
4
|
+
include GreenMidget
|
|
5
|
+
|
|
6
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => '/sc/user_backup/data.db')
|
|
7
|
+
|
|
8
|
+
@spam = [ 'messages', 'comments', 'posts' ].map { |table| ActiveRecord::Base.connection.execute("select body from #{table} limit 1500").inject([]) { |memo, hash| memo << hash["body"] } }
|
|
9
|
+
|
|
10
|
+
ActiveRecord::Base.establish_connection(:adapter => 'mysql', :username => 'root', :password => 'root', :database => 'soundcloud_development_temp')
|
|
11
|
+
|
|
12
|
+
@ham = [ 'messages', 'comments', 'posts' ].map { |table| GreenMidgetRecords.find_by_sql("select body from #{table} limit 1500").to_a.inject([]) { |memo, hash| memo << hash["body"] } }
|
|
13
|
+
|
|
14
|
+
ActiveRecord::Base.establish_connection(:adapter => 'mysql', :username => 'root', :password => 'root', :database => 'classifier_development_weird')
|
|
15
|
+
#
|
|
16
|
+
# # ------ I. PERFORM TRAINING
|
|
17
|
+
# puts Benchmark.measure {
|
|
18
|
+
# @spam.each { |src|
|
|
19
|
+
# src.each {|body|
|
|
20
|
+
# klass = Tester.new(body);klass.classify_as! :spam
|
|
21
|
+
# }
|
|
22
|
+
# };true
|
|
23
|
+
# }
|
|
24
|
+
#
|
|
25
|
+
# puts Benchmark.measure {
|
|
26
|
+
# @ham.each { |src|
|
|
27
|
+
# src.each {|body|
|
|
28
|
+
# klass = Tester.new(body);klass.classify_as! :ham
|
|
29
|
+
# }
|
|
30
|
+
# };true
|
|
31
|
+
# }
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
class CreateGreenMidgetRecords < ActiveRecord::Migration
|
|
3
|
+
def self.up
|
|
4
|
+
create_table :green_midget_records do |t|
|
|
5
|
+
t.string :key
|
|
6
|
+
t.string :value
|
|
7
|
+
t.datetime :updated_at
|
|
8
|
+
end
|
|
9
|
+
add_index :green_midget_records, :key
|
|
10
|
+
add_index :green_midget_records, :updated_at
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.down
|
|
14
|
+
drop_table :green_midget_records
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
|
3
|
+
require "green_midget/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |s|
|
|
6
|
+
s.name = "green_midget"
|
|
7
|
+
s.version = GreenMidget::VERSION
|
|
8
|
+
s.authors = ["nikola chochkov"]
|
|
9
|
+
s.email = ["nikola@howkul.info"]
|
|
10
|
+
s.homepage = "http://github.com/chochkov/GreenMidget"
|
|
11
|
+
s.licenses = ["MIT"]
|
|
12
|
+
s.require_paths = ["lib"]
|
|
13
|
+
s.rubygems_version = %q{1.7.2}
|
|
14
|
+
s.summary = %q{Bayesian Text Classifier}
|
|
15
|
+
s.description = %q{Naive Bayesian Classifier with customizable features}
|
|
16
|
+
|
|
17
|
+
s.rubyforge_project = "green_midget"
|
|
18
|
+
|
|
19
|
+
s.files = `git ls-files`.split("\n")
|
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
22
|
+
s.require_paths = ["lib"]
|
|
23
|
+
|
|
24
|
+
s.add_runtime_dependency "activerecord"
|
|
25
|
+
s.add_development_dependency "rspec"
|
|
26
|
+
s.add_development_dependency "bundler"
|
|
27
|
+
|
|
28
|
+
# specify any dependencies here; for example:
|
|
29
|
+
# s.add_development_dependency "rspec"
|
|
30
|
+
# s.add_runtime_dependency "rest-client"
|
|
31
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
class Sample < GreenMidget::Base
|
|
3
|
+
attr_accessor :user
|
|
4
|
+
|
|
5
|
+
def initialize(text, user)
|
|
6
|
+
@text = text
|
|
7
|
+
@user = user
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def features
|
|
13
|
+
%w(regular_user) + super
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def regular_user?
|
|
17
|
+
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'green_midget/constants'
|
|
3
|
+
require 'green_midget/logger'
|
|
4
|
+
require 'green_midget/url_detection'
|
|
5
|
+
|
|
6
|
+
module GreenMidget
|
|
7
|
+
class Base
|
|
8
|
+
include Logger
|
|
9
|
+
include Constants
|
|
10
|
+
|
|
11
|
+
def classify
|
|
12
|
+
CATEGORIES.each do |category|
|
|
13
|
+
if respond_to?(:"pass_#{category}_heuristics?") && send(:"pass_#{category}_heuristics?")
|
|
14
|
+
classify_as!(category)
|
|
15
|
+
return HYPOTHESES[category]
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
GreenMidgetRecords.fetch_all(words)
|
|
20
|
+
register_classification
|
|
21
|
+
|
|
22
|
+
factor = log_ratio
|
|
23
|
+
case
|
|
24
|
+
when factor >= ACCEPT_ALTERNATIVE_MIN
|
|
25
|
+
ALTERNATIVE_RESPONSE
|
|
26
|
+
when factor >= REJECT_ALTERNATIVE_MAX
|
|
27
|
+
DUNNO
|
|
28
|
+
else
|
|
29
|
+
NULL_RESPONSE
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def classify_as!(category)
|
|
34
|
+
keys = [ Words.objects(words), Features.objects(present_features), Examples.objects(features, true) ].flatten.map do |object|
|
|
35
|
+
object.record_key(category)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
GreenMidgetRecords.increment(keys)
|
|
39
|
+
register_training
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
# ------ Features --------
|
|
45
|
+
|
|
46
|
+
def features
|
|
47
|
+
FEATURES
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def present_features
|
|
51
|
+
features.select { |feature| feature_present?(feature) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def feature_present?(feature)
|
|
55
|
+
method = :"#{ feature }?"
|
|
56
|
+
if respond_to?(method, true)
|
|
57
|
+
send(method)
|
|
58
|
+
else
|
|
59
|
+
raise("You must implement method #{ method } or remove feature #{ feature }.")
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def url_in_text?
|
|
64
|
+
UrlDetection.new(text).any?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def email_in_text?
|
|
68
|
+
text.scan(EMAIL_REGEX).size > 0
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# ------ Words --------
|
|
72
|
+
|
|
73
|
+
def words
|
|
74
|
+
strip_external_links.scan(WORDS_SPLIT_REGEX).uniq.
|
|
75
|
+
map(&:downcase).
|
|
76
|
+
reject { |word| STOP_WORDS.include?(word) }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def strip_external_links
|
|
80
|
+
text.gsub(EXTERNAL_LINK_REGEX, '')
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def text
|
|
84
|
+
@text || raise('You should either implement the text method or provide an instance variable at this point.')
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def log_ratio
|
|
88
|
+
Examples.log_ratio + words.map{ |word| Words[word].log_ratio }.sum + present_features.map{ |feature| Features[feature].log_ratio }.sum
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
module Constants
|
|
4
|
+
TOLERATED_URLS = /(soundcloud.com)|(facebook.com)|(myspace.com)|(twitter.com)/
|
|
5
|
+
|
|
6
|
+
EMAIL_REGEX = /[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]/
|
|
7
|
+
URL_REGEX = /(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))/
|
|
8
|
+
|
|
9
|
+
EXTERNAL_LINK_REGEX = Regexp.new(/(#{ EMAIL_REGEX })|(#{ URL_REGEX })/)
|
|
10
|
+
|
|
11
|
+
STOP_WORDS = %w()
|
|
12
|
+
|
|
13
|
+
MIN_CHARACTERS_IN_WORD = 3
|
|
14
|
+
MAX_CHARACTERS_IN_WORD = 20
|
|
15
|
+
|
|
16
|
+
WORDS_SPLIT_REGEX = Regexp.new(/\w{#{ MIN_CHARACTERS_IN_WORD },#{ MAX_CHARACTERS_IN_WORD }}/)
|
|
17
|
+
FEATURES = %w(url_in_text email_in_text)
|
|
18
|
+
|
|
19
|
+
# Decision making: Log(Pr(alternative | text)) - Log(Pr(null | text)) <=> [ REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN ]
|
|
20
|
+
ACCEPT_ALTERNATIVE_MIN = Math::log(3.0)
|
|
21
|
+
REJECT_ALTERNATIVE_MAX = 0.0
|
|
22
|
+
|
|
23
|
+
ALTERNATIVE_RESPONSE = 1
|
|
24
|
+
DUNNO = 0
|
|
25
|
+
NULL_RESPONSE = -1
|
|
26
|
+
|
|
27
|
+
NULL = :ham
|
|
28
|
+
ALTERNATIVE = :spam
|
|
29
|
+
CATEGORIES = [ NULL, ALTERNATIVE ]
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
module Logger
|
|
4
|
+
private
|
|
5
|
+
|
|
6
|
+
def register_classification
|
|
7
|
+
# TODO: implement default Logging system
|
|
8
|
+
true
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def register_training
|
|
12
|
+
# TODO: the same... (obviously)
|
|
13
|
+
true
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
class Countable
|
|
4
|
+
include Constants
|
|
5
|
+
attr_accessor :key
|
|
6
|
+
|
|
7
|
+
class << self; attr_accessor :prefix end
|
|
8
|
+
|
|
9
|
+
def self.[](key)
|
|
10
|
+
new(key)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.objects(keys)
|
|
14
|
+
keys.map { |key| new(key) }
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize(key)
|
|
18
|
+
@key = self.class.prefix + key
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def [](category)
|
|
22
|
+
GreenMidgetRecords[record_key(category)].to_f
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def log_ratio
|
|
26
|
+
Math::log(probability_for(ALTERNATIVE) / probability_for(NULL))
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def record_key(category)
|
|
30
|
+
self.key + "::#{category}_count"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
class Examples < Countable
|
|
4
|
+
NO_EXAMPLES_GIVEN_ERROR = 'Training examples must be provided for all categories before classification.'
|
|
5
|
+
GENERAL_FEATURE_NAME = 'any'
|
|
6
|
+
self.prefix = 'examples_with_feature::'
|
|
7
|
+
|
|
8
|
+
class_eval(<<-EVAL, __FILE__, __LINE__ + 1)
|
|
9
|
+
def self.#{ ALTERNATIVE } # def self.ham
|
|
10
|
+
@@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE] # @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE]
|
|
11
|
+
end # end
|
|
12
|
+
|
|
13
|
+
def self.#{ NULL } # def self.spam
|
|
14
|
+
@@null ||= self[GENERAL_FEATURE_NAME][NULL] # @@null ||= self[GENERAL_FEATURE_NAME][NULL]
|
|
15
|
+
end # end
|
|
16
|
+
EVAL
|
|
17
|
+
|
|
18
|
+
def self.[](feature)
|
|
19
|
+
object = super(feature)
|
|
20
|
+
|
|
21
|
+
if object.no_examples? && (feature == GENERAL_FEATURE_NAME)
|
|
22
|
+
raise NO_EXAMPLES_GIVEN_ERROR
|
|
23
|
+
elsif object.no_examples?
|
|
24
|
+
super GENERAL_FEATURE_NAME
|
|
25
|
+
else
|
|
26
|
+
object
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.objects(features, with_general = false)
|
|
31
|
+
features += with_general ? [ GENERAL_FEATURE_NAME ] : []
|
|
32
|
+
super(features)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def self.log_ratio
|
|
36
|
+
self[GENERAL_FEATURE_NAME].log_ratio
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.total
|
|
40
|
+
@@total ||= self[GENERAL_FEATURE_NAME].total
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def probability_for(category)
|
|
44
|
+
self[category] / total
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def total
|
|
48
|
+
CATEGORIES.inject(0) { |memo, category| memo += self[category] }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def no_examples?
|
|
52
|
+
CATEGORIES.inject(1) { |memo, category| memo *= self[category] } == 0
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
class Features < Countable
|
|
4
|
+
self.prefix = 'feature::'
|
|
5
|
+
|
|
6
|
+
def probability_for(category)
|
|
7
|
+
self[category] / Examples[feature][category]
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def feature
|
|
11
|
+
key.gsub(/(^#{ self.class.prefix })|(::\w+_count$)/, '')
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|