green_midget 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +4 -0
- data/.rspec +1 -0
- data/.rvmrc +1 -0
- data/Gemfile +19 -0
- data/Gemfile.lock +45 -0
- data/LICENSE.txt +15 -0
- data/README.md +128 -0
- data/Rakefile +1 -0
- data/benchmark/benchmark.rb +40 -0
- data/benchmark/test.rb +31 -0
- data/db/migrate/create_green_midget_records.rb +16 -0
- data/green_midget.gemspec +31 -0
- data/lib/extensions/green_midget_check.rb +8 -0
- data/lib/extensions/sample.rb +19 -0
- data/lib/green_midget/base.rb +91 -0
- data/lib/green_midget/constants.rb +31 -0
- data/lib/green_midget/green_midget.rb +6 -0
- data/lib/green_midget/logger.rb +16 -0
- data/lib/green_midget/models/countable.rb +33 -0
- data/lib/green_midget/models/examples.rb +55 -0
- data/lib/green_midget/models/features.rb +14 -0
- data/lib/green_midget/models/green_midget_records.rb +49 -0
- data/lib/green_midget/models/words.rb +21 -0
- data/lib/green_midget/url_detection.rb +24 -0
- data/lib/green_midget/version.rb +3 -0
- data/lib/green_midget.rb +16 -0
- data/lib/tasks/green_midget.rake +32 -0
- data/spec/base_spec.rb +163 -0
- data/spec/examples_spec.rb +85 -0
- data/spec/features_spec.rb +24 -0
- data/spec/green_midget_records_spec.rb +80 -0
- data/spec/green_midget_url_detection_spec.rb +14 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/tester.rb +17 -0
- data/spec/words_spec.rb +24 -0
- metadata +125 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
class GreenMidgetRecords < ActiveRecord::Base
|
|
4
|
+
set_table_name :green_midget_records
|
|
5
|
+
|
|
6
|
+
def self.fetch_all(words = [])
|
|
7
|
+
words_keys = Words.record_keys(words)
|
|
8
|
+
|
|
9
|
+
pairs = connection.select_rows(
|
|
10
|
+
"SELECT `key`, `value` FROM %s WHERE `key` IN ('%s') OR `key` LIKE '%s' OR `key` LIKE '%s'" %
|
|
11
|
+
[ table_name, words_keys.join("', '"), "#{ Features.prefix }%", "#{ Examples.prefix }%" ]
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
@@cache = pairs.inject({}) do |memo, pair|
|
|
15
|
+
memo[pair.first] = pair.last
|
|
16
|
+
memo
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
words_keys.inject(@@cache) do |memo, word|
|
|
20
|
+
memo[word] ||= ''
|
|
21
|
+
memo
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.[](key)
|
|
26
|
+
key = key.to_s
|
|
27
|
+
@@cache ||= {}
|
|
28
|
+
@@cache[key] || @@cache[key] = connection.select_value("SELECT `value` FROM #{ table_name } WHERE `key` = '#{ key }'") || @@cache[key] = ''
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def self.increment(keys)
|
|
32
|
+
keys = Array(keys)
|
|
33
|
+
records = all(:conditions => [ "`key` IN (?)", keys ])
|
|
34
|
+
|
|
35
|
+
@@objects = records.inject({}) do |memo, record|
|
|
36
|
+
memo[record.key] = record
|
|
37
|
+
memo
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
keys.inject(@@objects) do |memo, key|
|
|
41
|
+
memo[key] ||= new(:key => key, :value => '0.0')
|
|
42
|
+
memo
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
@@objects.each { |key, record| record.update_attribute(:value, record.value.to_f + 1) }
|
|
46
|
+
@@objects = {}
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
class Words < Countable
|
|
4
|
+
self.prefix = 'word::'
|
|
5
|
+
|
|
6
|
+
def self.record_keys(words, category = nil)
|
|
7
|
+
words.map do |word|
|
|
8
|
+
Array(category || GreenMidget::CATEGORIES).map{ |category| Words[word].record_key(category) }
|
|
9
|
+
end.flatten
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def probability_for(category)
|
|
13
|
+
count = self[category]
|
|
14
|
+
if count == 0.0
|
|
15
|
+
1.0 / Examples.total
|
|
16
|
+
else
|
|
17
|
+
count / Examples.send(category)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
module GreenMidget
|
|
3
|
+
class UrlDetection
|
|
4
|
+
def initialize(text)
|
|
5
|
+
@text = text
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def any?
|
|
9
|
+
non_tolerated_urls.size > 0
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
private
|
|
13
|
+
|
|
14
|
+
def urls
|
|
15
|
+
@text.scan(GreenMidget::URL_REGEX).flatten.reject(&:nil?)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def non_tolerated_urls
|
|
19
|
+
urls.reject do |url|
|
|
20
|
+
url.to_s.downcase =~ GreenMidget::TOLERATED_URLS
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
data/lib/green_midget.rb
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'active_record'
|
|
3
|
+
require 'green_midget/green_midget'
|
|
4
|
+
require 'green_midget/base'
|
|
5
|
+
require 'green_midget/models/countable'
|
|
6
|
+
require 'green_midget/models/examples'
|
|
7
|
+
require 'green_midget/models/features'
|
|
8
|
+
require 'green_midget/models/green_midget_records'
|
|
9
|
+
require 'green_midget/models/words'
|
|
10
|
+
require 'extensions/green_midget_check'
|
|
11
|
+
|
|
12
|
+
if (classifier = Gem.searcher.find('green_midget'))
|
|
13
|
+
path = classifier.full_gem_path
|
|
14
|
+
Dir["#{path}/lib/tasks/*.rake"].each { |ext| load ext }
|
|
15
|
+
end
|
|
16
|
+
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'fileutils'
|
|
3
|
+
require 'rake'
|
|
4
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'db', 'migrate', 'create_green_midget_records')
|
|
5
|
+
|
|
6
|
+
namespace :green_midget do
|
|
7
|
+
desc "prepare this project for a world without spam"
|
|
8
|
+
task :setup => :environment do
|
|
9
|
+
include GreenMidget
|
|
10
|
+
|
|
11
|
+
unless GreenMidgetRecords.table_exists?
|
|
12
|
+
CreateGreenMidgetRecords.up
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
keys = ["url_in_text", "email_in_text"].map do |feature|
|
|
16
|
+
[ Features[feature].record_key(ALTERNATIVE), Features[feature].record_key(NULL) ]
|
|
17
|
+
end.flatten
|
|
18
|
+
|
|
19
|
+
keys += [Examples::GENERAL_FEATURE_NAME, "url_in_text", "email_in_text"].map do |feature|
|
|
20
|
+
[ Examples[feature].record_key(ALTERNATIVE), Examples[feature].record_key(NULL) ]
|
|
21
|
+
end.flatten
|
|
22
|
+
|
|
23
|
+
puts '== Creating records ==='
|
|
24
|
+
keys.each { |key|
|
|
25
|
+
unless GreenMidgetRecords.find_by_key(key)
|
|
26
|
+
GreenMidgetRecords.create(key)
|
|
27
|
+
puts "-- Created #{key}"
|
|
28
|
+
end
|
|
29
|
+
}
|
|
30
|
+
puts '== Done ==='
|
|
31
|
+
end
|
|
32
|
+
end
|
data/spec/base_spec.rb
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
require File.join(File.dirname(__FILE__), 'tester')
|
|
4
|
+
|
|
5
|
+
describe GreenMidget::Base do
|
|
6
|
+
include GreenMidget
|
|
7
|
+
|
|
8
|
+
before(:each) do
|
|
9
|
+
GreenMidgetRecords.delete_all
|
|
10
|
+
[
|
|
11
|
+
{:key => "#{ Words.prefix }this::#{ ALTERNATIVE }_count", :value => 701.0 },
|
|
12
|
+
{:key => "#{ Words.prefix }this::#{ NULL }_count", :value => 11.0 },
|
|
13
|
+
{:key => "#{ Words.prefix }test::#{ ALTERNATIVE }_count", :value => 9.0 },
|
|
14
|
+
{:key => "#{ Words.prefix }test::#{ NULL }_count", :value => 71.0 },
|
|
15
|
+
{:key => "#{ Words.prefix }goes::#{ ALTERNATIVE }_count", :value => 90.0 },
|
|
16
|
+
{:key => "#{ Words.prefix }goes::#{ NULL }_count", :value => 90.0 },
|
|
17
|
+
{:key => "#{ Words.prefix }rid::#{ ALTERNATIVE }_count", :value => 311.0 },
|
|
18
|
+
{:key => "#{ Words.prefix }rid::#{ NULL }_count", :value => 290.0 },
|
|
19
|
+
{:key => "#{ Words.prefix }dirty::#{ ALTERNATIVE }_count", :value => 222.0 },
|
|
20
|
+
{:key => "#{ Words.prefix }dirty::#{ NULL }_count", :value => 45.0 },
|
|
21
|
+
{:key => "#{ Words.prefix }spam::#{ ALTERNATIVE }_count", :value => 11.0 },
|
|
22
|
+
{:key => "#{ Words.prefix }spam::#{ NULL }_count", :value => 133.0 },
|
|
23
|
+
{:key => "#{ Words.prefix }words::#{ ALTERNATIVE }_count", :value => 6.0 },
|
|
24
|
+
{:key => "#{ Words.prefix }words::#{ NULL }_count", :value => 811.0 },
|
|
25
|
+
{:key => "#{ Words.prefix }zero::#{ ALTERNATIVE }_count", :value => 0.0 },
|
|
26
|
+
{:key => "#{ Words.prefix }zero::#{ NULL }_count", :value => 0.0 },
|
|
27
|
+
{:key => "#{ Features.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 440.0 },
|
|
28
|
+
{:key => "#{ Features.prefix }url_in_text::#{ NULL }_count", :value => 40.0 },
|
|
29
|
+
{:key => "#{ Features.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 112.0 },
|
|
30
|
+
{:key => "#{ Features.prefix }email_in_text::#{ NULL }_count", :value => 9.0 },
|
|
31
|
+
{:key => "#{ Examples.prefix }any::#{ ALTERNATIVE }_count", :value => 1000.0 },
|
|
32
|
+
{:key => "#{ Examples.prefix }any::#{ NULL }_count", :value => 1000.0 },
|
|
33
|
+
{:key => "#{ Examples.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 1000.0 },
|
|
34
|
+
{:key => "#{ Examples.prefix }url_in_text::#{ NULL }_count", :value => 1000.0 },
|
|
35
|
+
{:key => "#{ Examples.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 1000.0 },
|
|
36
|
+
{:key => "#{ Examples.prefix }email_in_text::#{ NULL }_count", :value => 1000.0 },
|
|
37
|
+
].each do |entry|
|
|
38
|
+
GreenMidgetRecords.create(entry)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
describe "#log_ratio" do
|
|
43
|
+
it "should be for 'this words'" do
|
|
44
|
+
Tester.new('this words').log_ratio.should == Math::log((701.0/1000) / (11.0/1000)) + Math::log((6.0/1000) / (811.0/1000)) + Math::log((1000.0/2000) / (1000.0/2000))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "should be smaller for a smaller number of spammy words" do
|
|
48
|
+
Tester.new('this dirty test').log_ratio.should > Tester.new('this test').log_ratio
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "considers 'test goes words' ham" do
|
|
52
|
+
Tester.new('test goes words').log_ratio.should < REJECT_ALTERNATIVE_MAX
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "considers 'rid goes dirty' spam" do
|
|
56
|
+
Tester.new('rid goes dirty').log_ratio.should >= ACCEPT_ALTERNATIVE_MIN
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it "doesn't know whether 'zero goes rid' is spam or not" do
|
|
60
|
+
Tester.new('zero goes rid').log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "thinks of 'test boss@offshore.com' as more spam than just 'test'" do
|
|
64
|
+
Tester.new('test boss@offshore.com').log_ratio.
|
|
65
|
+
should > Tester.new('test').log_ratio
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
it "thinks of 'test www.offshore.com' as more spam than just 'test'" do
|
|
69
|
+
Tester.new('test www.offshore.com').log_ratio.
|
|
70
|
+
should > Tester.new('test').log_ratio
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it "will tolerate urls coming from known sites" do
|
|
74
|
+
Tester.new('test www.offshore.com').log_ratio.should >
|
|
75
|
+
Tester.new('test www.soundcloud.com').log_ratio
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it "should say DUNNO if it doesnt have neither ALTERNATIVE nor NULL score for a message" do
|
|
79
|
+
Tester.new('zero newword heuristicspass').log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it "should say ALTERNATIVE if it has spam score for a message and doesn't have ham score for it" do
|
|
83
|
+
a = Tester.new('nosuchword nowordsuch heuristicspass')
|
|
84
|
+
a.log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
|
|
85
|
+
a.classify_as!(ALTERNATIVE)
|
|
86
|
+
a.log_ratio.should >= ACCEPT_ALTERNATIVE_MIN
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it "should say NULL if it has ham score for a message and doesn't have spam score for it" do
|
|
90
|
+
a = Tester.new('suchwordno nowordsuch heuristicspasss')
|
|
91
|
+
a.log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
|
|
92
|
+
a.classify_as!(NULL)
|
|
93
|
+
a.log_ratio.should < REJECT_ALTERNATIVE_MAX
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
describe "#classify" do
|
|
98
|
+
it "should add unknown words to the dictionary before classification" do
|
|
99
|
+
Tester.new('newword needs to pass heuristics').classify
|
|
100
|
+
Words['newword'][ALTERNATIVE].should == 0
|
|
101
|
+
Words['newword'][ALTERNATIVE].should == 0
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
describe "#classify_as!" do
|
|
106
|
+
it "should increase the index counts of the classified words" do
|
|
107
|
+
lambda {
|
|
108
|
+
Tester.new('zero').classify_as!(NULL)
|
|
109
|
+
}.should change { GreenMidgetRecords.find_by_key(Words['zero'].record_key(NULL)).value.to_f }.by(1)
|
|
110
|
+
end
|
|
111
|
+
it "should increment the learning examples count for all features" do
|
|
112
|
+
FEATURES.each do |feature|
|
|
113
|
+
lambda {
|
|
114
|
+
Tester.new('zero').classify_as!(NULL)
|
|
115
|
+
}.should change { GreenMidgetRecords.find_by_key(Examples[feature].record_key(NULL)).value.to_f }.by(1)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
it "should not add new records for known keys" do
|
|
119
|
+
a = Tester.new 'stuff unknown sofar'
|
|
120
|
+
lambda {
|
|
121
|
+
a.classify_as! ALTERNATIVE
|
|
122
|
+
}.should change { GreenMidgetRecords.count }.by(3)
|
|
123
|
+
lambda {
|
|
124
|
+
a.classify_as! ALTERNATIVE
|
|
125
|
+
}.should_not change { GreenMidgetRecords.count }
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
describe "#words" do
|
|
130
|
+
it "should ignore words less than 3 characters" do
|
|
131
|
+
Tester.new('is 2 ch').words.should == []
|
|
132
|
+
end
|
|
133
|
+
it "should break large character strings into chunks of 20 bytes" do
|
|
134
|
+
Tester.new('s'*20 + '111').words.should == ['s'*20, '111']
|
|
135
|
+
end
|
|
136
|
+
it "should bring uppercase to lowcase" do
|
|
137
|
+
Tester.new('HOWBIG').words.should == ['howbig']
|
|
138
|
+
end
|
|
139
|
+
it "should not consider parts of email address as individual words" do
|
|
140
|
+
Tester.new('friend@soundcloud.com').words.should == []
|
|
141
|
+
end
|
|
142
|
+
it "should not consider parts of website url as individual words" do
|
|
143
|
+
Tester.new('www.myguy.com http://weargeil.org').words.should == []
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# describe "extreme cases" do
|
|
148
|
+
# it "should fallback to training_examples_with_feature::any if there're no examples in the database for a particular feature" do
|
|
149
|
+
# # a new feature should be added with no examples and make sure the classifier won't break
|
|
150
|
+
# pending('todo')
|
|
151
|
+
# end
|
|
152
|
+
# it "throw an exception if no training examples were given, but it's asked for classification" do
|
|
153
|
+
# # if GreenMidgetRecords.count(ALTERNATIVE) or GreenMidgetRecords.count(NULL) is 0.0 => throw an exception
|
|
154
|
+
# pending('todo')
|
|
155
|
+
# end
|
|
156
|
+
# end
|
|
157
|
+
#
|
|
158
|
+
# describe "#feature_present?" do
|
|
159
|
+
# it "should throw NoMethodError if a feature look-up method has not been implemented" do
|
|
160
|
+
# pending('')
|
|
161
|
+
# end
|
|
162
|
+
# end
|
|
163
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
describe GreenMidget::Examples do
|
|
5
|
+
include GreenMidget
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
GreenMidgetRecords.delete_all
|
|
9
|
+
GreenMidgetRecords.class_variable_set("@@cache", {})
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
describe "#[]()" do
|
|
13
|
+
before do
|
|
14
|
+
@call_any = lambda do
|
|
15
|
+
Examples[Examples::GENERAL_FEATURE_NAME]
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it "should return the general feature examples if passed a (new) feature key that has no examples yet" do
|
|
20
|
+
GreenMidgetRecords.create(:key => Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ NULL }_count", :value => 1000)
|
|
21
|
+
GreenMidgetRecords.create(:key => Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ ALTERNATIVE }_count", :value => 1000)
|
|
22
|
+
GreenMidgetRecords.find_by_key(Examples.prefix + "new::#{ NULL }_count").should == nil
|
|
23
|
+
GreenMidgetRecords.fetch_all
|
|
24
|
+
CATEGORIES.each do |category|
|
|
25
|
+
Examples['new'][category].should == Examples[Examples::GENERAL_FEATURE_NAME][category]
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
it "should return the feature's own example counts if these exist" do
|
|
29
|
+
GreenMidgetRecords.create(:key => Examples.prefix + "new::#{ NULL }_count", :value => 3)
|
|
30
|
+
GreenMidgetRecords.create(:key => Examples.prefix + "new::#{ ALTERNATIVE }_count", :value => 1)
|
|
31
|
+
Examples['new'][NULL].should == 3
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it "should throw an error if the general feature examples isn't found" do
|
|
35
|
+
@call_any.should raise_error
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it "should throw an error if the general feature examples has a zero spam_count and ham_count" do
|
|
39
|
+
GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ NULL }_count")
|
|
40
|
+
@call_any.should raise_error
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "should throw an error if the general feature examples has a zero spam_count or ham_count" do
|
|
44
|
+
GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ NULL }_count", :value => 0)
|
|
45
|
+
@call_any.should raise_error
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "should not throw an error if both columns are positive" do
|
|
49
|
+
GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ NULL }_count", :value => 2)
|
|
50
|
+
GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ ALTERNATIVE }_count", :value => 1)
|
|
51
|
+
@call_any.should_not raise_error
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
describe "#probability_for" do
|
|
56
|
+
it "should return the probability of a feature falling into category as: Examples[feature][category] / (Examples[feature][ALTERNATIVE] + Examples[feature][NULL])" do
|
|
57
|
+
GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(NULL), :value => 1000)
|
|
58
|
+
GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(ALTERNATIVE), :value => 150 )
|
|
59
|
+
Examples['url_in_text'].probability_for(ALTERNATIVE).should == 150.0/(1000 + 150)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe "#no_examples?" do
|
|
64
|
+
before(:each) do
|
|
65
|
+
GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(ALTERNATIVE))
|
|
66
|
+
GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(NULL))
|
|
67
|
+
@object = Examples['url_in_text']
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it "should return true if spam_count and ham_count are zero" do
|
|
71
|
+
@object.no_examples?.should be_true
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "should return true if spam_count or ham_count are zero" do
|
|
75
|
+
GreenMidgetRecords.find_by_key(@object.record_key(NULL)).update_attribute(:value, 1)
|
|
76
|
+
@object.no_examples?.should be_true
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "should should return false if both spam_count and ham_count are positive" do
|
|
80
|
+
GreenMidgetRecords.find_by_key(@object.record_key(NULL)).update_attribute(:value, 1)
|
|
81
|
+
GreenMidgetRecords.find_by_key(@object.record_key(ALTERNATIVE)).update_attribute(:value, 1)
|
|
82
|
+
@object.no_examples?.should be_false
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
describe GreenMidget::Features do
|
|
5
|
+
include GreenMidget
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
GreenMidgetRecords.delete_all
|
|
9
|
+
GreenMidgetRecords.class_variable_set("@@cache", {})
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
describe "#probability_for" do
|
|
13
|
+
it "should return Feature[feature] / Examples[feature]" do
|
|
14
|
+
GreenMidgetRecords.create(:key => Features["url_in_text"].record_key(NULL), :value => 20 )
|
|
15
|
+
GreenMidgetRecords.create(:key => Features["url_in_text"].record_key(ALTERNATIVE), :value => 10 )
|
|
16
|
+
|
|
17
|
+
GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(NULL), :value => 100 )
|
|
18
|
+
GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(ALTERNATIVE), :value => 1000)
|
|
19
|
+
|
|
20
|
+
Features['url_in_text'].probability_for(NULL).should == 20.0/100
|
|
21
|
+
Features['url_in_text'].probability_for(ALTERNATIVE).should == 10.0/1000
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
describe GreenMidget::GreenMidgetRecords do
|
|
5
|
+
include GreenMidget
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
GreenMidgetRecords.delete_all
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe "#[]()" do
|
|
12
|
+
it "should take words from data store if not found in the cache" do
|
|
13
|
+
word_key, phrase_key = [ 'word', 'phrase' ].map { |w| Words[w].record_key(NULL) }
|
|
14
|
+
GreenMidgetRecords.fetch_all([ 'word' ])
|
|
15
|
+
GreenMidgetRecords.create(:key => phrase_key)
|
|
16
|
+
GreenMidgetRecords.find_by_key(word_key).should == nil
|
|
17
|
+
GreenMidgetRecords.find_by_key(phrase_key).should_not == nil
|
|
18
|
+
GreenMidgetRecords[phrase_key].should == ''
|
|
19
|
+
end
|
|
20
|
+
it "should add a {key => ''} to the cache if key not found in cache and in the data store" do
|
|
21
|
+
key = Words['nonexisting'].record_key(NULL)
|
|
22
|
+
GreenMidgetRecords[key].should == ''
|
|
23
|
+
GreenMidgetRecords.find_by_key(key).should == nil
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
describe "#fetch_all" do
|
|
28
|
+
it "should empty cache before fetching" do
|
|
29
|
+
bar_key = Words['bar'].record_key(ALTERNATIVE)
|
|
30
|
+
GreenMidgetRecords.fetch_all([ 'foo', 'bar' ])
|
|
31
|
+
GreenMidgetRecords.class_variable_get("@@cache").key?(bar_key).should be_true
|
|
32
|
+
GreenMidgetRecords.fetch_all([ 'foo', 'newbar' ])
|
|
33
|
+
GreenMidgetRecords.class_variable_get("@@cache").key?(bar_key).should be_false
|
|
34
|
+
end
|
|
35
|
+
it "does a multi get on all words and keys" do
|
|
36
|
+
cache = GreenMidgetRecords.fetch_all([ 'foo', 'bar' ])
|
|
37
|
+
cache['foo'].should.eql? GreenMidgetRecords.class_eval{new('foo')}
|
|
38
|
+
end
|
|
39
|
+
it "should fetch the system keys along with the given words" do
|
|
40
|
+
key = Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ NULL }_count"
|
|
41
|
+
GreenMidgetRecords.create(:key => key)
|
|
42
|
+
GreenMidgetRecords.fetch_all([])
|
|
43
|
+
cache = GreenMidgetRecords.class_variable_get("@@cache")
|
|
44
|
+
cache.key?(key).should be_true
|
|
45
|
+
cache.count.should == 1
|
|
46
|
+
end
|
|
47
|
+
it "words with zero examples or no record in the database should be present in the cache" do
|
|
48
|
+
GreenMidgetRecords.create(:key => Words['kotoba'].record_key(NULL))
|
|
49
|
+
GreenMidgetRecords.fetch_all(['kotoba'])
|
|
50
|
+
GreenMidgetRecords.class_variable_get("@@cache").key?(Words['kotoba'].record_key(ALTERNATIVE)).should be_true
|
|
51
|
+
GreenMidgetRecords.create(:key => Words['mouichidou'].record_key(NULL), :value => 0)
|
|
52
|
+
GreenMidgetRecords.create(:key => Words['mouichidou'].record_key(ALTERNATIVE), :value => 3)
|
|
53
|
+
GreenMidgetRecords.fetch_all(['mouichidou'])
|
|
54
|
+
GreenMidgetRecords.class_variable_get("@@cache")[Words['mouichidou'].record_key(NULL)].should_not == nil
|
|
55
|
+
GreenMidgetRecords.class_variable_get("@@cache")[Words['mouichidou'].record_key(ALTERNATIVE)].should_not == nil
|
|
56
|
+
end
|
|
57
|
+
it "the cache should be a hash; its keys should be strings" do
|
|
58
|
+
GreenMidgetRecords.create(:key => Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ NULL }_count")
|
|
59
|
+
GreenMidgetRecords.create(:key => Features.prefix + "url_in_text::#{ NULL }_count")
|
|
60
|
+
GreenMidgetRecords.fetch_all([])
|
|
61
|
+
cache = GreenMidgetRecords.class_variable_get("@@cache")
|
|
62
|
+
cache.class.should.eql? Hash
|
|
63
|
+
cache.count.should == 2
|
|
64
|
+
cache.keys.each do |key|
|
|
65
|
+
key.class.should.eql? String
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
describe "#increment" do
|
|
71
|
+
it "should increment counts first in cache and write! to store only if explicitly called" do
|
|
72
|
+
record_key = Words['stuff'].record_key(NULL)
|
|
73
|
+
GreenMidgetRecords.create(:key => record_key)
|
|
74
|
+
|
|
75
|
+
lambda {
|
|
76
|
+
GreenMidgetRecords.increment(record_key)
|
|
77
|
+
}.should change { GreenMidgetRecords.find_by_key(record_key).value.to_f }.by(1)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
include GreenMidget
|
|
5
|
+
|
|
6
|
+
describe UrlDetection do
|
|
7
|
+
it 'should not detect a url' do
|
|
8
|
+
UrlDetection.new('not a url').any?.should_not be_true
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it 'should detect a url' do
|
|
12
|
+
UrlDetection.new('http://foo.de/').any?.should be_true
|
|
13
|
+
end
|
|
14
|
+
end
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
|
4
|
+
require 'rspec'
|
|
5
|
+
require 'green_midget'
|
|
6
|
+
|
|
7
|
+
# Requires supporting files with custom matchers and macros, etc,
|
|
8
|
+
# in ./support/ and its subdirectories.
|
|
9
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each { |file| require file }
|
|
10
|
+
|
|
11
|
+
RSpec.configure do |config|
|
|
12
|
+
|
|
13
|
+
end
|
data/spec/tester.rb
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
class Tester < GreenMidget::Base
|
|
3
|
+
attr_accessor :text
|
|
4
|
+
|
|
5
|
+
def initialize(text = '')
|
|
6
|
+
self.text = text
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def words
|
|
10
|
+
super
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def log_ratio
|
|
14
|
+
GreenMidgetRecords.fetch_all(words)
|
|
15
|
+
super
|
|
16
|
+
end
|
|
17
|
+
end
|
data/spec/words_spec.rb
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
describe GreenMidget::Words do
|
|
5
|
+
include GreenMidget
|
|
6
|
+
|
|
7
|
+
before(:each) do
|
|
8
|
+
GreenMidgetRecords.delete_all
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe "self.record_keys" do
|
|
12
|
+
it "takes an array of words and optionally a category, returns an array of corresponding record keys wrt category" do
|
|
13
|
+
Words.record_keys([ 'one' ]).should == [ "#{ Words.prefix }one::#{ NULL }_count", "#{ Words.prefix }one::#{ ALTERNATIVE }_count" ]
|
|
14
|
+
Words.record_keys([ 'one' ], NULL).should == [ "#{ Words.prefix }one::#{ NULL }_count" ]
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
describe "#probability_for" do
|
|
19
|
+
it "should return the smoother constant if the word has zero examples" do
|
|
20
|
+
GreenMidgetRecords[Words['word'].record_key(ALTERNATIVE)].should == ''
|
|
21
|
+
Words['word'].probability_for(ALTERNATIVE).should == (1.0 / Examples.total)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|