green_midget 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ module GreenMidget
3
+ class GreenMidgetRecords < ActiveRecord::Base
4
+ set_table_name :green_midget_records
5
+
6
+ def self.fetch_all(words = [])
7
+ words_keys = Words.record_keys(words)
8
+
9
+ pairs = connection.select_rows(
10
+ "SELECT `key`, `value` FROM %s WHERE `key` IN ('%s') OR `key` LIKE '%s' OR `key` LIKE '%s'" %
11
+ [ table_name, words_keys.join("', '"), "#{ Features.prefix }%", "#{ Examples.prefix }%" ]
12
+ )
13
+
14
+ @@cache = pairs.inject({}) do |memo, pair|
15
+ memo[pair.first] = pair.last
16
+ memo
17
+ end
18
+
19
+ words_keys.inject(@@cache) do |memo, word|
20
+ memo[word] ||= ''
21
+ memo
22
+ end
23
+ end
24
+
25
+ def self.[](key)
26
+ key = key.to_s
27
+ @@cache ||= {}
28
+ @@cache[key] || @@cache[key] = connection.select_value("SELECT `value` FROM #{ table_name } WHERE `key` = '#{ key }'") || @@cache[key] = ''
29
+ end
30
+
31
+ def self.increment(keys)
32
+ keys = Array(keys)
33
+ records = all(:conditions => [ "`key` IN (?)", keys ])
34
+
35
+ @@objects = records.inject({}) do |memo, record|
36
+ memo[record.key] = record
37
+ memo
38
+ end
39
+
40
+ keys.inject(@@objects) do |memo, key|
41
+ memo[key] ||= new(:key => key, :value => '0.0')
42
+ memo
43
+ end
44
+
45
+ @@objects.each { |key, record| record.update_attribute(:value, record.value.to_f + 1) }
46
+ @@objects = {}
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,21 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ module GreenMidget
3
+ class Words < Countable
4
+ self.prefix = 'word::'
5
+
6
+ def self.record_keys(words, category = nil)
7
+ words.map do |word|
8
+ Array(category || GreenMidget::CATEGORIES).map{ |category| Words[word].record_key(category) }
9
+ end.flatten
10
+ end
11
+
12
+ def probability_for(category)
13
+ count = self[category]
14
+ if count == 0.0
15
+ 1.0 / Examples.total
16
+ else
17
+ count / Examples.send(category)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,24 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ module GreenMidget
3
+ class UrlDetection
4
+ def initialize(text)
5
+ @text = text
6
+ end
7
+
8
+ def any?
9
+ non_tolerated_urls.size > 0
10
+ end
11
+
12
+ private
13
+
14
+ def urls
15
+ @text.scan(GreenMidget::URL_REGEX).flatten.reject(&:nil?)
16
+ end
17
+
18
+ def non_tolerated_urls
19
+ urls.reject do |url|
20
+ url.to_s.downcase =~ GreenMidget::TOLERATED_URLS
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module GreenMidget
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,16 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'active_record'
3
+ require 'green_midget/green_midget'
4
+ require 'green_midget/base'
5
+ require 'green_midget/models/countable'
6
+ require 'green_midget/models/examples'
7
+ require 'green_midget/models/features'
8
+ require 'green_midget/models/green_midget_records'
9
+ require 'green_midget/models/words'
10
+ require 'extensions/green_midget_check'
11
+
12
+ if (classifier = Gem.searcher.find('green_midget'))
13
+ path = classifier.full_gem_path
14
+ Dir["#{path}/lib/tasks/*.rake"].each { |ext| load ext }
15
+ end
16
+
@@ -0,0 +1,32 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'fileutils'
3
+ require 'rake'
4
+ require File.join(File.dirname(__FILE__), '..', '..', 'db', 'migrate', 'create_green_midget_records')
5
+
6
+ namespace :green_midget do
7
+ desc "prepare this project for a world without spam"
8
+ task :setup => :environment do
9
+ include GreenMidget
10
+
11
+ unless GreenMidgetRecords.table_exists?
12
+ CreateGreenMidgetRecords.up
13
+ end
14
+
15
+ keys = ["url_in_text", "email_in_text"].map do |feature|
16
+ [ Features[feature].record_key(ALTERNATIVE), Features[feature].record_key(NULL) ]
17
+ end.flatten
18
+
19
+ keys += [Examples::GENERAL_FEATURE_NAME, "url_in_text", "email_in_text"].map do |feature|
20
+ [ Examples[feature].record_key(ALTERNATIVE), Examples[feature].record_key(NULL) ]
21
+ end.flatten
22
+
23
+ puts '== Creating records ==='
24
+ keys.each { |key|
25
+ unless GreenMidgetRecords.find_by_key(key)
26
+ GreenMidgetRecords.create(key)
27
+ puts "-- Created #{key}"
28
+ end
29
+ }
30
+ puts '== Done ==='
31
+ end
32
+ end
data/spec/base_spec.rb ADDED
@@ -0,0 +1,163 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'spec_helper'
3
+ require File.join(File.dirname(__FILE__), 'tester')
4
+
5
+ describe GreenMidget::Base do
6
+ include GreenMidget
7
+
8
+ before(:each) do
9
+ GreenMidgetRecords.delete_all
10
+ [
11
+ {:key => "#{ Words.prefix }this::#{ ALTERNATIVE }_count", :value => 701.0 },
12
+ {:key => "#{ Words.prefix }this::#{ NULL }_count", :value => 11.0 },
13
+ {:key => "#{ Words.prefix }test::#{ ALTERNATIVE }_count", :value => 9.0 },
14
+ {:key => "#{ Words.prefix }test::#{ NULL }_count", :value => 71.0 },
15
+ {:key => "#{ Words.prefix }goes::#{ ALTERNATIVE }_count", :value => 90.0 },
16
+ {:key => "#{ Words.prefix }goes::#{ NULL }_count", :value => 90.0 },
17
+ {:key => "#{ Words.prefix }rid::#{ ALTERNATIVE }_count", :value => 311.0 },
18
+ {:key => "#{ Words.prefix }rid::#{ NULL }_count", :value => 290.0 },
19
+ {:key => "#{ Words.prefix }dirty::#{ ALTERNATIVE }_count", :value => 222.0 },
20
+ {:key => "#{ Words.prefix }dirty::#{ NULL }_count", :value => 45.0 },
21
+ {:key => "#{ Words.prefix }spam::#{ ALTERNATIVE }_count", :value => 11.0 },
22
+ {:key => "#{ Words.prefix }spam::#{ NULL }_count", :value => 133.0 },
23
+ {:key => "#{ Words.prefix }words::#{ ALTERNATIVE }_count", :value => 6.0 },
24
+ {:key => "#{ Words.prefix }words::#{ NULL }_count", :value => 811.0 },
25
+ {:key => "#{ Words.prefix }zero::#{ ALTERNATIVE }_count", :value => 0.0 },
26
+ {:key => "#{ Words.prefix }zero::#{ NULL }_count", :value => 0.0 },
27
+ {:key => "#{ Features.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 440.0 },
28
+ {:key => "#{ Features.prefix }url_in_text::#{ NULL }_count", :value => 40.0 },
29
+ {:key => "#{ Features.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 112.0 },
30
+ {:key => "#{ Features.prefix }email_in_text::#{ NULL }_count", :value => 9.0 },
31
+ {:key => "#{ Examples.prefix }any::#{ ALTERNATIVE }_count", :value => 1000.0 },
32
+ {:key => "#{ Examples.prefix }any::#{ NULL }_count", :value => 1000.0 },
33
+ {:key => "#{ Examples.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 1000.0 },
34
+ {:key => "#{ Examples.prefix }url_in_text::#{ NULL }_count", :value => 1000.0 },
35
+ {:key => "#{ Examples.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 1000.0 },
36
+ {:key => "#{ Examples.prefix }email_in_text::#{ NULL }_count", :value => 1000.0 },
37
+ ].each do |entry|
38
+ GreenMidgetRecords.create(entry)
39
+ end
40
+ end
41
+
42
+ describe "#log_ratio" do
43
+ it "should be for 'this words'" do
44
+ Tester.new('this words').log_ratio.should == Math::log((701.0/1000) / (11.0/1000)) + Math::log((6.0/1000) / (811.0/1000)) + Math::log((1000.0/2000) / (1000.0/2000))
45
+ end
46
+
47
+ it "should be smaller for a smaller number of spammy words" do
48
+ Tester.new('this dirty test').log_ratio.should > Tester.new('this test').log_ratio
49
+ end
50
+
51
+ it "considers 'test goes words' ham" do
52
+ Tester.new('test goes words').log_ratio.should < REJECT_ALTERNATIVE_MAX
53
+ end
54
+
55
+ it "considers 'rid goes dirty' spam" do
56
+ Tester.new('rid goes dirty').log_ratio.should >= ACCEPT_ALTERNATIVE_MIN
57
+ end
58
+
59
+ it "doesn't know whether 'zero goes rid' is spam or not" do
60
+ Tester.new('zero goes rid').log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
61
+ end
62
+
63
+ it "thinks of 'test boss@offshore.com' as more spam than just 'test'" do
64
+ Tester.new('test boss@offshore.com').log_ratio.
65
+ should > Tester.new('test').log_ratio
66
+ end
67
+
68
+ it "thinks of 'test www.offshore.com' as more spam than just 'test'" do
69
+ Tester.new('test www.offshore.com').log_ratio.
70
+ should > Tester.new('test').log_ratio
71
+ end
72
+
73
+ it "will tolerate urls coming from known sites" do
74
+ Tester.new('test www.offshore.com').log_ratio.should >
75
+ Tester.new('test www.soundcloud.com').log_ratio
76
+ end
77
+
78
+ it "should say DUNNO if it doesnt have neither ALTERNATIVE nor NULL score for a message" do
79
+ Tester.new('zero newword heuristicspass').log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
80
+ end
81
+
82
+ it "should say ALTERNATIVE if it has spam score for a message and doesn't have ham score for it" do
83
+ a = Tester.new('nosuchword nowordsuch heuristicspass')
84
+ a.log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
85
+ a.classify_as!(ALTERNATIVE)
86
+ a.log_ratio.should >= ACCEPT_ALTERNATIVE_MIN
87
+ end
88
+
89
+ it "should say NULL if it has ham score for a message and doesn't have spam score for it" do
90
+ a = Tester.new('suchwordno nowordsuch heuristicspasss')
91
+ a.log_ratio.between?(REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN).should be_true
92
+ a.classify_as!(NULL)
93
+ a.log_ratio.should < REJECT_ALTERNATIVE_MAX
94
+ end
95
+ end
96
+
97
+ describe "#classify" do
98
+ it "should add unknown words to the dictionary before classification" do
99
+ Tester.new('newword needs to pass heuristics').classify
100
+ Words['newword'][ALTERNATIVE].should == 0
101
+ Words['newword'][ALTERNATIVE].should == 0
102
+ end
103
+ end
104
+
105
+ describe "#classify_as!" do
106
+ it "should increase the index counts of the classified words" do
107
+ lambda {
108
+ Tester.new('zero').classify_as!(NULL)
109
+ }.should change { GreenMidgetRecords.find_by_key(Words['zero'].record_key(NULL)).value.to_f }.by(1)
110
+ end
111
+ it "should increment the learning examples count for all features" do
112
+ FEATURES.each do |feature|
113
+ lambda {
114
+ Tester.new('zero').classify_as!(NULL)
115
+ }.should change { GreenMidgetRecords.find_by_key(Examples[feature].record_key(NULL)).value.to_f }.by(1)
116
+ end
117
+ end
118
+ it "should not add new records for known keys" do
119
+ a = Tester.new 'stuff unknown sofar'
120
+ lambda {
121
+ a.classify_as! ALTERNATIVE
122
+ }.should change { GreenMidgetRecords.count }.by(3)
123
+ lambda {
124
+ a.classify_as! ALTERNATIVE
125
+ }.should_not change { GreenMidgetRecords.count }
126
+ end
127
+ end
128
+
129
+ describe "#words" do
130
+ it "should ignore words less than 3 characters" do
131
+ Tester.new('is 2 ch').words.should == []
132
+ end
133
+ it "should break large character strings into chunks of 20 bytes" do
134
+ Tester.new('s'*20 + '111').words.should == ['s'*20, '111']
135
+ end
136
+ it "should bring uppercase to lowcase" do
137
+ Tester.new('HOWBIG').words.should == ['howbig']
138
+ end
139
+ it "should not consider parts of email address as individual words" do
140
+ Tester.new('friend@soundcloud.com').words.should == []
141
+ end
142
+ it "should not consider parts of website url as individual words" do
143
+ Tester.new('www.myguy.com http://weargeil.org').words.should == []
144
+ end
145
+ end
146
+
147
+ # describe "extreme cases" do
148
+ # it "should fallback to training_examples_with_feature::any if there're no examples in the database for a particular feature" do
149
+ # # a new feature should be added with no examples and make sure the classifier won't break
150
+ # pending('todo')
151
+ # end
152
+ # it "throw an exception if no training examples were given, but it's asked for classification" do
153
+ # # if GreenMidgetRecords.count(ALTERNATIVE) or GreenMidgetRecords.count(NULL) is 0.0 => throw an exception
154
+ # pending('todo')
155
+ # end
156
+ # end
157
+ #
158
+ # describe "#feature_present?" do
159
+ # it "should throw NoMethodError if a feature look-up method has not been implemented" do
160
+ # pending('')
161
+ # end
162
+ # end
163
+ end
@@ -0,0 +1,85 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'spec_helper'
3
+
4
+ describe GreenMidget::Examples do
5
+ include GreenMidget
6
+
7
+ before(:each) do
8
+ GreenMidgetRecords.delete_all
9
+ GreenMidgetRecords.class_variable_set("@@cache", {})
10
+ end
11
+
12
+ describe "#[]()" do
13
+ before do
14
+ @call_any = lambda do
15
+ Examples[Examples::GENERAL_FEATURE_NAME]
16
+ end
17
+ end
18
+
19
+ it "should return the general feature examples if passed a (new) feature key that has no examples yet" do
20
+ GreenMidgetRecords.create(:key => Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ NULL }_count", :value => 1000)
21
+ GreenMidgetRecords.create(:key => Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ ALTERNATIVE }_count", :value => 1000)
22
+ GreenMidgetRecords.find_by_key(Examples.prefix + "new::#{ NULL }_count").should == nil
23
+ GreenMidgetRecords.fetch_all
24
+ CATEGORIES.each do |category|
25
+ Examples['new'][category].should == Examples[Examples::GENERAL_FEATURE_NAME][category]
26
+ end
27
+ end
28
+ it "should return the feature's own example counts if these exist" do
29
+ GreenMidgetRecords.create(:key => Examples.prefix + "new::#{ NULL }_count", :value => 3)
30
+ GreenMidgetRecords.create(:key => Examples.prefix + "new::#{ ALTERNATIVE }_count", :value => 1)
31
+ Examples['new'][NULL].should == 3
32
+ end
33
+
34
+ it "should throw an error if the general feature examples isn't found" do
35
+ @call_any.should raise_error
36
+ end
37
+
38
+ it "should throw an error if the general feature examples has a zero spam_count and ham_count" do
39
+ GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ NULL }_count")
40
+ @call_any.should raise_error
41
+ end
42
+
43
+ it "should throw an error if the general feature examples has a zero spam_count or ham_count" do
44
+ GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ NULL }_count", :value => 0)
45
+ @call_any.should raise_error
46
+ end
47
+
48
+ it "should not throw an error if both columns are positive" do
49
+ GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ NULL }_count", :value => 2)
50
+ GreenMidgetRecords.create(:key => Examples.prefix + "#{ Examples::GENERAL_FEATURE_NAME }::#{ ALTERNATIVE }_count", :value => 1)
51
+ @call_any.should_not raise_error
52
+ end
53
+ end
54
+
55
+ describe "#probability_for" do
56
+ it "should return the probability of a feature falling into category as: Examples[feature][category] / (Examples[feature][ALTERNATIVE] + Examples[feature][NULL])" do
57
+ GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(NULL), :value => 1000)
58
+ GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(ALTERNATIVE), :value => 150 )
59
+ Examples['url_in_text'].probability_for(ALTERNATIVE).should == 150.0/(1000 + 150)
60
+ end
61
+ end
62
+
63
+ describe "#no_examples?" do
64
+ before(:each) do
65
+ GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(ALTERNATIVE))
66
+ GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(NULL))
67
+ @object = Examples['url_in_text']
68
+ end
69
+
70
+ it "should return true if spam_count and ham_count are zero" do
71
+ @object.no_examples?.should be_true
72
+ end
73
+
74
+ it "should return true if spam_count or ham_count are zero" do
75
+ GreenMidgetRecords.find_by_key(@object.record_key(NULL)).update_attribute(:value, 1)
76
+ @object.no_examples?.should be_true
77
+ end
78
+
79
+ it "should should return false if both spam_count and ham_count are positive" do
80
+ GreenMidgetRecords.find_by_key(@object.record_key(NULL)).update_attribute(:value, 1)
81
+ GreenMidgetRecords.find_by_key(@object.record_key(ALTERNATIVE)).update_attribute(:value, 1)
82
+ @object.no_examples?.should be_false
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,24 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'spec_helper'
3
+
4
+ describe GreenMidget::Features do
5
+ include GreenMidget
6
+
7
+ before(:each) do
8
+ GreenMidgetRecords.delete_all
9
+ GreenMidgetRecords.class_variable_set("@@cache", {})
10
+ end
11
+
12
+ describe "#probability_for" do
13
+ it "should return Feature[feature] / Examples[feature]" do
14
+ GreenMidgetRecords.create(:key => Features["url_in_text"].record_key(NULL), :value => 20 )
15
+ GreenMidgetRecords.create(:key => Features["url_in_text"].record_key(ALTERNATIVE), :value => 10 )
16
+
17
+ GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(NULL), :value => 100 )
18
+ GreenMidgetRecords.create(:key => Examples['url_in_text'].record_key(ALTERNATIVE), :value => 1000)
19
+
20
+ Features['url_in_text'].probability_for(NULL).should == 20.0/100
21
+ Features['url_in_text'].probability_for(ALTERNATIVE).should == 10.0/1000
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,80 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'spec_helper'
3
+
4
+ describe GreenMidget::GreenMidgetRecords do
5
+ include GreenMidget
6
+
7
+ before(:each) do
8
+ GreenMidgetRecords.delete_all
9
+ end
10
+
11
+ describe "#[]()" do
12
+ it "should take words from data store if not found in the cache" do
13
+ word_key, phrase_key = [ 'word', 'phrase' ].map { |w| Words[w].record_key(NULL) }
14
+ GreenMidgetRecords.fetch_all([ 'word' ])
15
+ GreenMidgetRecords.create(:key => phrase_key)
16
+ GreenMidgetRecords.find_by_key(word_key).should == nil
17
+ GreenMidgetRecords.find_by_key(phrase_key).should_not == nil
18
+ GreenMidgetRecords[phrase_key].should == ''
19
+ end
20
+ it "should add a {key => ''} to the cache if key not found in cache and in the data store" do
21
+ key = Words['nonexisting'].record_key(NULL)
22
+ GreenMidgetRecords[key].should == ''
23
+ GreenMidgetRecords.find_by_key(key).should == nil
24
+ end
25
+ end
26
+
27
+ describe "#fetch_all" do
28
+ it "should empty cache before fetching" do
29
+ bar_key = Words['bar'].record_key(ALTERNATIVE)
30
+ GreenMidgetRecords.fetch_all([ 'foo', 'bar' ])
31
+ GreenMidgetRecords.class_variable_get("@@cache").key?(bar_key).should be_true
32
+ GreenMidgetRecords.fetch_all([ 'foo', 'newbar' ])
33
+ GreenMidgetRecords.class_variable_get("@@cache").key?(bar_key).should be_false
34
+ end
35
+ it "does a multi get on all words and keys" do
36
+ cache = GreenMidgetRecords.fetch_all([ 'foo', 'bar' ])
37
+ cache['foo'].should.eql? GreenMidgetRecords.class_eval{new('foo')}
38
+ end
39
+ it "should fetch the system keys along with the given words" do
40
+ key = Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ NULL }_count"
41
+ GreenMidgetRecords.create(:key => key)
42
+ GreenMidgetRecords.fetch_all([])
43
+ cache = GreenMidgetRecords.class_variable_get("@@cache")
44
+ cache.key?(key).should be_true
45
+ cache.count.should == 1
46
+ end
47
+ it "words with zero examples or no record in the database should be present in the cache" do
48
+ GreenMidgetRecords.create(:key => Words['kotoba'].record_key(NULL))
49
+ GreenMidgetRecords.fetch_all(['kotoba'])
50
+ GreenMidgetRecords.class_variable_get("@@cache").key?(Words['kotoba'].record_key(ALTERNATIVE)).should be_true
51
+ GreenMidgetRecords.create(:key => Words['mouichidou'].record_key(NULL), :value => 0)
52
+ GreenMidgetRecords.create(:key => Words['mouichidou'].record_key(ALTERNATIVE), :value => 3)
53
+ GreenMidgetRecords.fetch_all(['mouichidou'])
54
+ GreenMidgetRecords.class_variable_get("@@cache")[Words['mouichidou'].record_key(NULL)].should_not == nil
55
+ GreenMidgetRecords.class_variable_get("@@cache")[Words['mouichidou'].record_key(ALTERNATIVE)].should_not == nil
56
+ end
57
+ it "the cache should be a hash; its keys should be strings" do
58
+ GreenMidgetRecords.create(:key => Examples.prefix + Examples::GENERAL_FEATURE_NAME + "::#{ NULL }_count")
59
+ GreenMidgetRecords.create(:key => Features.prefix + "url_in_text::#{ NULL }_count")
60
+ GreenMidgetRecords.fetch_all([])
61
+ cache = GreenMidgetRecords.class_variable_get("@@cache")
62
+ cache.class.should.eql? Hash
63
+ cache.count.should == 2
64
+ cache.keys.each do |key|
65
+ key.class.should.eql? String
66
+ end
67
+ end
68
+ end
69
+
70
+ describe "#increment" do
71
+ it "should increment counts first in cache and write! to store only if explicitly called" do
72
+ record_key = Words['stuff'].record_key(NULL)
73
+ GreenMidgetRecords.create(:key => record_key)
74
+
75
+ lambda {
76
+ GreenMidgetRecords.increment(record_key)
77
+ }.should change { GreenMidgetRecords.find_by_key(record_key).value.to_f }.by(1)
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,14 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'spec_helper'
3
+
4
+ include GreenMidget
5
+
6
+ describe UrlDetection do
7
+ it 'should not detect a url' do
8
+ UrlDetection.new('not a url').any?.should_not be_true
9
+ end
10
+
11
+ it 'should detect a url' do
12
+ UrlDetection.new('http://foo.de/').any?.should be_true
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
4
+ require 'rspec'
5
+ require 'green_midget'
6
+
7
+ # Requires supporting files with custom matchers and macros, etc,
8
+ # in ./support/ and its subdirectories.
9
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each { |file| require file }
10
+
11
+ RSpec.configure do |config|
12
+
13
+ end
data/spec/tester.rb ADDED
@@ -0,0 +1,17 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ class Tester < GreenMidget::Base
3
+ attr_accessor :text
4
+
5
+ def initialize(text = '')
6
+ self.text = text
7
+ end
8
+
9
+ def words
10
+ super
11
+ end
12
+
13
+ def log_ratio
14
+ GreenMidgetRecords.fetch_all(words)
15
+ super
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ # Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
2
+ require 'spec_helper'
3
+
4
+ describe GreenMidget::Words do
5
+ include GreenMidget
6
+
7
+ before(:each) do
8
+ GreenMidgetRecords.delete_all
9
+ end
10
+
11
+ describe "self.record_keys" do
12
+ it "takes an array of words and optionally a category, returns an array of corresponding record keys wrt category" do
13
+ Words.record_keys([ 'one' ]).should == [ "#{ Words.prefix }one::#{ NULL }_count", "#{ Words.prefix }one::#{ ALTERNATIVE }_count" ]
14
+ Words.record_keys([ 'one' ], NULL).should == [ "#{ Words.prefix }one::#{ NULL }_count" ]
15
+ end
16
+ end
17
+
18
+ describe "#probability_for" do
19
+ it "should return the smoother constant if the word has zero examples" do
20
+ GreenMidgetRecords[Words['word'].record_key(ALTERNATIVE)].should == ''
21
+ Words['word'].probability_for(ALTERNATIVE).should == (1.0 / Examples.total)
22
+ end
23
+ end
24
+ end