green_midget 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/Gemfile +13 -1
- data/README.md +26 -10
- data/Rakefile +6 -1
- data/benchmark/benchmark.rb +44 -0
- data/benchmark/test.rb +33 -0
- data/green_midget.gemspec +0 -8
- data/lib/green_midget.rb +14 -3
- data/lib/green_midget/base.rb +52 -53
- data/lib/green_midget/constants.rb +23 -21
- data/lib/green_midget/db/migrate/create_green_midget_records.rb +13 -11
- data/lib/green_midget/default_features.rb +40 -0
- data/lib/green_midget/errors/feature_method_not_implemented.rb +11 -0
- data/lib/green_midget/errors/no_examples_given.rb +9 -0
- data/lib/green_midget/errors/no_text_found.rb +10 -0
- data/lib/green_midget/heuristic_checks.rb +23 -0
- data/lib/green_midget/models/countable.rb +27 -11
- data/lib/green_midget/models/examples.rb +25 -12
- data/lib/green_midget/models/features.rb +11 -1
- data/lib/green_midget/models/records.rb +61 -0
- data/lib/green_midget/models/words.rb +4 -1
- data/lib/green_midget/url_detection.rb +2 -2
- data/lib/green_midget/version.rb +1 -1
- data/lib/tasks/green_midget.rake +10 -11
- data/spec/base_spec.rb +42 -34
- data/spec/examples_spec.rb +19 -19
- data/spec/features_spec.rb +6 -6
- data/spec/green_midget_records_spec.rb +38 -33
- data/spec/spec_helper.rb +10 -7
- data/spec/tester.rb +1 -1
- data/spec/words_spec.rb +2 -2
- metadata +14 -30
- data/.document +0 -5
- data/lib/green_midget/green_midget.rb +0 -6
- data/lib/green_midget/models/green_midget_records.rb +0 -49
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# A mixin that implements features check and allows Base sublcasses
|
|
2
|
+
# to define their own features for spam/ham detection.
|
|
3
|
+
#
|
|
4
|
+
# By default texts are checked for presence of external URL or email
|
|
5
|
+
# references. An example of addional feature would be presence of particular
|
|
6
|
+
# words or expressions.
|
|
7
|
+
#
|
|
8
|
+
# See the example in `lib/green_midget/extensions/sample.rb`
|
|
9
|
+
#
|
|
10
|
+
module GreenMidget
|
|
11
|
+
module DefaultFeatures
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def features
|
|
16
|
+
FEATURES
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def present_features
|
|
20
|
+
features.select { |feature| feature_present?(feature) }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def feature_present?(feature)
|
|
24
|
+
method = :"#{feature}?"
|
|
25
|
+
if respond_to?(method, true)
|
|
26
|
+
send(method)
|
|
27
|
+
else
|
|
28
|
+
raise FeatureMethodNotImplemented.new(feature, method)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def url_in_text?
|
|
33
|
+
UrlDetection.new(text).any?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def email_in_text?
|
|
37
|
+
text.scan(EMAIL_REGEX).size > 0
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
module GreenMidget
|
|
2
|
+
class FeatureMethodNotImplemented < StandardError
|
|
3
|
+
def initialize(feature, method_name)
|
|
4
|
+
super <<-MSG
|
|
5
|
+
Method #{method_name.inspect} not found. Either implement it or
|
|
6
|
+
delete feature #{feature} from your features list.
|
|
7
|
+
MSG
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# A mixin that implements heuritics checks for both categories.
|
|
2
|
+
# If there're some conditions under which a spammable object could
|
|
3
|
+
# directly be classified as one of the classification categories
|
|
4
|
+
# the logic could be implemented using heuritic checks in your subclasses
|
|
5
|
+
#
|
|
6
|
+
# See the example in `lib/green_midget/extensions/sample.rb`
|
|
7
|
+
#
|
|
8
|
+
module GreenMidget
|
|
9
|
+
module HeuristicChecks
|
|
10
|
+
|
|
11
|
+
private
|
|
12
|
+
|
|
13
|
+
def heuristic_checks
|
|
14
|
+
CATEGORIES.each do |category|
|
|
15
|
+
if respond_to?(:"pass_#{category}_heuristics?") && send(:"pass_#{category}_heuristics?")
|
|
16
|
+
classify_as!(category)
|
|
17
|
+
return RESPONSES[category]
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
return false
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -1,25 +1,41 @@
|
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
#
|
|
3
|
+
# This is an abstraction from Words, Examples and Features. It provides common
|
|
4
|
+
# methods for building the record keys for individual countables in any
|
|
5
|
+
# category.
|
|
6
|
+
#
|
|
7
|
+
# For example the data record key for the word 'legit' in Spam category would
|
|
8
|
+
# be something like "word::legit::spam_count". The record key for a feature
|
|
9
|
+
# 'url_present' in Ham would be something like "feature::url_present::ham_count"
|
|
10
|
+
# The count of all training examples given for category Spam would be
|
|
11
|
+
# "example::any::spam_count"
|
|
12
|
+
#
|
|
13
|
+
# The example counts for individual features is stored as well. For example for
|
|
14
|
+
# 'url_present' we will have two records: "example::url_present::spam_count" and
|
|
15
|
+
# "example::url_present::ham_count". They will store the informatino about how
|
|
16
|
+
# much training the GreenMidget received for this feature in each category.
|
|
17
|
+
#
|
|
18
|
+
# This class is the link between countable and the Records data store adapter
|
|
19
|
+
#
|
|
2
20
|
module GreenMidget
|
|
3
21
|
class Countable
|
|
4
|
-
include Constants
|
|
5
22
|
attr_accessor :key
|
|
23
|
+
class_attribute :prefix
|
|
6
24
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def self.[](key)
|
|
10
|
-
new(key)
|
|
25
|
+
def initialize(key)
|
|
26
|
+
@key = self.class.prefix + key
|
|
11
27
|
end
|
|
12
28
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
end
|
|
29
|
+
class << self
|
|
30
|
+
alias :[] :new
|
|
16
31
|
|
|
17
|
-
|
|
18
|
-
|
|
32
|
+
def objects(keys)
|
|
33
|
+
keys.map { |key| new(key) }
|
|
34
|
+
end
|
|
19
35
|
end
|
|
20
36
|
|
|
21
37
|
def [](category)
|
|
22
|
-
|
|
38
|
+
Records[record_key(category)].to_f
|
|
23
39
|
end
|
|
24
40
|
|
|
25
41
|
def log_ratio
|
|
@@ -1,25 +1,26 @@
|
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
#
|
|
3
|
+
# A model for Examples used in GreenMidget. Examples represent the counts for
|
|
4
|
+
# how much training GreenMidget received in each respective category.
|
|
5
|
+
#
|
|
6
|
+
# Example['url_present'][:spam]
|
|
7
|
+
# # the number of spam training examples having an URL
|
|
8
|
+
#
|
|
9
|
+
# Example['any'][:ham]
|
|
10
|
+
# # the number of total Ham training examples
|
|
11
|
+
#
|
|
12
|
+
# See Countable
|
|
13
|
+
#
|
|
2
14
|
module GreenMidget
|
|
3
15
|
class Examples < Countable
|
|
4
|
-
NO_EXAMPLES_GIVEN_ERROR = 'Training examples must be provided for all categories before classification.'
|
|
5
16
|
GENERAL_FEATURE_NAME = 'any'
|
|
6
17
|
self.prefix = 'examples_with_feature::'
|
|
7
18
|
|
|
8
|
-
class_eval(<<-EVAL, __FILE__, __LINE__ + 1)
|
|
9
|
-
def self.#{ ALTERNATIVE } # def self.ham
|
|
10
|
-
@@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE] # @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE]
|
|
11
|
-
end # end
|
|
12
|
-
|
|
13
|
-
def self.#{ NULL } # def self.spam
|
|
14
|
-
@@null ||= self[GENERAL_FEATURE_NAME][NULL] # @@null ||= self[GENERAL_FEATURE_NAME][NULL]
|
|
15
|
-
end # end
|
|
16
|
-
EVAL
|
|
17
|
-
|
|
18
19
|
def self.[](feature)
|
|
19
20
|
object = super(feature)
|
|
20
21
|
|
|
21
22
|
if object.no_examples? && (feature == GENERAL_FEATURE_NAME)
|
|
22
|
-
raise
|
|
23
|
+
raise NoExamplesGiven
|
|
23
24
|
elsif object.no_examples?
|
|
24
25
|
super GENERAL_FEATURE_NAME
|
|
25
26
|
else
|
|
@@ -51,5 +52,17 @@ module GreenMidget
|
|
|
51
52
|
def no_examples?
|
|
52
53
|
CATEGORIES.inject(1) { |memo, category| memo *= self[category] } == 0
|
|
53
54
|
end
|
|
55
|
+
|
|
56
|
+
# These methods store the total ham and spam examples count
|
|
57
|
+
#
|
|
58
|
+
class_eval(<<-EVAL, __FILE__, __LINE__ + 1)
|
|
59
|
+
def self.#{ ALTERNATIVE } # def self.ham
|
|
60
|
+
@@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE] # @@alternative ||= self[GENERAL_FEATURE_NAME][ALTERNATIVE]
|
|
61
|
+
end # end
|
|
62
|
+
|
|
63
|
+
def self.#{ NULL } # def self.spam
|
|
64
|
+
@@null ||= self[GENERAL_FEATURE_NAME][NULL] # @@null ||= self[GENERAL_FEATURE_NAME][NULL]
|
|
65
|
+
end # end
|
|
66
|
+
EVAL
|
|
54
67
|
end
|
|
55
68
|
end
|
|
@@ -1,4 +1,14 @@
|
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
#
|
|
3
|
+
# A model for Features used in GreenMidget. A Feature could be defined by user.
|
|
4
|
+
# An example would be 'url_found_in_text' which will be true for spammable
|
|
5
|
+
# objects that have url in their text and false otherwise.
|
|
6
|
+
#
|
|
7
|
+
# Features['url_in_text'][:spam]
|
|
8
|
+
# # the count of spam messages that have the feature
|
|
9
|
+
#
|
|
10
|
+
# See Countable
|
|
11
|
+
#
|
|
2
12
|
module GreenMidget
|
|
3
13
|
class Features < Countable
|
|
4
14
|
self.prefix = 'feature::'
|
|
@@ -8,7 +18,7 @@ module GreenMidget
|
|
|
8
18
|
end
|
|
9
19
|
|
|
10
20
|
def feature
|
|
11
|
-
key.gsub(/(^#{
|
|
21
|
+
key.gsub(/(^#{self.class.prefix})|(::\w+_count$)/, '')
|
|
12
22
|
end
|
|
13
23
|
end
|
|
14
24
|
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
#
|
|
3
|
+
# GreenMidget's simple data store adapter with only three public methods.
|
|
4
|
+
# It's currently ActiveRecord based but a plan is to make a
|
|
5
|
+
# Redis based extension as well.
|
|
6
|
+
#
|
|
7
|
+
module GreenMidget
|
|
8
|
+
class Records < ActiveRecord::Base
|
|
9
|
+
self.table_name = :green_midget_records
|
|
10
|
+
|
|
11
|
+
# Does a multi-get of the necessary count records for the given words.
|
|
12
|
+
# If no words are given, then only Examples and Features counts are taken
|
|
13
|
+
def self.fetch_all(words = [])
|
|
14
|
+
words_keys = Words.record_keys(words)
|
|
15
|
+
|
|
16
|
+
pairs = where(arel_table[:key].in(words_keys).
|
|
17
|
+
or(arel_table[:key].matches("#{Features.prefix}%")).
|
|
18
|
+
or(arel_table[:key].matches("#{Examples.prefix}%"))).
|
|
19
|
+
select(:key).select(:value)
|
|
20
|
+
|
|
21
|
+
@@cache = pairs.inject({}) do |memo, pair|
|
|
22
|
+
memo[pair['key']] = pair['value']
|
|
23
|
+
memo
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
words_keys.inject(@@cache) do |memo, word|
|
|
27
|
+
memo[word] ||= ''
|
|
28
|
+
memo
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Reads the value for a given key looking in the cache first and doing a
|
|
33
|
+
# database call if nothing is found.
|
|
34
|
+
def self.[](key)
|
|
35
|
+
key = key.to_s
|
|
36
|
+
@@cache ||= {}
|
|
37
|
+
@@cache[key] ||= where(:key => key).select(:value).map(&:value).first || ''
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Increment the values for given keys. The AR implementation increments each
|
|
41
|
+
# record individually, but implementing a multi-set is possible within this
|
|
42
|
+
# method.
|
|
43
|
+
def self.increment(keys)
|
|
44
|
+
keys = Array(keys)
|
|
45
|
+
|
|
46
|
+
@@objects = where(:key => keys).inject({}) do |memo, record|
|
|
47
|
+
memo[record.key] = record
|
|
48
|
+
memo
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
keys.inject(@@objects) do |memo, key|
|
|
52
|
+
memo[key] ||= new(:key => key, :value => 0)
|
|
53
|
+
memo
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
@@objects.each { |key, record| record.increment!(:value) }
|
|
57
|
+
@@objects = {}
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
+
#
|
|
3
|
+
# A model for Words used in GreenMidget. See Countable
|
|
4
|
+
#
|
|
2
5
|
module GreenMidget
|
|
3
6
|
class Words < Countable
|
|
4
7
|
self.prefix = 'word::'
|
|
5
8
|
|
|
6
9
|
def self.record_keys(words, category = nil)
|
|
7
10
|
words.map do |word|
|
|
8
|
-
Array(category ||
|
|
11
|
+
Array(category || CATEGORIES).map{ |category| Words[word].record_key(category) }
|
|
9
12
|
end.flatten
|
|
10
13
|
end
|
|
11
14
|
|
|
@@ -12,12 +12,12 @@ module GreenMidget
|
|
|
12
12
|
private
|
|
13
13
|
|
|
14
14
|
def urls
|
|
15
|
-
@text.scan(
|
|
15
|
+
@text.scan(URL_REGEX).flatten.reject(&:nil?)
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
def non_tolerated_urls
|
|
19
19
|
urls.reject do |url|
|
|
20
|
-
url.to_s.downcase =~
|
|
20
|
+
url.to_s.downcase =~ TOLERATED_URLS
|
|
21
21
|
end
|
|
22
22
|
end
|
|
23
23
|
end
|
data/lib/green_midget/version.rb
CHANGED
data/lib/tasks/green_midget.rake
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
|
-
require 'fileutils'
|
|
3
2
|
require 'rake'
|
|
4
3
|
require 'green_midget/db/migrate/create_green_midget_records'
|
|
5
4
|
|
|
@@ -9,24 +8,24 @@ namespace :green_midget do
|
|
|
9
8
|
task :active_record => :environment do
|
|
10
9
|
include GreenMidget
|
|
11
10
|
|
|
12
|
-
unless
|
|
11
|
+
unless Records.table_exists?
|
|
13
12
|
CreateGreenMidgetRecords.up
|
|
14
13
|
end
|
|
15
14
|
|
|
16
15
|
keys = [ ALTERNATIVE, NULL ].map do |hypothesis|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
16
|
+
FEATURES.map do |feature|
|
|
17
|
+
[
|
|
18
|
+
"#{Features.prefix}#{feature}::#{hypothesis}_count",
|
|
19
|
+
"#{Examples.prefix}#{feature}::#{hypothesis}_count",
|
|
20
|
+
"#{Examples.prefix}any::#{hypothesis}_count",
|
|
21
|
+
]
|
|
22
|
+
end
|
|
24
23
|
end.flatten
|
|
25
24
|
|
|
26
25
|
puts '== Creating records ==='
|
|
27
26
|
keys.each { |key|
|
|
28
|
-
unless
|
|
29
|
-
|
|
27
|
+
unless Records.find_by_key(key)
|
|
28
|
+
Records.create(:key => key, :value => 0)
|
|
30
29
|
puts "-- Created #{key}"
|
|
31
30
|
end
|
|
32
31
|
}
|
data/spec/base_spec.rb
CHANGED
|
@@ -1,41 +1,41 @@
|
|
|
1
1
|
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
|
|
2
2
|
require 'spec_helper'
|
|
3
|
-
require
|
|
3
|
+
require 'tester'
|
|
4
4
|
|
|
5
5
|
describe GreenMidget::Base do
|
|
6
6
|
include GreenMidget
|
|
7
7
|
|
|
8
8
|
before(:each) do
|
|
9
|
-
|
|
9
|
+
Records.delete_all
|
|
10
10
|
[
|
|
11
|
-
{:key => "#{ Words.prefix }this::#{ ALTERNATIVE }_count", :value => 701
|
|
12
|
-
{:key => "#{ Words.prefix }this::#{ NULL }_count", :value => 11
|
|
13
|
-
{:key => "#{ Words.prefix }test::#{ ALTERNATIVE }_count", :value => 9
|
|
14
|
-
{:key => "#{ Words.prefix }test::#{ NULL }_count", :value => 71
|
|
15
|
-
{:key => "#{ Words.prefix }goes::#{ ALTERNATIVE }_count", :value => 90
|
|
16
|
-
{:key => "#{ Words.prefix }goes::#{ NULL }_count", :value => 90
|
|
17
|
-
{:key => "#{ Words.prefix }rid::#{ ALTERNATIVE }_count", :value => 311
|
|
18
|
-
{:key => "#{ Words.prefix }rid::#{ NULL }_count", :value => 290
|
|
19
|
-
{:key => "#{ Words.prefix }dirty::#{ ALTERNATIVE }_count", :value => 222
|
|
20
|
-
{:key => "#{ Words.prefix }dirty::#{ NULL }_count", :value => 45
|
|
21
|
-
{:key => "#{ Words.prefix }spam::#{ ALTERNATIVE }_count", :value => 11
|
|
22
|
-
{:key => "#{ Words.prefix }spam::#{ NULL }_count", :value => 133
|
|
23
|
-
{:key => "#{ Words.prefix }words::#{ ALTERNATIVE }_count", :value => 6
|
|
24
|
-
{:key => "#{ Words.prefix }words::#{ NULL }_count", :value => 811
|
|
25
|
-
{:key => "#{ Words.prefix }zero::#{ ALTERNATIVE }_count", :value => 0
|
|
26
|
-
{:key => "#{ Words.prefix }zero::#{ NULL }_count", :value => 0
|
|
27
|
-
{:key => "#{ Features.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 440
|
|
28
|
-
{:key => "#{ Features.prefix }url_in_text::#{ NULL }_count", :value => 40
|
|
29
|
-
{:key => "#{ Features.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 112
|
|
30
|
-
{:key => "#{ Features.prefix }email_in_text::#{ NULL }_count", :value => 9
|
|
31
|
-
{:key => "#{ Examples.prefix }any::#{ ALTERNATIVE }_count", :value => 1000
|
|
32
|
-
{:key => "#{ Examples.prefix }any::#{ NULL }_count", :value => 1000
|
|
33
|
-
{:key => "#{ Examples.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 1000
|
|
34
|
-
{:key => "#{ Examples.prefix }url_in_text::#{ NULL }_count", :value => 1000
|
|
35
|
-
{:key => "#{ Examples.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 1000
|
|
36
|
-
{:key => "#{ Examples.prefix }email_in_text::#{ NULL }_count", :value => 1000
|
|
11
|
+
{:key => "#{ Words.prefix }this::#{ ALTERNATIVE }_count", :value => 701 },
|
|
12
|
+
{:key => "#{ Words.prefix }this::#{ NULL }_count", :value => 11 },
|
|
13
|
+
{:key => "#{ Words.prefix }test::#{ ALTERNATIVE }_count", :value => 9 },
|
|
14
|
+
{:key => "#{ Words.prefix }test::#{ NULL }_count", :value => 71 },
|
|
15
|
+
{:key => "#{ Words.prefix }goes::#{ ALTERNATIVE }_count", :value => 90 },
|
|
16
|
+
{:key => "#{ Words.prefix }goes::#{ NULL }_count", :value => 90 },
|
|
17
|
+
{:key => "#{ Words.prefix }rid::#{ ALTERNATIVE }_count", :value => 311 },
|
|
18
|
+
{:key => "#{ Words.prefix }rid::#{ NULL }_count", :value => 290 },
|
|
19
|
+
{:key => "#{ Words.prefix }dirty::#{ ALTERNATIVE }_count", :value => 222 },
|
|
20
|
+
{:key => "#{ Words.prefix }dirty::#{ NULL }_count", :value => 45 },
|
|
21
|
+
{:key => "#{ Words.prefix }spam::#{ ALTERNATIVE }_count", :value => 11 },
|
|
22
|
+
{:key => "#{ Words.prefix }spam::#{ NULL }_count", :value => 133 },
|
|
23
|
+
{:key => "#{ Words.prefix }words::#{ ALTERNATIVE }_count", :value => 6 },
|
|
24
|
+
{:key => "#{ Words.prefix }words::#{ NULL }_count", :value => 811 },
|
|
25
|
+
{:key => "#{ Words.prefix }zero::#{ ALTERNATIVE }_count", :value => 0 },
|
|
26
|
+
{:key => "#{ Words.prefix }zero::#{ NULL }_count", :value => 0 },
|
|
27
|
+
{:key => "#{ Features.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 440 },
|
|
28
|
+
{:key => "#{ Features.prefix }url_in_text::#{ NULL }_count", :value => 40 },
|
|
29
|
+
{:key => "#{ Features.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 112 },
|
|
30
|
+
{:key => "#{ Features.prefix }email_in_text::#{ NULL }_count", :value => 9 },
|
|
31
|
+
{:key => "#{ Examples.prefix }any::#{ ALTERNATIVE }_count", :value => 1000 },
|
|
32
|
+
{:key => "#{ Examples.prefix }any::#{ NULL }_count", :value => 1000 },
|
|
33
|
+
{:key => "#{ Examples.prefix }url_in_text::#{ ALTERNATIVE }_count", :value => 1000 },
|
|
34
|
+
{:key => "#{ Examples.prefix }url_in_text::#{ NULL }_count", :value => 1000 },
|
|
35
|
+
{:key => "#{ Examples.prefix }email_in_text::#{ ALTERNATIVE }_count", :value => 1000 },
|
|
36
|
+
{:key => "#{ Examples.prefix }email_in_text::#{ NULL }_count", :value => 1000 },
|
|
37
37
|
].each do |entry|
|
|
38
|
-
|
|
38
|
+
Records.create(entry)
|
|
39
39
|
end
|
|
40
40
|
end
|
|
41
41
|
|
|
@@ -106,23 +106,27 @@ describe GreenMidget::Base do
|
|
|
106
106
|
it "should increase the index counts of the classified words" do
|
|
107
107
|
lambda {
|
|
108
108
|
Tester.new('zero').classify_as!(NULL)
|
|
109
|
-
}.should change {
|
|
109
|
+
}.should change { Records.find_by_key(Words['zero'].record_key(NULL)).value.to_f }.by(1)
|
|
110
110
|
end
|
|
111
|
+
|
|
111
112
|
it "should increment the learning examples count for all features" do
|
|
112
113
|
FEATURES.each do |feature|
|
|
113
114
|
lambda {
|
|
114
115
|
Tester.new('zero').classify_as!(NULL)
|
|
115
|
-
}.should change {
|
|
116
|
+
}.should change { Records.find_by_key(Examples[feature].record_key(NULL)).value.to_f }.by(1)
|
|
116
117
|
end
|
|
117
118
|
end
|
|
119
|
+
|
|
118
120
|
it "should not add new records for known keys" do
|
|
119
121
|
a = Tester.new 'stuff unknown sofar'
|
|
122
|
+
|
|
120
123
|
lambda {
|
|
121
124
|
a.classify_as! ALTERNATIVE
|
|
122
|
-
}.should change {
|
|
125
|
+
}.should change { Records.count }.by(3)
|
|
126
|
+
|
|
123
127
|
lambda {
|
|
124
128
|
a.classify_as! ALTERNATIVE
|
|
125
|
-
}.should_not change {
|
|
129
|
+
}.should_not change { Records.count }
|
|
126
130
|
end
|
|
127
131
|
end
|
|
128
132
|
|
|
@@ -130,15 +134,19 @@ describe GreenMidget::Base do
|
|
|
130
134
|
it "should ignore words less than 3 characters" do
|
|
131
135
|
Tester.new('is 2 ch').words.should == []
|
|
132
136
|
end
|
|
137
|
+
|
|
133
138
|
it "should break large character strings into chunks of 20 bytes" do
|
|
134
139
|
Tester.new('s'*20 + '111').words.should == ['s'*20, '111']
|
|
135
140
|
end
|
|
141
|
+
|
|
136
142
|
it "should bring uppercase to lowcase" do
|
|
137
143
|
Tester.new('HOWBIG').words.should == ['howbig']
|
|
138
144
|
end
|
|
145
|
+
|
|
139
146
|
it "should not consider parts of email address as individual words" do
|
|
140
147
|
Tester.new('friend@soundcloud.com').words.should == []
|
|
141
148
|
end
|
|
149
|
+
|
|
142
150
|
it "should not consider parts of website url as individual words" do
|
|
143
151
|
Tester.new('www.myguy.com http://weargeil.org').words.should == []
|
|
144
152
|
end
|
|
@@ -150,7 +158,7 @@ describe GreenMidget::Base do
|
|
|
150
158
|
# pending('todo')
|
|
151
159
|
# end
|
|
152
160
|
# it "throw an exception if no training examples were given, but it's asked for classification" do
|
|
153
|
-
# # if
|
|
161
|
+
# # if Records.count(ALTERNATIVE) or Records.count(NULL) is 0.0 => throw an exception
|
|
154
162
|
# pending('todo')
|
|
155
163
|
# end
|
|
156
164
|
# end
|