dwarf 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +5 -1
- data/Rakefile +1 -0
- data/TODO.taskpaper +22 -0
- data/dwarf.gemspec +2 -0
- data/lib/dwarf.rb +3 -1
- data/lib/dwarf/classifier.rb +111 -66
- data/lib/dwarf/example_management.rb +32 -0
- data/lib/dwarf/information.rb +61 -0
- data/lib/dwarf/version.rb +1 -1
- data/spec/dwarf/classifier_spec.rb +197 -0
- data/spec/dwarf/information_spec.rb +157 -0
- data/spec/frawd.rb +105 -0
- data/spec/spec_helper.rb +59 -0
- data/specs.watchr +60 -0
- metadata +39 -4
- data/spec/classifier_spec.rb +0 -80
data/Gemfile.lock
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
dwarf (0.0.
|
4
|
+
dwarf (0.0.4)
|
5
5
|
rubytree (>= 0.8.1)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: http://rubygems.org/
|
9
9
|
specs:
|
10
10
|
diff-lcs (1.1.2)
|
11
|
+
faker (0.3.1)
|
11
12
|
rspec (2.0.1)
|
12
13
|
rspec-core (~> 2.0.1)
|
13
14
|
rspec-expectations (~> 2.0.1)
|
@@ -19,6 +20,7 @@ GEM
|
|
19
20
|
rspec-core (~> 2.0.1)
|
20
21
|
rspec-expectations (~> 2.0.1)
|
21
22
|
rubytree (0.8.1)
|
23
|
+
watchr (0.7)
|
22
24
|
|
23
25
|
PLATFORMS
|
24
26
|
ruby
|
@@ -26,5 +28,7 @@ PLATFORMS
|
|
26
28
|
DEPENDENCIES
|
27
29
|
bundler (>= 1.0.0)
|
28
30
|
dwarf!
|
31
|
+
faker (>= 0.3.1)
|
29
32
|
rspec (>= 2.0.1)
|
30
33
|
rubytree (>= 0.8.1)
|
34
|
+
watchr (>= 0.7)
|
data/Rakefile
CHANGED
data/TODO.taskpaper
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Dwarf 1.0 Features:
|
3
|
+
|
4
|
+
Find all instances with a given classification given a world. @alex @priority(3)
|
5
|
+
- Create queries to find all instances of a given classification. @priority(1)
|
6
|
+
- Make logic of a query for a given classification readble (as Ruby, or SQL, or ...) @priority(1)
|
7
|
+
- Generate large worlds with consistent instances to test against. @muness @priority(2)
|
8
|
+
- Handle nested features (e.g. example.engine.cylinders) @priority(1)
|
9
|
+
Handle messy data well (nil examples, examples with nil features, examples with nil subfeatures) @alex @priority(2)
|
10
|
+
Resolve weird behavior when all examples are missing some attribute. @alex @sam @priority(1)
|
11
|
+
- When attribute.nil? bisects a heterogenous group, we should probably split on that attribute. @priority(1)
|
12
|
+
Refactor information theory methods out to enable unit testing. @alex @priority(3)
|
13
|
+
Eliminate features which are unique across all examples @sam @alex @priority(2)
|
14
|
+
- Treat hashes as nested features. @priority(1)
|
15
|
+
|
16
|
+
Dwarf Nice To Haves:
|
17
|
+
|
18
|
+
- meta-features based on type, e.g. parity(car.engine.cylinders) can be :even or :odd @priority(2)
|
19
|
+
- Bayesian classification of text fields. @priority(1)
|
20
|
+
- Junk uniquely identifying features (implicit in info gain calculations? add tests to verify!) @priority(3)
|
21
|
+
- Modular feature enumeration and feature fetching code. (Don't rely on attributes and example.attribute to be your only duck type checks!) @priority(2)
|
22
|
+
- A world-generation tool to create internally consistent data sets to measure dwarf's learning against. Maybe we can call it "frawd". @priority(1)
|
data/dwarf.gemspec
CHANGED
@@ -17,6 +17,8 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.add_dependency "rubytree", ">= 0.8.1"
|
18
18
|
s.add_development_dependency "bundler", ">= 1.0.0"
|
19
19
|
s.add_development_dependency "rspec", ">= 2.0.1"
|
20
|
+
s.add_development_dependency "watchr", ">= 0.7"
|
21
|
+
s.add_development_dependency "faker", ">= 0.3.1"
|
20
22
|
|
21
23
|
s.files = `git ls-files`.split("\n")
|
22
24
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
data/lib/dwarf.rb
CHANGED
data/lib/dwarf/classifier.rb
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
module Dwarf
|
2
2
|
class Classifier
|
3
|
+
|
3
4
|
attr_accessor :examples
|
4
5
|
attr_accessor :example_attributes
|
5
6
|
attr_accessor :classifier_logic
|
7
|
+
attr_reader :decision_tree
|
6
8
|
|
7
9
|
def initialize()
|
8
10
|
@examples, @example_attributes = {}, []
|
9
11
|
@decision_tree = TreeNode.new("ROOT")
|
12
|
+
@nil_name = Object.new.to_s
|
10
13
|
end
|
11
14
|
|
12
15
|
def add_examples(example_hash)
|
@@ -17,7 +20,7 @@ module Dwarf
|
|
17
20
|
|
18
21
|
def add_example(example_record, classification)
|
19
22
|
@examples[example_record]=classification
|
20
|
-
@example_attributes |= example_record.
|
23
|
+
@example_attributes |= example_record.attribute_names
|
21
24
|
end
|
22
25
|
|
23
26
|
def classify(example)
|
@@ -26,37 +29,120 @@ module Dwarf
|
|
26
29
|
|
27
30
|
def learn!
|
28
31
|
@decision_tree.examples = @examples.keys
|
32
|
+
converge_tree
|
33
|
+
self.classifier_logic = codify_tree(@decision_tree)
|
34
|
+
implement_classify
|
35
|
+
end
|
36
|
+
|
37
|
+
def find_by_classification(world, classification)
|
38
|
+
matches = []
|
39
|
+
world.each do |instance|
|
40
|
+
if classify(instance) == classification
|
41
|
+
matches << instance
|
42
|
+
end
|
43
|
+
end
|
44
|
+
matches
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
include ExampleManagement
|
50
|
+
|
51
|
+
def converge_tree
|
29
52
|
pending = []
|
30
53
|
pending.push @decision_tree
|
31
|
-
used_attributes = []
|
32
54
|
until pending.empty?
|
33
55
|
node = pending.pop
|
34
56
|
if classification = homogenous_examples(node)
|
35
57
|
node.classification = classification
|
36
58
|
elsif no_valuable_attributes?(node) && node.parent
|
37
|
-
|
59
|
+
if split_nil_children = check_nil_split(node)
|
60
|
+
split_nil_children.each {|child_node| pending.push(child_node)}
|
61
|
+
else
|
62
|
+
create_expected_value(node)
|
63
|
+
end
|
38
64
|
elsif no_valuable_attributes?(node)
|
39
|
-
|
65
|
+
node.classification = expected_value(node.examples)
|
40
66
|
elsif false #stub branch
|
41
67
|
#C4.5 would also allow for previously unseen classifications
|
42
|
-
#dwarf
|
43
|
-
#
|
68
|
+
#dwarf needs to correctly handle a pre-existing tree when
|
69
|
+
#learn! is called
|
44
70
|
else
|
45
|
-
|
46
|
-
|
47
|
-
infogains[information_gain(node.examples,example_attribute)] = example_attribute
|
48
|
-
end
|
49
|
-
best_gain = infogains.keys.sort[0]
|
50
|
-
best_attribute = infogains[best_gain]
|
51
|
-
split(node,best_attribute).each {|child_node| pending.push(child_node)}
|
52
|
-
used_attributes << best_attribute
|
71
|
+
split_children = homogenize_children(node)
|
72
|
+
split_children.each {|child_node| pending.push(child_node)}
|
53
73
|
end
|
54
74
|
end
|
55
|
-
self.classifier_logic = codify_tree(@decision_tree)
|
56
|
-
implement_classify
|
57
75
|
end
|
58
76
|
|
59
|
-
|
77
|
+
def check_nil_split(node)
|
78
|
+
infogains = {}
|
79
|
+
|
80
|
+
used_attributes = used_attributes(node)
|
81
|
+
(filtered_attributes-used_attributes).each do |example_attribute|
|
82
|
+
infogains[Information::unfiltered_information_gain(node.examples,example_attribute,@examples)] =
|
83
|
+
example_attribute
|
84
|
+
end
|
85
|
+
best_gain = infogains.keys.sort[0]
|
86
|
+
best_attribute = infogains[best_gain]
|
87
|
+
if best_gain > 0.0
|
88
|
+
return split(node, best_attribute)
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
def create_expected_value(node)
|
94
|
+
new_node = TreeNode.new(node.name)
|
95
|
+
expected_value = expected_value(node.examples)
|
96
|
+
new_node.classification = expected_value
|
97
|
+
parent = node.parent
|
98
|
+
parent.remove! node
|
99
|
+
parent << new_node
|
100
|
+
new_node << node
|
101
|
+
end
|
102
|
+
|
103
|
+
def used_attributes(node)
|
104
|
+
if node.parentage
|
105
|
+
node.parentage.map { |parent| parent.attribute }
|
106
|
+
else
|
107
|
+
[]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def attribute_homogeneous?(example_subset, attribute)
|
112
|
+
invert_with_dups(attribute_map(example_subset, attribute)).keys.size == 1
|
113
|
+
end
|
114
|
+
|
115
|
+
def heterogeneous_attributes
|
116
|
+
@example_attributes.reject { |attr| attribute_homogeneous?(@examples.keys, attr) }
|
117
|
+
end
|
118
|
+
|
119
|
+
def attribute_clusters?(example_subset, attribute)
|
120
|
+
invert_with_dups(attribute_map(example_subset, attribute)).keys.size == example_subset.size
|
121
|
+
end
|
122
|
+
|
123
|
+
def clustering_attributes
|
124
|
+
@example_attributes.select {|attr| attribute_clusters?(@examples.keys, attr) }
|
125
|
+
end
|
126
|
+
|
127
|
+
def filtered_attributes
|
128
|
+
clustering_attributes | heterogeneous_attributes
|
129
|
+
end
|
130
|
+
|
131
|
+
def homogenize_children(node)
|
132
|
+
infogains = {}
|
133
|
+
|
134
|
+
used_attributes = used_attributes(node)
|
135
|
+
|
136
|
+
(filtered_attributes-used_attributes).each do |example_attribute|
|
137
|
+
infogains[Information::information_gain(node.examples,example_attribute,@examples)] =
|
138
|
+
example_attribute
|
139
|
+
end
|
140
|
+
|
141
|
+
best_gain = infogains.keys.sort[0]
|
142
|
+
best_attribute = infogains[best_gain]
|
143
|
+
|
144
|
+
return split(node,best_attribute)
|
145
|
+
end
|
60
146
|
|
61
147
|
def implement_classify
|
62
148
|
classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
|
@@ -85,6 +171,7 @@ module Dwarf
|
|
85
171
|
|
86
172
|
def codify_literal(object)
|
87
173
|
case object
|
174
|
+
when @nil_name then "nil"
|
88
175
|
when Symbol then ":#{object}"
|
89
176
|
when String then "\"#{object}\""
|
90
177
|
else
|
@@ -97,6 +184,9 @@ module Dwarf
|
|
97
184
|
example_subset = node.examples
|
98
185
|
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
99
186
|
examples_inversion.each do |key, value|
|
187
|
+
if key.nil?
|
188
|
+
key = @nil_name
|
189
|
+
end
|
100
190
|
child_node = TreeNode.new(key)
|
101
191
|
child_node.examples = value
|
102
192
|
node << child_node
|
@@ -106,21 +196,20 @@ module Dwarf
|
|
106
196
|
end
|
107
197
|
|
108
198
|
def expected_value(example_subset)
|
109
|
-
examples_inversion = invert_with_dups(classification_map(example_subset))
|
199
|
+
examples_inversion = invert_with_dups(classification_map(example_subset, @examples))
|
110
200
|
occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
|
111
201
|
occurrences.keys.sort { |key| occurrences[key] }[0]
|
112
202
|
end
|
113
203
|
|
114
204
|
def no_valuable_attributes?(node)
|
115
|
-
|
116
|
-
information_gain(node.examples, example_attribute)}.each {|info_gain|
|
205
|
+
filtered_attributes.map {|example_attribute|
|
206
|
+
Information::information_gain(node.examples, example_attribute, @examples)}.each {|info_gain|
|
117
207
|
return false if info_gain != 0}
|
118
208
|
return true
|
119
209
|
end
|
120
210
|
|
121
|
-
|
122
211
|
def homogenous_examples(node)
|
123
|
-
classifications =
|
212
|
+
classifications = filter_classifications(@examples, node.examples)
|
124
213
|
if classifications.length == 1
|
125
214
|
return classifications[0]
|
126
215
|
else
|
@@ -128,49 +217,5 @@ module Dwarf
|
|
128
217
|
end
|
129
218
|
end
|
130
219
|
|
131
|
-
def entropy(example_subset)
|
132
|
-
set_size = example_subset.length.to_f
|
133
|
-
examples_inversion = invert_with_dups(classification_map(example_subset))
|
134
|
-
occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
|
135
|
-
0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
|
136
|
-
sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
def information_gain(example_subset,attribute)
|
141
|
-
set_size = example_subset.length.to_f
|
142
|
-
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
143
|
-
occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
|
144
|
-
entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
|
145
|
-
sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
def classifications(example_subset)
|
150
|
-
example_subset.map {|example| @examples[example]}.compact
|
151
|
-
end
|
152
|
-
|
153
|
-
def classification_map(example_subset)
|
154
|
-
classification_map = {}
|
155
|
-
example_subset.each {|example| classification_map[example] = @examples[example]}
|
156
|
-
classification_map
|
157
|
-
end
|
158
|
-
|
159
|
-
def attribute_values(example_subset, attribute)
|
160
|
-
example_subset.map {|example| example.method(attribute.to_sym).call}.compact
|
161
|
-
end
|
162
|
-
|
163
|
-
def attribute_map(example_subset, attribute)
|
164
|
-
example_map = {}
|
165
|
-
example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
|
166
|
-
example_map
|
167
|
-
end
|
168
|
-
|
169
|
-
def invert_with_dups(hash)
|
170
|
-
inversion = {}
|
171
|
-
hash.values.each {|value| inversion[value] = []}
|
172
|
-
hash.keys.each {|key| inversion[hash[key]] << key}
|
173
|
-
inversion
|
174
|
-
end
|
175
220
|
end
|
176
221
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Dwarf
|
2
|
+
module ExampleManagement
|
3
|
+
|
4
|
+
def classification_map(example_subset, classifications)
|
5
|
+
classification_map = {}
|
6
|
+
example_subset.each {|example| classification_map[example] = classifications[example]}
|
7
|
+
classification_map
|
8
|
+
end
|
9
|
+
|
10
|
+
def invert_with_dups(hash)
|
11
|
+
inversion = { }
|
12
|
+
hash.values.each {|value| inversion[value] = []}
|
13
|
+
hash.keys.each {|key| inversion[hash[key]] << key}
|
14
|
+
inversion
|
15
|
+
end
|
16
|
+
|
17
|
+
def eval_attribute(example,attribute)
|
18
|
+
example.method(attribute.to_sym).call
|
19
|
+
end
|
20
|
+
|
21
|
+
def attribute_map(example_subset, attribute)
|
22
|
+
example_map = {}
|
23
|
+
example_subset.each {|example| example_map[example] = eval_attribute(example, attribute)}
|
24
|
+
example_map
|
25
|
+
end
|
26
|
+
|
27
|
+
def filter_classifications(classifications,example_subset)
|
28
|
+
example_subset.map {|example| classifications[example]}.uniq
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Dwarf
|
2
|
+
module Information
|
3
|
+
|
4
|
+
class<< self
|
5
|
+
include ExampleManagement
|
6
|
+
|
7
|
+
def entropy(example_subset, classifications)
|
8
|
+
seen_classifications = filter_classifications(classifications, example_subset)
|
9
|
+
return 0.0 if seen_classifications.length == 1
|
10
|
+
set_size = example_subset.length.to_f
|
11
|
+
examples_inversion = invert_with_dups(classification_map(example_subset, classifications))
|
12
|
+
occurrences = occurrences(examples_inversion)
|
13
|
+
sum_over(seen_classifications) do |classification|
|
14
|
+
frequency = occurrences[classification]/set_size
|
15
|
+
- frequency * Math.log(frequency,seen_classifications.length)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def information_gain(example_subset, attribute, classifications)
|
20
|
+
filtered_example_subset = filter_for_missing_attribute(example_subset, attribute)
|
21
|
+
unfiltered_information_gain(filtered_example_subset, attribute, classifications)
|
22
|
+
end
|
23
|
+
|
24
|
+
def unfiltered_information_gain(example_subset, attribute, classifications)
|
25
|
+
set_size = example_subset.length.to_f
|
26
|
+
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
27
|
+
occurrences = occurrences(examples_inversion)
|
28
|
+
heterogeneous_entropy = entropy(example_subset, classifications)
|
29
|
+
seen_attribute_values = attribute_values(example_subset,attribute)
|
30
|
+
heterogeneous_entropy -
|
31
|
+
sum_over(seen_attribute_values) do |attribute_value|
|
32
|
+
frequency = occurrences[attribute_value]/set_size
|
33
|
+
frequency * entropy(examples_inversion[attribute_value], classifications)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def sum_over(collection)
|
40
|
+
collection.inject(0.0) do |sum, classification|
|
41
|
+
sum + yield(classification)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def occurrences(examples_inversion)
|
46
|
+
examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
|
47
|
+
end
|
48
|
+
|
49
|
+
def filter_for_missing_attribute(example_subset, attribute)
|
50
|
+
example_subset.reject { |example| eval_attribute(example,attribute).nil? }
|
51
|
+
end
|
52
|
+
|
53
|
+
def attribute_values(example_subset, attribute)
|
54
|
+
example_subset.map {|example| eval_attribute(example, attribute)}.uniq
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
data/lib/dwarf/version.rb
CHANGED
@@ -0,0 +1,197 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
|
2
|
+
|
3
|
+
describe Dwarf::Classifier do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@classifier = Dwarf::Classifier.new()
|
7
|
+
end
|
8
|
+
|
9
|
+
def mock_car_examples
|
10
|
+
@example1 = FakeCar.new(:body_style => :boxy,
|
11
|
+
:cylinders => 4,
|
12
|
+
:transmission => :manual)
|
13
|
+
@example2 = FakeCar.new(:body_style => :swoopy,
|
14
|
+
:cylinders => 6,
|
15
|
+
:transmission => :manual)
|
16
|
+
@example3 = FakeCar.new(:body_style => :angry,
|
17
|
+
:cylinders => 8,
|
18
|
+
:transmission => :manual)
|
19
|
+
@example4 = FakeCar.new(:body_style => :swoopy,
|
20
|
+
:cylinders => 8,
|
21
|
+
:transmission => :manual)
|
22
|
+
@example5 = FakeCar.new(:body_style => nil,
|
23
|
+
:cylinders => 6,
|
24
|
+
:transmission => :manual)
|
25
|
+
@example6 = FakeCar.new(:body_style => :sleek,
|
26
|
+
:cylinders => nil,
|
27
|
+
:transmission => :manual)
|
28
|
+
end
|
29
|
+
|
30
|
+
context "add_example" do
|
31
|
+
|
32
|
+
it "accepts example classifications" do
|
33
|
+
@classifier.should respond_to(:add_example)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "stores examples" do
|
37
|
+
@example3 = double('example3')
|
38
|
+
@example3.stub(:attribute_names) { [] }
|
39
|
+
@classifier.add_example(@example3, :irish)
|
40
|
+
@classifier.examples.should include(@example3)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "enumerate example attributes" do
|
44
|
+
@example_with_attributes = double('attrs')
|
45
|
+
@example_with_attributes.stub(:attribute_names) { ["height", "branch_density"] }
|
46
|
+
@classifier.add_example(@example_with_attributes, :pine)
|
47
|
+
@classifier.example_attributes.should include("height", "branch_density")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "gracefully accepts examples with nil attributes" do
|
51
|
+
@example_with_nil_attributes = double('nils')
|
52
|
+
@example_with_nil_attributes.stub(:attribute_names) { ["height", "branch_density"] }
|
53
|
+
@example_with_nil_attributes.stub(:height) { nil }
|
54
|
+
@example_with_nil_attributes.stub(:branch_density) { :high }
|
55
|
+
lambda {@classifier.add_example(@example_with_nil_attributes, :pine)}.should_not raise_exception
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
context "learn! and classify" do
|
61
|
+
|
62
|
+
it "only implements classify on the learning instance" do
|
63
|
+
@example = double('example3')
|
64
|
+
@example.stub(:attribute_names) { [] }
|
65
|
+
@class2 = Dwarf::Classifier.new()
|
66
|
+
@classifier.add_example(@example, :round)
|
67
|
+
@classifier.learn!
|
68
|
+
@classifier.classify(@example) == :round
|
69
|
+
@class2.classify(@example).should == nil
|
70
|
+
end
|
71
|
+
|
72
|
+
context "frawd is dwarf backwards" do
|
73
|
+
before(:each) do
|
74
|
+
@frawd = Frawd.new(1,100)
|
75
|
+
end
|
76
|
+
|
77
|
+
it "is totally awesome" do
|
78
|
+
@frawd.training.each do |example, classification|
|
79
|
+
@classifier.add_example(example, classification)
|
80
|
+
end
|
81
|
+
@classifier.learn!
|
82
|
+
success = 0
|
83
|
+
@frawd.testing.each do |example, classification|
|
84
|
+
success += 1 if @classifier.classify(example) == classification
|
85
|
+
end
|
86
|
+
success.should == @frawd.testing.size
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
context "classifying cars" do
|
91
|
+
|
92
|
+
before(:each) do
|
93
|
+
mock_car_examples
|
94
|
+
@classifier.add_example(@example1, :japanese)
|
95
|
+
@classifier.add_example(@example2, :german)
|
96
|
+
@classifier.add_example(@example3, :american)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "classifies in a trivial case" do
|
100
|
+
@classifier.learn!
|
101
|
+
@classifier.classify(@example1).should == :japanese
|
102
|
+
@classifier.classify(@example2).should == :german
|
103
|
+
@classifier.classify(@example3).should == :american
|
104
|
+
end
|
105
|
+
|
106
|
+
it "classifies when multiple predicates required" do
|
107
|
+
@classifier.add_example(@example4, :german)
|
108
|
+
@classifier.learn!
|
109
|
+
@classifier.classify(@example1).should == :japanese
|
110
|
+
@classifier.classify(@example2).should == :german
|
111
|
+
@classifier.classify(@example3).should == :american
|
112
|
+
@classifier.classify(@example4).should == :german
|
113
|
+
end
|
114
|
+
|
115
|
+
it "handles nils gracefully" do
|
116
|
+
@classifier.add_examples(@example4 => :german,
|
117
|
+
@example5 => :japanese,
|
118
|
+
@example6 => :japanese)
|
119
|
+
lambda{@classifier.learn!}.should_not raise_exception
|
120
|
+
end
|
121
|
+
|
122
|
+
it "handles nils correctly" do
|
123
|
+
@classifier.add_examples(@example4 => :german,
|
124
|
+
@example5 => :japanese,
|
125
|
+
@example6 => :japanese)
|
126
|
+
@classifier.learn!
|
127
|
+
@classifier.classify(@example1).should == :japanese
|
128
|
+
@classifier.classify(@example2).should == :german
|
129
|
+
@classifier.classify(@example3).should == :american
|
130
|
+
@classifier.classify(@example4).should == :german
|
131
|
+
@classifier.classify(@example5).should == :japanese
|
132
|
+
@classifier.classify(@example6).should == :japanese
|
133
|
+
end
|
134
|
+
|
135
|
+
it "handles a feature missing from all examples correctly" do
|
136
|
+
@classifier.add_examples(@example4 => :german,
|
137
|
+
@example5 => :japanese,
|
138
|
+
@example6 => :japanese)
|
139
|
+
@classifier.learn!
|
140
|
+
open = [@classifier.decision_tree]
|
141
|
+
until open.empty?
|
142
|
+
current = open.pop
|
143
|
+
current.attribute.should_not == "wheel_diameter"
|
144
|
+
current.children.each {|child| open.push child}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
it "does not use a feature which is identical across all examples" do
|
149
|
+
@classifier.add_examples(@example4 => :german,
|
150
|
+
@example5 => :japanese,
|
151
|
+
@example6 => :japanese)
|
152
|
+
@classifier.learn!
|
153
|
+
open = [@classifier.decision_tree]
|
154
|
+
until open.empty?
|
155
|
+
current = open.pop
|
156
|
+
current.attribute.should_not == "transmission"
|
157
|
+
current.children.each {|child| open.push child}
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
it "does not use a feature unique to each example" do
|
162
|
+
@classifier.add_examples(@example4 => :german,
|
163
|
+
@example5 => :japanese,
|
164
|
+
@example6 => :japanese)
|
165
|
+
@classifier.learn!
|
166
|
+
open = [@classifier.decision_tree]
|
167
|
+
until open.empty?
|
168
|
+
current = open.pop
|
169
|
+
current.attribute.should_not == "vin"
|
170
|
+
current.children.each {|child| open.push child}
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
context "find_by_classification" do
|
180
|
+
|
181
|
+
it "returns sets of cars based on class" do
|
182
|
+
mock_car_examples
|
183
|
+
@classifier.add_examples(@example1 => :japanese,
|
184
|
+
@example2 => :german,
|
185
|
+
@example3 => :american,
|
186
|
+
@example4 => :german)
|
187
|
+
@classifier.learn!
|
188
|
+
all_cars = [@example1, @example2, @example3, @example4]
|
189
|
+
japanese_cars = @classifier.find_by_classification(all_cars, :japanese)
|
190
|
+
japanese_cars.should == [@example1]
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
end
|
195
|
+
|
196
|
+
|
197
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
|
2
|
+
|
3
|
+
describe Dwarf::Information do
|
4
|
+
|
5
|
+
class Deck
|
6
|
+
|
7
|
+
def initialize()
|
8
|
+
@draw = (1..52).map{|v| v}
|
9
|
+
@draw.shuffle!
|
10
|
+
@discard = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def sample
|
14
|
+
unless @draw.empty?
|
15
|
+
card = @draw.pop
|
16
|
+
@discard.push card
|
17
|
+
return card
|
18
|
+
else
|
19
|
+
@draw = @discard
|
20
|
+
@draw.shuffle
|
21
|
+
@discard = []
|
22
|
+
return self.sample
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
class Coin
|
29
|
+
|
30
|
+
def initialize(weighting)
|
31
|
+
@weighting = weighting
|
32
|
+
@faces = [:heads, :tails]
|
33
|
+
end
|
34
|
+
|
35
|
+
def attributes
|
36
|
+
"weighting"
|
37
|
+
end
|
38
|
+
|
39
|
+
def weighting
|
40
|
+
@weighting
|
41
|
+
end
|
42
|
+
|
43
|
+
def sample
|
44
|
+
case @weighting
|
45
|
+
when :fair then @faces.sample
|
46
|
+
when :heads then :heads
|
47
|
+
when :tails then :tails
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
context "entropy" do
|
54
|
+
it "calculates correctly for heads and tails" do
|
55
|
+
examples = []
|
56
|
+
classifications = {}
|
57
|
+
coin = Coin.new(:fair)
|
58
|
+
1000.times do
|
59
|
+
obj = Object.new
|
60
|
+
examples << obj
|
61
|
+
classifications[obj] = coin.sample
|
62
|
+
end
|
63
|
+
entropy = Dwarf::Information.entropy(examples, classifications)
|
64
|
+
entropy.should > 0.99
|
65
|
+
entropy.should <= 1.0
|
66
|
+
end
|
67
|
+
|
68
|
+
it "calculates correctly for 1d6" do
|
69
|
+
examples = []
|
70
|
+
classifications = {}
|
71
|
+
die = (1..6).map{|v| v}
|
72
|
+
1000.times do
|
73
|
+
obj = Object.new
|
74
|
+
examples << obj
|
75
|
+
classifications[obj] = die.sample
|
76
|
+
end
|
77
|
+
entropy = Dwarf::Information.entropy(examples, classifications)
|
78
|
+
entropy.should > 0.99
|
79
|
+
entropy.should <= 1.0
|
80
|
+
end
|
81
|
+
|
82
|
+
it "calculates correctly for a deck of cards" do
|
83
|
+
examples = []
|
84
|
+
classifications = {}
|
85
|
+
deck = Deck.new
|
86
|
+
1000.times do
|
87
|
+
obj = Object.new
|
88
|
+
examples << obj
|
89
|
+
classifications[obj] = deck.sample
|
90
|
+
end
|
91
|
+
entropy = Dwarf::Information.entropy(examples, classifications)
|
92
|
+
entropy.should > 0.99
|
93
|
+
entropy.should <= 1.0
|
94
|
+
end
|
95
|
+
|
96
|
+
it "calculates correctly with a weighted coin" do
|
97
|
+
examples = []
|
98
|
+
classifications = {}
|
99
|
+
1000.times do
|
100
|
+
obj = Object.new
|
101
|
+
examples << obj
|
102
|
+
classifications[obj] = (rand(100) == 99) ? :heads : :tails
|
103
|
+
end
|
104
|
+
entropy = Dwarf::Information.entropy(examples,classifications)
|
105
|
+
entropy.should < 0.101 #With a perfect 99:1 distribution, entropy should == 0.0807...
|
106
|
+
entropy.should >= 0.04
|
107
|
+
end
|
108
|
+
|
109
|
+
it "calculates correctly with a homogenous set" do
|
110
|
+
examples = []
|
111
|
+
classifications = {}
|
112
|
+
1000.times do
|
113
|
+
obj = Object.new
|
114
|
+
examples << obj
|
115
|
+
classifications[obj] = :heads
|
116
|
+
end
|
117
|
+
entropy = Dwarf::Information.entropy(examples,classifications)
|
118
|
+
entropy.should == 0.0
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
context "information_gain" do
|
124
|
+
|
125
|
+
it "calculates correctly splitting perfectly weighted coins" do
|
126
|
+
examples = []
|
127
|
+
classifications = {}
|
128
|
+
500.times do
|
129
|
+
coin = Coin.new(:heads)
|
130
|
+
examples << coin
|
131
|
+
classifications[coin] = coin.sample
|
132
|
+
end
|
133
|
+
500.times do
|
134
|
+
coin = Coin.new(:tails)
|
135
|
+
examples << coin
|
136
|
+
classifications[coin] = coin.sample
|
137
|
+
end
|
138
|
+
information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
|
139
|
+
information_gain.should == 1.0
|
140
|
+
end
|
141
|
+
|
142
|
+
it "calculates worthless infogame for fair weighted coins" do
|
143
|
+
examples = []
|
144
|
+
classifications = {}
|
145
|
+
coin = Coin.new(:fair)
|
146
|
+
1000.times do
|
147
|
+
coin = Coin.new(:fair)
|
148
|
+
examples << coin
|
149
|
+
classifications[coin] = coin.sample
|
150
|
+
end
|
151
|
+
information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
|
152
|
+
information_gain.should == 0.0
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
end
|
data/spec/frawd.rb
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
#require File.join(File.dirname(__FILE__), *%w[. spec_helper.rb])
|
2
|
+
require 'rspec/mocks'
|
3
|
+
require 'faker'
|
4
|
+
require 'digest'
|
5
|
+
|
6
|
+
class Frawd
|
7
|
+
attr_reader :rules
|
8
|
+
|
9
|
+
def initialize(depth = 10, sample_sizes = 1000)
|
10
|
+
@depth = depth
|
11
|
+
@sample_sizes = sample_sizes
|
12
|
+
initialize_attributes
|
13
|
+
@leaves = []
|
14
|
+
@rules = build_rules
|
15
|
+
@rules.each_leaf do |leaf|
|
16
|
+
@leaves << leaf
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def types
|
21
|
+
[:enum, :number, :text]
|
22
|
+
end
|
23
|
+
|
24
|
+
def enums
|
25
|
+
unless @enums
|
26
|
+
@enums = [[:true, :false],
|
27
|
+
[:baz, :bar, :zot],
|
28
|
+
[:baz, :bar, :zot, :quux]]
|
29
|
+
(1..rand(10)).each do
|
30
|
+
@enums << Faker::Lorem.words(rand(10)).uniq.map(&:to_sym)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
@enums
|
34
|
+
end
|
35
|
+
|
36
|
+
def classifications
|
37
|
+
@classifications ||= (1..rand(10)).map {|x| "classification#{x}".to_sym }
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize_attributes
|
41
|
+
@attributes = []
|
42
|
+
num_attributes = 10#rand(100)
|
43
|
+
(1..num_attributes).each do |number|
|
44
|
+
type = types.sample
|
45
|
+
values = enums.sample if type == :enum
|
46
|
+
@attributes << ["attribute#{number}", type, values]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def filtered_attributes
|
51
|
+
@attributes.select {|a| a[1] == :enum}
|
52
|
+
end
|
53
|
+
|
54
|
+
def build_rules(node = Dwarf::TreeNode.new("ROOT"), attributes = filtered_attributes)
|
55
|
+
parents = node.parentage || []
|
56
|
+
if (rand(@depth) < parents.length) || attributes.empty?
|
57
|
+
node.classification = classifications.sample
|
58
|
+
else
|
59
|
+
attribute = attributes.sample
|
60
|
+
node.attribute = attribute[0]
|
61
|
+
attribute[2].each do |value|
|
62
|
+
child = Dwarf::TreeNode.new(value.to_s)
|
63
|
+
node << child
|
64
|
+
build_rules(child,attributes-[attribute[0]])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
node
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_example
|
71
|
+
node = @leaves.sample
|
72
|
+
example_classification = node.classification
|
73
|
+
example = RSpec::Mocks::Mock.new('example')
|
74
|
+
node.parentage.unshift(node).each_cons(2) do |child, parent|
|
75
|
+
example.stub!(parent.attribute.to_sym) { child.name }
|
76
|
+
example.stub!(:attribute_names) { @attributes.map {|a| a[0]} }
|
77
|
+
end
|
78
|
+
@attributes.each do |attribute|
|
79
|
+
unless example.respond_to? attribute[0].to_sym
|
80
|
+
val = case attribute[1]
|
81
|
+
when :enum then attribute[2].sample
|
82
|
+
when :number then rand((2**(0.size * 8 -2) -1))
|
83
|
+
when :text then Faker::Lorem.paragraphs
|
84
|
+
end
|
85
|
+
example.stub!(attribute[0].to_sym) { val }
|
86
|
+
end
|
87
|
+
end
|
88
|
+
[ example, example_classification ]
|
89
|
+
end
|
90
|
+
|
91
|
+
def generate_examples(count)
|
92
|
+
examples = Array.new(count)
|
93
|
+
(0...count).each { |index| examples[index] = generate_example }
|
94
|
+
examples
|
95
|
+
end
|
96
|
+
|
97
|
+
def training
|
98
|
+
@training ||= generate_examples(@sample_sizes)
|
99
|
+
end
|
100
|
+
|
101
|
+
def testing
|
102
|
+
@testing ||= generate_examples(@sample_sizes)
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1 +1,60 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
|
2
|
+
require File.join(File.dirname(__FILE__), *%w[. frawd])
|
3
|
+
|
4
|
+
# http://blog.jayfields.com/2007/04/ruby-assigning-instance-variables-in.html
|
5
|
+
class Module
|
6
|
+
def initializer(*args, &block)
|
7
|
+
define_method :initialize do |*ctor_args|
|
8
|
+
ctor_named_args = (ctor_args.last.is_a?(Hash) ? ctor_args.pop : {})
|
9
|
+
(0..args.size).each do |index|
|
10
|
+
instance_variable_set("@#{args[index]}", ctor_args[index])
|
11
|
+
end
|
12
|
+
ctor_named_args.each_pair do |param_name, param_value|
|
13
|
+
instance_variable_set("@#{param_name}", param_value)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class FakeCar
|
20
|
+
initializer :body_style, :cylinders, :wheel_diameter, :transmission
|
21
|
+
attr_accessor :body_style, :cylinders, :wheel_diameter, :transmission
|
22
|
+
|
23
|
+
@@vin_counter = 0
|
24
|
+
|
25
|
+
def vin
|
26
|
+
@vin ||= @@vin_counter+=1
|
27
|
+
end
|
28
|
+
|
29
|
+
def attributes
|
30
|
+
["body_style", "cylinders", "wheel_diameter", "transmission", "vin"]
|
31
|
+
end
|
32
|
+
|
33
|
+
alias_method :attribute_names, :attributes
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
"#{body_style} with #{cylinders} cylinders"
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.valid_body_styles
|
40
|
+
[:boxy, :swoopy, :angry, :boring]
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.valid_cylinders
|
44
|
+
[4, 6, 8]
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.fake
|
48
|
+
new(:body_style => valid_body_styles.sample,
|
49
|
+
:cylinders => valid_cylinders.sample)
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.multiple_fakes(how_many=5)
|
53
|
+
array = []
|
54
|
+
how_many.times do
|
55
|
+
array << fake
|
56
|
+
end
|
57
|
+
array
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
data/specs.watchr
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# Run me with:
|
2
|
+
#
|
3
|
+
# $ watchr specs.watchr
|
4
|
+
|
5
|
+
# --------------------------------------------------
|
6
|
+
# Convenience Methods
|
7
|
+
# --------------------------------------------------
|
8
|
+
def all_spec_files
|
9
|
+
Dir['spec/**/*_spec.rb']
|
10
|
+
end
|
11
|
+
|
12
|
+
def run_spec_matching(thing_to_match)
|
13
|
+
matches = all_spec_files.grep(/#{thing_to_match}/i)
|
14
|
+
if matches.empty?
|
15
|
+
puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
|
16
|
+
else
|
17
|
+
run matches.join(' ')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def run(files_to_run)
|
22
|
+
puts("Running: #{files_to_run}")
|
23
|
+
system("clear;rspec -cfs #{files_to_run}")
|
24
|
+
no_int_for_you
|
25
|
+
end
|
26
|
+
|
27
|
+
def run_all_specs
|
28
|
+
run(all_spec_files.join(' '))
|
29
|
+
end
|
30
|
+
|
31
|
+
# --------------------------------------------------
|
32
|
+
# Watchr Rules
|
33
|
+
# --------------------------------------------------
|
34
|
+
watch('^spec/(.*)_spec\.rb') { |m| run_spec_matching(m[1]) }
|
35
|
+
watch('^lib/(.*)\.rb') { |m| run_spec_matching(m[1]) }
|
36
|
+
watch('^spec/spec_helper\.rb') { run_all_specs }
|
37
|
+
watch('^spec/frawd\.rb') { run_all_specs }
|
38
|
+
watch('^spec/support/.*\.rb') { run_all_specs }
|
39
|
+
|
40
|
+
# --------------------------------------------------
|
41
|
+
# Signal Handling
|
42
|
+
# --------------------------------------------------
|
43
|
+
|
44
|
+
def no_int_for_you
|
45
|
+
@sent_an_int = nil
|
46
|
+
end
|
47
|
+
|
48
|
+
Signal.trap 'INT' do
|
49
|
+
if @sent_an_int then
|
50
|
+
puts " A second INT? Ok, I get the message. Shutting down now."
|
51
|
+
exit
|
52
|
+
else
|
53
|
+
puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
|
54
|
+
@sent_an_int = true
|
55
|
+
Kernel.sleep 1.5
|
56
|
+
run_all_specs
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# vim:ft=ruby
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 5
|
9
|
+
version: 0.0.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Alex Redington
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-11-05 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -62,6 +62,35 @@ dependencies:
|
|
62
62
|
version: 2.0.1
|
63
63
|
type: :development
|
64
64
|
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
name: watchr
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
segments:
|
74
|
+
- 0
|
75
|
+
- 7
|
76
|
+
version: "0.7"
|
77
|
+
type: :development
|
78
|
+
version_requirements: *id004
|
79
|
+
- !ruby/object:Gem::Dependency
|
80
|
+
name: faker
|
81
|
+
prerelease: false
|
82
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
- 3
|
90
|
+
- 1
|
91
|
+
version: 0.3.1
|
92
|
+
type: :development
|
93
|
+
version_requirements: *id005
|
65
94
|
description: Dwarf is an implementation of decision tree learning algorithms targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
|
66
95
|
email:
|
67
96
|
- aredington@gmail.com
|
@@ -77,13 +106,19 @@ files:
|
|
77
106
|
- Gemfile.lock
|
78
107
|
- README.md
|
79
108
|
- Rakefile
|
109
|
+
- TODO.taskpaper
|
80
110
|
- dwarf.gemspec
|
81
111
|
- lib/dwarf.rb
|
82
112
|
- lib/dwarf/classifier.rb
|
113
|
+
- lib/dwarf/example_management.rb
|
114
|
+
- lib/dwarf/information.rb
|
83
115
|
- lib/dwarf/tree_node.rb
|
84
116
|
- lib/dwarf/version.rb
|
85
|
-
- spec/classifier_spec.rb
|
117
|
+
- spec/dwarf/classifier_spec.rb
|
118
|
+
- spec/dwarf/information_spec.rb
|
119
|
+
- spec/frawd.rb
|
86
120
|
- spec/spec_helper.rb
|
121
|
+
- specs.watchr
|
87
122
|
has_rdoc: true
|
88
123
|
homepage: http://github.com/aredington/dwarf
|
89
124
|
licenses: []
|
data/spec/classifier_spec.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), *%w[spec_helper])
|
2
|
-
|
3
|
-
describe Dwarf::Classifier do
|
4
|
-
|
5
|
-
before(:each) do
|
6
|
-
@classifier = Dwarf::Classifier.new()
|
7
|
-
end
|
8
|
-
|
9
|
-
it "accepts example classifications" do
|
10
|
-
@classifier.should respond_to(:add_example)
|
11
|
-
end
|
12
|
-
|
13
|
-
it "stores examples" do
|
14
|
-
@example3 = double('example3')
|
15
|
-
@example3.stub(:attributes) { [] }
|
16
|
-
@classifier.add_example(@example3, :irish)
|
17
|
-
@classifier.examples.should include(@example3)
|
18
|
-
end
|
19
|
-
|
20
|
-
it "only implements classify on the learning instance" do
|
21
|
-
@example = double('example3')
|
22
|
-
@example.stub(:attributes) { [] }
|
23
|
-
@class2 = Dwarf::Classifier.new()
|
24
|
-
@classifier.add_example(@example, :round)
|
25
|
-
@classifier.learn!
|
26
|
-
@classifier.classify(@example).should eq(:round)
|
27
|
-
@class2.classify(@example).should eq(nil)
|
28
|
-
end
|
29
|
-
|
30
|
-
context "classifying cars" do
|
31
|
-
def mock_car_examples
|
32
|
-
@example1 = double('example1')
|
33
|
-
@example1.stub(:body_style) { :boxy }
|
34
|
-
@example1.stub(:cylinders) { 4 }
|
35
|
-
@example1.stub(:attributes) { ["body_style", "cylinders"] }
|
36
|
-
@example2 = double('example2')
|
37
|
-
@example2.stub(:body_style) { :swoopy }
|
38
|
-
@example2.stub(:cylinders) { 6 }
|
39
|
-
@example2.stub(:attributes) { ["body_style", "cylinders"] }
|
40
|
-
@example3 = double('example3')
|
41
|
-
@example3.stub(:body_style) { :angry }
|
42
|
-
@example3.stub(:cylinders) { 8 }
|
43
|
-
@example3.stub(:attributes) { ["body_style", "cylinders"] }
|
44
|
-
@example4 = double('example4')
|
45
|
-
@example4.stub(:body_style) {:swoopy}
|
46
|
-
@example4.stub(:cylinders) {8}
|
47
|
-
@example4.stub(:attributes) { ["body_style", "cylinders"] }
|
48
|
-
end
|
49
|
-
|
50
|
-
it "enumerate example attributes" do
|
51
|
-
mock_car_examples
|
52
|
-
@classifier.add_example(@example1, :japanese)
|
53
|
-
@classifier.example_attributes.should include("body_style", "cylinders")
|
54
|
-
end
|
55
|
-
|
56
|
-
it "classifies in a trivial case" do
|
57
|
-
mock_car_examples
|
58
|
-
@classifier.add_example(@example1, :japanese)
|
59
|
-
@classifier.add_example(@example2, :german)
|
60
|
-
@classifier.add_example(@example3, :american)
|
61
|
-
@classifier.learn!
|
62
|
-
@classifier.classify(@example1).should eq(:japanese)
|
63
|
-
@classifier.classify(@example2).should eq(:german)
|
64
|
-
@classifier.classify(@example3).should eq(:american)
|
65
|
-
end
|
66
|
-
|
67
|
-
it "classifies when multiple predicates required" do
|
68
|
-
mock_car_examples
|
69
|
-
@classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
|
70
|
-
@classifier.learn!
|
71
|
-
@classifier.classify(@example1).should eq(:japanese)
|
72
|
-
@classifier.classify(@example2).should eq(:german)
|
73
|
-
@classifier.classify(@example3).should eq(:american)
|
74
|
-
@classifier.classify(@example4).should eq(:german)
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
end
|