dwarf 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +5 -1
- data/Rakefile +1 -0
- data/TODO.taskpaper +22 -0
- data/dwarf.gemspec +2 -0
- data/lib/dwarf.rb +3 -1
- data/lib/dwarf/classifier.rb +111 -66
- data/lib/dwarf/example_management.rb +32 -0
- data/lib/dwarf/information.rb +61 -0
- data/lib/dwarf/version.rb +1 -1
- data/spec/dwarf/classifier_spec.rb +197 -0
- data/spec/dwarf/information_spec.rb +157 -0
- data/spec/frawd.rb +105 -0
- data/spec/spec_helper.rb +59 -0
- data/specs.watchr +60 -0
- metadata +39 -4
- data/spec/classifier_spec.rb +0 -80
data/Gemfile.lock
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
dwarf (0.0.
|
4
|
+
dwarf (0.0.4)
|
5
5
|
rubytree (>= 0.8.1)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: http://rubygems.org/
|
9
9
|
specs:
|
10
10
|
diff-lcs (1.1.2)
|
11
|
+
faker (0.3.1)
|
11
12
|
rspec (2.0.1)
|
12
13
|
rspec-core (~> 2.0.1)
|
13
14
|
rspec-expectations (~> 2.0.1)
|
@@ -19,6 +20,7 @@ GEM
|
|
19
20
|
rspec-core (~> 2.0.1)
|
20
21
|
rspec-expectations (~> 2.0.1)
|
21
22
|
rubytree (0.8.1)
|
23
|
+
watchr (0.7)
|
22
24
|
|
23
25
|
PLATFORMS
|
24
26
|
ruby
|
@@ -26,5 +28,7 @@ PLATFORMS
|
|
26
28
|
DEPENDENCIES
|
27
29
|
bundler (>= 1.0.0)
|
28
30
|
dwarf!
|
31
|
+
faker (>= 0.3.1)
|
29
32
|
rspec (>= 2.0.1)
|
30
33
|
rubytree (>= 0.8.1)
|
34
|
+
watchr (>= 0.7)
|
data/Rakefile
CHANGED
data/TODO.taskpaper
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Dwarf 1.0 Features:
|
3
|
+
|
4
|
+
Find all instances with a given classification given a world. @alex @priority(3)
|
5
|
+
- Create queries to find all instances of a given classification. @priority(1)
|
6
|
+
- Make logic of a query for a given classification readble (as Ruby, or SQL, or ...) @priority(1)
|
7
|
+
- Generate large worlds with consistent instances to test against. @muness @priority(2)
|
8
|
+
- Handle nested features (e.g. example.engine.cylinders) @priority(1)
|
9
|
+
Handle messy data well (nil examples, examples with nil features, examples with nil subfeatures) @alex @priority(2)
|
10
|
+
Resolve weird behavior when all examples are missing some attribute. @alex @sam @priority(1)
|
11
|
+
- When attribute.nil? bisects a heterogenous group, we should probably split on that attribute. @priority(1)
|
12
|
+
Refactor information theory methods out to enable unit testing. @alex @priority(3)
|
13
|
+
Eliminate features which are unique across all examples @sam @alex @priority(2)
|
14
|
+
- Treat hashes as nested features. @priority(1)
|
15
|
+
|
16
|
+
Dwarf Nice To Haves:
|
17
|
+
|
18
|
+
- meta-features based on type, e.g. parity(car.engine.cylinders) can be :even or :odd @priority(2)
|
19
|
+
- Bayesian classification of text fields. @priority(1)
|
20
|
+
- Junk uniquely identifying features (implicit in info gain calculations? add tests to verify!) @priority(3)
|
21
|
+
- Modular feature enumeration and feature fetching code. (Don't rely on attributes and example.attribute to be your only duck type checks!) @priority(2)
|
22
|
+
- A world-generation tool to create internally consistent data sets to measure dwarf's learning against. Maybe we can call it "frawd". @priority(1)
|
data/dwarf.gemspec
CHANGED
@@ -17,6 +17,8 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.add_dependency "rubytree", ">= 0.8.1"
|
18
18
|
s.add_development_dependency "bundler", ">= 1.0.0"
|
19
19
|
s.add_development_dependency "rspec", ">= 2.0.1"
|
20
|
+
s.add_development_dependency "watchr", ">= 0.7"
|
21
|
+
s.add_development_dependency "faker", ">= 0.3.1"
|
20
22
|
|
21
23
|
s.files = `git ls-files`.split("\n")
|
22
24
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
data/lib/dwarf.rb
CHANGED
data/lib/dwarf/classifier.rb
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
module Dwarf
|
2
2
|
class Classifier
|
3
|
+
|
3
4
|
attr_accessor :examples
|
4
5
|
attr_accessor :example_attributes
|
5
6
|
attr_accessor :classifier_logic
|
7
|
+
attr_reader :decision_tree
|
6
8
|
|
7
9
|
def initialize()
|
8
10
|
@examples, @example_attributes = {}, []
|
9
11
|
@decision_tree = TreeNode.new("ROOT")
|
12
|
+
@nil_name = Object.new.to_s
|
10
13
|
end
|
11
14
|
|
12
15
|
def add_examples(example_hash)
|
@@ -17,7 +20,7 @@ module Dwarf
|
|
17
20
|
|
18
21
|
def add_example(example_record, classification)
|
19
22
|
@examples[example_record]=classification
|
20
|
-
@example_attributes |= example_record.
|
23
|
+
@example_attributes |= example_record.attribute_names
|
21
24
|
end
|
22
25
|
|
23
26
|
def classify(example)
|
@@ -26,37 +29,120 @@ module Dwarf
|
|
26
29
|
|
27
30
|
def learn!
|
28
31
|
@decision_tree.examples = @examples.keys
|
32
|
+
converge_tree
|
33
|
+
self.classifier_logic = codify_tree(@decision_tree)
|
34
|
+
implement_classify
|
35
|
+
end
|
36
|
+
|
37
|
+
def find_by_classification(world, classification)
|
38
|
+
matches = []
|
39
|
+
world.each do |instance|
|
40
|
+
if classify(instance) == classification
|
41
|
+
matches << instance
|
42
|
+
end
|
43
|
+
end
|
44
|
+
matches
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
include ExampleManagement
|
50
|
+
|
51
|
+
def converge_tree
|
29
52
|
pending = []
|
30
53
|
pending.push @decision_tree
|
31
|
-
used_attributes = []
|
32
54
|
until pending.empty?
|
33
55
|
node = pending.pop
|
34
56
|
if classification = homogenous_examples(node)
|
35
57
|
node.classification = classification
|
36
58
|
elsif no_valuable_attributes?(node) && node.parent
|
37
|
-
|
59
|
+
if split_nil_children = check_nil_split(node)
|
60
|
+
split_nil_children.each {|child_node| pending.push(child_node)}
|
61
|
+
else
|
62
|
+
create_expected_value(node)
|
63
|
+
end
|
38
64
|
elsif no_valuable_attributes?(node)
|
39
|
-
|
65
|
+
node.classification = expected_value(node.examples)
|
40
66
|
elsif false #stub branch
|
41
67
|
#C4.5 would also allow for previously unseen classifications
|
42
|
-
#dwarf
|
43
|
-
#
|
68
|
+
#dwarf needs to correctly handle a pre-existing tree when
|
69
|
+
#learn! is called
|
44
70
|
else
|
45
|
-
|
46
|
-
|
47
|
-
infogains[information_gain(node.examples,example_attribute)] = example_attribute
|
48
|
-
end
|
49
|
-
best_gain = infogains.keys.sort[0]
|
50
|
-
best_attribute = infogains[best_gain]
|
51
|
-
split(node,best_attribute).each {|child_node| pending.push(child_node)}
|
52
|
-
used_attributes << best_attribute
|
71
|
+
split_children = homogenize_children(node)
|
72
|
+
split_children.each {|child_node| pending.push(child_node)}
|
53
73
|
end
|
54
74
|
end
|
55
|
-
self.classifier_logic = codify_tree(@decision_tree)
|
56
|
-
implement_classify
|
57
75
|
end
|
58
76
|
|
59
|
-
|
77
|
+
def check_nil_split(node)
|
78
|
+
infogains = {}
|
79
|
+
|
80
|
+
used_attributes = used_attributes(node)
|
81
|
+
(filtered_attributes-used_attributes).each do |example_attribute|
|
82
|
+
infogains[Information::unfiltered_information_gain(node.examples,example_attribute,@examples)] =
|
83
|
+
example_attribute
|
84
|
+
end
|
85
|
+
best_gain = infogains.keys.sort[0]
|
86
|
+
best_attribute = infogains[best_gain]
|
87
|
+
if best_gain > 0.0
|
88
|
+
return split(node, best_attribute)
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
def create_expected_value(node)
|
94
|
+
new_node = TreeNode.new(node.name)
|
95
|
+
expected_value = expected_value(node.examples)
|
96
|
+
new_node.classification = expected_value
|
97
|
+
parent = node.parent
|
98
|
+
parent.remove! node
|
99
|
+
parent << new_node
|
100
|
+
new_node << node
|
101
|
+
end
|
102
|
+
|
103
|
+
def used_attributes(node)
|
104
|
+
if node.parentage
|
105
|
+
node.parentage.map { |parent| parent.attribute }
|
106
|
+
else
|
107
|
+
[]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def attribute_homogeneous?(example_subset, attribute)
|
112
|
+
invert_with_dups(attribute_map(example_subset, attribute)).keys.size == 1
|
113
|
+
end
|
114
|
+
|
115
|
+
def heterogeneous_attributes
|
116
|
+
@example_attributes.reject { |attr| attribute_homogeneous?(@examples.keys, attr) }
|
117
|
+
end
|
118
|
+
|
119
|
+
def attribute_clusters?(example_subset, attribute)
|
120
|
+
invert_with_dups(attribute_map(example_subset, attribute)).keys.size == example_subset.size
|
121
|
+
end
|
122
|
+
|
123
|
+
def clustering_attributes
|
124
|
+
@example_attributes.select {|attr| attribute_clusters?(@examples.keys, attr) }
|
125
|
+
end
|
126
|
+
|
127
|
+
def filtered_attributes
|
128
|
+
clustering_attributes | heterogeneous_attributes
|
129
|
+
end
|
130
|
+
|
131
|
+
def homogenize_children(node)
|
132
|
+
infogains = {}
|
133
|
+
|
134
|
+
used_attributes = used_attributes(node)
|
135
|
+
|
136
|
+
(filtered_attributes-used_attributes).each do |example_attribute|
|
137
|
+
infogains[Information::information_gain(node.examples,example_attribute,@examples)] =
|
138
|
+
example_attribute
|
139
|
+
end
|
140
|
+
|
141
|
+
best_gain = infogains.keys.sort[0]
|
142
|
+
best_attribute = infogains[best_gain]
|
143
|
+
|
144
|
+
return split(node,best_attribute)
|
145
|
+
end
|
60
146
|
|
61
147
|
def implement_classify
|
62
148
|
classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
|
@@ -85,6 +171,7 @@ module Dwarf
|
|
85
171
|
|
86
172
|
def codify_literal(object)
|
87
173
|
case object
|
174
|
+
when @nil_name then "nil"
|
88
175
|
when Symbol then ":#{object}"
|
89
176
|
when String then "\"#{object}\""
|
90
177
|
else
|
@@ -97,6 +184,9 @@ module Dwarf
|
|
97
184
|
example_subset = node.examples
|
98
185
|
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
99
186
|
examples_inversion.each do |key, value|
|
187
|
+
if key.nil?
|
188
|
+
key = @nil_name
|
189
|
+
end
|
100
190
|
child_node = TreeNode.new(key)
|
101
191
|
child_node.examples = value
|
102
192
|
node << child_node
|
@@ -106,21 +196,20 @@ module Dwarf
|
|
106
196
|
end
|
107
197
|
|
108
198
|
def expected_value(example_subset)
|
109
|
-
examples_inversion = invert_with_dups(classification_map(example_subset))
|
199
|
+
examples_inversion = invert_with_dups(classification_map(example_subset, @examples))
|
110
200
|
occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
|
111
201
|
occurrences.keys.sort { |key| occurrences[key] }[0]
|
112
202
|
end
|
113
203
|
|
114
204
|
def no_valuable_attributes?(node)
|
115
|
-
|
116
|
-
information_gain(node.examples, example_attribute)}.each {|info_gain|
|
205
|
+
filtered_attributes.map {|example_attribute|
|
206
|
+
Information::information_gain(node.examples, example_attribute, @examples)}.each {|info_gain|
|
117
207
|
return false if info_gain != 0}
|
118
208
|
return true
|
119
209
|
end
|
120
210
|
|
121
|
-
|
122
211
|
def homogenous_examples(node)
|
123
|
-
classifications =
|
212
|
+
classifications = filter_classifications(@examples, node.examples)
|
124
213
|
if classifications.length == 1
|
125
214
|
return classifications[0]
|
126
215
|
else
|
@@ -128,49 +217,5 @@ module Dwarf
|
|
128
217
|
end
|
129
218
|
end
|
130
219
|
|
131
|
-
def entropy(example_subset)
|
132
|
-
set_size = example_subset.length.to_f
|
133
|
-
examples_inversion = invert_with_dups(classification_map(example_subset))
|
134
|
-
occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
|
135
|
-
0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
|
136
|
-
sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
def information_gain(example_subset,attribute)
|
141
|
-
set_size = example_subset.length.to_f
|
142
|
-
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
143
|
-
occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
|
144
|
-
entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
|
145
|
-
sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
def classifications(example_subset)
|
150
|
-
example_subset.map {|example| @examples[example]}.compact
|
151
|
-
end
|
152
|
-
|
153
|
-
def classification_map(example_subset)
|
154
|
-
classification_map = {}
|
155
|
-
example_subset.each {|example| classification_map[example] = @examples[example]}
|
156
|
-
classification_map
|
157
|
-
end
|
158
|
-
|
159
|
-
def attribute_values(example_subset, attribute)
|
160
|
-
example_subset.map {|example| example.method(attribute.to_sym).call}.compact
|
161
|
-
end
|
162
|
-
|
163
|
-
def attribute_map(example_subset, attribute)
|
164
|
-
example_map = {}
|
165
|
-
example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
|
166
|
-
example_map
|
167
|
-
end
|
168
|
-
|
169
|
-
def invert_with_dups(hash)
|
170
|
-
inversion = {}
|
171
|
-
hash.values.each {|value| inversion[value] = []}
|
172
|
-
hash.keys.each {|key| inversion[hash[key]] << key}
|
173
|
-
inversion
|
174
|
-
end
|
175
220
|
end
|
176
221
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Dwarf
|
2
|
+
module ExampleManagement
|
3
|
+
|
4
|
+
def classification_map(example_subset, classifications)
|
5
|
+
classification_map = {}
|
6
|
+
example_subset.each {|example| classification_map[example] = classifications[example]}
|
7
|
+
classification_map
|
8
|
+
end
|
9
|
+
|
10
|
+
def invert_with_dups(hash)
|
11
|
+
inversion = { }
|
12
|
+
hash.values.each {|value| inversion[value] = []}
|
13
|
+
hash.keys.each {|key| inversion[hash[key]] << key}
|
14
|
+
inversion
|
15
|
+
end
|
16
|
+
|
17
|
+
def eval_attribute(example,attribute)
|
18
|
+
example.method(attribute.to_sym).call
|
19
|
+
end
|
20
|
+
|
21
|
+
def attribute_map(example_subset, attribute)
|
22
|
+
example_map = {}
|
23
|
+
example_subset.each {|example| example_map[example] = eval_attribute(example, attribute)}
|
24
|
+
example_map
|
25
|
+
end
|
26
|
+
|
27
|
+
def filter_classifications(classifications,example_subset)
|
28
|
+
example_subset.map {|example| classifications[example]}.uniq
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Dwarf
|
2
|
+
module Information
|
3
|
+
|
4
|
+
class<< self
|
5
|
+
include ExampleManagement
|
6
|
+
|
7
|
+
def entropy(example_subset, classifications)
|
8
|
+
seen_classifications = filter_classifications(classifications, example_subset)
|
9
|
+
return 0.0 if seen_classifications.length == 1
|
10
|
+
set_size = example_subset.length.to_f
|
11
|
+
examples_inversion = invert_with_dups(classification_map(example_subset, classifications))
|
12
|
+
occurrences = occurrences(examples_inversion)
|
13
|
+
sum_over(seen_classifications) do |classification|
|
14
|
+
frequency = occurrences[classification]/set_size
|
15
|
+
- frequency * Math.log(frequency,seen_classifications.length)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def information_gain(example_subset, attribute, classifications)
|
20
|
+
filtered_example_subset = filter_for_missing_attribute(example_subset, attribute)
|
21
|
+
unfiltered_information_gain(filtered_example_subset, attribute, classifications)
|
22
|
+
end
|
23
|
+
|
24
|
+
def unfiltered_information_gain(example_subset, attribute, classifications)
|
25
|
+
set_size = example_subset.length.to_f
|
26
|
+
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
27
|
+
occurrences = occurrences(examples_inversion)
|
28
|
+
heterogeneous_entropy = entropy(example_subset, classifications)
|
29
|
+
seen_attribute_values = attribute_values(example_subset,attribute)
|
30
|
+
heterogeneous_entropy -
|
31
|
+
sum_over(seen_attribute_values) do |attribute_value|
|
32
|
+
frequency = occurrences[attribute_value]/set_size
|
33
|
+
frequency * entropy(examples_inversion[attribute_value], classifications)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def sum_over(collection)
|
40
|
+
collection.inject(0.0) do |sum, classification|
|
41
|
+
sum + yield(classification)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def occurrences(examples_inversion)
|
46
|
+
examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
|
47
|
+
end
|
48
|
+
|
49
|
+
def filter_for_missing_attribute(example_subset, attribute)
|
50
|
+
example_subset.reject { |example| eval_attribute(example,attribute).nil? }
|
51
|
+
end
|
52
|
+
|
53
|
+
def attribute_values(example_subset, attribute)
|
54
|
+
example_subset.map {|example| eval_attribute(example, attribute)}.uniq
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
data/lib/dwarf/version.rb
CHANGED
@@ -0,0 +1,197 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
|
2
|
+
|
3
|
+
describe Dwarf::Classifier do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@classifier = Dwarf::Classifier.new()
|
7
|
+
end
|
8
|
+
|
9
|
+
def mock_car_examples
|
10
|
+
@example1 = FakeCar.new(:body_style => :boxy,
|
11
|
+
:cylinders => 4,
|
12
|
+
:transmission => :manual)
|
13
|
+
@example2 = FakeCar.new(:body_style => :swoopy,
|
14
|
+
:cylinders => 6,
|
15
|
+
:transmission => :manual)
|
16
|
+
@example3 = FakeCar.new(:body_style => :angry,
|
17
|
+
:cylinders => 8,
|
18
|
+
:transmission => :manual)
|
19
|
+
@example4 = FakeCar.new(:body_style => :swoopy,
|
20
|
+
:cylinders => 8,
|
21
|
+
:transmission => :manual)
|
22
|
+
@example5 = FakeCar.new(:body_style => nil,
|
23
|
+
:cylinders => 6,
|
24
|
+
:transmission => :manual)
|
25
|
+
@example6 = FakeCar.new(:body_style => :sleek,
|
26
|
+
:cylinders => nil,
|
27
|
+
:transmission => :manual)
|
28
|
+
end
|
29
|
+
|
30
|
+
context "add_example" do
|
31
|
+
|
32
|
+
it "accepts example classifications" do
|
33
|
+
@classifier.should respond_to(:add_example)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "stores examples" do
|
37
|
+
@example3 = double('example3')
|
38
|
+
@example3.stub(:attribute_names) { [] }
|
39
|
+
@classifier.add_example(@example3, :irish)
|
40
|
+
@classifier.examples.should include(@example3)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "enumerate example attributes" do
|
44
|
+
@example_with_attributes = double('attrs')
|
45
|
+
@example_with_attributes.stub(:attribute_names) { ["height", "branch_density"] }
|
46
|
+
@classifier.add_example(@example_with_attributes, :pine)
|
47
|
+
@classifier.example_attributes.should include("height", "branch_density")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "gracefully accepts examples with nil attributes" do
|
51
|
+
@example_with_nil_attributes = double('nils')
|
52
|
+
@example_with_nil_attributes.stub(:attribute_names) { ["height", "branch_density"] }
|
53
|
+
@example_with_nil_attributes.stub(:height) { nil }
|
54
|
+
@example_with_nil_attributes.stub(:branch_density) { :high }
|
55
|
+
lambda {@classifier.add_example(@example_with_nil_attributes, :pine)}.should_not raise_exception
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
context "learn! and classify" do
|
61
|
+
|
62
|
+
it "only implements classify on the learning instance" do
|
63
|
+
@example = double('example3')
|
64
|
+
@example.stub(:attribute_names) { [] }
|
65
|
+
@class2 = Dwarf::Classifier.new()
|
66
|
+
@classifier.add_example(@example, :round)
|
67
|
+
@classifier.learn!
|
68
|
+
@classifier.classify(@example) == :round
|
69
|
+
@class2.classify(@example).should == nil
|
70
|
+
end
|
71
|
+
|
72
|
+
context "frawd is dwarf backwards" do
|
73
|
+
before(:each) do
|
74
|
+
@frawd = Frawd.new(1,100)
|
75
|
+
end
|
76
|
+
|
77
|
+
it "is totally awesome" do
|
78
|
+
@frawd.training.each do |example, classification|
|
79
|
+
@classifier.add_example(example, classification)
|
80
|
+
end
|
81
|
+
@classifier.learn!
|
82
|
+
success = 0
|
83
|
+
@frawd.testing.each do |example, classification|
|
84
|
+
success += 1 if @classifier.classify(example) == classification
|
85
|
+
end
|
86
|
+
success.should == @frawd.testing.size
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
context "classifying cars" do
|
91
|
+
|
92
|
+
before(:each) do
|
93
|
+
mock_car_examples
|
94
|
+
@classifier.add_example(@example1, :japanese)
|
95
|
+
@classifier.add_example(@example2, :german)
|
96
|
+
@classifier.add_example(@example3, :american)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "classifies in a trivial case" do
|
100
|
+
@classifier.learn!
|
101
|
+
@classifier.classify(@example1).should == :japanese
|
102
|
+
@classifier.classify(@example2).should == :german
|
103
|
+
@classifier.classify(@example3).should == :american
|
104
|
+
end
|
105
|
+
|
106
|
+
it "classifies when multiple predicates required" do
|
107
|
+
@classifier.add_example(@example4, :german)
|
108
|
+
@classifier.learn!
|
109
|
+
@classifier.classify(@example1).should == :japanese
|
110
|
+
@classifier.classify(@example2).should == :german
|
111
|
+
@classifier.classify(@example3).should == :american
|
112
|
+
@classifier.classify(@example4).should == :german
|
113
|
+
end
|
114
|
+
|
115
|
+
it "handles nils gracefully" do
|
116
|
+
@classifier.add_examples(@example4 => :german,
|
117
|
+
@example5 => :japanese,
|
118
|
+
@example6 => :japanese)
|
119
|
+
lambda{@classifier.learn!}.should_not raise_exception
|
120
|
+
end
|
121
|
+
|
122
|
+
it "handles nils correctly" do
|
123
|
+
@classifier.add_examples(@example4 => :german,
|
124
|
+
@example5 => :japanese,
|
125
|
+
@example6 => :japanese)
|
126
|
+
@classifier.learn!
|
127
|
+
@classifier.classify(@example1).should == :japanese
|
128
|
+
@classifier.classify(@example2).should == :german
|
129
|
+
@classifier.classify(@example3).should == :american
|
130
|
+
@classifier.classify(@example4).should == :german
|
131
|
+
@classifier.classify(@example5).should == :japanese
|
132
|
+
@classifier.classify(@example6).should == :japanese
|
133
|
+
end
|
134
|
+
|
135
|
+
it "handles a feature missing from all examples correctly" do
|
136
|
+
@classifier.add_examples(@example4 => :german,
|
137
|
+
@example5 => :japanese,
|
138
|
+
@example6 => :japanese)
|
139
|
+
@classifier.learn!
|
140
|
+
open = [@classifier.decision_tree]
|
141
|
+
until open.empty?
|
142
|
+
current = open.pop
|
143
|
+
current.attribute.should_not == "wheel_diameter"
|
144
|
+
current.children.each {|child| open.push child}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
it "does not use a feature which is identical across all examples" do
|
149
|
+
@classifier.add_examples(@example4 => :german,
|
150
|
+
@example5 => :japanese,
|
151
|
+
@example6 => :japanese)
|
152
|
+
@classifier.learn!
|
153
|
+
open = [@classifier.decision_tree]
|
154
|
+
until open.empty?
|
155
|
+
current = open.pop
|
156
|
+
current.attribute.should_not == "transmission"
|
157
|
+
current.children.each {|child| open.push child}
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
it "does not use a feature unique to each example" do
|
162
|
+
@classifier.add_examples(@example4 => :german,
|
163
|
+
@example5 => :japanese,
|
164
|
+
@example6 => :japanese)
|
165
|
+
@classifier.learn!
|
166
|
+
open = [@classifier.decision_tree]
|
167
|
+
until open.empty?
|
168
|
+
current = open.pop
|
169
|
+
current.attribute.should_not == "vin"
|
170
|
+
current.children.each {|child| open.push child}
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
context "find_by_classification" do
|
180
|
+
|
181
|
+
it "returns sets of cars based on class" do
|
182
|
+
mock_car_examples
|
183
|
+
@classifier.add_examples(@example1 => :japanese,
|
184
|
+
@example2 => :german,
|
185
|
+
@example3 => :american,
|
186
|
+
@example4 => :german)
|
187
|
+
@classifier.learn!
|
188
|
+
all_cars = [@example1, @example2, @example3, @example4]
|
189
|
+
japanese_cars = @classifier.find_by_classification(all_cars, :japanese)
|
190
|
+
japanese_cars.should == [@example1]
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
end
|
195
|
+
|
196
|
+
|
197
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
|
2
|
+
|
3
|
+
describe Dwarf::Information do
|
4
|
+
|
5
|
+
class Deck
|
6
|
+
|
7
|
+
def initialize()
|
8
|
+
@draw = (1..52).map{|v| v}
|
9
|
+
@draw.shuffle!
|
10
|
+
@discard = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def sample
|
14
|
+
unless @draw.empty?
|
15
|
+
card = @draw.pop
|
16
|
+
@discard.push card
|
17
|
+
return card
|
18
|
+
else
|
19
|
+
@draw = @discard
|
20
|
+
@draw.shuffle
|
21
|
+
@discard = []
|
22
|
+
return self.sample
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
class Coin
|
29
|
+
|
30
|
+
def initialize(weighting)
|
31
|
+
@weighting = weighting
|
32
|
+
@faces = [:heads, :tails]
|
33
|
+
end
|
34
|
+
|
35
|
+
def attributes
|
36
|
+
"weighting"
|
37
|
+
end
|
38
|
+
|
39
|
+
def weighting
|
40
|
+
@weighting
|
41
|
+
end
|
42
|
+
|
43
|
+
def sample
|
44
|
+
case @weighting
|
45
|
+
when :fair then @faces.sample
|
46
|
+
when :heads then :heads
|
47
|
+
when :tails then :tails
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
context "entropy" do
|
54
|
+
it "calculates correctly for heads and tails" do
|
55
|
+
examples = []
|
56
|
+
classifications = {}
|
57
|
+
coin = Coin.new(:fair)
|
58
|
+
1000.times do
|
59
|
+
obj = Object.new
|
60
|
+
examples << obj
|
61
|
+
classifications[obj] = coin.sample
|
62
|
+
end
|
63
|
+
entropy = Dwarf::Information.entropy(examples, classifications)
|
64
|
+
entropy.should > 0.99
|
65
|
+
entropy.should <= 1.0
|
66
|
+
end
|
67
|
+
|
68
|
+
it "calculates correctly for 1d6" do
|
69
|
+
examples = []
|
70
|
+
classifications = {}
|
71
|
+
die = (1..6).map{|v| v}
|
72
|
+
1000.times do
|
73
|
+
obj = Object.new
|
74
|
+
examples << obj
|
75
|
+
classifications[obj] = die.sample
|
76
|
+
end
|
77
|
+
entropy = Dwarf::Information.entropy(examples, classifications)
|
78
|
+
entropy.should > 0.99
|
79
|
+
entropy.should <= 1.0
|
80
|
+
end
|
81
|
+
|
82
|
+
it "calculates correctly for a deck of cards" do
|
83
|
+
examples = []
|
84
|
+
classifications = {}
|
85
|
+
deck = Deck.new
|
86
|
+
1000.times do
|
87
|
+
obj = Object.new
|
88
|
+
examples << obj
|
89
|
+
classifications[obj] = deck.sample
|
90
|
+
end
|
91
|
+
entropy = Dwarf::Information.entropy(examples, classifications)
|
92
|
+
entropy.should > 0.99
|
93
|
+
entropy.should <= 1.0
|
94
|
+
end
|
95
|
+
|
96
|
+
it "calculates correctly with a weighted coin" do
|
97
|
+
examples = []
|
98
|
+
classifications = {}
|
99
|
+
1000.times do
|
100
|
+
obj = Object.new
|
101
|
+
examples << obj
|
102
|
+
classifications[obj] = (rand(100) == 99) ? :heads : :tails
|
103
|
+
end
|
104
|
+
entropy = Dwarf::Information.entropy(examples,classifications)
|
105
|
+
entropy.should < 0.101 #With a perfect 99:1 distribution, entropy should == 0.0807...
|
106
|
+
entropy.should >= 0.04
|
107
|
+
end
|
108
|
+
|
109
|
+
it "calculates correctly with a homogenous set" do
|
110
|
+
examples = []
|
111
|
+
classifications = {}
|
112
|
+
1000.times do
|
113
|
+
obj = Object.new
|
114
|
+
examples << obj
|
115
|
+
classifications[obj] = :heads
|
116
|
+
end
|
117
|
+
entropy = Dwarf::Information.entropy(examples,classifications)
|
118
|
+
entropy.should == 0.0
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
context "information_gain" do
|
124
|
+
|
125
|
+
it "calculates correctly splitting perfectly weighted coins" do
|
126
|
+
examples = []
|
127
|
+
classifications = {}
|
128
|
+
500.times do
|
129
|
+
coin = Coin.new(:heads)
|
130
|
+
examples << coin
|
131
|
+
classifications[coin] = coin.sample
|
132
|
+
end
|
133
|
+
500.times do
|
134
|
+
coin = Coin.new(:tails)
|
135
|
+
examples << coin
|
136
|
+
classifications[coin] = coin.sample
|
137
|
+
end
|
138
|
+
information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
|
139
|
+
information_gain.should == 1.0
|
140
|
+
end
|
141
|
+
|
142
|
+
it "calculates worthless infogame for fair weighted coins" do
|
143
|
+
examples = []
|
144
|
+
classifications = {}
|
145
|
+
coin = Coin.new(:fair)
|
146
|
+
1000.times do
|
147
|
+
coin = Coin.new(:fair)
|
148
|
+
examples << coin
|
149
|
+
classifications[coin] = coin.sample
|
150
|
+
end
|
151
|
+
information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
|
152
|
+
information_gain.should == 0.0
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
end
|
data/spec/frawd.rb
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
#require File.join(File.dirname(__FILE__), *%w[. spec_helper.rb])
|
2
|
+
require 'rspec/mocks'
|
3
|
+
require 'faker'
|
4
|
+
require 'digest'
|
5
|
+
|
6
|
+
class Frawd
|
7
|
+
attr_reader :rules
|
8
|
+
|
9
|
+
def initialize(depth = 10, sample_sizes = 1000)
|
10
|
+
@depth = depth
|
11
|
+
@sample_sizes = sample_sizes
|
12
|
+
initialize_attributes
|
13
|
+
@leaves = []
|
14
|
+
@rules = build_rules
|
15
|
+
@rules.each_leaf do |leaf|
|
16
|
+
@leaves << leaf
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def types
|
21
|
+
[:enum, :number, :text]
|
22
|
+
end
|
23
|
+
|
24
|
+
def enums
|
25
|
+
unless @enums
|
26
|
+
@enums = [[:true, :false],
|
27
|
+
[:baz, :bar, :zot],
|
28
|
+
[:baz, :bar, :zot, :quux]]
|
29
|
+
(1..rand(10)).each do
|
30
|
+
@enums << Faker::Lorem.words(rand(10)).uniq.map(&:to_sym)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
@enums
|
34
|
+
end
|
35
|
+
|
36
|
+
def classifications
|
37
|
+
@classifications ||= (1..rand(10)).map {|x| "classification#{x}".to_sym }
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize_attributes
|
41
|
+
@attributes = []
|
42
|
+
num_attributes = 10#rand(100)
|
43
|
+
(1..num_attributes).each do |number|
|
44
|
+
type = types.sample
|
45
|
+
values = enums.sample if type == :enum
|
46
|
+
@attributes << ["attribute#{number}", type, values]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def filtered_attributes
|
51
|
+
@attributes.select {|a| a[1] == :enum}
|
52
|
+
end
|
53
|
+
|
54
|
+
def build_rules(node = Dwarf::TreeNode.new("ROOT"), attributes = filtered_attributes)
|
55
|
+
parents = node.parentage || []
|
56
|
+
if (rand(@depth) < parents.length) || attributes.empty?
|
57
|
+
node.classification = classifications.sample
|
58
|
+
else
|
59
|
+
attribute = attributes.sample
|
60
|
+
node.attribute = attribute[0]
|
61
|
+
attribute[2].each do |value|
|
62
|
+
child = Dwarf::TreeNode.new(value.to_s)
|
63
|
+
node << child
|
64
|
+
build_rules(child,attributes-[attribute[0]])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
node
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_example
|
71
|
+
node = @leaves.sample
|
72
|
+
example_classification = node.classification
|
73
|
+
example = RSpec::Mocks::Mock.new('example')
|
74
|
+
node.parentage.unshift(node).each_cons(2) do |child, parent|
|
75
|
+
example.stub!(parent.attribute.to_sym) { child.name }
|
76
|
+
example.stub!(:attribute_names) { @attributes.map {|a| a[0]} }
|
77
|
+
end
|
78
|
+
@attributes.each do |attribute|
|
79
|
+
unless example.respond_to? attribute[0].to_sym
|
80
|
+
val = case attribute[1]
|
81
|
+
when :enum then attribute[2].sample
|
82
|
+
when :number then rand((2**(0.size * 8 -2) -1))
|
83
|
+
when :text then Faker::Lorem.paragraphs
|
84
|
+
end
|
85
|
+
example.stub!(attribute[0].to_sym) { val }
|
86
|
+
end
|
87
|
+
end
|
88
|
+
[ example, example_classification ]
|
89
|
+
end
|
90
|
+
|
91
|
+
def generate_examples(count)
|
92
|
+
examples = Array.new(count)
|
93
|
+
(0...count).each { |index| examples[index] = generate_example }
|
94
|
+
examples
|
95
|
+
end
|
96
|
+
|
97
|
+
def training
|
98
|
+
@training ||= generate_examples(@sample_sizes)
|
99
|
+
end
|
100
|
+
|
101
|
+
def testing
|
102
|
+
@testing ||= generate_examples(@sample_sizes)
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1 +1,60 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
|
2
|
+
require File.join(File.dirname(__FILE__), *%w[. frawd])
|
3
|
+
|
4
|
+
# http://blog.jayfields.com/2007/04/ruby-assigning-instance-variables-in.html
|
5
|
+
class Module
|
6
|
+
def initializer(*args, &block)
|
7
|
+
define_method :initialize do |*ctor_args|
|
8
|
+
ctor_named_args = (ctor_args.last.is_a?(Hash) ? ctor_args.pop : {})
|
9
|
+
(0..args.size).each do |index|
|
10
|
+
instance_variable_set("@#{args[index]}", ctor_args[index])
|
11
|
+
end
|
12
|
+
ctor_named_args.each_pair do |param_name, param_value|
|
13
|
+
instance_variable_set("@#{param_name}", param_value)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class FakeCar
|
20
|
+
initializer :body_style, :cylinders, :wheel_diameter, :transmission
|
21
|
+
attr_accessor :body_style, :cylinders, :wheel_diameter, :transmission
|
22
|
+
|
23
|
+
@@vin_counter = 0
|
24
|
+
|
25
|
+
def vin
|
26
|
+
@vin ||= @@vin_counter+=1
|
27
|
+
end
|
28
|
+
|
29
|
+
def attributes
|
30
|
+
["body_style", "cylinders", "wheel_diameter", "transmission", "vin"]
|
31
|
+
end
|
32
|
+
|
33
|
+
alias_method :attribute_names, :attributes
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
"#{body_style} with #{cylinders} cylinders"
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.valid_body_styles
|
40
|
+
[:boxy, :swoopy, :angry, :boring]
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.valid_cylinders
|
44
|
+
[4, 6, 8]
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.fake
|
48
|
+
new(:body_style => valid_body_styles.sample,
|
49
|
+
:cylinders => valid_cylinders.sample)
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.multiple_fakes(how_many=5)
|
53
|
+
array = []
|
54
|
+
how_many.times do
|
55
|
+
array << fake
|
56
|
+
end
|
57
|
+
array
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
data/specs.watchr
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# Run me with:
|
2
|
+
#
|
3
|
+
# $ watchr specs.watchr
|
4
|
+
|
5
|
+
# --------------------------------------------------
|
6
|
+
# Convenience Methods
|
7
|
+
# --------------------------------------------------
|
8
|
+
def all_spec_files
|
9
|
+
Dir['spec/**/*_spec.rb']
|
10
|
+
end
|
11
|
+
|
12
|
+
def run_spec_matching(thing_to_match)
|
13
|
+
matches = all_spec_files.grep(/#{thing_to_match}/i)
|
14
|
+
if matches.empty?
|
15
|
+
puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
|
16
|
+
else
|
17
|
+
run matches.join(' ')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def run(files_to_run)
|
22
|
+
puts("Running: #{files_to_run}")
|
23
|
+
system("clear;rspec -cfs #{files_to_run}")
|
24
|
+
no_int_for_you
|
25
|
+
end
|
26
|
+
|
27
|
+
def run_all_specs
|
28
|
+
run(all_spec_files.join(' '))
|
29
|
+
end
|
30
|
+
|
31
|
+
# --------------------------------------------------
|
32
|
+
# Watchr Rules
|
33
|
+
# --------------------------------------------------
|
34
|
+
watch('^spec/(.*)_spec\.rb') { |m| run_spec_matching(m[1]) }
|
35
|
+
watch('^lib/(.*)\.rb') { |m| run_spec_matching(m[1]) }
|
36
|
+
watch('^spec/spec_helper\.rb') { run_all_specs }
|
37
|
+
watch('^spec/frawd\.rb') { run_all_specs }
|
38
|
+
watch('^spec/support/.*\.rb') { run_all_specs }
|
39
|
+
|
40
|
+
# --------------------------------------------------
|
41
|
+
# Signal Handling
|
42
|
+
# --------------------------------------------------
|
43
|
+
|
44
|
+
def no_int_for_you
|
45
|
+
@sent_an_int = nil
|
46
|
+
end
|
47
|
+
|
48
|
+
Signal.trap 'INT' do
|
49
|
+
if @sent_an_int then
|
50
|
+
puts " A second INT? Ok, I get the message. Shutting down now."
|
51
|
+
exit
|
52
|
+
else
|
53
|
+
puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
|
54
|
+
@sent_an_int = true
|
55
|
+
Kernel.sleep 1.5
|
56
|
+
run_all_specs
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# vim:ft=ruby
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 5
|
9
|
+
version: 0.0.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Alex Redington
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-11-05 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -62,6 +62,35 @@ dependencies:
|
|
62
62
|
version: 2.0.1
|
63
63
|
type: :development
|
64
64
|
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
name: watchr
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
segments:
|
74
|
+
- 0
|
75
|
+
- 7
|
76
|
+
version: "0.7"
|
77
|
+
type: :development
|
78
|
+
version_requirements: *id004
|
79
|
+
- !ruby/object:Gem::Dependency
|
80
|
+
name: faker
|
81
|
+
prerelease: false
|
82
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
- 3
|
90
|
+
- 1
|
91
|
+
version: 0.3.1
|
92
|
+
type: :development
|
93
|
+
version_requirements: *id005
|
65
94
|
description: Dwarf is an implementation of decision tree learning algorithms targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
|
66
95
|
email:
|
67
96
|
- aredington@gmail.com
|
@@ -77,13 +106,19 @@ files:
|
|
77
106
|
- Gemfile.lock
|
78
107
|
- README.md
|
79
108
|
- Rakefile
|
109
|
+
- TODO.taskpaper
|
80
110
|
- dwarf.gemspec
|
81
111
|
- lib/dwarf.rb
|
82
112
|
- lib/dwarf/classifier.rb
|
113
|
+
- lib/dwarf/example_management.rb
|
114
|
+
- lib/dwarf/information.rb
|
83
115
|
- lib/dwarf/tree_node.rb
|
84
116
|
- lib/dwarf/version.rb
|
85
|
-
- spec/classifier_spec.rb
|
117
|
+
- spec/dwarf/classifier_spec.rb
|
118
|
+
- spec/dwarf/information_spec.rb
|
119
|
+
- spec/frawd.rb
|
86
120
|
- spec/spec_helper.rb
|
121
|
+
- specs.watchr
|
87
122
|
has_rdoc: true
|
88
123
|
homepage: http://github.com/aredington/dwarf
|
89
124
|
licenses: []
|
data/spec/classifier_spec.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), *%w[spec_helper])
|
2
|
-
|
3
|
-
describe Dwarf::Classifier do
|
4
|
-
|
5
|
-
before(:each) do
|
6
|
-
@classifier = Dwarf::Classifier.new()
|
7
|
-
end
|
8
|
-
|
9
|
-
it "accepts example classifications" do
|
10
|
-
@classifier.should respond_to(:add_example)
|
11
|
-
end
|
12
|
-
|
13
|
-
it "stores examples" do
|
14
|
-
@example3 = double('example3')
|
15
|
-
@example3.stub(:attributes) { [] }
|
16
|
-
@classifier.add_example(@example3, :irish)
|
17
|
-
@classifier.examples.should include(@example3)
|
18
|
-
end
|
19
|
-
|
20
|
-
it "only implements classify on the learning instance" do
|
21
|
-
@example = double('example3')
|
22
|
-
@example.stub(:attributes) { [] }
|
23
|
-
@class2 = Dwarf::Classifier.new()
|
24
|
-
@classifier.add_example(@example, :round)
|
25
|
-
@classifier.learn!
|
26
|
-
@classifier.classify(@example).should eq(:round)
|
27
|
-
@class2.classify(@example).should eq(nil)
|
28
|
-
end
|
29
|
-
|
30
|
-
context "classifying cars" do
|
31
|
-
def mock_car_examples
|
32
|
-
@example1 = double('example1')
|
33
|
-
@example1.stub(:body_style) { :boxy }
|
34
|
-
@example1.stub(:cylinders) { 4 }
|
35
|
-
@example1.stub(:attributes) { ["body_style", "cylinders"] }
|
36
|
-
@example2 = double('example2')
|
37
|
-
@example2.stub(:body_style) { :swoopy }
|
38
|
-
@example2.stub(:cylinders) { 6 }
|
39
|
-
@example2.stub(:attributes) { ["body_style", "cylinders"] }
|
40
|
-
@example3 = double('example3')
|
41
|
-
@example3.stub(:body_style) { :angry }
|
42
|
-
@example3.stub(:cylinders) { 8 }
|
43
|
-
@example3.stub(:attributes) { ["body_style", "cylinders"] }
|
44
|
-
@example4 = double('example4')
|
45
|
-
@example4.stub(:body_style) {:swoopy}
|
46
|
-
@example4.stub(:cylinders) {8}
|
47
|
-
@example4.stub(:attributes) { ["body_style", "cylinders"] }
|
48
|
-
end
|
49
|
-
|
50
|
-
it "enumerate example attributes" do
|
51
|
-
mock_car_examples
|
52
|
-
@classifier.add_example(@example1, :japanese)
|
53
|
-
@classifier.example_attributes.should include("body_style", "cylinders")
|
54
|
-
end
|
55
|
-
|
56
|
-
it "classifies in a trivial case" do
|
57
|
-
mock_car_examples
|
58
|
-
@classifier.add_example(@example1, :japanese)
|
59
|
-
@classifier.add_example(@example2, :german)
|
60
|
-
@classifier.add_example(@example3, :american)
|
61
|
-
@classifier.learn!
|
62
|
-
@classifier.classify(@example1).should eq(:japanese)
|
63
|
-
@classifier.classify(@example2).should eq(:german)
|
64
|
-
@classifier.classify(@example3).should eq(:american)
|
65
|
-
end
|
66
|
-
|
67
|
-
it "classifies when multiple predicates required" do
|
68
|
-
mock_car_examples
|
69
|
-
@classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
|
70
|
-
@classifier.learn!
|
71
|
-
@classifier.classify(@example1).should eq(:japanese)
|
72
|
-
@classifier.classify(@example2).should eq(:german)
|
73
|
-
@classifier.classify(@example3).should eq(:american)
|
74
|
-
@classifier.classify(@example4).should eq(:german)
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
end
|