dwarf 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- dwarf (0.0.3)
4
+ dwarf (0.0.4)
5
5
  rubytree (>= 0.8.1)
6
6
 
7
7
  GEM
8
8
  remote: http://rubygems.org/
9
9
  specs:
10
10
  diff-lcs (1.1.2)
11
+ faker (0.3.1)
11
12
  rspec (2.0.1)
12
13
  rspec-core (~> 2.0.1)
13
14
  rspec-expectations (~> 2.0.1)
@@ -19,6 +20,7 @@ GEM
19
20
  rspec-core (~> 2.0.1)
20
21
  rspec-expectations (~> 2.0.1)
21
22
  rubytree (0.8.1)
23
+ watchr (0.7)
22
24
 
23
25
  PLATFORMS
24
26
  ruby
@@ -26,5 +28,7 @@ PLATFORMS
26
28
  DEPENDENCIES
27
29
  bundler (>= 1.0.0)
28
30
  dwarf!
31
+ faker (>= 0.3.1)
29
32
  rspec (>= 2.0.1)
30
33
  rubytree (>= 0.8.1)
34
+ watchr (>= 0.7)
data/Rakefile CHANGED
@@ -5,3 +5,4 @@ Bundler::GemHelper.install_tasks
5
5
  RSpec::Core::RakeTask.new(:spec) do
6
6
  end
7
7
 
8
+ task :default => :spec
@@ -0,0 +1,22 @@
1
+
2
+ Dwarf 1.0 Features:
3
+
4
+ Find all instances with a given classification given a world. @alex @priority(3)
5
+ - Create queries to find all instances of a given classification. @priority(1)
6
+ - Make logic of a query for a given classification readble (as Ruby, or SQL, or ...) @priority(1)
7
+ - Generate large worlds with consistent instances to test against. @muness @priority(2)
8
+ - Handle nested features (e.g. example.engine.cylinders) @priority(1)
9
+ Handle messy data well (nil examples, examples with nil features, examples with nil subfeatures) @alex @priority(2)
10
+ Resolve weird behavior when all examples are missing some attribute. @alex @sam @priority(1)
11
+ - When attribute.nil? bisects a heterogenous group, we should probably split on that attribute. @priority(1)
12
+ Refactor information theory methods out to enable unit testing. @alex @priority(3)
13
+ Eliminate features which are unique across all examples @sam @alex @priority(2)
14
+ - Treat hashes as nested features. @priority(1)
15
+
16
+ Dwarf Nice To Haves:
17
+
18
+ - meta-features based on type, e.g. parity(car.engine.cylinders) can be :even or :odd @priority(2)
19
+ - Bayesian classification of text fields. @priority(1)
20
+ - Junk uniquely identifying features (implicit in info gain calculations? add tests to verify!) @priority(3)
21
+ - Modular feature enumeration and feature fetching code. (Don't rely on attributes and example.attribute to be your only duck type checks!) @priority(2)
22
+ - A world-generation tool to create internally consistent data sets to measure dwarf's learning against. Maybe we can call it "frawd". @priority(1)
@@ -17,6 +17,8 @@ Gem::Specification.new do |s|
17
17
  s.add_dependency "rubytree", ">= 0.8.1"
18
18
  s.add_development_dependency "bundler", ">= 1.0.0"
19
19
  s.add_development_dependency "rspec", ">= 2.0.1"
20
+ s.add_development_dependency "watchr", ">= 0.7"
21
+ s.add_development_dependency "faker", ">= 0.3.1"
20
22
 
21
23
  s.files = `git ls-files`.split("\n")
22
24
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
@@ -1,4 +1,6 @@
1
1
  module Dwarf
2
- require 'dwarf/classifier'
2
+ require 'dwarf/example_management'
3
+ require 'dwarf/information'
3
4
  require 'dwarf/tree_node'
5
+ require 'dwarf/classifier'
4
6
  end
@@ -1,12 +1,15 @@
1
1
  module Dwarf
2
2
  class Classifier
3
+
3
4
  attr_accessor :examples
4
5
  attr_accessor :example_attributes
5
6
  attr_accessor :classifier_logic
7
+ attr_reader :decision_tree
6
8
 
7
9
  def initialize()
8
10
  @examples, @example_attributes = {}, []
9
11
  @decision_tree = TreeNode.new("ROOT")
12
+ @nil_name = Object.new.to_s
10
13
  end
11
14
 
12
15
  def add_examples(example_hash)
@@ -17,7 +20,7 @@ module Dwarf
17
20
 
18
21
  def add_example(example_record, classification)
19
22
  @examples[example_record]=classification
20
- @example_attributes |= example_record.attributes
23
+ @example_attributes |= example_record.attribute_names
21
24
  end
22
25
 
23
26
  def classify(example)
@@ -26,37 +29,120 @@ module Dwarf
26
29
 
27
30
  def learn!
28
31
  @decision_tree.examples = @examples.keys
32
+ converge_tree
33
+ self.classifier_logic = codify_tree(@decision_tree)
34
+ implement_classify
35
+ end
36
+
37
+ def find_by_classification(world, classification)
38
+ matches = []
39
+ world.each do |instance|
40
+ if classify(instance) == classification
41
+ matches << instance
42
+ end
43
+ end
44
+ matches
45
+ end
46
+
47
+ private
48
+
49
+ include ExampleManagement
50
+
51
+ def converge_tree
29
52
  pending = []
30
53
  pending.push @decision_tree
31
- used_attributes = []
32
54
  until pending.empty?
33
55
  node = pending.pop
34
56
  if classification = homogenous_examples(node)
35
57
  node.classification = classification
36
58
  elsif no_valuable_attributes?(node) && node.parent
37
- node.parent.classification= expected_value(node.examples)
59
+ if split_nil_children = check_nil_split(node)
60
+ split_nil_children.each {|child_node| pending.push(child_node)}
61
+ else
62
+ create_expected_value(node)
63
+ end
38
64
  elsif no_valuable_attributes?(node)
39
- classifier_logic = expected_value(node.examples)
65
+ node.classification = expected_value(node.examples)
40
66
  elsif false #stub branch
41
67
  #C4.5 would also allow for previously unseen classifications
42
- #dwarf's API dictates all classifications are known before learning
43
- #starts
68
+ #dwarf needs to correctly handle a pre-existing tree when
69
+ #learn! is called
44
70
  else
45
- infogains = {}
46
- (@example_attributes-used_attributes).each do |example_attribute|
47
- infogains[information_gain(node.examples,example_attribute)] = example_attribute
48
- end
49
- best_gain = infogains.keys.sort[0]
50
- best_attribute = infogains[best_gain]
51
- split(node,best_attribute).each {|child_node| pending.push(child_node)}
52
- used_attributes << best_attribute
71
+ split_children = homogenize_children(node)
72
+ split_children.each {|child_node| pending.push(child_node)}
53
73
  end
54
74
  end
55
- self.classifier_logic = codify_tree(@decision_tree)
56
- implement_classify
57
75
  end
58
76
 
59
- private
77
+ def check_nil_split(node)
78
+ infogains = {}
79
+
80
+ used_attributes = used_attributes(node)
81
+ (filtered_attributes-used_attributes).each do |example_attribute|
82
+ infogains[Information::unfiltered_information_gain(node.examples,example_attribute,@examples)] =
83
+ example_attribute
84
+ end
85
+ best_gain = infogains.keys.sort[0]
86
+ best_attribute = infogains[best_gain]
87
+ if best_gain > 0.0
88
+ return split(node, best_attribute)
89
+ end
90
+
91
+ end
92
+
93
+ def create_expected_value(node)
94
+ new_node = TreeNode.new(node.name)
95
+ expected_value = expected_value(node.examples)
96
+ new_node.classification = expected_value
97
+ parent = node.parent
98
+ parent.remove! node
99
+ parent << new_node
100
+ new_node << node
101
+ end
102
+
103
+ def used_attributes(node)
104
+ if node.parentage
105
+ node.parentage.map { |parent| parent.attribute }
106
+ else
107
+ []
108
+ end
109
+ end
110
+
111
+ def attribute_homogeneous?(example_subset, attribute)
112
+ invert_with_dups(attribute_map(example_subset, attribute)).keys.size == 1
113
+ end
114
+
115
+ def heterogeneous_attributes
116
+ @example_attributes.reject { |attr| attribute_homogeneous?(@examples.keys, attr) }
117
+ end
118
+
119
+ def attribute_clusters?(example_subset, attribute)
120
+ invert_with_dups(attribute_map(example_subset, attribute)).keys.size == example_subset.size
121
+ end
122
+
123
+ def clustering_attributes
124
+ @example_attributes.select {|attr| attribute_clusters?(@examples.keys, attr) }
125
+ end
126
+
127
+ def filtered_attributes
128
+ clustering_attributes | heterogeneous_attributes
129
+ end
130
+
131
+ def homogenize_children(node)
132
+ infogains = {}
133
+
134
+ used_attributes = used_attributes(node)
135
+
136
+ (filtered_attributes-used_attributes).each do |example_attribute|
137
+ infogains[Information::information_gain(node.examples,example_attribute,@examples)] =
138
+ example_attribute
139
+ end
140
+
141
+ best_gain = infogains.keys.sort[0]
142
+ best_attribute = infogains[best_gain]
143
+
144
+ return split(node,best_attribute)
145
+ end
60
146
 
61
147
  def implement_classify
62
148
  classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
@@ -85,6 +171,7 @@ module Dwarf
85
171
 
86
172
  def codify_literal(object)
87
173
  case object
174
+ when @nil_name then "nil"
88
175
  when Symbol then ":#{object}"
89
176
  when String then "\"#{object}\""
90
177
  else
@@ -97,6 +184,9 @@ module Dwarf
97
184
  example_subset = node.examples
98
185
  examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
99
186
  examples_inversion.each do |key, value|
187
+ if key.nil?
188
+ key = @nil_name
189
+ end
100
190
  child_node = TreeNode.new(key)
101
191
  child_node.examples = value
102
192
  node << child_node
@@ -106,21 +196,20 @@ module Dwarf
106
196
  end
107
197
 
108
198
  def expected_value(example_subset)
109
- examples_inversion = invert_with_dups(classification_map(example_subset))
199
+ examples_inversion = invert_with_dups(classification_map(example_subset, @examples))
110
200
  occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
111
201
  occurrences.keys.sort { |key| occurrences[key] }[0]
112
202
  end
113
203
 
114
204
  def no_valuable_attributes?(node)
115
- @example_attributes.map {|example_attribute|
116
- information_gain(node.examples, example_attribute)}.each {|info_gain|
205
+ filtered_attributes.map {|example_attribute|
206
+ Information::information_gain(node.examples, example_attribute, @examples)}.each {|info_gain|
117
207
  return false if info_gain != 0}
118
208
  return true
119
209
  end
120
210
 
121
-
122
211
  def homogenous_examples(node)
123
- classifications = classifications(node.examples)
212
+ classifications = filter_classifications(@examples, node.examples)
124
213
  if classifications.length == 1
125
214
  return classifications[0]
126
215
  else
@@ -128,49 +217,5 @@ module Dwarf
128
217
  end
129
218
  end
130
219
 
131
- def entropy(example_subset)
132
- set_size = example_subset.length.to_f
133
- examples_inversion = invert_with_dups(classification_map(example_subset))
134
- occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
135
- 0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
136
- sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
137
- end
138
- end
139
-
140
- def information_gain(example_subset,attribute)
141
- set_size = example_subset.length.to_f
142
- examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
143
- occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
144
- entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
145
- sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
146
- end
147
- end
148
-
149
- def classifications(example_subset)
150
- example_subset.map {|example| @examples[example]}.compact
151
- end
152
-
153
- def classification_map(example_subset)
154
- classification_map = {}
155
- example_subset.each {|example| classification_map[example] = @examples[example]}
156
- classification_map
157
- end
158
-
159
- def attribute_values(example_subset, attribute)
160
- example_subset.map {|example| example.method(attribute.to_sym).call}.compact
161
- end
162
-
163
- def attribute_map(example_subset, attribute)
164
- example_map = {}
165
- example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
166
- example_map
167
- end
168
-
169
- def invert_with_dups(hash)
170
- inversion = {}
171
- hash.values.each {|value| inversion[value] = []}
172
- hash.keys.each {|key| inversion[hash[key]] << key}
173
- inversion
174
- end
175
220
  end
176
221
  end
@@ -0,0 +1,32 @@
1
+ module Dwarf
2
+ module ExampleManagement
3
+
4
+ def classification_map(example_subset, classifications)
5
+ classification_map = {}
6
+ example_subset.each {|example| classification_map[example] = classifications[example]}
7
+ classification_map
8
+ end
9
+
10
+ def invert_with_dups(hash)
11
+ inversion = { }
12
+ hash.values.each {|value| inversion[value] = []}
13
+ hash.keys.each {|key| inversion[hash[key]] << key}
14
+ inversion
15
+ end
16
+
17
+ def eval_attribute(example,attribute)
18
+ example.method(attribute.to_sym).call
19
+ end
20
+
21
+ def attribute_map(example_subset, attribute)
22
+ example_map = {}
23
+ example_subset.each {|example| example_map[example] = eval_attribute(example, attribute)}
24
+ example_map
25
+ end
26
+
27
+ def filter_classifications(classifications,example_subset)
28
+ example_subset.map {|example| classifications[example]}.uniq
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,61 @@
1
+ module Dwarf
2
+ module Information
3
+
4
+ class<< self
5
+ include ExampleManagement
6
+
7
+ def entropy(example_subset, classifications)
8
+ seen_classifications = filter_classifications(classifications, example_subset)
9
+ return 0.0 if seen_classifications.length == 1
10
+ set_size = example_subset.length.to_f
11
+ examples_inversion = invert_with_dups(classification_map(example_subset, classifications))
12
+ occurrences = occurrences(examples_inversion)
13
+ sum_over(seen_classifications) do |classification|
14
+ frequency = occurrences[classification]/set_size
15
+ - frequency * Math.log(frequency,seen_classifications.length)
16
+ end
17
+ end
18
+
19
+ def information_gain(example_subset, attribute, classifications)
20
+ filtered_example_subset = filter_for_missing_attribute(example_subset, attribute)
21
+ unfiltered_information_gain(filtered_example_subset, attribute, classifications)
22
+ end
23
+
24
+ def unfiltered_information_gain(example_subset, attribute, classifications)
25
+ set_size = example_subset.length.to_f
26
+ examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
27
+ occurrences = occurrences(examples_inversion)
28
+ heterogeneous_entropy = entropy(example_subset, classifications)
29
+ seen_attribute_values = attribute_values(example_subset,attribute)
30
+ heterogeneous_entropy -
31
+ sum_over(seen_attribute_values) do |attribute_value|
32
+ frequency = occurrences[attribute_value]/set_size
33
+ frequency * entropy(examples_inversion[attribute_value], classifications)
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def sum_over(collection)
40
+ collection.inject(0.0) do |sum, classification|
41
+ sum + yield(classification)
42
+ end
43
+ end
44
+
45
+ def occurrences(examples_inversion)
46
+ examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
47
+ end
48
+
49
+ def filter_for_missing_attribute(example_subset, attribute)
50
+ example_subset.reject { |example| eval_attribute(example,attribute).nil? }
51
+ end
52
+
53
+ def attribute_values(example_subset, attribute)
54
+ example_subset.map {|example| eval_attribute(example, attribute)}.uniq
55
+ end
56
+
57
+ end
58
+
59
+ end
60
+
61
+ end
@@ -1,3 +1,3 @@
1
1
  module Dwarf
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -0,0 +1,197 @@
1
+ require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
2
+
3
+ describe Dwarf::Classifier do
4
+
5
+ before(:each) do
6
+ @classifier = Dwarf::Classifier.new()
7
+ end
8
+
9
+ def mock_car_examples
10
+ @example1 = FakeCar.new(:body_style => :boxy,
11
+ :cylinders => 4,
12
+ :transmission => :manual)
13
+ @example2 = FakeCar.new(:body_style => :swoopy,
14
+ :cylinders => 6,
15
+ :transmission => :manual)
16
+ @example3 = FakeCar.new(:body_style => :angry,
17
+ :cylinders => 8,
18
+ :transmission => :manual)
19
+ @example4 = FakeCar.new(:body_style => :swoopy,
20
+ :cylinders => 8,
21
+ :transmission => :manual)
22
+ @example5 = FakeCar.new(:body_style => nil,
23
+ :cylinders => 6,
24
+ :transmission => :manual)
25
+ @example6 = FakeCar.new(:body_style => :sleek,
26
+ :cylinders => nil,
27
+ :transmission => :manual)
28
+ end
29
+
30
+ context "add_example" do
31
+
32
+ it "accepts example classifications" do
33
+ @classifier.should respond_to(:add_example)
34
+ end
35
+
36
+ it "stores examples" do
37
+ @example3 = double('example3')
38
+ @example3.stub(:attribute_names) { [] }
39
+ @classifier.add_example(@example3, :irish)
40
+ @classifier.examples.should include(@example3)
41
+ end
42
+
43
+ it "enumerate example attributes" do
44
+ @example_with_attributes = double('attrs')
45
+ @example_with_attributes.stub(:attribute_names) { ["height", "branch_density"] }
46
+ @classifier.add_example(@example_with_attributes, :pine)
47
+ @classifier.example_attributes.should include("height", "branch_density")
48
+ end
49
+
50
+ it "gracefully accepts examples with nil attributes" do
51
+ @example_with_nil_attributes = double('nils')
52
+ @example_with_nil_attributes.stub(:attribute_names) { ["height", "branch_density"] }
53
+ @example_with_nil_attributes.stub(:height) { nil }
54
+ @example_with_nil_attributes.stub(:branch_density) { :high }
55
+ lambda {@classifier.add_example(@example_with_nil_attributes, :pine)}.should_not raise_exception
56
+ end
57
+
58
+ end
59
+
60
+ context "learn! and classify" do
61
+
62
+ it "only implements classify on the learning instance" do
63
+ @example = double('example3')
64
+ @example.stub(:attribute_names) { [] }
65
+ @class2 = Dwarf::Classifier.new()
66
+ @classifier.add_example(@example, :round)
67
+ @classifier.learn!
68
+ @classifier.classify(@example) == :round
69
+ @class2.classify(@example).should == nil
70
+ end
71
+
72
+ context "frawd is dwarf backwards" do
73
+ before(:each) do
74
+ @frawd = Frawd.new(1,100)
75
+ end
76
+
77
+ it "is totally awesome" do
78
+ @frawd.training.each do |example, classification|
79
+ @classifier.add_example(example, classification)
80
+ end
81
+ @classifier.learn!
82
+ success = 0
83
+ @frawd.testing.each do |example, classification|
84
+ success += 1 if @classifier.classify(example) == classification
85
+ end
86
+ success.should == @frawd.testing.size
87
+ end
88
+ end
89
+
90
+ context "classifying cars" do
91
+
92
+ before(:each) do
93
+ mock_car_examples
94
+ @classifier.add_example(@example1, :japanese)
95
+ @classifier.add_example(@example2, :german)
96
+ @classifier.add_example(@example3, :american)
97
+ end
98
+
99
+ it "classifies in a trivial case" do
100
+ @classifier.learn!
101
+ @classifier.classify(@example1).should == :japanese
102
+ @classifier.classify(@example2).should == :german
103
+ @classifier.classify(@example3).should == :american
104
+ end
105
+
106
+ it "classifies when multiple predicates required" do
107
+ @classifier.add_example(@example4, :german)
108
+ @classifier.learn!
109
+ @classifier.classify(@example1).should == :japanese
110
+ @classifier.classify(@example2).should == :german
111
+ @classifier.classify(@example3).should == :american
112
+ @classifier.classify(@example4).should == :german
113
+ end
114
+
115
+ it "handles nils gracefully" do
116
+ @classifier.add_examples(@example4 => :german,
117
+ @example5 => :japanese,
118
+ @example6 => :japanese)
119
+ lambda{@classifier.learn!}.should_not raise_exception
120
+ end
121
+
122
+ it "handles nils correctly" do
123
+ @classifier.add_examples(@example4 => :german,
124
+ @example5 => :japanese,
125
+ @example6 => :japanese)
126
+ @classifier.learn!
127
+ @classifier.classify(@example1).should == :japanese
128
+ @classifier.classify(@example2).should == :german
129
+ @classifier.classify(@example3).should == :american
130
+ @classifier.classify(@example4).should == :german
131
+ @classifier.classify(@example5).should == :japanese
132
+ @classifier.classify(@example6).should == :japanese
133
+ end
134
+
135
+ it "handles a feature missing from all examples correctly" do
136
+ @classifier.add_examples(@example4 => :german,
137
+ @example5 => :japanese,
138
+ @example6 => :japanese)
139
+ @classifier.learn!
140
+ open = [@classifier.decision_tree]
141
+ until open.empty?
142
+ current = open.pop
143
+ current.attribute.should_not == "wheel_diameter"
144
+ current.children.each {|child| open.push child}
145
+ end
146
+ end
147
+
148
+ it "does not use a feature which is identical across all examples" do
149
+ @classifier.add_examples(@example4 => :german,
150
+ @example5 => :japanese,
151
+ @example6 => :japanese)
152
+ @classifier.learn!
153
+ open = [@classifier.decision_tree]
154
+ until open.empty?
155
+ current = open.pop
156
+ current.attribute.should_not == "transmission"
157
+ current.children.each {|child| open.push child}
158
+ end
159
+ end
160
+
161
+ it "does not use a feature unique to each example" do
162
+ @classifier.add_examples(@example4 => :german,
163
+ @example5 => :japanese,
164
+ @example6 => :japanese)
165
+ @classifier.learn!
166
+ open = [@classifier.decision_tree]
167
+ until open.empty?
168
+ current = open.pop
169
+ current.attribute.should_not == "vin"
170
+ current.children.each {|child| open.push child}
171
+ end
172
+
173
+ end
174
+
175
+ end
176
+
177
+ end
178
+
179
+ context "find_by_classification" do
180
+
181
+ it "returns sets of cars based on class" do
182
+ mock_car_examples
183
+ @classifier.add_examples(@example1 => :japanese,
184
+ @example2 => :german,
185
+ @example3 => :american,
186
+ @example4 => :german)
187
+ @classifier.learn!
188
+ all_cars = [@example1, @example2, @example3, @example4]
189
+ japanese_cars = @classifier.find_by_classification(all_cars, :japanese)
190
+ japanese_cars.should == [@example1]
191
+ end
192
+
193
+
194
+ end
195
+
196
+
197
+ end
@@ -0,0 +1,157 @@
1
+ require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
2
+
3
+ describe Dwarf::Information do
4
+
5
+ class Deck
6
+
7
+ def initialize()
8
+ @draw = (1..52).map{|v| v}
9
+ @draw.shuffle!
10
+ @discard = []
11
+ end
12
+
13
+ def sample
14
+ unless @draw.empty?
15
+ card = @draw.pop
16
+ @discard.push card
17
+ return card
18
+ else
19
+ @draw = @discard
20
+ @draw.shuffle
21
+ @discard = []
22
+ return self.sample
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ class Coin
29
+
30
+ def initialize(weighting)
31
+ @weighting = weighting
32
+ @faces = [:heads, :tails]
33
+ end
34
+
35
+ def attributes
36
+ "weighting"
37
+ end
38
+
39
+ def weighting
40
+ @weighting
41
+ end
42
+
43
+ def sample
44
+ case @weighting
45
+ when :fair then @faces.sample
46
+ when :heads then :heads
47
+ when :tails then :tails
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ context "entropy" do
54
+ it "calculates correctly for heads and tails" do
55
+ examples = []
56
+ classifications = {}
57
+ coin = Coin.new(:fair)
58
+ 1000.times do
59
+ obj = Object.new
60
+ examples << obj
61
+ classifications[obj] = coin.sample
62
+ end
63
+ entropy = Dwarf::Information.entropy(examples, classifications)
64
+ entropy.should > 0.99
65
+ entropy.should <= 1.0
66
+ end
67
+
68
+ it "calculates correctly for 1d6" do
69
+ examples = []
70
+ classifications = {}
71
+ die = (1..6).map{|v| v}
72
+ 1000.times do
73
+ obj = Object.new
74
+ examples << obj
75
+ classifications[obj] = die.sample
76
+ end
77
+ entropy = Dwarf::Information.entropy(examples, classifications)
78
+ entropy.should > 0.99
79
+ entropy.should <= 1.0
80
+ end
81
+
82
+ it "calculates correctly for a deck of cards" do
83
+ examples = []
84
+ classifications = {}
85
+ deck = Deck.new
86
+ 1000.times do
87
+ obj = Object.new
88
+ examples << obj
89
+ classifications[obj] = deck.sample
90
+ end
91
+ entropy = Dwarf::Information.entropy(examples, classifications)
92
+ entropy.should > 0.99
93
+ entropy.should <= 1.0
94
+ end
95
+
96
+ it "calculates correctly with a weighted coin" do
97
+ examples = []
98
+ classifications = {}
99
+ 1000.times do
100
+ obj = Object.new
101
+ examples << obj
102
+ classifications[obj] = (rand(100) == 99) ? :heads : :tails
103
+ end
104
+ entropy = Dwarf::Information.entropy(examples,classifications)
105
+ entropy.should < 0.101 #With a perfect 99:1 distribution, entropy should == 0.0807...
106
+ entropy.should >= 0.04
107
+ end
108
+
109
+ it "calculates correctly with a homogenous set" do
110
+ examples = []
111
+ classifications = {}
112
+ 1000.times do
113
+ obj = Object.new
114
+ examples << obj
115
+ classifications[obj] = :heads
116
+ end
117
+ entropy = Dwarf::Information.entropy(examples,classifications)
118
+ entropy.should == 0.0
119
+ end
120
+
121
+ end
122
+
123
+ context "information_gain" do
124
+
125
+ it "calculates correctly splitting perfectly weighted coins" do
126
+ examples = []
127
+ classifications = {}
128
+ 500.times do
129
+ coin = Coin.new(:heads)
130
+ examples << coin
131
+ classifications[coin] = coin.sample
132
+ end
133
+ 500.times do
134
+ coin = Coin.new(:tails)
135
+ examples << coin
136
+ classifications[coin] = coin.sample
137
+ end
138
+ information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
139
+ information_gain.should == 1.0
140
+ end
141
+
142
+ it "calculates worthless infogame for fair weighted coins" do
143
+ examples = []
144
+ classifications = {}
145
+ coin = Coin.new(:fair)
146
+ 1000.times do
147
+ coin = Coin.new(:fair)
148
+ examples << coin
149
+ classifications[coin] = coin.sample
150
+ end
151
+ information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
152
+ information_gain.should == 0.0
153
+ end
154
+
155
+ end
156
+
157
+ end
@@ -0,0 +1,105 @@
1
+ #require File.join(File.dirname(__FILE__), *%w[. spec_helper.rb])
2
+ require 'rspec/mocks'
3
+ require 'faker'
4
+ require 'digest'
5
+
6
+ class Frawd
7
+ attr_reader :rules
8
+
9
+ def initialize(depth = 10, sample_sizes = 1000)
10
+ @depth = depth
11
+ @sample_sizes = sample_sizes
12
+ initialize_attributes
13
+ @leaves = []
14
+ @rules = build_rules
15
+ @rules.each_leaf do |leaf|
16
+ @leaves << leaf
17
+ end
18
+ end
19
+
20
+ def types
21
+ [:enum, :number, :text]
22
+ end
23
+
24
+ def enums
25
+ unless @enums
26
+ @enums = [[:true, :false],
27
+ [:baz, :bar, :zot],
28
+ [:baz, :bar, :zot, :quux]]
29
+ (1..rand(10)).each do
30
+ @enums << Faker::Lorem.words(rand(10)).uniq.map(&:to_sym)
31
+ end
32
+ end
33
+ @enums
34
+ end
35
+
36
+ def classifications
37
+ @classifications ||= (1..rand(10)).map {|x| "classification#{x}".to_sym }
38
+ end
39
+
40
+ def initialize_attributes
41
+ @attributes = []
42
+ num_attributes = 10#rand(100)
43
+ (1..num_attributes).each do |number|
44
+ type = types.sample
45
+ values = enums.sample if type == :enum
46
+ @attributes << ["attribute#{number}", type, values]
47
+ end
48
+ end
49
+
50
+ def filtered_attributes
51
+ @attributes.select {|a| a[1] == :enum}
52
+ end
53
+
54
+ def build_rules(node = Dwarf::TreeNode.new("ROOT"), attributes = filtered_attributes)
55
+ parents = node.parentage || []
56
+ if (rand(@depth) < parents.length) || attributes.empty?
57
+ node.classification = classifications.sample
58
+ else
59
+ attribute = attributes.sample
60
+ node.attribute = attribute[0]
61
+ attribute[2].each do |value|
62
+ child = Dwarf::TreeNode.new(value.to_s)
63
+ node << child
64
+ build_rules(child,attributes-[attribute[0]])
65
+ end
66
+ end
67
+ node
68
+ end
69
+
70
+ def generate_example
71
+ node = @leaves.sample
72
+ example_classification = node.classification
73
+ example = RSpec::Mocks::Mock.new('example')
74
+ node.parentage.unshift(node).each_cons(2) do |child, parent|
75
+ example.stub!(parent.attribute.to_sym) { child.name }
76
+ example.stub!(:attribute_names) { @attributes.map {|a| a[0]} }
77
+ end
78
+ @attributes.each do |attribute|
79
+ unless example.respond_to? attribute[0].to_sym
80
+ val = case attribute[1]
81
+ when :enum then attribute[2].sample
82
+ when :number then rand((2**(0.size * 8 -2) -1))
83
+ when :text then Faker::Lorem.paragraphs
84
+ end
85
+ example.stub!(attribute[0].to_sym) { val }
86
+ end
87
+ end
88
+ [ example, example_classification ]
89
+ end
90
+
91
+ def generate_examples(count)
92
+ examples = Array.new(count)
93
+ (0...count).each { |index| examples[index] = generate_example }
94
+ examples
95
+ end
96
+
97
+ def training
98
+ @training ||= generate_examples(@sample_sizes)
99
+ end
100
+
101
+ def testing
102
+ @testing ||= generate_examples(@sample_sizes)
103
+ end
104
+
105
+ end
@@ -1 +1,60 @@
1
1
  require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
2
+ require File.join(File.dirname(__FILE__), *%w[. frawd])
3
+
4
+ # http://blog.jayfields.com/2007/04/ruby-assigning-instance-variables-in.html
5
+ class Module
6
+ def initializer(*args, &block)
7
+ define_method :initialize do |*ctor_args|
8
+ ctor_named_args = (ctor_args.last.is_a?(Hash) ? ctor_args.pop : {})
9
+ (0..args.size).each do |index|
10
+ instance_variable_set("@#{args[index]}", ctor_args[index])
11
+ end
12
+ ctor_named_args.each_pair do |param_name, param_value|
13
+ instance_variable_set("@#{param_name}", param_value)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ class FakeCar
20
+ initializer :body_style, :cylinders, :wheel_diameter, :transmission
21
+ attr_accessor :body_style, :cylinders, :wheel_diameter, :transmission
22
+
23
+ @@vin_counter = 0
24
+
25
+ def vin
26
+ @vin ||= @@vin_counter+=1
27
+ end
28
+
29
+ def attributes
30
+ ["body_style", "cylinders", "wheel_diameter", "transmission", "vin"]
31
+ end
32
+
33
+ alias_method :attribute_names, :attributes
34
+
35
+ def to_s
36
+ "#{body_style} with #{cylinders} cylinders"
37
+ end
38
+
39
+ def self.valid_body_styles
40
+ [:boxy, :swoopy, :angry, :boring]
41
+ end
42
+
43
+ def self.valid_cylinders
44
+ [4, 6, 8]
45
+ end
46
+
47
+ def self.fake
48
+ new(:body_style => valid_body_styles.sample,
49
+ :cylinders => valid_cylinders.sample)
50
+ end
51
+
52
+ def self.multiple_fakes(how_many=5)
53
+ array = []
54
+ how_many.times do
55
+ array << fake
56
+ end
57
+ array
58
+ end
59
+ end
60
+
@@ -0,0 +1,60 @@
1
+ # Run me with:
2
+ #
3
+ # $ watchr specs.watchr
4
+
5
+ # --------------------------------------------------
6
+ # Convenience Methods
7
+ # --------------------------------------------------
8
+ def all_spec_files
9
+ Dir['spec/**/*_spec.rb']
10
+ end
11
+
12
+ def run_spec_matching(thing_to_match)
13
+ matches = all_spec_files.grep(/#{thing_to_match}/i)
14
+ if matches.empty?
15
+ puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
16
+ else
17
+ run matches.join(' ')
18
+ end
19
+ end
20
+
21
+ def run(files_to_run)
22
+ puts("Running: #{files_to_run}")
23
+ system("clear;rspec -cfs #{files_to_run}")
24
+ no_int_for_you
25
+ end
26
+
27
+ def run_all_specs
28
+ run(all_spec_files.join(' '))
29
+ end
30
+
31
+ # --------------------------------------------------
32
+ # Watchr Rules
33
+ # --------------------------------------------------
34
+ watch('^spec/(.*)_spec\.rb') { |m| run_spec_matching(m[1]) }
35
+ watch('^lib/(.*)\.rb') { |m| run_spec_matching(m[1]) }
36
+ watch('^spec/spec_helper\.rb') { run_all_specs }
37
+ watch('^spec/frawd\.rb') { run_all_specs }
38
+ watch('^spec/support/.*\.rb') { run_all_specs }
39
+
40
+ # --------------------------------------------------
41
+ # Signal Handling
42
+ # --------------------------------------------------
43
+
44
+ def no_int_for_you
45
+ @sent_an_int = nil
46
+ end
47
+
48
+ Signal.trap 'INT' do
49
+ if @sent_an_int then
50
+ puts " A second INT? Ok, I get the message. Shutting down now."
51
+ exit
52
+ else
53
+ puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
54
+ @sent_an_int = true
55
+ Kernel.sleep 1.5
56
+ run_all_specs
57
+ end
58
+ end
59
+
60
+ # vim:ft=ruby
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 4
9
- version: 0.0.4
8
+ - 5
9
+ version: 0.0.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Alex Redington
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-22 00:00:00 -04:00
17
+ date: 2010-11-05 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -62,6 +62,35 @@ dependencies:
62
62
  version: 2.0.1
63
63
  type: :development
64
64
  version_requirements: *id003
65
+ - !ruby/object:Gem::Dependency
66
+ name: watchr
67
+ prerelease: false
68
+ requirement: &id004 !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ segments:
74
+ - 0
75
+ - 7
76
+ version: "0.7"
77
+ type: :development
78
+ version_requirements: *id004
79
+ - !ruby/object:Gem::Dependency
80
+ name: faker
81
+ prerelease: false
82
+ requirement: &id005 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ segments:
88
+ - 0
89
+ - 3
90
+ - 1
91
+ version: 0.3.1
92
+ type: :development
93
+ version_requirements: *id005
65
94
  description: Dwarf is an implementation of decision tree learning algorithms targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
66
95
  email:
67
96
  - aredington@gmail.com
@@ -77,13 +106,19 @@ files:
77
106
  - Gemfile.lock
78
107
  - README.md
79
108
  - Rakefile
109
+ - TODO.taskpaper
80
110
  - dwarf.gemspec
81
111
  - lib/dwarf.rb
82
112
  - lib/dwarf/classifier.rb
113
+ - lib/dwarf/example_management.rb
114
+ - lib/dwarf/information.rb
83
115
  - lib/dwarf/tree_node.rb
84
116
  - lib/dwarf/version.rb
85
- - spec/classifier_spec.rb
117
+ - spec/dwarf/classifier_spec.rb
118
+ - spec/dwarf/information_spec.rb
119
+ - spec/frawd.rb
86
120
  - spec/spec_helper.rb
121
+ - specs.watchr
87
122
  has_rdoc: true
88
123
  homepage: http://github.com/aredington/dwarf
89
124
  licenses: []
@@ -1,80 +0,0 @@
1
- require File.join(File.dirname(__FILE__), *%w[spec_helper])
2
-
3
- describe Dwarf::Classifier do
4
-
5
- before(:each) do
6
- @classifier = Dwarf::Classifier.new()
7
- end
8
-
9
- it "accepts example classifications" do
10
- @classifier.should respond_to(:add_example)
11
- end
12
-
13
- it "stores examples" do
14
- @example3 = double('example3')
15
- @example3.stub(:attributes) { [] }
16
- @classifier.add_example(@example3, :irish)
17
- @classifier.examples.should include(@example3)
18
- end
19
-
20
- it "only implements classify on the learning instance" do
21
- @example = double('example3')
22
- @example.stub(:attributes) { [] }
23
- @class2 = Dwarf::Classifier.new()
24
- @classifier.add_example(@example, :round)
25
- @classifier.learn!
26
- @classifier.classify(@example).should eq(:round)
27
- @class2.classify(@example).should eq(nil)
28
- end
29
-
30
- context "classifying cars" do
31
- def mock_car_examples
32
- @example1 = double('example1')
33
- @example1.stub(:body_style) { :boxy }
34
- @example1.stub(:cylinders) { 4 }
35
- @example1.stub(:attributes) { ["body_style", "cylinders"] }
36
- @example2 = double('example2')
37
- @example2.stub(:body_style) { :swoopy }
38
- @example2.stub(:cylinders) { 6 }
39
- @example2.stub(:attributes) { ["body_style", "cylinders"] }
40
- @example3 = double('example3')
41
- @example3.stub(:body_style) { :angry }
42
- @example3.stub(:cylinders) { 8 }
43
- @example3.stub(:attributes) { ["body_style", "cylinders"] }
44
- @example4 = double('example4')
45
- @example4.stub(:body_style) {:swoopy}
46
- @example4.stub(:cylinders) {8}
47
- @example4.stub(:attributes) { ["body_style", "cylinders"] }
48
- end
49
-
50
- it "enumerate example attributes" do
51
- mock_car_examples
52
- @classifier.add_example(@example1, :japanese)
53
- @classifier.example_attributes.should include("body_style", "cylinders")
54
- end
55
-
56
- it "classifies in a trivial case" do
57
- mock_car_examples
58
- @classifier.add_example(@example1, :japanese)
59
- @classifier.add_example(@example2, :german)
60
- @classifier.add_example(@example3, :american)
61
- @classifier.learn!
62
- @classifier.classify(@example1).should eq(:japanese)
63
- @classifier.classify(@example2).should eq(:german)
64
- @classifier.classify(@example3).should eq(:american)
65
- end
66
-
67
- it "classifies when multiple predicates required" do
68
- mock_car_examples
69
- @classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
70
- @classifier.learn!
71
- @classifier.classify(@example1).should eq(:japanese)
72
- @classifier.classify(@example2).should eq(:german)
73
- @classifier.classify(@example3).should eq(:american)
74
- @classifier.classify(@example4).should eq(:german)
75
- end
76
-
77
- end
78
-
79
-
80
- end