dwarf 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- dwarf (0.0.3)
4
+ dwarf (0.0.4)
5
5
  rubytree (>= 0.8.1)
6
6
 
7
7
  GEM
8
8
  remote: http://rubygems.org/
9
9
  specs:
10
10
  diff-lcs (1.1.2)
11
+ faker (0.3.1)
11
12
  rspec (2.0.1)
12
13
  rspec-core (~> 2.0.1)
13
14
  rspec-expectations (~> 2.0.1)
@@ -19,6 +20,7 @@ GEM
19
20
  rspec-core (~> 2.0.1)
20
21
  rspec-expectations (~> 2.0.1)
21
22
  rubytree (0.8.1)
23
+ watchr (0.7)
22
24
 
23
25
  PLATFORMS
24
26
  ruby
@@ -26,5 +28,7 @@ PLATFORMS
26
28
  DEPENDENCIES
27
29
  bundler (>= 1.0.0)
28
30
  dwarf!
31
+ faker (>= 0.3.1)
29
32
  rspec (>= 2.0.1)
30
33
  rubytree (>= 0.8.1)
34
+ watchr (>= 0.7)
data/Rakefile CHANGED
@@ -5,3 +5,4 @@ Bundler::GemHelper.install_tasks
5
5
  RSpec::Core::RakeTask.new(:spec) do
6
6
  end
7
7
 
8
+ task :default => :spec
@@ -0,0 +1,22 @@
1
+
2
+ Dwarf 1.0 Features:
3
+
4
+ Find all instances with a given classification given a world. @alex @priority(3)
5
+ - Create queries to find all instances of a given classification. @priority(1)
6
+ - Make logic of a query for a given classification readble (as Ruby, or SQL, or ...) @priority(1)
7
+ - Generate large worlds with consistent instances to test against. @muness @priority(2)
8
+ - Handle nested features (e.g. example.engine.cylinders) @priority(1)
9
+ Handle messy data well (nil examples, examples with nil features, examples with nil subfeatures) @alex @priority(2)
10
+ Resolve weird behavior when all examples are missing some attribute. @alex @sam @priority(1)
11
+ - When attribute.nil? bisects a heterogenous group, we should probably split on that attribute. @priority(1)
12
+ Refactor information theory methods out to enable unit testing. @alex @priority(3)
13
+ Eliminate features which are unique across all examples @sam @alex @priority(2)
14
+ - Treat hashes as nested features. @priority(1)
15
+
16
+ Dwarf Nice To Haves:
17
+
18
+ - meta-features based on type, e.g. parity(car.engine.cylinders) can be :even or :odd @priority(2)
19
+ - Bayesian classification of text fields. @priority(1)
20
+ - Junk uniquely identifying features (implicit in info gain calculations? add tests to verify!) @priority(3)
21
+ - Modular feature enumeration and feature fetching code. (Don't rely on attributes and example.attribute to be your only duck type checks!) @priority(2)
22
+ - A world-generation tool to create internally consistent data sets to measure dwarf's learning against. Maybe we can call it "frawd". @priority(1)
@@ -17,6 +17,8 @@ Gem::Specification.new do |s|
17
17
  s.add_dependency "rubytree", ">= 0.8.1"
18
18
  s.add_development_dependency "bundler", ">= 1.0.0"
19
19
  s.add_development_dependency "rspec", ">= 2.0.1"
20
+ s.add_development_dependency "watchr", ">= 0.7"
21
+ s.add_development_dependency "faker", ">= 0.3.1"
20
22
 
21
23
  s.files = `git ls-files`.split("\n")
22
24
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
@@ -1,4 +1,6 @@
1
1
  module Dwarf
2
- require 'dwarf/classifier'
2
+ require 'dwarf/example_management'
3
+ require 'dwarf/information'
3
4
  require 'dwarf/tree_node'
5
+ require 'dwarf/classifier'
4
6
  end
@@ -1,12 +1,15 @@
1
1
  module Dwarf
2
2
  class Classifier
3
+
3
4
  attr_accessor :examples
4
5
  attr_accessor :example_attributes
5
6
  attr_accessor :classifier_logic
7
+ attr_reader :decision_tree
6
8
 
7
9
  def initialize()
8
10
  @examples, @example_attributes = {}, []
9
11
  @decision_tree = TreeNode.new("ROOT")
12
+ @nil_name = Object.new.to_s
10
13
  end
11
14
 
12
15
  def add_examples(example_hash)
@@ -17,7 +20,7 @@ module Dwarf
17
20
 
18
21
  def add_example(example_record, classification)
19
22
  @examples[example_record]=classification
20
- @example_attributes |= example_record.attributes
23
+ @example_attributes |= example_record.attribute_names
21
24
  end
22
25
 
23
26
  def classify(example)
@@ -26,37 +29,120 @@ module Dwarf
26
29
 
27
30
  def learn!
28
31
  @decision_tree.examples = @examples.keys
32
+ converge_tree
33
+ self.classifier_logic = codify_tree(@decision_tree)
34
+ implement_classify
35
+ end
36
+
37
+ def find_by_classification(world, classification)
38
+ matches = []
39
+ world.each do |instance|
40
+ if classify(instance) == classification
41
+ matches << instance
42
+ end
43
+ end
44
+ matches
45
+ end
46
+
47
+ private
48
+
49
+ include ExampleManagement
50
+
51
+ def converge_tree
29
52
  pending = []
30
53
  pending.push @decision_tree
31
- used_attributes = []
32
54
  until pending.empty?
33
55
  node = pending.pop
34
56
  if classification = homogenous_examples(node)
35
57
  node.classification = classification
36
58
  elsif no_valuable_attributes?(node) && node.parent
37
- node.parent.classification= expected_value(node.examples)
59
+ if split_nil_children = check_nil_split(node)
60
+ split_nil_children.each {|child_node| pending.push(child_node)}
61
+ else
62
+ create_expected_value(node)
63
+ end
38
64
  elsif no_valuable_attributes?(node)
39
- classifier_logic = expected_value(node.examples)
65
+ node.classification = expected_value(node.examples)
40
66
  elsif false #stub branch
41
67
  #C4.5 would also allow for previously unseen classifications
42
- #dwarf's API dictates all classifications are known before learning
43
- #starts
68
+ #dwarf needs to correctly handle a pre-existing tree when
69
+ #learn! is called
44
70
  else
45
- infogains = {}
46
- (@example_attributes-used_attributes).each do |example_attribute|
47
- infogains[information_gain(node.examples,example_attribute)] = example_attribute
48
- end
49
- best_gain = infogains.keys.sort[0]
50
- best_attribute = infogains[best_gain]
51
- split(node,best_attribute).each {|child_node| pending.push(child_node)}
52
- used_attributes << best_attribute
71
+ split_children = homogenize_children(node)
72
+ split_children.each {|child_node| pending.push(child_node)}
53
73
  end
54
74
  end
55
- self.classifier_logic = codify_tree(@decision_tree)
56
- implement_classify
57
75
  end
58
76
 
59
- private
77
+ def check_nil_split(node)
78
+ infogains = {}
79
+
80
+ used_attributes = used_attributes(node)
81
+ (filtered_attributes-used_attributes).each do |example_attribute|
82
+ infogains[Information::unfiltered_information_gain(node.examples,example_attribute,@examples)] =
83
+ example_attribute
84
+ end
85
+ best_gain = infogains.keys.sort[0]
86
+ best_attribute = infogains[best_gain]
87
+ if best_gain > 0.0
88
+ return split(node, best_attribute)
89
+ end
90
+
91
+ end
92
+
93
+ def create_expected_value(node)
94
+ new_node = TreeNode.new(node.name)
95
+ expected_value = expected_value(node.examples)
96
+ new_node.classification = expected_value
97
+ parent = node.parent
98
+ parent.remove! node
99
+ parent << new_node
100
+ new_node << node
101
+ end
102
+
103
+ def used_attributes(node)
104
+ if node.parentage
105
+ node.parentage.map { |parent| parent.attribute }
106
+ else
107
+ []
108
+ end
109
+ end
110
+
111
+ def attribute_homogeneous?(example_subset, attribute)
112
+ invert_with_dups(attribute_map(example_subset, attribute)).keys.size == 1
113
+ end
114
+
115
+ def heterogeneous_attributes
116
+ @example_attributes.reject { |attr| attribute_homogeneous?(@examples.keys, attr) }
117
+ end
118
+
119
+ def attribute_clusters?(example_subset, attribute)
120
+ invert_with_dups(attribute_map(example_subset, attribute)).keys.size == example_subset.size
121
+ end
122
+
123
+ def clustering_attributes
124
+ @example_attributes.select {|attr| attribute_clusters?(@examples.keys, attr) }
125
+ end
126
+
127
+ def filtered_attributes
128
+ clustering_attributes | heterogeneous_attributes
129
+ end
130
+
131
+ def homogenize_children(node)
132
+ infogains = {}
133
+
134
+ used_attributes = used_attributes(node)
135
+
136
+ (filtered_attributes-used_attributes).each do |example_attribute|
137
+ infogains[Information::information_gain(node.examples,example_attribute,@examples)] =
138
+ example_attribute
139
+ end
140
+
141
+ best_gain = infogains.keys.sort[0]
142
+ best_attribute = infogains[best_gain]
143
+
144
+ return split(node,best_attribute)
145
+ end
60
146
 
61
147
  def implement_classify
62
148
  classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
@@ -85,6 +171,7 @@ module Dwarf
85
171
 
86
172
  def codify_literal(object)
87
173
  case object
174
+ when @nil_name then "nil"
88
175
  when Symbol then ":#{object}"
89
176
  when String then "\"#{object}\""
90
177
  else
@@ -97,6 +184,9 @@ module Dwarf
97
184
  example_subset = node.examples
98
185
  examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
99
186
  examples_inversion.each do |key, value|
187
+ if key.nil?
188
+ key = @nil_name
189
+ end
100
190
  child_node = TreeNode.new(key)
101
191
  child_node.examples = value
102
192
  node << child_node
@@ -106,21 +196,20 @@ module Dwarf
106
196
  end
107
197
 
108
198
  def expected_value(example_subset)
109
- examples_inversion = invert_with_dups(classification_map(example_subset))
199
+ examples_inversion = invert_with_dups(classification_map(example_subset, @examples))
110
200
  occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
111
201
  occurrences.keys.sort { |key| occurrences[key] }[0]
112
202
  end
113
203
 
114
204
  def no_valuable_attributes?(node)
115
- @example_attributes.map {|example_attribute|
116
- information_gain(node.examples, example_attribute)}.each {|info_gain|
205
+ filtered_attributes.map {|example_attribute|
206
+ Information::information_gain(node.examples, example_attribute, @examples)}.each {|info_gain|
117
207
  return false if info_gain != 0}
118
208
  return true
119
209
  end
120
210
 
121
-
122
211
  def homogenous_examples(node)
123
- classifications = classifications(node.examples)
212
+ classifications = filter_classifications(@examples, node.examples)
124
213
  if classifications.length == 1
125
214
  return classifications[0]
126
215
  else
@@ -128,49 +217,5 @@ module Dwarf
128
217
  end
129
218
  end
130
219
 
131
- def entropy(example_subset)
132
- set_size = example_subset.length.to_f
133
- examples_inversion = invert_with_dups(classification_map(example_subset))
134
- occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
135
- 0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
136
- sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
137
- end
138
- end
139
-
140
- def information_gain(example_subset,attribute)
141
- set_size = example_subset.length.to_f
142
- examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
143
- occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
144
- entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
145
- sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
146
- end
147
- end
148
-
149
- def classifications(example_subset)
150
- example_subset.map {|example| @examples[example]}.compact
151
- end
152
-
153
- def classification_map(example_subset)
154
- classification_map = {}
155
- example_subset.each {|example| classification_map[example] = @examples[example]}
156
- classification_map
157
- end
158
-
159
- def attribute_values(example_subset, attribute)
160
- example_subset.map {|example| example.method(attribute.to_sym).call}.compact
161
- end
162
-
163
- def attribute_map(example_subset, attribute)
164
- example_map = {}
165
- example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
166
- example_map
167
- end
168
-
169
- def invert_with_dups(hash)
170
- inversion = {}
171
- hash.values.each {|value| inversion[value] = []}
172
- hash.keys.each {|key| inversion[hash[key]] << key}
173
- inversion
174
- end
175
220
  end
176
221
  end
@@ -0,0 +1,32 @@
1
+ module Dwarf
2
+ module ExampleManagement
3
+
4
+ def classification_map(example_subset, classifications)
5
+ classification_map = {}
6
+ example_subset.each {|example| classification_map[example] = classifications[example]}
7
+ classification_map
8
+ end
9
+
10
+ def invert_with_dups(hash)
11
+ inversion = { }
12
+ hash.values.each {|value| inversion[value] = []}
13
+ hash.keys.each {|key| inversion[hash[key]] << key}
14
+ inversion
15
+ end
16
+
17
+ def eval_attribute(example,attribute)
18
+ example.method(attribute.to_sym).call
19
+ end
20
+
21
+ def attribute_map(example_subset, attribute)
22
+ example_map = {}
23
+ example_subset.each {|example| example_map[example] = eval_attribute(example, attribute)}
24
+ example_map
25
+ end
26
+
27
+ def filter_classifications(classifications,example_subset)
28
+ example_subset.map {|example| classifications[example]}.uniq
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,61 @@
1
+ module Dwarf
2
+ module Information
3
+
4
+ class<< self
5
+ include ExampleManagement
6
+
7
+ def entropy(example_subset, classifications)
8
+ seen_classifications = filter_classifications(classifications, example_subset)
9
+ return 0.0 if seen_classifications.length == 1
10
+ set_size = example_subset.length.to_f
11
+ examples_inversion = invert_with_dups(classification_map(example_subset, classifications))
12
+ occurrences = occurrences(examples_inversion)
13
+ sum_over(seen_classifications) do |classification|
14
+ frequency = occurrences[classification]/set_size
15
+ - frequency * Math.log(frequency,seen_classifications.length)
16
+ end
17
+ end
18
+
19
+ def information_gain(example_subset, attribute, classifications)
20
+ filtered_example_subset = filter_for_missing_attribute(example_subset, attribute)
21
+ unfiltered_information_gain(filtered_example_subset, attribute, classifications)
22
+ end
23
+
24
+ def unfiltered_information_gain(example_subset, attribute, classifications)
25
+ set_size = example_subset.length.to_f
26
+ examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
27
+ occurrences = occurrences(examples_inversion)
28
+ heterogeneous_entropy = entropy(example_subset, classifications)
29
+ seen_attribute_values = attribute_values(example_subset,attribute)
30
+ heterogeneous_entropy -
31
+ sum_over(seen_attribute_values) do |attribute_value|
32
+ frequency = occurrences[attribute_value]/set_size
33
+ frequency * entropy(examples_inversion[attribute_value], classifications)
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def sum_over(collection)
40
+ collection.inject(0.0) do |sum, classification|
41
+ sum + yield(classification)
42
+ end
43
+ end
44
+
45
+ def occurrences(examples_inversion)
46
+ examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
47
+ end
48
+
49
+ def filter_for_missing_attribute(example_subset, attribute)
50
+ example_subset.reject { |example| eval_attribute(example,attribute).nil? }
51
+ end
52
+
53
+ def attribute_values(example_subset, attribute)
54
+ example_subset.map {|example| eval_attribute(example, attribute)}.uniq
55
+ end
56
+
57
+ end
58
+
59
+ end
60
+
61
+ end
@@ -1,3 +1,3 @@
1
1
  module Dwarf
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -0,0 +1,197 @@
1
+ require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
2
+
3
+ describe Dwarf::Classifier do
4
+
5
+ before(:each) do
6
+ @classifier = Dwarf::Classifier.new()
7
+ end
8
+
9
+ def mock_car_examples
10
+ @example1 = FakeCar.new(:body_style => :boxy,
11
+ :cylinders => 4,
12
+ :transmission => :manual)
13
+ @example2 = FakeCar.new(:body_style => :swoopy,
14
+ :cylinders => 6,
15
+ :transmission => :manual)
16
+ @example3 = FakeCar.new(:body_style => :angry,
17
+ :cylinders => 8,
18
+ :transmission => :manual)
19
+ @example4 = FakeCar.new(:body_style => :swoopy,
20
+ :cylinders => 8,
21
+ :transmission => :manual)
22
+ @example5 = FakeCar.new(:body_style => nil,
23
+ :cylinders => 6,
24
+ :transmission => :manual)
25
+ @example6 = FakeCar.new(:body_style => :sleek,
26
+ :cylinders => nil,
27
+ :transmission => :manual)
28
+ end
29
+
30
+ context "add_example" do
31
+
32
+ it "accepts example classifications" do
33
+ @classifier.should respond_to(:add_example)
34
+ end
35
+
36
+ it "stores examples" do
37
+ @example3 = double('example3')
38
+ @example3.stub(:attribute_names) { [] }
39
+ @classifier.add_example(@example3, :irish)
40
+ @classifier.examples.should include(@example3)
41
+ end
42
+
43
+ it "enumerate example attributes" do
44
+ @example_with_attributes = double('attrs')
45
+ @example_with_attributes.stub(:attribute_names) { ["height", "branch_density"] }
46
+ @classifier.add_example(@example_with_attributes, :pine)
47
+ @classifier.example_attributes.should include("height", "branch_density")
48
+ end
49
+
50
+ it "gracefully accepts examples with nil attributes" do
51
+ @example_with_nil_attributes = double('nils')
52
+ @example_with_nil_attributes.stub(:attribute_names) { ["height", "branch_density"] }
53
+ @example_with_nil_attributes.stub(:height) { nil }
54
+ @example_with_nil_attributes.stub(:branch_density) { :high }
55
+ lambda {@classifier.add_example(@example_with_nil_attributes, :pine)}.should_not raise_exception
56
+ end
57
+
58
+ end
59
+
60
+ context "learn! and classify" do
61
+
62
+ it "only implements classify on the learning instance" do
63
+ @example = double('example3')
64
+ @example.stub(:attribute_names) { [] }
65
+ @class2 = Dwarf::Classifier.new()
66
+ @classifier.add_example(@example, :round)
67
+ @classifier.learn!
68
+ @classifier.classify(@example) == :round
69
+ @class2.classify(@example).should == nil
70
+ end
71
+
72
+ context "frawd is dwarf backwards" do
73
+ before(:each) do
74
+ @frawd = Frawd.new(1,100)
75
+ end
76
+
77
+ it "is totally awesome" do
78
+ @frawd.training.each do |example, classification|
79
+ @classifier.add_example(example, classification)
80
+ end
81
+ @classifier.learn!
82
+ success = 0
83
+ @frawd.testing.each do |example, classification|
84
+ success += 1 if @classifier.classify(example) == classification
85
+ end
86
+ success.should == @frawd.testing.size
87
+ end
88
+ end
89
+
90
+ context "classifying cars" do
91
+
92
+ before(:each) do
93
+ mock_car_examples
94
+ @classifier.add_example(@example1, :japanese)
95
+ @classifier.add_example(@example2, :german)
96
+ @classifier.add_example(@example3, :american)
97
+ end
98
+
99
+ it "classifies in a trivial case" do
100
+ @classifier.learn!
101
+ @classifier.classify(@example1).should == :japanese
102
+ @classifier.classify(@example2).should == :german
103
+ @classifier.classify(@example3).should == :american
104
+ end
105
+
106
+ it "classifies when multiple predicates required" do
107
+ @classifier.add_example(@example4, :german)
108
+ @classifier.learn!
109
+ @classifier.classify(@example1).should == :japanese
110
+ @classifier.classify(@example2).should == :german
111
+ @classifier.classify(@example3).should == :american
112
+ @classifier.classify(@example4).should == :german
113
+ end
114
+
115
+ it "handles nils gracefully" do
116
+ @classifier.add_examples(@example4 => :german,
117
+ @example5 => :japanese,
118
+ @example6 => :japanese)
119
+ lambda{@classifier.learn!}.should_not raise_exception
120
+ end
121
+
122
+ it "handles nils correctly" do
123
+ @classifier.add_examples(@example4 => :german,
124
+ @example5 => :japanese,
125
+ @example6 => :japanese)
126
+ @classifier.learn!
127
+ @classifier.classify(@example1).should == :japanese
128
+ @classifier.classify(@example2).should == :german
129
+ @classifier.classify(@example3).should == :american
130
+ @classifier.classify(@example4).should == :german
131
+ @classifier.classify(@example5).should == :japanese
132
+ @classifier.classify(@example6).should == :japanese
133
+ end
134
+
135
+ it "handles a feature missing from all examples correctly" do
136
+ @classifier.add_examples(@example4 => :german,
137
+ @example5 => :japanese,
138
+ @example6 => :japanese)
139
+ @classifier.learn!
140
+ open = [@classifier.decision_tree]
141
+ until open.empty?
142
+ current = open.pop
143
+ current.attribute.should_not == "wheel_diameter"
144
+ current.children.each {|child| open.push child}
145
+ end
146
+ end
147
+
148
+ it "does not use a feature which is identical across all examples" do
149
+ @classifier.add_examples(@example4 => :german,
150
+ @example5 => :japanese,
151
+ @example6 => :japanese)
152
+ @classifier.learn!
153
+ open = [@classifier.decision_tree]
154
+ until open.empty?
155
+ current = open.pop
156
+ current.attribute.should_not == "transmission"
157
+ current.children.each {|child| open.push child}
158
+ end
159
+ end
160
+
161
+ it "does not use a feature unique to each example" do
162
+ @classifier.add_examples(@example4 => :german,
163
+ @example5 => :japanese,
164
+ @example6 => :japanese)
165
+ @classifier.learn!
166
+ open = [@classifier.decision_tree]
167
+ until open.empty?
168
+ current = open.pop
169
+ current.attribute.should_not == "vin"
170
+ current.children.each {|child| open.push child}
171
+ end
172
+
173
+ end
174
+
175
+ end
176
+
177
+ end
178
+
179
+ context "find_by_classification" do
180
+
181
+ it "returns sets of cars based on class" do
182
+ mock_car_examples
183
+ @classifier.add_examples(@example1 => :japanese,
184
+ @example2 => :german,
185
+ @example3 => :american,
186
+ @example4 => :german)
187
+ @classifier.learn!
188
+ all_cars = [@example1, @example2, @example3, @example4]
189
+ japanese_cars = @classifier.find_by_classification(all_cars, :japanese)
190
+ japanese_cars.should == [@example1]
191
+ end
192
+
193
+
194
+ end
195
+
196
+
197
+ end
@@ -0,0 +1,157 @@
1
+ require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
2
+
3
+ describe Dwarf::Information do
4
+
5
+ class Deck
6
+
7
+ def initialize()
8
+ @draw = (1..52).map{|v| v}
9
+ @draw.shuffle!
10
+ @discard = []
11
+ end
12
+
13
+ def sample
14
+ unless @draw.empty?
15
+ card = @draw.pop
16
+ @discard.push card
17
+ return card
18
+ else
19
+ @draw = @discard
20
+ @draw.shuffle
21
+ @discard = []
22
+ return self.sample
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ class Coin
29
+
30
+ def initialize(weighting)
31
+ @weighting = weighting
32
+ @faces = [:heads, :tails]
33
+ end
34
+
35
+ def attributes
36
+ "weighting"
37
+ end
38
+
39
+ def weighting
40
+ @weighting
41
+ end
42
+
43
+ def sample
44
+ case @weighting
45
+ when :fair then @faces.sample
46
+ when :heads then :heads
47
+ when :tails then :tails
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ context "entropy" do
54
+ it "calculates correctly for heads and tails" do
55
+ examples = []
56
+ classifications = {}
57
+ coin = Coin.new(:fair)
58
+ 1000.times do
59
+ obj = Object.new
60
+ examples << obj
61
+ classifications[obj] = coin.sample
62
+ end
63
+ entropy = Dwarf::Information.entropy(examples, classifications)
64
+ entropy.should > 0.99
65
+ entropy.should <= 1.0
66
+ end
67
+
68
+ it "calculates correctly for 1d6" do
69
+ examples = []
70
+ classifications = {}
71
+ die = (1..6).map{|v| v}
72
+ 1000.times do
73
+ obj = Object.new
74
+ examples << obj
75
+ classifications[obj] = die.sample
76
+ end
77
+ entropy = Dwarf::Information.entropy(examples, classifications)
78
+ entropy.should > 0.99
79
+ entropy.should <= 1.0
80
+ end
81
+
82
+ it "calculates correctly for a deck of cards" do
83
+ examples = []
84
+ classifications = {}
85
+ deck = Deck.new
86
+ 1000.times do
87
+ obj = Object.new
88
+ examples << obj
89
+ classifications[obj] = deck.sample
90
+ end
91
+ entropy = Dwarf::Information.entropy(examples, classifications)
92
+ entropy.should > 0.99
93
+ entropy.should <= 1.0
94
+ end
95
+
96
+ it "calculates correctly with a weighted coin" do
97
+ examples = []
98
+ classifications = {}
99
+ 1000.times do
100
+ obj = Object.new
101
+ examples << obj
102
+ classifications[obj] = (rand(100) == 99) ? :heads : :tails
103
+ end
104
+ entropy = Dwarf::Information.entropy(examples,classifications)
105
+ entropy.should < 0.101 #With a perfect 99:1 distribution, entropy should == 0.0807...
106
+ entropy.should >= 0.04
107
+ end
108
+
109
+ it "calculates correctly with a homogenous set" do
110
+ examples = []
111
+ classifications = {}
112
+ 1000.times do
113
+ obj = Object.new
114
+ examples << obj
115
+ classifications[obj] = :heads
116
+ end
117
+ entropy = Dwarf::Information.entropy(examples,classifications)
118
+ entropy.should == 0.0
119
+ end
120
+
121
+ end
122
+
123
+ context "information_gain" do
124
+
125
+ it "calculates correctly splitting perfectly weighted coins" do
126
+ examples = []
127
+ classifications = {}
128
+ 500.times do
129
+ coin = Coin.new(:heads)
130
+ examples << coin
131
+ classifications[coin] = coin.sample
132
+ end
133
+ 500.times do
134
+ coin = Coin.new(:tails)
135
+ examples << coin
136
+ classifications[coin] = coin.sample
137
+ end
138
+ information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
139
+ information_gain.should == 1.0
140
+ end
141
+
142
+ it "calculates worthless infogame for fair weighted coins" do
143
+ examples = []
144
+ classifications = {}
145
+ coin = Coin.new(:fair)
146
+ 1000.times do
147
+ coin = Coin.new(:fair)
148
+ examples << coin
149
+ classifications[coin] = coin.sample
150
+ end
151
+ information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
152
+ information_gain.should == 0.0
153
+ end
154
+
155
+ end
156
+
157
+ end
@@ -0,0 +1,105 @@
1
+ #require File.join(File.dirname(__FILE__), *%w[. spec_helper.rb])
2
+ require 'rspec/mocks'
3
+ require 'faker'
4
+ require 'digest'
5
+
6
+ class Frawd
7
+ attr_reader :rules
8
+
9
+ def initialize(depth = 10, sample_sizes = 1000)
10
+ @depth = depth
11
+ @sample_sizes = sample_sizes
12
+ initialize_attributes
13
+ @leaves = []
14
+ @rules = build_rules
15
+ @rules.each_leaf do |leaf|
16
+ @leaves << leaf
17
+ end
18
+ end
19
+
20
+ def types
21
+ [:enum, :number, :text]
22
+ end
23
+
24
+ def enums
25
+ unless @enums
26
+ @enums = [[:true, :false],
27
+ [:baz, :bar, :zot],
28
+ [:baz, :bar, :zot, :quux]]
29
+ (1..rand(10)).each do
30
+ @enums << Faker::Lorem.words(rand(10)).uniq.map(&:to_sym)
31
+ end
32
+ end
33
+ @enums
34
+ end
35
+
36
+ def classifications
37
+ @classifications ||= (1..rand(10)).map {|x| "classification#{x}".to_sym }
38
+ end
39
+
40
+ def initialize_attributes
41
+ @attributes = []
42
+ num_attributes = 10#rand(100)
43
+ (1..num_attributes).each do |number|
44
+ type = types.sample
45
+ values = enums.sample if type == :enum
46
+ @attributes << ["attribute#{number}", type, values]
47
+ end
48
+ end
49
+
50
+ def filtered_attributes
51
+ @attributes.select {|a| a[1] == :enum}
52
+ end
53
+
54
+ def build_rules(node = Dwarf::TreeNode.new("ROOT"), attributes = filtered_attributes)
55
+ parents = node.parentage || []
56
+ if (rand(@depth) < parents.length) || attributes.empty?
57
+ node.classification = classifications.sample
58
+ else
59
+ attribute = attributes.sample
60
+ node.attribute = attribute[0]
61
+ attribute[2].each do |value|
62
+ child = Dwarf::TreeNode.new(value.to_s)
63
+ node << child
64
+ build_rules(child,attributes-[attribute[0]])
65
+ end
66
+ end
67
+ node
68
+ end
69
+
70
+ def generate_example
71
+ node = @leaves.sample
72
+ example_classification = node.classification
73
+ example = RSpec::Mocks::Mock.new('example')
74
+ node.parentage.unshift(node).each_cons(2) do |child, parent|
75
+ example.stub!(parent.attribute.to_sym) { child.name }
76
+ example.stub!(:attribute_names) { @attributes.map {|a| a[0]} }
77
+ end
78
+ @attributes.each do |attribute|
79
+ unless example.respond_to? attribute[0].to_sym
80
+ val = case attribute[1]
81
+ when :enum then attribute[2].sample
82
+ when :number then rand((2**(0.size * 8 -2) -1))
83
+ when :text then Faker::Lorem.paragraphs
84
+ end
85
+ example.stub!(attribute[0].to_sym) { val }
86
+ end
87
+ end
88
+ [ example, example_classification ]
89
+ end
90
+
91
+ def generate_examples(count)
92
+ examples = Array.new(count)
93
+ (0...count).each { |index| examples[index] = generate_example }
94
+ examples
95
+ end
96
+
97
+ def training
98
+ @training ||= generate_examples(@sample_sizes)
99
+ end
100
+
101
+ def testing
102
+ @testing ||= generate_examples(@sample_sizes)
103
+ end
104
+
105
+ end
@@ -1 +1,60 @@
1
1
  require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
2
+ require File.join(File.dirname(__FILE__), *%w[. frawd])
3
+
4
+ # http://blog.jayfields.com/2007/04/ruby-assigning-instance-variables-in.html
5
+ class Module
6
+ def initializer(*args, &block)
7
+ define_method :initialize do |*ctor_args|
8
+ ctor_named_args = (ctor_args.last.is_a?(Hash) ? ctor_args.pop : {})
9
+ (0..args.size).each do |index|
10
+ instance_variable_set("@#{args[index]}", ctor_args[index])
11
+ end
12
+ ctor_named_args.each_pair do |param_name, param_value|
13
+ instance_variable_set("@#{param_name}", param_value)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ class FakeCar
20
+ initializer :body_style, :cylinders, :wheel_diameter, :transmission
21
+ attr_accessor :body_style, :cylinders, :wheel_diameter, :transmission
22
+
23
+ @@vin_counter = 0
24
+
25
+ def vin
26
+ @vin ||= @@vin_counter+=1
27
+ end
28
+
29
+ def attributes
30
+ ["body_style", "cylinders", "wheel_diameter", "transmission", "vin"]
31
+ end
32
+
33
+ alias_method :attribute_names, :attributes
34
+
35
+ def to_s
36
+ "#{body_style} with #{cylinders} cylinders"
37
+ end
38
+
39
+ def self.valid_body_styles
40
+ [:boxy, :swoopy, :angry, :boring]
41
+ end
42
+
43
+ def self.valid_cylinders
44
+ [4, 6, 8]
45
+ end
46
+
47
+ def self.fake
48
+ new(:body_style => valid_body_styles.sample,
49
+ :cylinders => valid_cylinders.sample)
50
+ end
51
+
52
+ def self.multiple_fakes(how_many=5)
53
+ array = []
54
+ how_many.times do
55
+ array << fake
56
+ end
57
+ array
58
+ end
59
+ end
60
+
@@ -0,0 +1,60 @@
1
+ # Run me with:
2
+ #
3
+ # $ watchr specs.watchr
4
+
5
+ # --------------------------------------------------
6
+ # Convenience Methods
7
+ # --------------------------------------------------
8
+ def all_spec_files
9
+ Dir['spec/**/*_spec.rb']
10
+ end
11
+
12
+ def run_spec_matching(thing_to_match)
13
+ matches = all_spec_files.grep(/#{thing_to_match}/i)
14
+ if matches.empty?
15
+ puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
16
+ else
17
+ run matches.join(' ')
18
+ end
19
+ end
20
+
21
+ def run(files_to_run)
22
+ puts("Running: #{files_to_run}")
23
+ system("clear;rspec -cfs #{files_to_run}")
24
+ no_int_for_you
25
+ end
26
+
27
+ def run_all_specs
28
+ run(all_spec_files.join(' '))
29
+ end
30
+
31
+ # --------------------------------------------------
32
+ # Watchr Rules
33
+ # --------------------------------------------------
34
+ watch('^spec/(.*)_spec\.rb') { |m| run_spec_matching(m[1]) }
35
+ watch('^lib/(.*)\.rb') { |m| run_spec_matching(m[1]) }
36
+ watch('^spec/spec_helper\.rb') { run_all_specs }
37
+ watch('^spec/frawd\.rb') { run_all_specs }
38
+ watch('^spec/support/.*\.rb') { run_all_specs }
39
+
40
+ # --------------------------------------------------
41
+ # Signal Handling
42
+ # --------------------------------------------------
43
+
44
+ def no_int_for_you
45
+ @sent_an_int = nil
46
+ end
47
+
48
+ Signal.trap 'INT' do
49
+ if @sent_an_int then
50
+ puts " A second INT? Ok, I get the message. Shutting down now."
51
+ exit
52
+ else
53
+ puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
54
+ @sent_an_int = true
55
+ Kernel.sleep 1.5
56
+ run_all_specs
57
+ end
58
+ end
59
+
60
+ # vim:ft=ruby
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 4
9
- version: 0.0.4
8
+ - 5
9
+ version: 0.0.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Alex Redington
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-22 00:00:00 -04:00
17
+ date: 2010-11-05 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -62,6 +62,35 @@ dependencies:
62
62
  version: 2.0.1
63
63
  type: :development
64
64
  version_requirements: *id003
65
+ - !ruby/object:Gem::Dependency
66
+ name: watchr
67
+ prerelease: false
68
+ requirement: &id004 !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ segments:
74
+ - 0
75
+ - 7
76
+ version: "0.7"
77
+ type: :development
78
+ version_requirements: *id004
79
+ - !ruby/object:Gem::Dependency
80
+ name: faker
81
+ prerelease: false
82
+ requirement: &id005 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ segments:
88
+ - 0
89
+ - 3
90
+ - 1
91
+ version: 0.3.1
92
+ type: :development
93
+ version_requirements: *id005
65
94
  description: Dwarf is an implementation of decision tree learning algorithms targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
66
95
  email:
67
96
  - aredington@gmail.com
@@ -77,13 +106,19 @@ files:
77
106
  - Gemfile.lock
78
107
  - README.md
79
108
  - Rakefile
109
+ - TODO.taskpaper
80
110
  - dwarf.gemspec
81
111
  - lib/dwarf.rb
82
112
  - lib/dwarf/classifier.rb
113
+ - lib/dwarf/example_management.rb
114
+ - lib/dwarf/information.rb
83
115
  - lib/dwarf/tree_node.rb
84
116
  - lib/dwarf/version.rb
85
- - spec/classifier_spec.rb
117
+ - spec/dwarf/classifier_spec.rb
118
+ - spec/dwarf/information_spec.rb
119
+ - spec/frawd.rb
86
120
  - spec/spec_helper.rb
121
+ - specs.watchr
87
122
  has_rdoc: true
88
123
  homepage: http://github.com/aredington/dwarf
89
124
  licenses: []
@@ -1,80 +0,0 @@
1
- require File.join(File.dirname(__FILE__), *%w[spec_helper])
2
-
3
- describe Dwarf::Classifier do
4
-
5
- before(:each) do
6
- @classifier = Dwarf::Classifier.new()
7
- end
8
-
9
- it "accepts example classifications" do
10
- @classifier.should respond_to(:add_example)
11
- end
12
-
13
- it "stores examples" do
14
- @example3 = double('example3')
15
- @example3.stub(:attributes) { [] }
16
- @classifier.add_example(@example3, :irish)
17
- @classifier.examples.should include(@example3)
18
- end
19
-
20
- it "only implements classify on the learning instance" do
21
- @example = double('example3')
22
- @example.stub(:attributes) { [] }
23
- @class2 = Dwarf::Classifier.new()
24
- @classifier.add_example(@example, :round)
25
- @classifier.learn!
26
- @classifier.classify(@example).should eq(:round)
27
- @class2.classify(@example).should eq(nil)
28
- end
29
-
30
- context "classifying cars" do
31
- def mock_car_examples
32
- @example1 = double('example1')
33
- @example1.stub(:body_style) { :boxy }
34
- @example1.stub(:cylinders) { 4 }
35
- @example1.stub(:attributes) { ["body_style", "cylinders"] }
36
- @example2 = double('example2')
37
- @example2.stub(:body_style) { :swoopy }
38
- @example2.stub(:cylinders) { 6 }
39
- @example2.stub(:attributes) { ["body_style", "cylinders"] }
40
- @example3 = double('example3')
41
- @example3.stub(:body_style) { :angry }
42
- @example3.stub(:cylinders) { 8 }
43
- @example3.stub(:attributes) { ["body_style", "cylinders"] }
44
- @example4 = double('example4')
45
- @example4.stub(:body_style) {:swoopy}
46
- @example4.stub(:cylinders) {8}
47
- @example4.stub(:attributes) { ["body_style", "cylinders"] }
48
- end
49
-
50
- it "enumerate example attributes" do
51
- mock_car_examples
52
- @classifier.add_example(@example1, :japanese)
53
- @classifier.example_attributes.should include("body_style", "cylinders")
54
- end
55
-
56
- it "classifies in a trivial case" do
57
- mock_car_examples
58
- @classifier.add_example(@example1, :japanese)
59
- @classifier.add_example(@example2, :german)
60
- @classifier.add_example(@example3, :american)
61
- @classifier.learn!
62
- @classifier.classify(@example1).should eq(:japanese)
63
- @classifier.classify(@example2).should eq(:german)
64
- @classifier.classify(@example3).should eq(:american)
65
- end
66
-
67
- it "classifies when multiple predicates required" do
68
- mock_car_examples
69
- @classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
70
- @classifier.learn!
71
- @classifier.classify(@example1).should eq(:japanese)
72
- @classifier.classify(@example2).should eq(:german)
73
- @classifier.classify(@example3).should eq(:american)
74
- @classifier.classify(@example4).should eq(:german)
75
- end
76
-
77
- end
78
-
79
-
80
- end