dwarf 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :gemcutter
2
+
3
+ # Specify your gem's dependencies in quarry.gemspec
4
+ gemspec
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ dwarf (0.0.3)
5
+ rubytree (>= 0.8.1)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.1.2)
11
+ rspec (2.0.1)
12
+ rspec-core (~> 2.0.1)
13
+ rspec-expectations (~> 2.0.1)
14
+ rspec-mocks (~> 2.0.1)
15
+ rspec-core (2.0.1)
16
+ rspec-expectations (2.0.1)
17
+ diff-lcs (>= 1.1.2)
18
+ rspec-mocks (2.0.1)
19
+ rspec-core (~> 2.0.1)
20
+ rspec-expectations (~> 2.0.1)
21
+ rubytree (0.8.1)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ bundler (>= 1.0.0)
28
+ dwarf!
29
+ rspec (>= 2.0.1)
30
+ rubytree (>= 0.8.1)
File without changes
@@ -0,0 +1,7 @@
1
+ require 'bundler'
2
+ require 'rspec/core/rake_task'
3
+ Bundler::GemHelper.install_tasks
4
+
5
+ RSpec::Core::RakeTask.new(:spec) do
6
+ end
7
+
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/dwarf/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "dwarf"
6
+ s.version = Dwarf::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Alex Redington"]
9
+ s.email = ["aredington@gmail.com"]
10
+ s.homepage = "http://github.com/aredington/dwarf"
11
+ s.summary = "C4.5 for ActiveRecord objects"
12
+ s.description = "Dwarf is an implementation of the C4.5 algorithm targeted for use in the Rails 3 console environment for classifying ActiveRecord objects."
13
+
14
+ s.required_rubygems_version = ">= 1.3.6"
15
+ s.rubyforge_project = "dwarf"
16
+
17
+ s.add_dependency "rubytree", ">= 0.8.1"
18
+ s.add_development_dependency "bundler", ">= 1.0.0"
19
+ s.add_development_dependency "rspec", ">= 2.0.1"
20
+
21
+ s.files = `git ls-files`.split("\n")
22
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
23
+ s.require_path = 'lib'
24
+ end
@@ -0,0 +1,4 @@
1
+ module Dwarf
2
+ require 'dwarf/classifier'
3
+ require 'dwarf/tree_node'
4
+ end
@@ -0,0 +1,176 @@
1
+ module Dwarf
2
+ class Classifier
3
+ attr_accessor :examples
4
+ attr_accessor :example_attributes
5
+ attr_accessor :classifier_logic
6
+
7
+ def initialize()
8
+ @examples, @example_attributes = {}, []
9
+ @decision_tree = TreeNode.new("ROOT")
10
+ end
11
+
12
+ def add_examples(example_hash)
13
+ example_hash.each do |example, classification|
14
+ add_example(example, classification)
15
+ end
16
+ end
17
+
18
+ def add_example(example_record, classification)
19
+ @examples[example_record]=classification
20
+ @example_attributes |= example_record.attributes
21
+ end
22
+
23
+ def classify(example)
24
+ return nil
25
+ end
26
+
27
+ def learn!
28
+ @decision_tree.examples = @examples.keys
29
+ pending = []
30
+ pending.push @decision_tree
31
+ used_attributes = []
32
+ until pending.empty?
33
+ node = pending.pop
34
+ if classification = homogenous_examples(node)
35
+ node.classification = classification
36
+ elsif no_valuable_attributes?(node) && node.parent
37
+ node.parent.classification= expected_value(node.examples)
38
+ elsif no_valuable_attributes?(node)
39
+ classifier_logic = expected_value(node.examples)
40
+ elsif false #stub branch
41
+ #C4.5 would also allow for previously unseen classifications
42
+ #dwarf's API dictates all classifications are known before learning
43
+ #starts
44
+ else
45
+ infogains = {}
46
+ (@example_attributes-used_attributes).each do |example_attribute|
47
+ infogains[information_gain(node.examples,example_attribute)] = example_attribute
48
+ end
49
+ best_gain = infogains.keys.sort[0]
50
+ best_attribute = infogains[best_gain]
51
+ split(node,best_attribute).each {|child_node| pending.push(child_node)}
52
+ used_attributes << best_attribute
53
+ end
54
+ end
55
+ self.classifier_logic = codify_tree(@decision_tree)
56
+ implement_classify
57
+ end
58
+
59
+ private
60
+
61
+ def implement_classify
62
+ classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
63
+ self.instance_eval classify_impl
64
+ end
65
+
66
+ def codify_tree(decision_tree)
67
+ lines = [""]
68
+ depth = 1
69
+ codify_node(decision_tree, lines, depth)
70
+ lines.join("\n")
71
+ end
72
+
73
+ def codify_node(decision_tree, lines, depth)
74
+ if decision_tree.attribute
75
+ lines << (" "*depth)+"case example.#{decision_tree.attribute}"
76
+ decision_tree.children.each do |child|
77
+ lines << (" "*depth)+"when #{codify_literal(child.name)}"
78
+ codify_node(child, lines, depth + 1)
79
+ end
80
+ lines << (" "*depth)+"end"
81
+ elsif decision_tree.classification
82
+ lines << (" "*depth)+"return #{codify_literal(decision_tree.classification)}"
83
+ end
84
+ end
85
+
86
+ def codify_literal(object)
87
+ case object
88
+ when Symbol then ":#{object}"
89
+ when String then "\"#{object}\""
90
+ else
91
+ object.to_s
92
+ end
93
+ end
94
+
95
+ def split(node, attribute)
96
+ node.attribute = attribute
97
+ example_subset = node.examples
98
+ examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
99
+ examples_inversion.each do |key, value|
100
+ child_node = TreeNode.new(key)
101
+ child_node.examples = value
102
+ node << child_node
103
+ end
104
+ node.examples = nil
105
+ node.children
106
+ end
107
+
108
+ def expected_value(example_subset)
109
+ examples_inversion = invert_with_dups(classification_map(example_subset))
110
+ occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
111
+ occurrences.keys.sort { |key| occurrences[key] }[0]
112
+ end
113
+
114
+ def no_valuable_attributes?(node)
115
+ @example_attributes.map {|example_attribute|
116
+ information_gain(node.examples, example_attribute)}.each {|info_gain|
117
+ return false if info_gain != 0}
118
+ return true
119
+ end
120
+
121
+
122
+ def homogenous_examples(node)
123
+ classifications = classifications(node.examples)
124
+ if classifications.length == 1
125
+ return classifications[0]
126
+ else
127
+ return nil
128
+ end
129
+ end
130
+
131
+ def entropy(example_subset)
132
+ set_size = example_subset.length.to_f
133
+ examples_inversion = invert_with_dups(classification_map(example_subset))
134
+ occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
135
+ 0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
136
+ sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
137
+ end
138
+ end
139
+
140
+ def information_gain(example_subset,attribute)
141
+ set_size = example_subset.length.to_f
142
+ examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
143
+ occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
144
+ entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
145
+ sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
146
+ end
147
+ end
148
+
149
+ def classifications(example_subset)
150
+ example_subset.map {|example| @examples[example]}.compact
151
+ end
152
+
153
+ def classification_map(example_subset)
154
+ classification_map = {}
155
+ example_subset.each {|example| classification_map[example] = @examples[example]}
156
+ classification_map
157
+ end
158
+
159
+ def attribute_values(example_subset, attribute)
160
+ example_subset.map {|example| example.method(attribute.to_sym).call}.compact
161
+ end
162
+
163
+ def attribute_map(example_subset, attribute)
164
+ example_map = {}
165
+ example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
166
+ example_map
167
+ end
168
+
169
+ def invert_with_dups(hash)
170
+ inversion = {}
171
+ hash.values.each {|value| inversion[value] = []}
172
+ hash.keys.each {|key| inversion[hash[key]] << key}
173
+ inversion
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,15 @@
1
+ require 'tree'
2
+ module Dwarf
3
+ class TreeNode < Tree::TreeNode
4
+ attr_accessor :examples
5
+ attr_accessor :classification
6
+ attr_accessor :attribute
7
+
8
+ def initialize(name, content = nil)
9
+ examples = []
10
+ classification = nil
11
+ attribute = nil
12
+ super(name, content)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Dwarf
2
+ VERSION = "0.0.3"
3
+ end
@@ -0,0 +1,80 @@
1
+ require File.join(File.dirname(__FILE__), *%w[spec_helper])
2
+
3
+ describe Dwarf::Classifier do
4
+
5
+ before(:each) do
6
+ @classifier = Dwarf::Classifier.new()
7
+ end
8
+
9
+ it "accepts example classifications" do
10
+ @classifier.should respond_to(:add_example)
11
+ end
12
+
13
+ it "stores examples" do
14
+ @example3 = double('example3')
15
+ @example3.stub(:attributes) { [] }
16
+ @classifier.add_example(@example3, :irish)
17
+ @classifier.examples.should include(@example3)
18
+ end
19
+
20
+ it "only implements classify on the learning instance" do
21
+ @example = double('example3')
22
+ @example.stub(:attributes) { [] }
23
+ @class2 = Dwarf::Classifier.new()
24
+ @classifier.add_example(@example, :round)
25
+ @classifier.learn!
26
+ @classifier.classify(@example).should eq(:round)
27
+ @class2.classify(@example).should eq(nil)
28
+ end
29
+
30
+ context "classifying cars" do
31
+ def mock_car_examples
32
+ @example1 = double('example1')
33
+ @example1.stub(:body_style) { :boxy }
34
+ @example1.stub(:cylinders) { 4 }
35
+ @example1.stub(:attributes) { ["body_style", "cylinders"] }
36
+ @example2 = double('example2')
37
+ @example2.stub(:body_style) { :swoopy }
38
+ @example2.stub(:cylinders) { 6 }
39
+ @example2.stub(:attributes) { ["body_style", "cylinders"] }
40
+ @example3 = double('example3')
41
+ @example3.stub(:body_style) { :angry }
42
+ @example3.stub(:cylinders) { 8 }
43
+ @example3.stub(:attributes) { ["body_style", "cylinders"] }
44
+ @example4 = double('example4')
45
+ @example4.stub(:body_style) {:swoopy}
46
+ @example4.stub(:cylinders) {8}
47
+ @example4.stub(:attributes) { ["body_style", "cylinders"] }
48
+ end
49
+
50
+ it "enumerate example attributes" do
51
+ mock_car_examples
52
+ @classifier.add_example(@example1, :japanese)
53
+ @classifier.example_attributes.should include("body_style", "cylinders")
54
+ end
55
+
56
+ it "classifies in a trivial case" do
57
+ mock_car_examples
58
+ @classifier.add_example(@example1, :japanese)
59
+ @classifier.add_example(@example2, :german)
60
+ @classifier.add_example(@example3, :american)
61
+ @classifier.learn!
62
+ @classifier.classify(@example1).should eq(:japanese)
63
+ @classifier.classify(@example2).should eq(:german)
64
+ @classifier.classify(@example3).should eq(:american)
65
+ end
66
+
67
+ it "classifies when multiple predicates required" do
68
+ mock_car_examples
69
+ @classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
70
+ @classifier.learn!
71
+ @classifier.classify(@example1).should eq(:japanese)
72
+ @classifier.classify(@example2).should eq(:german)
73
+ @classifier.classify(@example3).should eq(:american)
74
+ @classifier.classify(@example4).should eq(:german)
75
+ end
76
+
77
+ end
78
+
79
+
80
+ end
@@ -0,0 +1 @@
1
+ require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dwarf
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 3
9
+ version: 0.0.3
10
+ platform: ruby
11
+ authors:
12
+ - Alex Redington
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-10-22 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rubytree
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ - 8
31
+ - 1
32
+ version: 0.8.1
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: bundler
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ segments:
44
+ - 1
45
+ - 0
46
+ - 0
47
+ version: 1.0.0
48
+ type: :development
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: rspec
52
+ prerelease: false
53
+ requirement: &id003 !ruby/object:Gem::Requirement
54
+ none: false
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 2
60
+ - 0
61
+ - 1
62
+ version: 2.0.1
63
+ type: :development
64
+ version_requirements: *id003
65
+ description: Dwarf is an implementation of the C4.5 algorithm targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
66
+ email:
67
+ - aredington@gmail.com
68
+ executables: []
69
+
70
+ extensions: []
71
+
72
+ extra_rdoc_files: []
73
+
74
+ files:
75
+ - .gitignore
76
+ - Gemfile
77
+ - Gemfile.lock
78
+ - README.md
79
+ - Rakefile
80
+ - dwarf.gemspec
81
+ - lib/dwarf.rb
82
+ - lib/dwarf/classifier.rb
83
+ - lib/dwarf/tree_node.rb
84
+ - lib/dwarf/version.rb
85
+ - spec/classifier_spec.rb
86
+ - spec/spec_helper.rb
87
+ has_rdoc: true
88
+ homepage: http://github.com/aredington/dwarf
89
+ licenses: []
90
+
91
+ post_install_message:
92
+ rdoc_options: []
93
+
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ segments:
102
+ - 0
103
+ version: "0"
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ segments:
110
+ - 1
111
+ - 3
112
+ - 6
113
+ version: 1.3.6
114
+ requirements: []
115
+
116
+ rubyforge_project: dwarf
117
+ rubygems_version: 1.3.7
118
+ signing_key:
119
+ specification_version: 3
120
+ summary: C4.5 for ActiveRecord objects
121
+ test_files: []
122
+