dwarf 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :gemcutter
2
+
3
+ # Specify your gem's dependencies in quarry.gemspec
4
+ gemspec
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ dwarf (0.0.3)
5
+ rubytree (>= 0.8.1)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.1.2)
11
+ rspec (2.0.1)
12
+ rspec-core (~> 2.0.1)
13
+ rspec-expectations (~> 2.0.1)
14
+ rspec-mocks (~> 2.0.1)
15
+ rspec-core (2.0.1)
16
+ rspec-expectations (2.0.1)
17
+ diff-lcs (>= 1.1.2)
18
+ rspec-mocks (2.0.1)
19
+ rspec-core (~> 2.0.1)
20
+ rspec-expectations (~> 2.0.1)
21
+ rubytree (0.8.1)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ bundler (>= 1.0.0)
28
+ dwarf!
29
+ rspec (>= 2.0.1)
30
+ rubytree (>= 0.8.1)
File without changes
@@ -0,0 +1,7 @@
1
+ require 'bundler'
2
+ require 'rspec/core/rake_task'
3
+ Bundler::GemHelper.install_tasks
4
+
5
+ RSpec::Core::RakeTask.new(:spec) do
6
+ end
7
+
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/dwarf/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "dwarf"
6
+ s.version = Dwarf::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Alex Redington"]
9
+ s.email = ["aredington@gmail.com"]
10
+ s.homepage = "http://github.com/aredington/dwarf"
11
+ s.summary = "C4.5 for ActiveRecord objects"
12
+ s.description = "Dwarf is an implementation of the C4.5 algorithm targeted for use in the Rails 3 console environment for classifying ActiveRecord objects."
13
+
14
+ s.required_rubygems_version = ">= 1.3.6"
15
+ s.rubyforge_project = "dwarf"
16
+
17
+ s.add_dependency "rubytree", ">= 0.8.1"
18
+ s.add_development_dependency "bundler", ">= 1.0.0"
19
+ s.add_development_dependency "rspec", ">= 2.0.1"
20
+
21
+ s.files = `git ls-files`.split("\n")
22
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
23
+ s.require_path = 'lib'
24
+ end
@@ -0,0 +1,4 @@
1
+ module Dwarf
2
+ require 'dwarf/classifier'
3
+ require 'dwarf/tree_node'
4
+ end
@@ -0,0 +1,176 @@
1
+ module Dwarf
2
+ class Classifier
3
+ attr_accessor :examples
4
+ attr_accessor :example_attributes
5
+ attr_accessor :classifier_logic
6
+
7
+ def initialize()
8
+ @examples, @example_attributes = {}, []
9
+ @decision_tree = TreeNode.new("ROOT")
10
+ end
11
+
12
+ def add_examples(example_hash)
13
+ example_hash.each do |example, classification|
14
+ add_example(example, classification)
15
+ end
16
+ end
17
+
18
+ def add_example(example_record, classification)
19
+ @examples[example_record]=classification
20
+ @example_attributes |= example_record.attributes
21
+ end
22
+
23
+ def classify(example)
24
+ return nil
25
+ end
26
+
27
+ def learn!
28
+ @decision_tree.examples = @examples.keys
29
+ pending = []
30
+ pending.push @decision_tree
31
+ used_attributes = []
32
+ until pending.empty?
33
+ node = pending.pop
34
+ if classification = homogenous_examples(node)
35
+ node.classification = classification
36
+ elsif no_valuable_attributes?(node) && node.parent
37
+ node.parent.classification= expected_value(node.examples)
38
+ elsif no_valuable_attributes?(node)
39
+ classifier_logic = expected_value(node.examples)
40
+ elsif false #stub branch
41
+ #C4.5 would also allow for previously unseen classifications
42
+ #dwarf's API dictates all classifications are known before learning
43
+ #starts
44
+ else
45
+ infogains = {}
46
+ (@example_attributes-used_attributes).each do |example_attribute|
47
+ infogains[information_gain(node.examples,example_attribute)] = example_attribute
48
+ end
49
+ best_gain = infogains.keys.sort[0]
50
+ best_attribute = infogains[best_gain]
51
+ split(node,best_attribute).each {|child_node| pending.push(child_node)}
52
+ used_attributes << best_attribute
53
+ end
54
+ end
55
+ self.classifier_logic = codify_tree(@decision_tree)
56
+ implement_classify
57
+ end
58
+
59
+ private
60
+
61
+ def implement_classify
62
+ classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
63
+ self.instance_eval classify_impl
64
+ end
65
+
66
+ def codify_tree(decision_tree)
67
+ lines = [""]
68
+ depth = 1
69
+ codify_node(decision_tree, lines, depth)
70
+ lines.join("\n")
71
+ end
72
+
73
+ def codify_node(decision_tree, lines, depth)
74
+ if decision_tree.attribute
75
+ lines << (" "*depth)+"case example.#{decision_tree.attribute}"
76
+ decision_tree.children.each do |child|
77
+ lines << (" "*depth)+"when #{codify_literal(child.name)}"
78
+ codify_node(child, lines, depth + 1)
79
+ end
80
+ lines << (" "*depth)+"end"
81
+ elsif decision_tree.classification
82
+ lines << (" "*depth)+"return #{codify_literal(decision_tree.classification)}"
83
+ end
84
+ end
85
+
86
+ def codify_literal(object)
87
+ case object
88
+ when Symbol then ":#{object}"
89
+ when String then "\"#{object}\""
90
+ else
91
+ object.to_s
92
+ end
93
+ end
94
+
95
+ def split(node, attribute)
96
+ node.attribute = attribute
97
+ example_subset = node.examples
98
+ examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
99
+ examples_inversion.each do |key, value|
100
+ child_node = TreeNode.new(key)
101
+ child_node.examples = value
102
+ node << child_node
103
+ end
104
+ node.examples = nil
105
+ node.children
106
+ end
107
+
108
+ def expected_value(example_subset)
109
+ examples_inversion = invert_with_dups(classification_map(example_subset))
110
+ occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
111
+ occurrences.keys.sort { |key| occurrences[key] }[0]
112
+ end
113
+
114
+ def no_valuable_attributes?(node)
115
+ @example_attributes.map {|example_attribute|
116
+ information_gain(node.examples, example_attribute)}.each {|info_gain|
117
+ return false if info_gain != 0}
118
+ return true
119
+ end
120
+
121
+
122
+ def homogenous_examples(node)
123
+ classifications = classifications(node.examples)
124
+ if classifications.length == 1
125
+ return classifications[0]
126
+ else
127
+ return nil
128
+ end
129
+ end
130
+
131
+ def entropy(example_subset)
132
+ set_size = example_subset.length.to_f
133
+ examples_inversion = invert_with_dups(classification_map(example_subset))
134
+ occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
135
+ 0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
136
+ sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
137
+ end
138
+ end
139
+
140
+ def information_gain(example_subset,attribute)
141
+ set_size = example_subset.length.to_f
142
+ examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
143
+ occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
144
+ entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
145
+ sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
146
+ end
147
+ end
148
+
149
+ def classifications(example_subset)
150
+ example_subset.map {|example| @examples[example]}.compact
151
+ end
152
+
153
+ def classification_map(example_subset)
154
+ classification_map = {}
155
+ example_subset.each {|example| classification_map[example] = @examples[example]}
156
+ classification_map
157
+ end
158
+
159
+ def attribute_values(example_subset, attribute)
160
+ example_subset.map {|example| example.method(attribute.to_sym).call}.compact
161
+ end
162
+
163
+ def attribute_map(example_subset, attribute)
164
+ example_map = {}
165
+ example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
166
+ example_map
167
+ end
168
+
169
+ def invert_with_dups(hash)
170
+ inversion = {}
171
+ hash.values.each {|value| inversion[value] = []}
172
+ hash.keys.each {|key| inversion[hash[key]] << key}
173
+ inversion
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,15 @@
1
+ require 'tree'
2
+ module Dwarf
3
+ class TreeNode < Tree::TreeNode
4
+ attr_accessor :examples
5
+ attr_accessor :classification
6
+ attr_accessor :attribute
7
+
8
+ def initialize(name, content = nil)
9
+ examples = []
10
+ classification = nil
11
+ attribute = nil
12
+ super(name, content)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Dwarf
2
+ VERSION = "0.0.3"
3
+ end
@@ -0,0 +1,80 @@
1
+ require File.join(File.dirname(__FILE__), *%w[spec_helper])
2
+
3
+ describe Dwarf::Classifier do
4
+
5
+ before(:each) do
6
+ @classifier = Dwarf::Classifier.new()
7
+ end
8
+
9
+ it "accepts example classifications" do
10
+ @classifier.should respond_to(:add_example)
11
+ end
12
+
13
+ it "stores examples" do
14
+ @example3 = double('example3')
15
+ @example3.stub(:attributes) { [] }
16
+ @classifier.add_example(@example3, :irish)
17
+ @classifier.examples.should include(@example3)
18
+ end
19
+
20
+ it "only implements classify on the learning instance" do
21
+ @example = double('example3')
22
+ @example.stub(:attributes) { [] }
23
+ @class2 = Dwarf::Classifier.new()
24
+ @classifier.add_example(@example, :round)
25
+ @classifier.learn!
26
+ @classifier.classify(@example).should eq(:round)
27
+ @class2.classify(@example).should eq(nil)
28
+ end
29
+
30
+ context "classifying cars" do
31
+ def mock_car_examples
32
+ @example1 = double('example1')
33
+ @example1.stub(:body_style) { :boxy }
34
+ @example1.stub(:cylinders) { 4 }
35
+ @example1.stub(:attributes) { ["body_style", "cylinders"] }
36
+ @example2 = double('example2')
37
+ @example2.stub(:body_style) { :swoopy }
38
+ @example2.stub(:cylinders) { 6 }
39
+ @example2.stub(:attributes) { ["body_style", "cylinders"] }
40
+ @example3 = double('example3')
41
+ @example3.stub(:body_style) { :angry }
42
+ @example3.stub(:cylinders) { 8 }
43
+ @example3.stub(:attributes) { ["body_style", "cylinders"] }
44
+ @example4 = double('example4')
45
+ @example4.stub(:body_style) {:swoopy}
46
+ @example4.stub(:cylinders) {8}
47
+ @example4.stub(:attributes) { ["body_style", "cylinders"] }
48
+ end
49
+
50
+ it "enumerate example attributes" do
51
+ mock_car_examples
52
+ @classifier.add_example(@example1, :japanese)
53
+ @classifier.example_attributes.should include("body_style", "cylinders")
54
+ end
55
+
56
+ it "classifies in a trivial case" do
57
+ mock_car_examples
58
+ @classifier.add_example(@example1, :japanese)
59
+ @classifier.add_example(@example2, :german)
60
+ @classifier.add_example(@example3, :american)
61
+ @classifier.learn!
62
+ @classifier.classify(@example1).should eq(:japanese)
63
+ @classifier.classify(@example2).should eq(:german)
64
+ @classifier.classify(@example3).should eq(:american)
65
+ end
66
+
67
+ it "classifies when multiple predicates required" do
68
+ mock_car_examples
69
+ @classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
70
+ @classifier.learn!
71
+ @classifier.classify(@example1).should eq(:japanese)
72
+ @classifier.classify(@example2).should eq(:german)
73
+ @classifier.classify(@example3).should eq(:american)
74
+ @classifier.classify(@example4).should eq(:german)
75
+ end
76
+
77
+ end
78
+
79
+
80
+ end
@@ -0,0 +1 @@
1
+ require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dwarf
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 3
9
+ version: 0.0.3
10
+ platform: ruby
11
+ authors:
12
+ - Alex Redington
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-10-22 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rubytree
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ - 8
31
+ - 1
32
+ version: 0.8.1
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: bundler
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ segments:
44
+ - 1
45
+ - 0
46
+ - 0
47
+ version: 1.0.0
48
+ type: :development
49
+ version_requirements: *id002
50
+ - !ruby/object:Gem::Dependency
51
+ name: rspec
52
+ prerelease: false
53
+ requirement: &id003 !ruby/object:Gem::Requirement
54
+ none: false
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 2
60
+ - 0
61
+ - 1
62
+ version: 2.0.1
63
+ type: :development
64
+ version_requirements: *id003
65
+ description: Dwarf is an implementation of the C4.5 algorithm targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
66
+ email:
67
+ - aredington@gmail.com
68
+ executables: []
69
+
70
+ extensions: []
71
+
72
+ extra_rdoc_files: []
73
+
74
+ files:
75
+ - .gitignore
76
+ - Gemfile
77
+ - Gemfile.lock
78
+ - README.md
79
+ - Rakefile
80
+ - dwarf.gemspec
81
+ - lib/dwarf.rb
82
+ - lib/dwarf/classifier.rb
83
+ - lib/dwarf/tree_node.rb
84
+ - lib/dwarf/version.rb
85
+ - spec/classifier_spec.rb
86
+ - spec/spec_helper.rb
87
+ has_rdoc: true
88
+ homepage: http://github.com/aredington/dwarf
89
+ licenses: []
90
+
91
+ post_install_message:
92
+ rdoc_options: []
93
+
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ segments:
102
+ - 0
103
+ version: "0"
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ segments:
110
+ - 1
111
+ - 3
112
+ - 6
113
+ version: 1.3.6
114
+ requirements: []
115
+
116
+ rubyforge_project: dwarf
117
+ rubygems_version: 1.3.7
118
+ signing_key:
119
+ specification_version: 3
120
+ summary: C4.5 for ActiveRecord objects
121
+ test_files: []
122
+