dwarf 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +30 -0
- data/README.md +0 -0
- data/Rakefile +7 -0
- data/dwarf.gemspec +24 -0
- data/lib/dwarf.rb +4 -0
- data/lib/dwarf/classifier.rb +176 -0
- data/lib/dwarf/tree_node.rb +15 -0
- data/lib/dwarf/version.rb +3 -0
- data/spec/classifier_spec.rb +80 -0
- data/spec/spec_helper.rb +1 -0
- metadata +122 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
dwarf (0.0.3)
|
5
|
+
rubytree (>= 0.8.1)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.2)
|
11
|
+
rspec (2.0.1)
|
12
|
+
rspec-core (~> 2.0.1)
|
13
|
+
rspec-expectations (~> 2.0.1)
|
14
|
+
rspec-mocks (~> 2.0.1)
|
15
|
+
rspec-core (2.0.1)
|
16
|
+
rspec-expectations (2.0.1)
|
17
|
+
diff-lcs (>= 1.1.2)
|
18
|
+
rspec-mocks (2.0.1)
|
19
|
+
rspec-core (~> 2.0.1)
|
20
|
+
rspec-expectations (~> 2.0.1)
|
21
|
+
rubytree (0.8.1)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
bundler (>= 1.0.0)
|
28
|
+
dwarf!
|
29
|
+
rspec (>= 2.0.1)
|
30
|
+
rubytree (>= 0.8.1)
|
data/README.md
ADDED
File without changes
|
data/Rakefile
ADDED
data/dwarf.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path("../lib/dwarf/version", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "dwarf"
|
6
|
+
s.version = Dwarf::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Alex Redington"]
|
9
|
+
s.email = ["aredington@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/aredington/dwarf"
|
11
|
+
s.summary = "C4.5 for ActiveRecord objects"
|
12
|
+
s.description = "Dwarf is an implementation of the C4.5 algorithm targeted for use in the Rails 3 console environment for classifying ActiveRecord objects."
|
13
|
+
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
s.rubyforge_project = "dwarf"
|
16
|
+
|
17
|
+
s.add_dependency "rubytree", ">= 0.8.1"
|
18
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
19
|
+
s.add_development_dependency "rspec", ">= 2.0.1"
|
20
|
+
|
21
|
+
s.files = `git ls-files`.split("\n")
|
22
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
23
|
+
s.require_path = 'lib'
|
24
|
+
end
|
data/lib/dwarf.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
module Dwarf
|
2
|
+
class Classifier
|
3
|
+
attr_accessor :examples
|
4
|
+
attr_accessor :example_attributes
|
5
|
+
attr_accessor :classifier_logic
|
6
|
+
|
7
|
+
def initialize()
|
8
|
+
@examples, @example_attributes = {}, []
|
9
|
+
@decision_tree = TreeNode.new("ROOT")
|
10
|
+
end
|
11
|
+
|
12
|
+
def add_examples(example_hash)
|
13
|
+
example_hash.each do |example, classification|
|
14
|
+
add_example(example, classification)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_example(example_record, classification)
|
19
|
+
@examples[example_record]=classification
|
20
|
+
@example_attributes |= example_record.attributes
|
21
|
+
end
|
22
|
+
|
23
|
+
def classify(example)
|
24
|
+
return nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def learn!
|
28
|
+
@decision_tree.examples = @examples.keys
|
29
|
+
pending = []
|
30
|
+
pending.push @decision_tree
|
31
|
+
used_attributes = []
|
32
|
+
until pending.empty?
|
33
|
+
node = pending.pop
|
34
|
+
if classification = homogenous_examples(node)
|
35
|
+
node.classification = classification
|
36
|
+
elsif no_valuable_attributes?(node) && node.parent
|
37
|
+
node.parent.classification= expected_value(node.examples)
|
38
|
+
elsif no_valuable_attributes?(node)
|
39
|
+
classifier_logic = expected_value(node.examples)
|
40
|
+
elsif false #stub branch
|
41
|
+
#C4.5 would also allow for previously unseen classifications
|
42
|
+
#dwarf's API dictates all classifications are known before learning
|
43
|
+
#starts
|
44
|
+
else
|
45
|
+
infogains = {}
|
46
|
+
(@example_attributes-used_attributes).each do |example_attribute|
|
47
|
+
infogains[information_gain(node.examples,example_attribute)] = example_attribute
|
48
|
+
end
|
49
|
+
best_gain = infogains.keys.sort[0]
|
50
|
+
best_attribute = infogains[best_gain]
|
51
|
+
split(node,best_attribute).each {|child_node| pending.push(child_node)}
|
52
|
+
used_attributes << best_attribute
|
53
|
+
end
|
54
|
+
end
|
55
|
+
self.classifier_logic = codify_tree(@decision_tree)
|
56
|
+
implement_classify
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def implement_classify
|
62
|
+
classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
|
63
|
+
self.instance_eval classify_impl
|
64
|
+
end
|
65
|
+
|
66
|
+
def codify_tree(decision_tree)
|
67
|
+
lines = [""]
|
68
|
+
depth = 1
|
69
|
+
codify_node(decision_tree, lines, depth)
|
70
|
+
lines.join("\n")
|
71
|
+
end
|
72
|
+
|
73
|
+
def codify_node(decision_tree, lines, depth)
|
74
|
+
if decision_tree.attribute
|
75
|
+
lines << (" "*depth)+"case example.#{decision_tree.attribute}"
|
76
|
+
decision_tree.children.each do |child|
|
77
|
+
lines << (" "*depth)+"when #{codify_literal(child.name)}"
|
78
|
+
codify_node(child, lines, depth + 1)
|
79
|
+
end
|
80
|
+
lines << (" "*depth)+"end"
|
81
|
+
elsif decision_tree.classification
|
82
|
+
lines << (" "*depth)+"return #{codify_literal(decision_tree.classification)}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def codify_literal(object)
|
87
|
+
case object
|
88
|
+
when Symbol then ":#{object}"
|
89
|
+
when String then "\"#{object}\""
|
90
|
+
else
|
91
|
+
object.to_s
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def split(node, attribute)
|
96
|
+
node.attribute = attribute
|
97
|
+
example_subset = node.examples
|
98
|
+
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
99
|
+
examples_inversion.each do |key, value|
|
100
|
+
child_node = TreeNode.new(key)
|
101
|
+
child_node.examples = value
|
102
|
+
node << child_node
|
103
|
+
end
|
104
|
+
node.examples = nil
|
105
|
+
node.children
|
106
|
+
end
|
107
|
+
|
108
|
+
def expected_value(example_subset)
|
109
|
+
examples_inversion = invert_with_dups(classification_map(example_subset))
|
110
|
+
occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
|
111
|
+
occurrences.keys.sort { |key| occurrences[key] }[0]
|
112
|
+
end
|
113
|
+
|
114
|
+
def no_valuable_attributes?(node)
|
115
|
+
@example_attributes.map {|example_attribute|
|
116
|
+
information_gain(node.examples, example_attribute)}.each {|info_gain|
|
117
|
+
return false if info_gain != 0}
|
118
|
+
return true
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
def homogenous_examples(node)
|
123
|
+
classifications = classifications(node.examples)
|
124
|
+
if classifications.length == 1
|
125
|
+
return classifications[0]
|
126
|
+
else
|
127
|
+
return nil
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def entropy(example_subset)
|
132
|
+
set_size = example_subset.length.to_f
|
133
|
+
examples_inversion = invert_with_dups(classification_map(example_subset))
|
134
|
+
occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
|
135
|
+
0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
|
136
|
+
sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def information_gain(example_subset,attribute)
|
141
|
+
set_size = example_subset.length.to_f
|
142
|
+
examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
|
143
|
+
occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
|
144
|
+
entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
|
145
|
+
sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def classifications(example_subset)
|
150
|
+
example_subset.map {|example| @examples[example]}.compact
|
151
|
+
end
|
152
|
+
|
153
|
+
def classification_map(example_subset)
|
154
|
+
classification_map = {}
|
155
|
+
example_subset.each {|example| classification_map[example] = @examples[example]}
|
156
|
+
classification_map
|
157
|
+
end
|
158
|
+
|
159
|
+
def attribute_values(example_subset, attribute)
|
160
|
+
example_subset.map {|example| example.method(attribute.to_sym).call}.compact
|
161
|
+
end
|
162
|
+
|
163
|
+
def attribute_map(example_subset, attribute)
|
164
|
+
example_map = {}
|
165
|
+
example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
|
166
|
+
example_map
|
167
|
+
end
|
168
|
+
|
169
|
+
def invert_with_dups(hash)
|
170
|
+
inversion = {}
|
171
|
+
hash.values.each {|value| inversion[value] = []}
|
172
|
+
hash.keys.each {|key| inversion[hash[key]] << key}
|
173
|
+
inversion
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'tree'
|
2
|
+
module Dwarf
|
3
|
+
class TreeNode < Tree::TreeNode
|
4
|
+
attr_accessor :examples
|
5
|
+
attr_accessor :classification
|
6
|
+
attr_accessor :attribute
|
7
|
+
|
8
|
+
def initialize(name, content = nil)
|
9
|
+
examples = []
|
10
|
+
classification = nil
|
11
|
+
attribute = nil
|
12
|
+
super(name, content)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), *%w[spec_helper])
|
2
|
+
|
3
|
+
describe Dwarf::Classifier do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@classifier = Dwarf::Classifier.new()
|
7
|
+
end
|
8
|
+
|
9
|
+
it "accepts example classifications" do
|
10
|
+
@classifier.should respond_to(:add_example)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "stores examples" do
|
14
|
+
@example3 = double('example3')
|
15
|
+
@example3.stub(:attributes) { [] }
|
16
|
+
@classifier.add_example(@example3, :irish)
|
17
|
+
@classifier.examples.should include(@example3)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only implements classify on the learning instance" do
|
21
|
+
@example = double('example3')
|
22
|
+
@example.stub(:attributes) { [] }
|
23
|
+
@class2 = Dwarf::Classifier.new()
|
24
|
+
@classifier.add_example(@example, :round)
|
25
|
+
@classifier.learn!
|
26
|
+
@classifier.classify(@example).should eq(:round)
|
27
|
+
@class2.classify(@example).should eq(nil)
|
28
|
+
end
|
29
|
+
|
30
|
+
context "classifying cars" do
|
31
|
+
def mock_car_examples
|
32
|
+
@example1 = double('example1')
|
33
|
+
@example1.stub(:body_style) { :boxy }
|
34
|
+
@example1.stub(:cylinders) { 4 }
|
35
|
+
@example1.stub(:attributes) { ["body_style", "cylinders"] }
|
36
|
+
@example2 = double('example2')
|
37
|
+
@example2.stub(:body_style) { :swoopy }
|
38
|
+
@example2.stub(:cylinders) { 6 }
|
39
|
+
@example2.stub(:attributes) { ["body_style", "cylinders"] }
|
40
|
+
@example3 = double('example3')
|
41
|
+
@example3.stub(:body_style) { :angry }
|
42
|
+
@example3.stub(:cylinders) { 8 }
|
43
|
+
@example3.stub(:attributes) { ["body_style", "cylinders"] }
|
44
|
+
@example4 = double('example4')
|
45
|
+
@example4.stub(:body_style) {:swoopy}
|
46
|
+
@example4.stub(:cylinders) {8}
|
47
|
+
@example4.stub(:attributes) { ["body_style", "cylinders"] }
|
48
|
+
end
|
49
|
+
|
50
|
+
it "enumerate example attributes" do
|
51
|
+
mock_car_examples
|
52
|
+
@classifier.add_example(@example1, :japanese)
|
53
|
+
@classifier.example_attributes.should include("body_style", "cylinders")
|
54
|
+
end
|
55
|
+
|
56
|
+
it "classifies in a trivial case" do
|
57
|
+
mock_car_examples
|
58
|
+
@classifier.add_example(@example1, :japanese)
|
59
|
+
@classifier.add_example(@example2, :german)
|
60
|
+
@classifier.add_example(@example3, :american)
|
61
|
+
@classifier.learn!
|
62
|
+
@classifier.classify(@example1).should eq(:japanese)
|
63
|
+
@classifier.classify(@example2).should eq(:german)
|
64
|
+
@classifier.classify(@example3).should eq(:american)
|
65
|
+
end
|
66
|
+
|
67
|
+
it "classifies when multiple predicates required" do
|
68
|
+
mock_car_examples
|
69
|
+
@classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
|
70
|
+
@classifier.learn!
|
71
|
+
@classifier.classify(@example1).should eq(:japanese)
|
72
|
+
@classifier.classify(@example2).should eq(:german)
|
73
|
+
@classifier.classify(@example3).should eq(:american)
|
74
|
+
@classifier.classify(@example4).should eq(:german)
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: dwarf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Alex Redington
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-10-22 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rubytree
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 8
|
31
|
+
- 1
|
32
|
+
version: 0.8.1
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: bundler
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 0
|
46
|
+
- 0
|
47
|
+
version: 1.0.0
|
48
|
+
type: :development
|
49
|
+
version_requirements: *id002
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: rspec
|
52
|
+
prerelease: false
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
segments:
|
59
|
+
- 2
|
60
|
+
- 0
|
61
|
+
- 1
|
62
|
+
version: 2.0.1
|
63
|
+
type: :development
|
64
|
+
version_requirements: *id003
|
65
|
+
description: Dwarf is an implementation of the C4.5 algorithm targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
|
66
|
+
email:
|
67
|
+
- aredington@gmail.com
|
68
|
+
executables: []
|
69
|
+
|
70
|
+
extensions: []
|
71
|
+
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
74
|
+
files:
|
75
|
+
- .gitignore
|
76
|
+
- Gemfile
|
77
|
+
- Gemfile.lock
|
78
|
+
- README.md
|
79
|
+
- Rakefile
|
80
|
+
- dwarf.gemspec
|
81
|
+
- lib/dwarf.rb
|
82
|
+
- lib/dwarf/classifier.rb
|
83
|
+
- lib/dwarf/tree_node.rb
|
84
|
+
- lib/dwarf/version.rb
|
85
|
+
- spec/classifier_spec.rb
|
86
|
+
- spec/spec_helper.rb
|
87
|
+
has_rdoc: true
|
88
|
+
homepage: http://github.com/aredington/dwarf
|
89
|
+
licenses: []
|
90
|
+
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
segments:
|
102
|
+
- 0
|
103
|
+
version: "0"
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
segments:
|
110
|
+
- 1
|
111
|
+
- 3
|
112
|
+
- 6
|
113
|
+
version: 1.3.6
|
114
|
+
requirements: []
|
115
|
+
|
116
|
+
rubyforge_project: dwarf
|
117
|
+
rubygems_version: 1.3.7
|
118
|
+
signing_key:
|
119
|
+
specification_version: 3
|
120
|
+
summary: C4.5 for ActiveRecord objects
|
121
|
+
test_files: []
|
122
|
+
|