decision-tree 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/decision-tree.rb +116 -87
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6f70b8957300f4487b14eec4a2ba6461395899c8
|
4
|
+
data.tar.gz: 62425aa46fa2199efafee60cfcb13f860fa76c54
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e6c52758675d22c3d91493cdaaef808e791d53ad73097cbc582a869e68c977f4a8f5bef47c218ab581fed4f422c3e1605853597cf2e1c9fdeaceda6e5b65a7a2
|
7
|
+
data.tar.gz: a1bafd5b22b0d54215ef74edfc5fbd82bc2c3c75ce78543e1ada9367ce504e711aa0510487fe08e31e1f3e369ce3eb8b2c7f6b20d8c3a4a3c7eed35873f392a0
|
data/lib/decision-tree.rb
CHANGED
@@ -26,112 +26,141 @@ module Enumerable
|
|
26
26
|
end
|
27
27
|
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
else
|
35
|
-
path
|
36
|
-
end
|
37
|
-
|
38
|
-
@threshold = threshold
|
39
|
-
|
40
|
-
@algorithm = if algorithm=='c45' or algorithm=='id3'
|
41
|
-
algorithm
|
42
|
-
else
|
43
|
-
raise "Unknown algorithm"
|
44
|
-
end
|
45
|
-
|
46
|
-
@dimension = if dimension.nil?
|
47
|
-
entries[0][:features].size
|
48
|
-
else
|
49
|
-
dimension
|
50
|
-
end
|
51
|
-
|
52
|
-
@columns = if columns.nil?
|
53
|
-
@dimension.times.map{|i| "feature_#{i}"}
|
54
|
-
elsif columns.size != @dimension
|
55
|
-
raise "The number of columns is incorrect"
|
56
|
-
else
|
57
|
-
columns
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
@labels = entries.map{|x| x[:label]}
|
62
|
-
@entropy = @labels.entropy
|
63
|
-
@child_nodes = Hash.new
|
29
|
+
module DecisionTree
|
30
|
+
def self.train(entries, **arg)
|
31
|
+
algorithm = arg[:algorithm] || 'c45'
|
32
|
+
Node.new(entries, arg[:columns], algorithm)
|
33
|
+
end
|
64
34
|
|
65
|
-
return if @path.size == @dimension
|
66
|
-
return if @entropy==0.0
|
67
35
|
|
68
|
-
|
36
|
+
class Node
|
37
|
+
def initialize(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil)
|
38
|
+
@parent_node = parent_node
|
39
|
+
@path = if path.nil?
|
40
|
+
Array.new
|
41
|
+
else
|
42
|
+
path
|
43
|
+
end
|
69
44
|
|
70
|
-
|
71
|
-
end
|
45
|
+
@threshold = threshold
|
72
46
|
|
47
|
+
@algorithm = if algorithm=='c45' or algorithm=='id3'
|
48
|
+
algorithm
|
49
|
+
else
|
50
|
+
raise "Unknown algorithm"
|
51
|
+
end
|
73
52
|
|
74
|
-
|
75
|
-
|
76
|
-
|
53
|
+
@dimension = if dimension.nil?
|
54
|
+
entries[0][:features].size
|
55
|
+
else
|
56
|
+
dimension
|
57
|
+
end
|
77
58
|
|
59
|
+
@columns = if columns.nil?
|
60
|
+
@dimension.times.map{|i| "feature_#{i}"}
|
61
|
+
elsif columns.size != @dimension
|
62
|
+
raise "The number of columns is incorrect"
|
63
|
+
else
|
64
|
+
columns
|
65
|
+
end
|
78
66
|
|
79
|
-
def feature_name
|
80
|
-
@columns[ @path[-1] ]
|
81
|
-
end
|
82
67
|
|
83
|
-
|
84
|
-
|
68
|
+
@labels = entries.map{|x| x[:label]}
|
69
|
+
@entropy = @labels.entropy
|
70
|
+
@child_nodes = Hash.new
|
85
71
|
|
86
|
-
|
87
|
-
|
88
|
-
if result.size==1
|
89
|
-
buff << "#{indent}return #{result[0]}"
|
90
|
-
else
|
91
|
-
buff << "#{indent}return #{@labels}"
|
92
|
-
end
|
93
|
-
end
|
72
|
+
return if @path.size == @dimension
|
73
|
+
return if @entropy==0.0
|
94
74
|
|
95
|
-
|
96
|
-
buff << "#{indent}if(#{feature_name} == #{feature_value}){"
|
97
|
-
child_node.to_pseudo_code(buff, indent+" " )
|
98
|
-
buff << "#{indent}}"
|
99
|
-
end
|
100
|
-
return buff
|
101
|
-
end
|
75
|
+
@path << choose_best_feature(entries)
|
102
76
|
|
103
|
-
|
104
|
-
|
77
|
+
build_child_nodes(entries)
|
78
|
+
end
|
105
79
|
|
106
|
-
|
80
|
+
def feature_index
|
81
|
+
@path[-1]
|
82
|
+
end
|
107
83
|
|
108
|
-
max_ig = {index: -1, ig: -1}
|
109
|
-
@dimension.times do |i|
|
110
|
-
next if @path.include?(i)
|
111
|
-
child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels)
|
112
84
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
(@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy
|
117
|
-
end
|
85
|
+
def feature_name
|
86
|
+
@columns[ @path[-1] ]
|
87
|
+
end
|
118
88
|
|
119
|
-
|
89
|
+
def to_pseudo_code(buff=nil,indent="")
|
90
|
+
buff = Array.new if buff.nil?
|
91
|
+
|
92
|
+
if @child_nodes.size==0
|
93
|
+
result = @labels.to_set.to_a
|
94
|
+
if result.size==1
|
95
|
+
buff << "#{indent}return #{result[0]}"
|
96
|
+
else
|
97
|
+
buff << "#{indent}return #{@labels}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
@child_nodes.each do |feature_value,child_node|
|
102
|
+
buff << "#{indent}if(#{feature_name} == #{feature_value}){"
|
103
|
+
# buff << "#{indent}if(#{feature_index} == #{feature_value}){"
|
104
|
+
child_node.to_pseudo_code(buff, indent+" " )
|
105
|
+
buff << "#{indent}}"
|
106
|
+
end
|
107
|
+
return buff
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
def predict(vector, default=nil)
|
112
|
+
if @child_nodes.size==0
|
113
|
+
probability = Hash.new(0)
|
114
|
+
@labels.each{|k| probability[k] += 1 }
|
115
|
+
probability.each{|k,v| probability[k] = v / @labels.size.to_f }
|
116
|
+
return probability.to_json
|
117
|
+
else
|
118
|
+
feature_value = vector[feature_index]
|
119
|
+
return default if not @child_nodes.has_key?(feature_value)
|
120
|
+
return @child_nodes[feature_value].predict(vector)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
def choose_best_feature(entries)
|
126
|
+
|
127
|
+
labels = entries.map{|x| x[:label]}
|
128
|
+
|
129
|
+
max_ig = {index: -1, ig: -1.0}
|
130
|
+
@dimension.times do |i|
|
131
|
+
next if @path.include?(i)
|
132
|
+
child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels)
|
133
|
+
|
134
|
+
ig = if @algorithm=='id3'
|
135
|
+
@entropy - child_entropy
|
136
|
+
else# c45
|
137
|
+
a = (@entropy - child_entropy)
|
138
|
+
b = entries.map{|x| x[:features][i]}.entropy
|
139
|
+
# puts "@path=#{@path}"
|
140
|
+
# puts "i=#{i} @entropy=#{@entropy} child_entropy=#{child_entropy} a=#{a} b=#{b}"
|
141
|
+
gain = (@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy
|
142
|
+
gain = 0 if gain.nan?
|
143
|
+
gain
|
144
|
+
end
|
145
|
+
|
146
|
+
max_ig = {index: i, ig: ig} if ig > max_ig[:ig]
|
147
|
+
# puts "max_ig=#{max_ig} ig=#{ig}"
|
148
|
+
end
|
149
|
+
return max_ig[:index]
|
120
150
|
end
|
121
|
-
return max_ig[:index]
|
122
|
-
end
|
123
151
|
|
124
152
|
|
125
|
-
|
153
|
+
def build_child_nodes(entries)
|
126
154
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
155
|
+
buff = Hash.new{|h,feature_value| h[feature_value] = Array.new}
|
156
|
+
entries.each do |e|
|
157
|
+
feature_value = e[:features][feature_index]
|
158
|
+
buff[feature_value] << e
|
159
|
+
end
|
132
160
|
|
133
|
-
|
134
|
-
|
161
|
+
buff.each do |feature_value,child_entries|
|
162
|
+
@child_nodes[feature_value] = Node.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup)
|
163
|
+
end
|
135
164
|
end
|
136
165
|
end
|
137
|
-
end
|
166
|
+
end
|