decision-tree 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/decision-tree.rb +116 -87
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 37f1ad53408f84cac2ee0dfcc71cfe43708b08ca
4
- data.tar.gz: 5b39986b348b91f4c4d52c10b4c293ce27404461
3
+ metadata.gz: 6f70b8957300f4487b14eec4a2ba6461395899c8
4
+ data.tar.gz: 62425aa46fa2199efafee60cfcb13f860fa76c54
5
5
  SHA512:
6
- metadata.gz: 171879112eb7730b21238e5ba14ebdcba0be35466caba611279a7022788f2a9f801b39bd0fe1827a64793f9ab70bc8bb281ed552d227f832daf39a4036000c70
7
- data.tar.gz: 373ab5d902c1640606f4f660a4a675f51dcae142ac46061364580b53d7c31be4c711052ea854ab8ca9bdff99bed9beac403f3475cd9cdcf9a640e7b967a0051b
6
+ metadata.gz: e6c52758675d22c3d91493cdaaef808e791d53ad73097cbc582a869e68c977f4a8f5bef47c218ab581fed4f422c3e1605853597cf2e1c9fdeaceda6e5b65a7a2
7
+ data.tar.gz: a1bafd5b22b0d54215ef74edfc5fbd82bc2c3c75ce78543e1ada9367ce504e711aa0510487fe08e31e1f3e369ce3eb8b2c7f6b20d8c3a4a3c7eed35873f392a0
data/lib/decision-tree.rb CHANGED
@@ -26,112 +26,141 @@ module Enumerable
26
26
  end
27
27
 
28
28
 
29
- class DecisionTree
30
- def initialize(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil)
31
- @parent_node = parent_node
32
- @path = if path.nil?
33
- Array.new
34
- else
35
- path
36
- end
37
-
38
- @threshold = threshold
39
-
40
- @algorithm = if algorithm=='c45' or algorithm=='id3'
41
- algorithm
42
- else
43
- raise "Unknown algorithm"
44
- end
45
-
46
- @dimension = if dimension.nil?
47
- entries[0][:features].size
48
- else
49
- dimension
50
- end
51
-
52
- @columns = if columns.nil?
53
- @dimension.times.map{|i| "feature_#{i}"}
54
- elsif columns.size != @dimension
55
- raise "The number of columns is incorrect"
56
- else
57
- columns
58
- end
59
-
60
-
61
- @labels = entries.map{|x| x[:label]}
62
- @entropy = @labels.entropy
63
- @child_nodes = Hash.new
29
+ module DecisionTree
30
+ def self.train(entries, **arg)
31
+ algorithm = arg[:algorithm] || 'c45'
32
+ Node.new(entries, arg[:columns], algorithm)
33
+ end
64
34
 
65
- return if @path.size == @dimension
66
- return if @entropy==0.0
67
35
 
68
- @path << choose_best_feature(entries)
36
+ class Node
37
+ def initialize(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil)
38
+ @parent_node = parent_node
39
+ @path = if path.nil?
40
+ Array.new
41
+ else
42
+ path
43
+ end
69
44
 
70
- build_child_nodes(entries)
71
- end
45
+ @threshold = threshold
72
46
 
47
+ @algorithm = if algorithm=='c45' or algorithm=='id3'
48
+ algorithm
49
+ else
50
+ raise "Unknown algorithm"
51
+ end
73
52
 
74
- def feature_index
75
- @path[-1]
76
- end
53
+ @dimension = if dimension.nil?
54
+ entries[0][:features].size
55
+ else
56
+ dimension
57
+ end
77
58
 
59
+ @columns = if columns.nil?
60
+ @dimension.times.map{|i| "feature_#{i}"}
61
+ elsif columns.size != @dimension
62
+ raise "The number of columns is incorrect"
63
+ else
64
+ columns
65
+ end
78
66
 
79
- def feature_name
80
- @columns[ @path[-1] ]
81
- end
82
67
 
83
- def to_pseudo_code(buff=nil,indent="")
84
- buff = Array.new if buff.nil?
68
+ @labels = entries.map{|x| x[:label]}
69
+ @entropy = @labels.entropy
70
+ @child_nodes = Hash.new
85
71
 
86
- if @child_nodes.size==0
87
- result = @labels.to_set.to_a
88
- if result.size==1
89
- buff << "#{indent}return #{result[0]}"
90
- else
91
- buff << "#{indent}return #{@labels}"
92
- end
93
- end
72
+ return if @path.size == @dimension
73
+ return if @entropy==0.0
94
74
 
95
- @child_nodes.each do |feature_value,child_node|
96
- buff << "#{indent}if(#{feature_name} == #{feature_value}){"
97
- child_node.to_pseudo_code(buff, indent+" " )
98
- buff << "#{indent}}"
99
- end
100
- return buff
101
- end
75
+ @path << choose_best_feature(entries)
102
76
 
103
- private
104
- def choose_best_feature(entries)
77
+ build_child_nodes(entries)
78
+ end
105
79
 
106
- labels = entries.map{|x| x[:label]}
80
+ def feature_index
81
+ @path[-1]
82
+ end
107
83
 
108
- max_ig = {index: -1, ig: -1}
109
- @dimension.times do |i|
110
- next if @path.include?(i)
111
- child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels)
112
84
 
113
- ig = if @algorithm=='id3'
114
- @entropy - child_entropy
115
- else# c45
116
- (@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy
117
- end
85
+ def feature_name
86
+ @columns[ @path[-1] ]
87
+ end
118
88
 
119
- max_ig = {index: i, ig: ig} if ig > max_ig[:ig]
89
+ def to_pseudo_code(buff=nil,indent="")
90
+ buff = Array.new if buff.nil?
91
+
92
+ if @child_nodes.size==0
93
+ result = @labels.to_set.to_a
94
+ if result.size==1
95
+ buff << "#{indent}return #{result[0]}"
96
+ else
97
+ buff << "#{indent}return #{@labels}"
98
+ end
99
+ end
100
+
101
+ @child_nodes.each do |feature_value,child_node|
102
+ buff << "#{indent}if(#{feature_name} == #{feature_value}){"
103
+ # buff << "#{indent}if(#{feature_index} == #{feature_value}){"
104
+ child_node.to_pseudo_code(buff, indent+" " )
105
+ buff << "#{indent}}"
106
+ end
107
+ return buff
108
+ end
109
+
110
+
111
+ def predict(vector, default=nil)
112
+ if @child_nodes.size==0
113
+ probability = Hash.new(0)
114
+ @labels.each{|k| probability[k] += 1 }
115
+ probability.each{|k,v| probability[k] = v / @labels.size.to_f }
116
+ return probability.to_json
117
+ else
118
+ feature_value = vector[feature_index]
119
+ return default if not @child_nodes.has_key?(feature_value)
120
+ return @child_nodes[feature_value].predict(vector)
121
+ end
122
+ end
123
+
124
+ private
125
+ def choose_best_feature(entries)
126
+
127
+ labels = entries.map{|x| x[:label]}
128
+
129
+ max_ig = {index: -1, ig: -1.0}
130
+ @dimension.times do |i|
131
+ next if @path.include?(i)
132
+ child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels)
133
+
134
+ ig = if @algorithm=='id3'
135
+ @entropy - child_entropy
136
+ else# c45
137
+ a = (@entropy - child_entropy)
138
+ b = entries.map{|x| x[:features][i]}.entropy
139
+ # puts "@path=#{@path}"
140
+ # puts "i=#{i} @entropy=#{@entropy} child_entropy=#{child_entropy} a=#{a} b=#{b}"
141
+ gain = (@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy
142
+ gain = 0 if gain.nan?
143
+ gain
144
+ end
145
+
146
+ max_ig = {index: i, ig: ig} if ig > max_ig[:ig]
147
+ # puts "max_ig=#{max_ig} ig=#{ig}"
148
+ end
149
+ return max_ig[:index]
120
150
  end
121
- return max_ig[:index]
122
- end
123
151
 
124
152
 
125
- def build_child_nodes(entries)
153
+ def build_child_nodes(entries)
126
154
 
127
- buff = Hash.new{|h,feature_value| h[feature_value] = Array.new}
128
- entries.each do |e|
129
- feature_value = e[:features][feature_index]
130
- buff[feature_value] << e
131
- end
155
+ buff = Hash.new{|h,feature_value| h[feature_value] = Array.new}
156
+ entries.each do |e|
157
+ feature_value = e[:features][feature_index]
158
+ buff[feature_value] << e
159
+ end
132
160
 
133
- buff.each do |feature_value,child_entries|
134
- @child_nodes[feature_value] = DecisionTree.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup)
161
+ buff.each do |feature_value,child_entries|
162
+ @child_nodes[feature_value] = Node.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup)
163
+ end
135
164
  end
136
165
  end
137
- end
166
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: decision-tree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - ireullin