decision-tree 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/decision-tree.rb +137 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 37f1ad53408f84cac2ee0dfcc71cfe43708b08ca
4
+ data.tar.gz: 5b39986b348b91f4c4d52c10b4c293ce27404461
5
+ SHA512:
6
+ metadata.gz: 171879112eb7730b21238e5ba14ebdcba0be35466caba611279a7022788f2a9f801b39bd0fe1827a64793f9ab70bc8bb281ed552d227f832daf39a4036000c70
7
+ data.tar.gz: 373ab5d902c1640606f4f660a4a675f51dcae142ac46061364580b53d7c31be4c711052ea854ab8ca9bdff99bed9beac403f3475cd9cdcf9a640e7b967a0051b
@@ -0,0 +1,137 @@
1
+ require 'set'
2
+ require 'json'
3
+
4
+ module Enumerable
5
+ def entropy
6
+ dataset = Hash.new(0)
7
+ self.each{|x| dataset[x] += 1 }
8
+
9
+ entropy = 0.0
10
+ dataset.each do |k,v|
11
+ p = v.to_f / self.size
12
+ entropy += (-p)*Math.log2(p)
13
+ end
14
+
15
+ return entropy
16
+ end
17
+
18
+ def concitional_entropy_with(label)
19
+ dataset = Hash.new{|h,k| h[k] = Array.new }
20
+ self.each_with_index{|v,i| dataset[v] << label[i] }
21
+
22
+ new_entropy = 0.0
23
+ dataset.each{|k,v| new_entropy += (v.size.to_f / self.size)*v.entropy }
24
+ return new_entropy
25
+ end
26
+ end
27
+
28
+
29
+ class DecisionTree
30
+ def initialize(entries, columns=nil, algorithm='c45', dimension=nil, parent_node=nil, threshold=nil, path=nil)
31
+ @parent_node = parent_node
32
+ @path = if path.nil?
33
+ Array.new
34
+ else
35
+ path
36
+ end
37
+
38
+ @threshold = threshold
39
+
40
+ @algorithm = if algorithm=='c45' or algorithm=='id3'
41
+ algorithm
42
+ else
43
+ raise "Unknown algorithm"
44
+ end
45
+
46
+ @dimension = if dimension.nil?
47
+ entries[0][:features].size
48
+ else
49
+ dimension
50
+ end
51
+
52
+ @columns = if columns.nil?
53
+ @dimension.times.map{|i| "feature_#{i}"}
54
+ elsif columns.size != @dimension
55
+ raise "The number of columns is incorrect"
56
+ else
57
+ columns
58
+ end
59
+
60
+
61
+ @labels = entries.map{|x| x[:label]}
62
+ @entropy = @labels.entropy
63
+ @child_nodes = Hash.new
64
+
65
+ return if @path.size == @dimension
66
+ return if @entropy==0.0
67
+
68
+ @path << choose_best_feature(entries)
69
+
70
+ build_child_nodes(entries)
71
+ end
72
+
73
+
74
+ def feature_index
75
+ @path[-1]
76
+ end
77
+
78
+
79
+ def feature_name
80
+ @columns[ @path[-1] ]
81
+ end
82
+
83
+ def to_pseudo_code(buff=nil,indent="")
84
+ buff = Array.new if buff.nil?
85
+
86
+ if @child_nodes.size==0
87
+ result = @labels.to_set.to_a
88
+ if result.size==1
89
+ buff << "#{indent}return #{result[0]}"
90
+ else
91
+ buff << "#{indent}return #{@labels}"
92
+ end
93
+ end
94
+
95
+ @child_nodes.each do |feature_value,child_node|
96
+ buff << "#{indent}if(#{feature_name} == #{feature_value}){"
97
+ child_node.to_pseudo_code(buff, indent+" " )
98
+ buff << "#{indent}}"
99
+ end
100
+ return buff
101
+ end
102
+
103
+ private
104
+ def choose_best_feature(entries)
105
+
106
+ labels = entries.map{|x| x[:label]}
107
+
108
+ max_ig = {index: -1, ig: -1}
109
+ @dimension.times do |i|
110
+ next if @path.include?(i)
111
+ child_entropy = entries.map{|x| x[:features][i]}.concitional_entropy_with(labels)
112
+
113
+ ig = if @algorithm=='id3'
114
+ @entropy - child_entropy
115
+ else# c45
116
+ (@entropy - child_entropy) / entries.map{|x| x[:features][i]}.entropy
117
+ end
118
+
119
+ max_ig = {index: i, ig: ig} if ig > max_ig[:ig]
120
+ end
121
+ return max_ig[:index]
122
+ end
123
+
124
+
125
+ def build_child_nodes(entries)
126
+
127
+ buff = Hash.new{|h,feature_value| h[feature_value] = Array.new}
128
+ entries.each do |e|
129
+ feature_value = e[:features][feature_index]
130
+ buff[feature_value] << e
131
+ end
132
+
133
+ buff.each do |feature_value,child_entries|
134
+ @child_nodes[feature_value] = DecisionTree.new(child_entries, @columns, @algorithm, @dimension, self, feature_value, @path.dup)
135
+ end
136
+ end
137
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: decision-tree
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - ireullin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-04-17 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: 'A decision tree library which implemented ID3 & C4.5 of algorithms '
14
+ email:
15
+ - ireullin@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/decision-tree.rb
21
+ homepage: https://github.com/ireullin/decision-tree
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.2.2
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: A decision tree library which implemented ID3 & C4.5 of algorithms
45
+ test_files: []