ai4r 1.1 → 1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +21 -20
- data/examples/decision_trees/id3_example.rb +3 -2
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +6 -6
- data/examples/neural_network/backpropagation_example.rb +2 -2
- data/lib/ai4r/classifiers/classifier_helper.rb +54 -0
- data/lib/ai4r/classifiers/id3.rb +356 -0
- data/lib/ai4r/classifiers/one_r.rb +148 -0
- data/lib/ai4r/classifiers/prism.rb +231 -0
- data/lib/ai4r/classifiers/zero_r.rb +104 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +272 -0
- data/lib/ai4r/neural_network/backpropagation.rb +271 -0
- data/site/build/tmp/locationmap.xml +14 -14
- data/site/build/tmp/output.xmap +23 -23
- data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
- data/site/build/tmp/plugins-1.xml +0 -11
- data/site/build/tmp/plugins-2.xml +54 -0
- data/site/build/tmp/projfilters.properties +41 -41
- data/site/build/webapp/WEB-INF/logs/core.log +681 -788
- data/site/build/webapp/WEB-INF/logs/error.log +281 -248
- data/site/build/webapp/WEB-INF/logs/sitemap.log +1015 -0
- data/site/src/documentation/content/xdocs/forum.html +9 -0
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +82 -68
- data/site/src/documentation/content/xdocs/index.xml +47 -18
- data/site/src/documentation/content/xdocs/machineLearning.xml +10 -9
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +60 -36
- data/site/src/documentation/content/xdocs/site.xml +8 -5
- data/site/src/documentation/content/xdocs/svn.xml +11 -1
- data/site/src/documentation/resources/images/Thumbs.db +0 -0
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
- data/site/src/documentation/skinconf.xml +18 -18
- data/test/classifiers/id3_test.rb +206 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +83 -0
- data/test/classifiers/zero_r_test.rb +48 -0
- data/test/genetic_algorithm/chromosome_test.rb +41 -38
- data/test/genetic_algorithm/genetic_algorithm_test.rb +64 -61
- data/test/neural_network/backpropagation_test.rb +20 -18
- metadata +109 -199
- data/lib/decision_tree/id3.rb +0 -354
- data/lib/genetic_algorithm/genetic_algorithm.rb +0 -268
- data/lib/neural_network/backpropagation.rb +0 -264
- data/site/build/site/en/broken-links.xml +0 -2
- data/site/build/site/en/downloads.html +0 -187
- data/site/build/site/en/downloads.pdf +0 -151
- data/site/build/site/en/geneticAlgorithms.html +0 -564
- data/site/build/site/en/geneticAlgorithms.pdf +0 -911
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +0 -258
- data/site/build/site/en/index.pdf +0 -306
- data/site/build/site/en/linkmap.html +0 -231
- data/site/build/site/en/linkmap.pdf +0 -94
- data/site/build/site/en/locationmap.xml +0 -72
- data/site/build/site/en/machineLearning.html +0 -325
- data/site/build/site/en/machineLearning.pdf +0 -337
- data/site/build/site/en/neuralNetworks.html +0 -446
- data/site/build/site/en/neuralNetworks.pdf +0 -604
- data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
- data/site/build/site/en/skin/basic.css +0 -166
- data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
- data/site/build/site/en/skin/breadcrumbs.js +0 -237
- data/site/build/site/en/skin/fontsize.js +0 -166
- data/site/build/site/en/skin/getBlank.js +0 -40
- data/site/build/site/en/skin/getMenu.js +0 -45
- data/site/build/site/en/skin/images/README.txt +0 -1
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +0 -48
- data/site/build/site/en/skin/note.txt +0 -50
- data/site/build/site/en/skin/print.css +0 -54
- data/site/build/site/en/skin/profile.css +0 -163
- data/site/build/site/en/skin/prototype.js +0 -1257
- data/site/build/site/en/skin/screen.css +0 -587
- data/site/build/site/en/svn.html +0 -223
- data/site/build/site/en/svn.pdf +0 -239
- data/site/build/site/en/wholesite.pdf +0 -1686
- data/site/build/tmp/brokenlinks.xml +0 -2
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/test/decision_tree/id3_test.rb +0 -209
data/README.rdoc
CHANGED
@@ -1,40 +1,41 @@
|
|
1
1
|
= Introduction
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
AI4R is a collection of ruby algorithms implementations, covering several Artificial intelligence fields,
|
4
|
+
and simple practical examples using them. It implements:
|
5
5
|
|
6
|
-
*
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
*
|
11
|
-
|
12
|
-
Implementation of GeneticSearch and Chromosome classes. The GeneticSearch is a generic class, and can be used to solved any kind of problems. The GeneticSearch class performs a stochastic search of the solution of a given problem.
|
13
|
-
|
14
|
-
* Neural network (NeuralNetwork::Backpropagation)
|
15
|
-
|
16
|
-
Implementation of neural networks using the Backpropagation supervised learning technique.
|
6
|
+
* Genetic algorithms (AI4R::GeneticAlgorithm::GeneticSearch)
|
7
|
+
|
8
|
+
* Neural networks (AI4R::NeuralNetwork::Backpropagation)
|
9
|
+
|
10
|
+
* ID3 Decision Trees (AI4R::Classifiers::ID3)
|
17
11
|
|
18
|
-
*
|
12
|
+
* PRISM (J. Cendrowska, 1987) (AI4R::Classifiers::Prism)
|
13
|
+
|
14
|
+
* OneR (AKA One Attribute Rule, 1R) (AI4R::Classifiers::OneR)
|
19
15
|
|
20
|
-
|
16
|
+
* ZeroR (AI4R::Classifiers::ZeroR)
|
21
17
|
|
22
18
|
= Where can I find the lastest code and info on this project?
|
23
19
|
|
24
20
|
http://ai4r.rubyforge.org
|
25
21
|
|
22
|
+
http://ai4r.jadeferret.com
|
23
|
+
|
26
24
|
= How to install
|
27
25
|
|
28
26
|
1. Install the gem:
|
29
27
|
|
30
|
-
gem install
|
28
|
+
gem install ai4r
|
31
29
|
|
32
30
|
2. Include require statements in your code:
|
33
31
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
32
|
+
require "rubygems"
|
33
|
+
require "ai4r/classifiers/id3"en
|
34
|
+
require "ai4r/classifiers/prism"
|
35
|
+
require "ai4r/classifiers/one_r"
|
36
|
+
require "ai4r/classifiers/zero_r"
|
37
|
+
require "ai4r/neural_network/backpropagation"
|
38
|
+
require "ai4r/genetic_algorithm/genetic_algorithm"
|
38
39
|
|
39
40
|
= Feedback
|
40
41
|
|
@@ -7,7 +7,8 @@
|
|
7
7
|
# the Mozilla Public License version 1.1 as published by the
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
9
|
|
10
|
-
require File.dirname(__FILE__) + '/../../lib/decision_tree/id3'
|
10
|
+
#require File.dirname(__FILE__) + '/../../lib/decision_tree/id3'
|
11
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
|
11
12
|
require 'csv'
|
12
13
|
|
13
14
|
# Load data from data_set.csv
|
@@ -18,7 +19,7 @@ end
|
|
18
19
|
data_labels = data_set.shift
|
19
20
|
|
20
21
|
# Build ID3 tree
|
21
|
-
id3 =
|
22
|
+
id3 = Ai4r::Classifiers::ID3.new.build(data_set, data_labels)
|
22
23
|
|
23
24
|
# Show rules
|
24
25
|
puts "Discovered rules are:"
|
@@ -7,7 +7,7 @@
|
|
7
7
|
# the Mozilla Public License version 1.1 as published by the
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
9
|
|
10
|
-
require File.dirname(__FILE__) + '/../../lib/genetic_algorithm/genetic_algorithm'
|
10
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/genetic_algorithm/genetic_algorithm'
|
11
11
|
require 'csv'
|
12
12
|
|
13
13
|
# Load data from data_set.csv
|
@@ -20,18 +20,18 @@ data_set.collect! do |column|
|
|
20
20
|
column.collect { |element| element.to_f}
|
21
21
|
end
|
22
22
|
|
23
|
-
GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
|
23
|
+
Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
|
24
24
|
|
25
25
|
puts "Some random selected tours costs: "
|
26
26
|
3.times do
|
27
|
-
c = GeneticAlgorithm::Chromosome.seed
|
28
|
-
puts "COST #{c.fitness} TOUR: #{c.data.collect{ |c| data_labels[c]} * ', '}"
|
27
|
+
c = Ai4r::GeneticAlgorithm::Chromosome.seed
|
28
|
+
puts "COST #{-1 * c.fitness} TOUR: #{c.data.collect{ |c| data_labels[c]} * ', '}"
|
29
29
|
end
|
30
30
|
|
31
31
|
puts "Beginning genetic search, please wait... "
|
32
|
-
search = GeneticAlgorithm::GeneticSearch.new(800, 100)
|
32
|
+
search = Ai4r::GeneticAlgorithm::GeneticSearch.new(800, 100)
|
33
33
|
result = search.run
|
34
|
-
puts "BEST COST FOUND #{result.fitness} TOUR: #{result.data.collect{ |c| data_labels[c]} * ', '}"
|
34
|
+
puts "BEST COST FOUND #{-1 * result.fitness} TOUR: #{result.data.collect{ |c| data_labels[c]} * ', '}"
|
35
35
|
|
36
36
|
# $7611.99 TOUR: Moscow, Kiev, Warsaw, Hamburg, Berlin, Vienna, Munich, Milan, Rome, Barcelona, Madrid, Paris, Brussels, London, Dublin
|
37
37
|
# $7659.81 TOUR: Moscow, Kiev, Warsaw, Vienna, Munich, Berlin, Hamburg, Brussels, Dublin, London, Paris, Milan, Rome, Barcelona, Madrid
|
@@ -10,12 +10,12 @@
|
|
10
10
|
require File.dirname(__FILE__) + '/training_patterns'
|
11
11
|
require File.dirname(__FILE__) + '/patterns_with_noise'
|
12
12
|
require File.dirname(__FILE__) + '/patterns_with_base_noise'
|
13
|
-
require File.dirname(__FILE__) + '/../../lib/neural_network/backpropagation'
|
13
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/neural_network/backpropagation'
|
14
14
|
require 'benchmark'
|
15
15
|
|
16
16
|
times = Benchmark.measure do
|
17
17
|
|
18
|
-
net = NeuralNetwork::Backpropagation.new([256, 3])
|
18
|
+
net = Ai4r::NeuralNetwork::Backpropagation.new([256, 3])
|
19
19
|
|
20
20
|
tr_input = TRIANGLE.flatten.collect { |input| input.to_f / 10}
|
21
21
|
sq_input = SQUARE.flatten.collect { |input| input.to_f / 10}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Ai4r
|
4
|
+
|
5
|
+
module Classifiers
|
6
|
+
|
7
|
+
NUMERIC_CLASS_TYPE = 1
|
8
|
+
NOMINAL_CLASS_TYPE = 2
|
9
|
+
|
10
|
+
module ClassifierHelper
|
11
|
+
|
12
|
+
def default_data_labels(data_examples)
|
13
|
+
data_labels = []
|
14
|
+
data_examples[0][0..-2].each_index do |i|
|
15
|
+
data_labels[i] = "attribute_#{i+1}"
|
16
|
+
end
|
17
|
+
data_labels[data_labels.length]="class_value"
|
18
|
+
return data_labels
|
19
|
+
end
|
20
|
+
|
21
|
+
def check_data_examples(data_examples)
|
22
|
+
if !data_examples || data_examples.empty?
|
23
|
+
raise ArgumentError,"Examples data set must not be empty."
|
24
|
+
elsif !data_examples.first.is_a?(Array)
|
25
|
+
raise ArgumentError,"Unkown format for example data."
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns attributes number, including class attribute
|
30
|
+
def num_attributes(data_examples)
|
31
|
+
return 0 if !data_examples || data_examples.empty? || !data_examples.first.is_a?(Array)
|
32
|
+
return data_examples.first.size
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns an array with the domain of each attribute (Set instance
|
36
|
+
# containing all possible values)
|
37
|
+
# Return example:
|
38
|
+
# => [#<Set: {"New York", "Chicago"}>,
|
39
|
+
# #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
|
40
|
+
# #<Set: {"M", "F"}>,
|
41
|
+
# #<Set: {"Y", "N"}>]
|
42
|
+
def build_domains(data_examples)
|
43
|
+
domains = Array.new(num_attributes(data_examples)) { Set.new }
|
44
|
+
data_examples.each do |data|
|
45
|
+
data.each_index {|attr_index| domains[attr_index] << data[attr_index]}
|
46
|
+
end
|
47
|
+
return domains
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,356 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation, Quinlan is
|
2
|
+
# the creator of the algorithm)
|
3
|
+
# License:: MPL 1.1
|
4
|
+
# Project:: ai4r
|
5
|
+
# Url:: http://ai4r.rubyforge.org/
|
6
|
+
#
|
7
|
+
# You can redistribute it and/or modify it under the terms of
|
8
|
+
# the Mozilla Public License version 1.1 as published by the
|
9
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
10
|
+
|
11
|
+
require File.dirname(__FILE__) + '/classifier_helper'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
|
15
|
+
module Classifiers
|
16
|
+
|
17
|
+
# = Introduction
|
18
|
+
# This is an implementation of the ID3 algorithm (Quinlan)
|
19
|
+
# Given a set of preclassified examples, it builds a top-down
|
20
|
+
# induction of decision tree, biased by the information gain and
|
21
|
+
# entropy measure.
|
22
|
+
#
|
23
|
+
# * http://en.wikipedia.org/wiki/Decision_tree
|
24
|
+
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
25
|
+
#
|
26
|
+
# = How to use it
|
27
|
+
#
|
28
|
+
# DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
29
|
+
#
|
30
|
+
# DATA_SET = [ ['New York', '<30', 'M', 'Y'],
|
31
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
32
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
33
|
+
# ['New York', '<30', 'M', 'Y'],
|
34
|
+
# ['New York', '<30', 'M', 'Y'],
|
35
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
36
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
37
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
38
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
39
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
40
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
41
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
42
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
43
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
44
|
+
# ['Chicago', '>80', 'F', 'Y']
|
45
|
+
# ]
|
46
|
+
#
|
47
|
+
# id3 = DecisionTree::ID3.new(DATA_SET, DATA_LABELS)
|
48
|
+
#
|
49
|
+
# id3.to_s
|
50
|
+
# # => if age_range=='<30' then marketing_target='Y'
|
51
|
+
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
52
|
+
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
53
|
+
# elsif age_range=='[50-80]' then marketing_target='N'
|
54
|
+
# elsif age_range=='>80' then marketing_target='Y'
|
55
|
+
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
56
|
+
#
|
57
|
+
# id3.eval(['New York', '<30', 'M'])
|
58
|
+
# # => 'Y'
|
59
|
+
#
|
60
|
+
# = A better way to load the data
|
61
|
+
#
|
62
|
+
# In the real life you will use lot more data training examples, with more
|
63
|
+
# attributes. Consider moving your data to an external CSV (comma separate
|
64
|
+
# values) file.
|
65
|
+
#
|
66
|
+
# data_set = []
|
67
|
+
# CSV::Reader.parse(File.open("#{File.dirname(__FILE__)}/data_set.csv", 'r')) do |row|
|
68
|
+
# data_set << row
|
69
|
+
# end
|
70
|
+
# data_labels = data_set.shift
|
71
|
+
#
|
72
|
+
# id3 = DecisionTree::ID3.new(data_set, data_labels)
|
73
|
+
#
|
74
|
+
# = A nice tip for data evaluation
|
75
|
+
#
|
76
|
+
# id3 = DecisionTree::ID3.new(DATA_SET, DATA_LABELS)
|
77
|
+
# age_range = '<30'
|
78
|
+
# marketing_target = nil
|
79
|
+
# eval id3.to_s
|
80
|
+
# puts marketing_target
|
81
|
+
# # => 'Y'
|
82
|
+
# = More about ID3 and decision trees
|
83
|
+
#
|
84
|
+
# * http://en.wikipedia.org/wiki/Decision_tree
|
85
|
+
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
86
|
+
#
|
87
|
+
# = About the project
|
88
|
+
# Author:: Sergio Fierens
|
89
|
+
# License:: MPL 1.1
|
90
|
+
|
91
|
+
class ID3
|
92
|
+
|
93
|
+
attr_reader :data_labels
|
94
|
+
include ClassifierHelper
|
95
|
+
|
96
|
+
# Create a new decision tree. If your data is classified with N attributed
|
97
|
+
# and M examples, then your data examples must have the following format:
|
98
|
+
#
|
99
|
+
# [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CATEGORY_VAL1],
|
100
|
+
# [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CATEGORY_VAL2],
|
101
|
+
# ...
|
102
|
+
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CATEGORY_VALM],
|
103
|
+
# ]
|
104
|
+
#
|
105
|
+
# e.g.
|
106
|
+
# [ ['New York', '<30', 'M', 'Y'],
|
107
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
108
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
109
|
+
# ['New York', '<30', 'M', 'Y'],
|
110
|
+
# ['New York', '<30', 'M', 'Y'],
|
111
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
112
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
113
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
114
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
115
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
116
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
117
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
118
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
119
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
120
|
+
# ['Chicago', '>80', 'F', 'Y']
|
121
|
+
# ]
|
122
|
+
#
|
123
|
+
# Data labels must have the following format:
|
124
|
+
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
125
|
+
#
|
126
|
+
# If you do not provide labels for you data, the following labels will
|
127
|
+
# be created by default:
|
128
|
+
# [ 'ATTRIBUTE_1', 'ATTRIBUTE_2', 'ATTRIBUTE_3', 'CATEGORY' ]
|
129
|
+
#
|
130
|
+
def build(data_examples, data_labels=nil)
|
131
|
+
check_data_examples(data_examples)
|
132
|
+
@data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
|
133
|
+
preprocess_data(data_examples)
|
134
|
+
return self
|
135
|
+
end
|
136
|
+
|
137
|
+
# You can evaluate new data, predicting its category.
|
138
|
+
# e.g.
|
139
|
+
# id3.eval(['New York', '<30', 'F']) # => 'Y'
|
140
|
+
def eval(data)
|
141
|
+
@tree.value(data) if @tree
|
142
|
+
end
|
143
|
+
|
144
|
+
# This method returns the generated rules in ruby code.
|
145
|
+
# e.g.
|
146
|
+
#
|
147
|
+
# id3.to_s
|
148
|
+
# # => if age_range=='<30' then marketing_target='Y'
|
149
|
+
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
150
|
+
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
151
|
+
# elsif age_range=='[50-80]' then marketing_target='N'
|
152
|
+
# elsif age_range=='>80' then marketing_target='Y'
|
153
|
+
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
154
|
+
#
|
155
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
156
|
+
# age_range = '<30'
|
157
|
+
# marketing_target = nil
|
158
|
+
# eval id3.to_s
|
159
|
+
# puts marketing_target
|
160
|
+
# # => 'Y'
|
161
|
+
def to_s
|
162
|
+
rules = @tree.get_rules
|
163
|
+
rules = rules.collect do |rule|
|
164
|
+
"#{rule[0..-2].join(' and ')} then #{rule.last}"
|
165
|
+
end
|
166
|
+
return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
|
167
|
+
end
|
168
|
+
|
169
|
+
private
|
170
|
+
def preprocess_data(data_examples)
|
171
|
+
@tree = build_node(data_examples)
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
def build_node(data_examples, flag_att = [])
|
176
|
+
return ErrorNode.new if data_examples.length == 0
|
177
|
+
domain = domain(data_examples)
|
178
|
+
return CategoryNode.new(@data_labels.last, domain.last[0]) if domain.last.length == 1
|
179
|
+
min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
|
180
|
+
flag_att << min_entropy_index
|
181
|
+
split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
|
182
|
+
return CategoryNode.new(@data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
|
183
|
+
nodes = split_data_examples.collect do |partial_data_examples|
|
184
|
+
build_node(partial_data_examples, flag_att)
|
185
|
+
end
|
186
|
+
return EvaluationNode.new(@data_labels, min_entropy_index, domain[min_entropy_index], nodes)
|
187
|
+
end
|
188
|
+
|
189
|
+
private
|
190
|
+
def self.sum(values)
|
191
|
+
values.inject( 0 ) { |sum,x| sum+x }
|
192
|
+
end
|
193
|
+
|
194
|
+
private
|
195
|
+
def self.log2(z)
|
196
|
+
return 0.0 if z == 0
|
197
|
+
Math.log(z)/LOG2
|
198
|
+
end
|
199
|
+
|
200
|
+
private
|
201
|
+
def most_freq(examples, domain)
|
202
|
+
freqs = []
|
203
|
+
domain.last.length.times { freqs << 0}
|
204
|
+
examples.each do |example|
|
205
|
+
cat_index = domain.last.index(example.last)
|
206
|
+
freq = freqs[cat_index] + 1
|
207
|
+
freqs[cat_index] = freq
|
208
|
+
end
|
209
|
+
max_freq = freqs.max
|
210
|
+
max_freq_index = freqs.index(max_freq)
|
211
|
+
domain.last[max_freq_index]
|
212
|
+
end
|
213
|
+
|
214
|
+
private
|
215
|
+
def split_data_examples(data_examples, domain, att_index)
|
216
|
+
data_examples_array = []
|
217
|
+
att_value_examples = {}
|
218
|
+
data_examples.each do |example|
|
219
|
+
example_set = att_value_examples[example[att_index]]
|
220
|
+
example_set = [] if !example_set
|
221
|
+
example_set << example
|
222
|
+
att_value_examples.store(example[att_index], example_set)
|
223
|
+
end
|
224
|
+
att_value_examples.each_pair do |att_value, example_set|
|
225
|
+
att_value_index = domain[att_index].index(att_value)
|
226
|
+
data_examples_array[att_value_index] = example_set
|
227
|
+
end
|
228
|
+
return data_examples_array
|
229
|
+
end
|
230
|
+
|
231
|
+
private
|
232
|
+
def min_entropy_index(data_examples, domain, flag_att=[])
|
233
|
+
min_entropy = nil
|
234
|
+
min_index = 0
|
235
|
+
domain[0..-2].each_index do |index|
|
236
|
+
freq_grid = freq_grid(index, data_examples, domain)
|
237
|
+
entropy = entropy(freq_grid, data_examples.length)
|
238
|
+
if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
|
239
|
+
min_entropy = entropy
|
240
|
+
min_index = index
|
241
|
+
end
|
242
|
+
end
|
243
|
+
return min_index
|
244
|
+
end
|
245
|
+
|
246
|
+
private
|
247
|
+
def domain(data_examples)
|
248
|
+
#return build_domains(data_examples)
|
249
|
+
domain = []
|
250
|
+
@data_labels.length.times { domain << [] }
|
251
|
+
data_examples.each do |data|
|
252
|
+
data.each_index do |i|
|
253
|
+
domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
|
254
|
+
end
|
255
|
+
end
|
256
|
+
return domain
|
257
|
+
end
|
258
|
+
|
259
|
+
private
|
260
|
+
def freq_grid(att_index, data_examples, domain)
|
261
|
+
#Initialize empty grid
|
262
|
+
grid_element = []
|
263
|
+
domain.last.length.times { grid_element << 0}
|
264
|
+
grid = []
|
265
|
+
domain[att_index].length.times { grid << grid_element.clone }
|
266
|
+
#Fill frecuency with grid
|
267
|
+
data_examples.each do |example|
|
268
|
+
att_val = example[att_index]
|
269
|
+
att_val_index = domain[att_index].index(att_val)
|
270
|
+
category = example.last
|
271
|
+
category_index = domain.last.index(category)
|
272
|
+
freq = grid[att_val_index][category_index] + 1
|
273
|
+
grid[att_val_index][category_index] = freq
|
274
|
+
end
|
275
|
+
return grid
|
276
|
+
end
|
277
|
+
|
278
|
+
private
|
279
|
+
def entropy(freq_grid, total_examples)
|
280
|
+
#Calc entropy of each element
|
281
|
+
entropy = 0
|
282
|
+
freq_grid.each do |att_freq|
|
283
|
+
att_total_freq = ID3.sum(att_freq)
|
284
|
+
partial_entropy = 0
|
285
|
+
if att_total_freq != 0
|
286
|
+
att_freq.each do |freq|
|
287
|
+
prop = freq.to_f/att_total_freq
|
288
|
+
partial_entropy += (-1*prop*ID3.log2(prop))
|
289
|
+
end
|
290
|
+
end
|
291
|
+
entropy += (att_total_freq.to_f/total_examples) * partial_entropy
|
292
|
+
end
|
293
|
+
return entropy
|
294
|
+
end
|
295
|
+
|
296
|
+
private
|
297
|
+
LOG2 = Math.log(2)
|
298
|
+
end
|
299
|
+
|
300
|
+
class EvaluationNode
|
301
|
+
|
302
|
+
attr_reader :index, :values, :nodes
|
303
|
+
|
304
|
+
def initialize(data_labels, index, values, nodes)
|
305
|
+
@index = index
|
306
|
+
@values = values
|
307
|
+
@nodes = nodes
|
308
|
+
@data_labels = data_labels
|
309
|
+
end
|
310
|
+
|
311
|
+
def value(data)
|
312
|
+
value = data[@index]
|
313
|
+
return rule_not_found if !@values.include?(value)
|
314
|
+
return nodes[@values.index(value)].value(data)
|
315
|
+
end
|
316
|
+
|
317
|
+
def get_rules
|
318
|
+
rule_set = []
|
319
|
+
@nodes.each_index do |child_node_index|
|
320
|
+
my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
|
321
|
+
child_node = @nodes[child_node_index]
|
322
|
+
child_node_rules = child_node.get_rules
|
323
|
+
child_node_rules.each do |child_rule|
|
324
|
+
child_rule.unshift(my_rule)
|
325
|
+
end
|
326
|
+
rule_set += child_node_rules
|
327
|
+
end
|
328
|
+
return rule_set
|
329
|
+
end
|
330
|
+
|
331
|
+
end
|
332
|
+
|
333
|
+
class CategoryNode
|
334
|
+
def initialize(label, value)
|
335
|
+
@label = label
|
336
|
+
@value = value
|
337
|
+
end
|
338
|
+
def value(data)
|
339
|
+
return @value
|
340
|
+
end
|
341
|
+
def get_rules
|
342
|
+
return [["#{@label}='#{@value}'"]]
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
class ErrorNode
|
347
|
+
def value(data)
|
348
|
+
raise "There was not enough information during training to do a proper induction for this data element."
|
349
|
+
end
|
350
|
+
def get_rules
|
351
|
+
return []
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
end
|
356
|
+
end
|