ai4r 1.1 → 1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +21 -20
- data/examples/decision_trees/id3_example.rb +3 -2
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +6 -6
- data/examples/neural_network/backpropagation_example.rb +2 -2
- data/lib/ai4r/classifiers/classifier_helper.rb +54 -0
- data/lib/ai4r/classifiers/id3.rb +356 -0
- data/lib/ai4r/classifiers/one_r.rb +148 -0
- data/lib/ai4r/classifiers/prism.rb +231 -0
- data/lib/ai4r/classifiers/zero_r.rb +104 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +272 -0
- data/lib/ai4r/neural_network/backpropagation.rb +271 -0
- data/site/build/tmp/locationmap.xml +14 -14
- data/site/build/tmp/output.xmap +23 -23
- data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
- data/site/build/tmp/plugins-1.xml +0 -11
- data/site/build/tmp/plugins-2.xml +54 -0
- data/site/build/tmp/projfilters.properties +41 -41
- data/site/build/webapp/WEB-INF/logs/core.log +681 -788
- data/site/build/webapp/WEB-INF/logs/error.log +281 -248
- data/site/build/webapp/WEB-INF/logs/sitemap.log +1015 -0
- data/site/src/documentation/content/xdocs/forum.html +9 -0
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +82 -68
- data/site/src/documentation/content/xdocs/index.xml +47 -18
- data/site/src/documentation/content/xdocs/machineLearning.xml +10 -9
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +60 -36
- data/site/src/documentation/content/xdocs/site.xml +8 -5
- data/site/src/documentation/content/xdocs/svn.xml +11 -1
- data/site/src/documentation/resources/images/Thumbs.db +0 -0
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
- data/site/src/documentation/skinconf.xml +18 -18
- data/test/classifiers/id3_test.rb +206 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +83 -0
- data/test/classifiers/zero_r_test.rb +48 -0
- data/test/genetic_algorithm/chromosome_test.rb +41 -38
- data/test/genetic_algorithm/genetic_algorithm_test.rb +64 -61
- data/test/neural_network/backpropagation_test.rb +20 -18
- metadata +109 -199
- data/lib/decision_tree/id3.rb +0 -354
- data/lib/genetic_algorithm/genetic_algorithm.rb +0 -268
- data/lib/neural_network/backpropagation.rb +0 -264
- data/site/build/site/en/broken-links.xml +0 -2
- data/site/build/site/en/downloads.html +0 -187
- data/site/build/site/en/downloads.pdf +0 -151
- data/site/build/site/en/geneticAlgorithms.html +0 -564
- data/site/build/site/en/geneticAlgorithms.pdf +0 -911
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +0 -258
- data/site/build/site/en/index.pdf +0 -306
- data/site/build/site/en/linkmap.html +0 -231
- data/site/build/site/en/linkmap.pdf +0 -94
- data/site/build/site/en/locationmap.xml +0 -72
- data/site/build/site/en/machineLearning.html +0 -325
- data/site/build/site/en/machineLearning.pdf +0 -337
- data/site/build/site/en/neuralNetworks.html +0 -446
- data/site/build/site/en/neuralNetworks.pdf +0 -604
- data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
- data/site/build/site/en/skin/basic.css +0 -166
- data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
- data/site/build/site/en/skin/breadcrumbs.js +0 -237
- data/site/build/site/en/skin/fontsize.js +0 -166
- data/site/build/site/en/skin/getBlank.js +0 -40
- data/site/build/site/en/skin/getMenu.js +0 -45
- data/site/build/site/en/skin/images/README.txt +0 -1
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +0 -48
- data/site/build/site/en/skin/note.txt +0 -50
- data/site/build/site/en/skin/print.css +0 -54
- data/site/build/site/en/skin/profile.css +0 -163
- data/site/build/site/en/skin/prototype.js +0 -1257
- data/site/build/site/en/skin/screen.css +0 -587
- data/site/build/site/en/svn.html +0 -223
- data/site/build/site/en/svn.pdf +0 -239
- data/site/build/site/en/wholesite.pdf +0 -1686
- data/site/build/tmp/brokenlinks.xml +0 -2
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/test/decision_tree/id3_test.rb +0 -209
data/README.rdoc
CHANGED
|
@@ -1,40 +1,41 @@
|
|
|
1
1
|
= Introduction
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
AI4R is a collection of ruby algorithms implementations, covering several Artificial intelligence fields,
|
|
4
|
+
and simple practical examples using them. It implements:
|
|
5
5
|
|
|
6
|
-
*
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
*
|
|
11
|
-
|
|
12
|
-
Implementation of GeneticSearch and Chromosome classes. The GeneticSearch is a generic class, and can be used to solved any kind of problems. The GeneticSearch class performs a stochastic search of the solution of a given problem.
|
|
13
|
-
|
|
14
|
-
* Neural network (NeuralNetwork::Backpropagation)
|
|
15
|
-
|
|
16
|
-
Implementation of neural networks using the Backpropagation supervised learning technique.
|
|
6
|
+
* Genetic algorithms (AI4R::GeneticAlgorithm::GeneticSearch)
|
|
7
|
+
|
|
8
|
+
* Neural networks (AI4R::NeuralNetwork::Backpropagation)
|
|
9
|
+
|
|
10
|
+
* ID3 Decision Trees (AI4R::Classifiers::ID3)
|
|
17
11
|
|
|
18
|
-
*
|
|
12
|
+
* PRISM (J. Cendrowska, 1987) (AI4R::Classifiers::Prism)
|
|
13
|
+
|
|
14
|
+
* OneR (AKA One Attribute Rule, 1R) (AI4R::Classifiers::OneR)
|
|
19
15
|
|
|
20
|
-
|
|
16
|
+
* ZeroR (AI4R::Classifiers::ZeroR)
|
|
21
17
|
|
|
22
18
|
= Where can I find the lastest code and info on this project?
|
|
23
19
|
|
|
24
20
|
http://ai4r.rubyforge.org
|
|
25
21
|
|
|
22
|
+
http://ai4r.jadeferret.com
|
|
23
|
+
|
|
26
24
|
= How to install
|
|
27
25
|
|
|
28
26
|
1. Install the gem:
|
|
29
27
|
|
|
30
|
-
gem install
|
|
28
|
+
gem install ai4r
|
|
31
29
|
|
|
32
30
|
2. Include require statements in your code:
|
|
33
31
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
require "rubygems"
|
|
33
|
+
require "ai4r/classifiers/id3"en
|
|
34
|
+
require "ai4r/classifiers/prism"
|
|
35
|
+
require "ai4r/classifiers/one_r"
|
|
36
|
+
require "ai4r/classifiers/zero_r"
|
|
37
|
+
require "ai4r/neural_network/backpropagation"
|
|
38
|
+
require "ai4r/genetic_algorithm/genetic_algorithm"
|
|
38
39
|
|
|
39
40
|
= Feedback
|
|
40
41
|
|
|
@@ -7,7 +7,8 @@
|
|
|
7
7
|
# the Mozilla Public License version 1.1 as published by the
|
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
9
|
|
|
10
|
-
require File.dirname(__FILE__) + '/../../lib/decision_tree/id3'
|
|
10
|
+
#require File.dirname(__FILE__) + '/../../lib/decision_tree/id3'
|
|
11
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
|
|
11
12
|
require 'csv'
|
|
12
13
|
|
|
13
14
|
# Load data from data_set.csv
|
|
@@ -18,7 +19,7 @@ end
|
|
|
18
19
|
data_labels = data_set.shift
|
|
19
20
|
|
|
20
21
|
# Build ID3 tree
|
|
21
|
-
id3 =
|
|
22
|
+
id3 = Ai4r::Classifiers::ID3.new.build(data_set, data_labels)
|
|
22
23
|
|
|
23
24
|
# Show rules
|
|
24
25
|
puts "Discovered rules are:"
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
# the Mozilla Public License version 1.1 as published by the
|
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
9
|
|
|
10
|
-
require File.dirname(__FILE__) + '/../../lib/genetic_algorithm/genetic_algorithm'
|
|
10
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/genetic_algorithm/genetic_algorithm'
|
|
11
11
|
require 'csv'
|
|
12
12
|
|
|
13
13
|
# Load data from data_set.csv
|
|
@@ -20,18 +20,18 @@ data_set.collect! do |column|
|
|
|
20
20
|
column.collect { |element| element.to_f}
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
-
GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
|
|
23
|
+
Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
|
|
24
24
|
|
|
25
25
|
puts "Some random selected tours costs: "
|
|
26
26
|
3.times do
|
|
27
|
-
c = GeneticAlgorithm::Chromosome.seed
|
|
28
|
-
puts "COST #{c.fitness} TOUR: #{c.data.collect{ |c| data_labels[c]} * ', '}"
|
|
27
|
+
c = Ai4r::GeneticAlgorithm::Chromosome.seed
|
|
28
|
+
puts "COST #{-1 * c.fitness} TOUR: #{c.data.collect{ |c| data_labels[c]} * ', '}"
|
|
29
29
|
end
|
|
30
30
|
|
|
31
31
|
puts "Beginning genetic search, please wait... "
|
|
32
|
-
search = GeneticAlgorithm::GeneticSearch.new(800, 100)
|
|
32
|
+
search = Ai4r::GeneticAlgorithm::GeneticSearch.new(800, 100)
|
|
33
33
|
result = search.run
|
|
34
|
-
puts "BEST COST FOUND #{result.fitness} TOUR: #{result.data.collect{ |c| data_labels[c]} * ', '}"
|
|
34
|
+
puts "BEST COST FOUND #{-1 * result.fitness} TOUR: #{result.data.collect{ |c| data_labels[c]} * ', '}"
|
|
35
35
|
|
|
36
36
|
# $7611.99 TOUR: Moscow, Kiev, Warsaw, Hamburg, Berlin, Vienna, Munich, Milan, Rome, Barcelona, Madrid, Paris, Brussels, London, Dublin
|
|
37
37
|
# $7659.81 TOUR: Moscow, Kiev, Warsaw, Vienna, Munich, Berlin, Hamburg, Brussels, Dublin, London, Paris, Milan, Rome, Barcelona, Madrid
|
|
@@ -10,12 +10,12 @@
|
|
|
10
10
|
require File.dirname(__FILE__) + '/training_patterns'
|
|
11
11
|
require File.dirname(__FILE__) + '/patterns_with_noise'
|
|
12
12
|
require File.dirname(__FILE__) + '/patterns_with_base_noise'
|
|
13
|
-
require File.dirname(__FILE__) + '/../../lib/neural_network/backpropagation'
|
|
13
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/neural_network/backpropagation'
|
|
14
14
|
require 'benchmark'
|
|
15
15
|
|
|
16
16
|
times = Benchmark.measure do
|
|
17
17
|
|
|
18
|
-
net = NeuralNetwork::Backpropagation.new([256, 3])
|
|
18
|
+
net = Ai4r::NeuralNetwork::Backpropagation.new([256, 3])
|
|
19
19
|
|
|
20
20
|
tr_input = TRIANGLE.flatten.collect { |input| input.to_f / 10}
|
|
21
21
|
sq_input = SQUARE.flatten.collect { |input| input.to_f / 10}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
|
|
3
|
+
module Ai4r
|
|
4
|
+
|
|
5
|
+
module Classifiers
|
|
6
|
+
|
|
7
|
+
NUMERIC_CLASS_TYPE = 1
|
|
8
|
+
NOMINAL_CLASS_TYPE = 2
|
|
9
|
+
|
|
10
|
+
module ClassifierHelper
|
|
11
|
+
|
|
12
|
+
def default_data_labels(data_examples)
|
|
13
|
+
data_labels = []
|
|
14
|
+
data_examples[0][0..-2].each_index do |i|
|
|
15
|
+
data_labels[i] = "attribute_#{i+1}"
|
|
16
|
+
end
|
|
17
|
+
data_labels[data_labels.length]="class_value"
|
|
18
|
+
return data_labels
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def check_data_examples(data_examples)
|
|
22
|
+
if !data_examples || data_examples.empty?
|
|
23
|
+
raise ArgumentError,"Examples data set must not be empty."
|
|
24
|
+
elsif !data_examples.first.is_a?(Array)
|
|
25
|
+
raise ArgumentError,"Unkown format for example data."
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Returns attributes number, including class attribute
|
|
30
|
+
def num_attributes(data_examples)
|
|
31
|
+
return 0 if !data_examples || data_examples.empty? || !data_examples.first.is_a?(Array)
|
|
32
|
+
return data_examples.first.size
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Returns an array with the domain of each attribute (Set instance
|
|
36
|
+
# containing all possible values)
|
|
37
|
+
# Return example:
|
|
38
|
+
# => [#<Set: {"New York", "Chicago"}>,
|
|
39
|
+
# #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
|
|
40
|
+
# #<Set: {"M", "F"}>,
|
|
41
|
+
# #<Set: {"Y", "N"}>]
|
|
42
|
+
def build_domains(data_examples)
|
|
43
|
+
domains = Array.new(num_attributes(data_examples)) { Set.new }
|
|
44
|
+
data_examples.each do |data|
|
|
45
|
+
data.each_index {|attr_index| domains[attr_index] << data[attr_index]}
|
|
46
|
+
end
|
|
47
|
+
return domains
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
# Author:: Sergio Fierens (Implementation, Quinlan is
|
|
2
|
+
# the creator of the algorithm)
|
|
3
|
+
# License:: MPL 1.1
|
|
4
|
+
# Project:: ai4r
|
|
5
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
6
|
+
#
|
|
7
|
+
# You can redistribute it and/or modify it under the terms of
|
|
8
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
9
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
10
|
+
|
|
11
|
+
require File.dirname(__FILE__) + '/classifier_helper'
|
|
12
|
+
|
|
13
|
+
module Ai4r
|
|
14
|
+
|
|
15
|
+
module Classifiers
|
|
16
|
+
|
|
17
|
+
# = Introduction
|
|
18
|
+
# This is an implementation of the ID3 algorithm (Quinlan)
|
|
19
|
+
# Given a set of preclassified examples, it builds a top-down
|
|
20
|
+
# induction of decision tree, biased by the information gain and
|
|
21
|
+
# entropy measure.
|
|
22
|
+
#
|
|
23
|
+
# * http://en.wikipedia.org/wiki/Decision_tree
|
|
24
|
+
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
|
25
|
+
#
|
|
26
|
+
# = How to use it
|
|
27
|
+
#
|
|
28
|
+
# DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
|
29
|
+
#
|
|
30
|
+
# DATA_SET = [ ['New York', '<30', 'M', 'Y'],
|
|
31
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
|
32
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
|
33
|
+
# ['New York', '<30', 'M', 'Y'],
|
|
34
|
+
# ['New York', '<30', 'M', 'Y'],
|
|
35
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
|
36
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
|
37
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
|
38
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
|
39
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
40
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
|
41
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
|
42
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
43
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
|
44
|
+
# ['Chicago', '>80', 'F', 'Y']
|
|
45
|
+
# ]
|
|
46
|
+
#
|
|
47
|
+
# id3 = DecisionTree::ID3.new(DATA_SET, DATA_LABELS)
|
|
48
|
+
#
|
|
49
|
+
# id3.to_s
|
|
50
|
+
# # => if age_range=='<30' then marketing_target='Y'
|
|
51
|
+
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
|
52
|
+
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
|
53
|
+
# elsif age_range=='[50-80]' then marketing_target='N'
|
|
54
|
+
# elsif age_range=='>80' then marketing_target='Y'
|
|
55
|
+
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
|
56
|
+
#
|
|
57
|
+
# id3.eval(['New York', '<30', 'M'])
|
|
58
|
+
# # => 'Y'
|
|
59
|
+
#
|
|
60
|
+
# = A better way to load the data
|
|
61
|
+
#
|
|
62
|
+
# In the real life you will use lot more data training examples, with more
|
|
63
|
+
# attributes. Consider moving your data to an external CSV (comma separate
|
|
64
|
+
# values) file.
|
|
65
|
+
#
|
|
66
|
+
# data_set = []
|
|
67
|
+
# CSV::Reader.parse(File.open("#{File.dirname(__FILE__)}/data_set.csv", 'r')) do |row|
|
|
68
|
+
# data_set << row
|
|
69
|
+
# end
|
|
70
|
+
# data_labels = data_set.shift
|
|
71
|
+
#
|
|
72
|
+
# id3 = DecisionTree::ID3.new(data_set, data_labels)
|
|
73
|
+
#
|
|
74
|
+
# = A nice tip for data evaluation
|
|
75
|
+
#
|
|
76
|
+
# id3 = DecisionTree::ID3.new(DATA_SET, DATA_LABELS)
|
|
77
|
+
# age_range = '<30'
|
|
78
|
+
# marketing_target = nil
|
|
79
|
+
# eval id3.to_s
|
|
80
|
+
# puts marketing_target
|
|
81
|
+
# # => 'Y'
|
|
82
|
+
# = More about ID3 and decision trees
|
|
83
|
+
#
|
|
84
|
+
# * http://en.wikipedia.org/wiki/Decision_tree
|
|
85
|
+
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
|
86
|
+
#
|
|
87
|
+
# = About the project
|
|
88
|
+
# Author:: Sergio Fierens
|
|
89
|
+
# License:: MPL 1.1
|
|
90
|
+
|
|
91
|
+
class ID3
|
|
92
|
+
|
|
93
|
+
attr_reader :data_labels
|
|
94
|
+
include ClassifierHelper
|
|
95
|
+
|
|
96
|
+
# Create a new decision tree. If your data is classified with N attributed
|
|
97
|
+
# and M examples, then your data examples must have the following format:
|
|
98
|
+
#
|
|
99
|
+
# [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CATEGORY_VAL1],
|
|
100
|
+
# [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CATEGORY_VAL2],
|
|
101
|
+
# ...
|
|
102
|
+
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CATEGORY_VALM],
|
|
103
|
+
# ]
|
|
104
|
+
#
|
|
105
|
+
# e.g.
|
|
106
|
+
# [ ['New York', '<30', 'M', 'Y'],
|
|
107
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
|
108
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
|
109
|
+
# ['New York', '<30', 'M', 'Y'],
|
|
110
|
+
# ['New York', '<30', 'M', 'Y'],
|
|
111
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
|
112
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
|
113
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
|
114
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
|
115
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
116
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
|
117
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
|
118
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
119
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
|
120
|
+
# ['Chicago', '>80', 'F', 'Y']
|
|
121
|
+
# ]
|
|
122
|
+
#
|
|
123
|
+
# Data labels must have the following format:
|
|
124
|
+
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
|
125
|
+
#
|
|
126
|
+
# If you do not provide labels for you data, the following labels will
|
|
127
|
+
# be created by default:
|
|
128
|
+
# [ 'ATTRIBUTE_1', 'ATTRIBUTE_2', 'ATTRIBUTE_3', 'CATEGORY' ]
|
|
129
|
+
#
|
|
130
|
+
def build(data_examples, data_labels=nil)
|
|
131
|
+
check_data_examples(data_examples)
|
|
132
|
+
@data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
|
|
133
|
+
preprocess_data(data_examples)
|
|
134
|
+
return self
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# You can evaluate new data, predicting its category.
|
|
138
|
+
# e.g.
|
|
139
|
+
# id3.eval(['New York', '<30', 'F']) # => 'Y'
|
|
140
|
+
def eval(data)
|
|
141
|
+
@tree.value(data) if @tree
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# This method returns the generated rules in ruby code.
|
|
145
|
+
# e.g.
|
|
146
|
+
#
|
|
147
|
+
# id3.to_s
|
|
148
|
+
# # => if age_range=='<30' then marketing_target='Y'
|
|
149
|
+
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
|
150
|
+
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
|
151
|
+
# elsif age_range=='[50-80]' then marketing_target='N'
|
|
152
|
+
# elsif age_range=='>80' then marketing_target='Y'
|
|
153
|
+
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
|
154
|
+
#
|
|
155
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
|
156
|
+
# age_range = '<30'
|
|
157
|
+
# marketing_target = nil
|
|
158
|
+
# eval id3.to_s
|
|
159
|
+
# puts marketing_target
|
|
160
|
+
# # => 'Y'
|
|
161
|
+
def to_s
|
|
162
|
+
rules = @tree.get_rules
|
|
163
|
+
rules = rules.collect do |rule|
|
|
164
|
+
"#{rule[0..-2].join(' and ')} then #{rule.last}"
|
|
165
|
+
end
|
|
166
|
+
return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
private
|
|
170
|
+
def preprocess_data(data_examples)
|
|
171
|
+
@tree = build_node(data_examples)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
private
|
|
175
|
+
def build_node(data_examples, flag_att = [])
|
|
176
|
+
return ErrorNode.new if data_examples.length == 0
|
|
177
|
+
domain = domain(data_examples)
|
|
178
|
+
return CategoryNode.new(@data_labels.last, domain.last[0]) if domain.last.length == 1
|
|
179
|
+
min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
|
|
180
|
+
flag_att << min_entropy_index
|
|
181
|
+
split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
|
|
182
|
+
return CategoryNode.new(@data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
|
|
183
|
+
nodes = split_data_examples.collect do |partial_data_examples|
|
|
184
|
+
build_node(partial_data_examples, flag_att)
|
|
185
|
+
end
|
|
186
|
+
return EvaluationNode.new(@data_labels, min_entropy_index, domain[min_entropy_index], nodes)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
private
|
|
190
|
+
def self.sum(values)
|
|
191
|
+
values.inject( 0 ) { |sum,x| sum+x }
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
private
|
|
195
|
+
def self.log2(z)
|
|
196
|
+
return 0.0 if z == 0
|
|
197
|
+
Math.log(z)/LOG2
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
private
|
|
201
|
+
def most_freq(examples, domain)
|
|
202
|
+
freqs = []
|
|
203
|
+
domain.last.length.times { freqs << 0}
|
|
204
|
+
examples.each do |example|
|
|
205
|
+
cat_index = domain.last.index(example.last)
|
|
206
|
+
freq = freqs[cat_index] + 1
|
|
207
|
+
freqs[cat_index] = freq
|
|
208
|
+
end
|
|
209
|
+
max_freq = freqs.max
|
|
210
|
+
max_freq_index = freqs.index(max_freq)
|
|
211
|
+
domain.last[max_freq_index]
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
private
|
|
215
|
+
def split_data_examples(data_examples, domain, att_index)
|
|
216
|
+
data_examples_array = []
|
|
217
|
+
att_value_examples = {}
|
|
218
|
+
data_examples.each do |example|
|
|
219
|
+
example_set = att_value_examples[example[att_index]]
|
|
220
|
+
example_set = [] if !example_set
|
|
221
|
+
example_set << example
|
|
222
|
+
att_value_examples.store(example[att_index], example_set)
|
|
223
|
+
end
|
|
224
|
+
att_value_examples.each_pair do |att_value, example_set|
|
|
225
|
+
att_value_index = domain[att_index].index(att_value)
|
|
226
|
+
data_examples_array[att_value_index] = example_set
|
|
227
|
+
end
|
|
228
|
+
return data_examples_array
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
private
|
|
232
|
+
def min_entropy_index(data_examples, domain, flag_att=[])
|
|
233
|
+
min_entropy = nil
|
|
234
|
+
min_index = 0
|
|
235
|
+
domain[0..-2].each_index do |index|
|
|
236
|
+
freq_grid = freq_grid(index, data_examples, domain)
|
|
237
|
+
entropy = entropy(freq_grid, data_examples.length)
|
|
238
|
+
if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
|
|
239
|
+
min_entropy = entropy
|
|
240
|
+
min_index = index
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
return min_index
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
private
|
|
247
|
+
def domain(data_examples)
|
|
248
|
+
#return build_domains(data_examples)
|
|
249
|
+
domain = []
|
|
250
|
+
@data_labels.length.times { domain << [] }
|
|
251
|
+
data_examples.each do |data|
|
|
252
|
+
data.each_index do |i|
|
|
253
|
+
domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
return domain
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
private
|
|
260
|
+
def freq_grid(att_index, data_examples, domain)
|
|
261
|
+
#Initialize empty grid
|
|
262
|
+
grid_element = []
|
|
263
|
+
domain.last.length.times { grid_element << 0}
|
|
264
|
+
grid = []
|
|
265
|
+
domain[att_index].length.times { grid << grid_element.clone }
|
|
266
|
+
#Fill frecuency with grid
|
|
267
|
+
data_examples.each do |example|
|
|
268
|
+
att_val = example[att_index]
|
|
269
|
+
att_val_index = domain[att_index].index(att_val)
|
|
270
|
+
category = example.last
|
|
271
|
+
category_index = domain.last.index(category)
|
|
272
|
+
freq = grid[att_val_index][category_index] + 1
|
|
273
|
+
grid[att_val_index][category_index] = freq
|
|
274
|
+
end
|
|
275
|
+
return grid
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
private
|
|
279
|
+
def entropy(freq_grid, total_examples)
|
|
280
|
+
#Calc entropy of each element
|
|
281
|
+
entropy = 0
|
|
282
|
+
freq_grid.each do |att_freq|
|
|
283
|
+
att_total_freq = ID3.sum(att_freq)
|
|
284
|
+
partial_entropy = 0
|
|
285
|
+
if att_total_freq != 0
|
|
286
|
+
att_freq.each do |freq|
|
|
287
|
+
prop = freq.to_f/att_total_freq
|
|
288
|
+
partial_entropy += (-1*prop*ID3.log2(prop))
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
entropy += (att_total_freq.to_f/total_examples) * partial_entropy
|
|
292
|
+
end
|
|
293
|
+
return entropy
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
private
|
|
297
|
+
LOG2 = Math.log(2)
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
class EvaluationNode
|
|
301
|
+
|
|
302
|
+
attr_reader :index, :values, :nodes
|
|
303
|
+
|
|
304
|
+
def initialize(data_labels, index, values, nodes)
|
|
305
|
+
@index = index
|
|
306
|
+
@values = values
|
|
307
|
+
@nodes = nodes
|
|
308
|
+
@data_labels = data_labels
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
def value(data)
|
|
312
|
+
value = data[@index]
|
|
313
|
+
return rule_not_found if !@values.include?(value)
|
|
314
|
+
return nodes[@values.index(value)].value(data)
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def get_rules
|
|
318
|
+
rule_set = []
|
|
319
|
+
@nodes.each_index do |child_node_index|
|
|
320
|
+
my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
|
|
321
|
+
child_node = @nodes[child_node_index]
|
|
322
|
+
child_node_rules = child_node.get_rules
|
|
323
|
+
child_node_rules.each do |child_rule|
|
|
324
|
+
child_rule.unshift(my_rule)
|
|
325
|
+
end
|
|
326
|
+
rule_set += child_node_rules
|
|
327
|
+
end
|
|
328
|
+
return rule_set
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
class CategoryNode
|
|
334
|
+
def initialize(label, value)
|
|
335
|
+
@label = label
|
|
336
|
+
@value = value
|
|
337
|
+
end
|
|
338
|
+
def value(data)
|
|
339
|
+
return @value
|
|
340
|
+
end
|
|
341
|
+
def get_rules
|
|
342
|
+
return [["#{@label}='#{@value}'"]]
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
class ErrorNode
|
|
347
|
+
def value(data)
|
|
348
|
+
raise "There was not enough information during training to do a proper induction for this data element."
|
|
349
|
+
end
|
|
350
|
+
def get_rules
|
|
351
|
+
return []
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
end
|
|
356
|
+
end
|