ai4r 1.4 → 1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/README.rdoc +24 -3
  2. data/examples/decision_trees/id3_example.rb +1 -1
  3. data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
  4. data/lib/ai4r.rb +11 -0
  5. data/lib/ai4r/classifiers/classifier.rb +2 -0
  6. data/lib/ai4r/classifiers/id3.rb +3 -2
  7. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  8. data/lib/ai4r/classifiers/one_r.rb +2 -1
  9. data/lib/ai4r/classifiers/prism.rb +2 -1
  10. data/lib/ai4r/classifiers/zero_r.rb +2 -1
  11. data/lib/ai4r/clusterers/average_linkage.rb +60 -0
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +17 -39
  13. data/lib/ai4r/clusterers/clusterer.rb +25 -0
  14. data/lib/ai4r/clusterers/complete_linkage.rb +62 -0
  15. data/lib/ai4r/clusterers/k_means.rb +18 -25
  16. data/lib/ai4r/clusterers/single_linkage.rb +179 -0
  17. data/lib/ai4r/data/data_set.rb +33 -41
  18. data/lib/ai4r/data/proximity.rb +82 -0
  19. data/lib/ai4r/data/statistics.rb +77 -0
  20. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  21. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +2 -4
  22. data/site/build/site/en/build/tmp/build-info.xml +5 -0
  23. data/site/build/site/en/build/tmp/plugins-1.xml +212 -0
  24. data/site/build/site/en/build/tmp/plugins-2.xml +252 -0
  25. data/site/build/site/en/build/tmp/projfilters.properties +41 -0
  26. data/site/build/site/en/downloads.html +1 -1
  27. data/site/build/site/en/geneticAlgorithms.html +1 -1
  28. data/site/build/site/en/index.html +44 -7
  29. data/site/build/site/en/index.pdf +278 -155
  30. data/site/build/site/en/linkmap.html +2 -2
  31. data/site/build/site/en/linkmap.pdf +12 -12
  32. data/site/build/site/en/machineLearning.html +1 -1
  33. data/site/build/site/en/neuralNetworks.html +1 -1
  34. data/site/build/site/en/sourceCode.html +244 -0
  35. data/site/build/site/en/sourceCode.pdf +278 -0
  36. data/site/build/site/en/svn.html +34 -42
  37. data/site/build/site/en/svn.pdf +86 -114
  38. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  39. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  40. data/site/build/tmp/projfilters.properties +1 -1
  41. data/site/build/webapp/WEB-INF/logs/core.log +628 -629
  42. data/site/build/webapp/WEB-INF/logs/error.log +213 -213
  43. data/site/src/documentation/content/xdocs/index.xml +20 -1
  44. data/site/src/documentation/content/xdocs/site.xml +1 -1
  45. data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
  46. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  47. data/test/classifiers/id3_test.rb +0 -1
  48. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  49. data/test/classifiers/one_r_test.rb +0 -2
  50. data/test/classifiers/prism_test.rb +0 -2
  51. data/test/classifiers/zero_r_test.rb +0 -2
  52. data/test/clusterers/average_linkage_test.rb +45 -0
  53. data/test/clusterers/bisecting_k_means_test.rb +0 -2
  54. data/test/clusterers/complete_linkage_test.rb +45 -0
  55. data/test/clusterers/k_means_test.rb +0 -2
  56. data/test/clusterers/single_linkage_test.rb +113 -0
  57. data/test/data/data_set_test.rb +3 -15
  58. data/test/data/proximity_test.rb +71 -0
  59. data/test/data/statistics_test.rb +65 -0
  60. data/test/experiment/classifier_evaluator_test.rb +76 -0
  61. metadata +27 -6
  62. data/site/src/documentation/content/xdocs/svn.xml +0 -41
@@ -20,10 +20,31 @@ http://ai4r.rubyforge.org
20
20
 
21
21
  = More Info
22
22
 
23
- * AI4R wiki: http://wiki.jadeferret.com/Category:AI4R
24
- * AI4R Project site: http://ai4r.rubyforge.org
23
+ * AI4R wiki: http://wiki.jadeferret.com/Category:AI4R
24
+ * AI4R Project site: http://ai4r.rubyforge.org
25
25
 
26
- = Warranty
26
+ = Contact
27
+
28
+ If you have questions or constructive comments about this project,
29
+ please post them in the forum (http://forum.jadeferret.com/viewforum.php?f=3).
30
+ I get an email notification when you post, and I do my best to answer as soon as possible.
31
+
32
+ If you do not want to make it public, send it to me: Sergio Fierens, email address: (sergio (at) jadeferret (dot) com). But please, try to post them in the forum. I get tons of emails and it would be great to make them public to help everyone.
33
+
34
+ = Roadmap
35
+
36
+ AI4R is an active project. If you are interested about what we are working on,
37
+ checkout the development roadmap: http://wiki.jadeferret.com/AI4R_RoadMap
38
+
39
+ = Disclaimer
40
+
41
+ In plain english:
42
+
43
+ This project was created by Sergio Fierens, but the AI algorithms were created by other
44
+ people who are actually much more clever than Sergio. He does his best implementing
45
+ them, but he cannot warranty that these implementations are accurate.
46
+
47
+ In legalese:
27
48
 
28
49
  This software is provided "as is" and without any express or implied warranties,
29
50
  including, without limitation, the implied warranties of merchantibility and
@@ -11,7 +11,7 @@ require File.dirname(__FILE__) + '/../../lib/ai4r/classifiers/id3'
11
11
 
12
12
  # Load data from data_set.csv
13
13
  data_filename = "#{File.dirname(__FILE__)}/data_set.csv"
14
- data_set = Ai4r::Data::DataSet.new.load_data_and_labels_from_csv data_filename
14
+ data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
15
15
 
16
16
  # Build ID3 tree
17
17
  id3 = Ai4r::Classifiers::ID3.new.build(data_set)
@@ -13,7 +13,7 @@ require 'csv'
13
13
 
14
14
  # Load data from data_set.csv
15
15
  data_filename = "#{File.dirname(__FILE__)}/travel_cost.csv"
16
- data_set = Ai4r::Data::DataSet.new.load_data_and_labels_from_csv data_filename
16
+ data_set = Ai4r::Data::DataSet.new.load_csv_with_labels data_filename
17
17
  data_set.data_items.collect! {|column| column.collect {|element| element.to_f}}
18
18
 
19
19
  Ai4r::GeneticAlgorithm::Chromosome.set_cost_matrix(data_set)
@@ -1,10 +1,21 @@
1
+ # Data
2
+ require "ai4r/data/data_set"
3
+ require "ai4r/data/statistics"
4
+ require "ai4r/data/parameterizable"
5
+ # Clusterers
1
6
  require "ai4r/clusterers/clusterer"
2
7
  require "ai4r/clusterers/k_means"
3
8
  require "ai4r/clusterers/bisecting_k_means"
9
+ require "ai4r/clusterers/single_linkage"
10
+ require "ai4r/clusterers/complete_linkage"
11
+ require "ai4r/clusterers/average_linkage"
12
+ # Classifiers
4
13
  require "ai4r/classifiers/classifier"
5
14
  require "ai4r/classifiers/id3"
6
15
  require "ai4r/classifiers/prism"
7
16
  require "ai4r/classifiers/one_r"
8
17
  require "ai4r/classifiers/zero_r"
18
+ # Neural networks
9
19
  require "ai4r/neural_network/backpropagation"
20
+ # Genetic Algorithms
10
21
  require "ai4r/genetic_algorithm/genetic_algorithm"
@@ -19,6 +19,8 @@ module Ai4r
19
19
  include Ai4r::Data::Parameterizable
20
20
 
21
21
  # Build a new classifier, using data examples found in data_set.
22
+ # The last attribute of each item is considered as the
23
+ # item class.
22
24
  def build(data_set)
23
25
  raise NotImplementedError
24
26
  end
@@ -67,7 +67,7 @@ module Ai4r
67
67
  # values) file.
68
68
  #
69
69
  # data_file = "#{File.dirname(__FILE__)}/data_set.csv"
70
- # data_set = DataSet.load_data_and_labels_from_csv data_file
70
+ # data_set = DataSet.load_csv_with_labels data_file
71
71
  # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
72
72
  #
73
73
  # = A nice tip for data evaluation
@@ -94,7 +94,8 @@ module Ai4r
94
94
  attr_reader :data_set
95
95
 
96
96
  # Create a new ID3 classifier. You must provide a DataSet instance
97
- # as parameter.
97
+ # as parameter. The last attribute of each item is considered as the
98
+ # item class.
98
99
  def build(data_set)
99
100
  data_set.check_not_empty
100
101
  @data_set = data_set
@@ -0,0 +1,135 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set.rb'
11
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
12
+ require File.dirname(__FILE__) + '/../neural_network/backpropagation'
13
+
14
+ module Ai4r
15
+ module Classifiers
16
+
17
+ # = Introduction
18
+ #
19
+ # The idea behind the MultilayerPerceptron classifier is to
20
+ # train a Multilayer Perceptron neural network with the provided examples,
21
+ # and predict the class for new data items.
22
+ #
23
+ # = Parameters
24
+ #
25
+ # Use class method get_parameters_info to obtain details on the algorithm
26
+ # parameters. Use set_parameters to set values for this parameters.
27
+ # See Parameterizable module documentation.
28
+ #
29
+ # * :network_class => Neural network implementation class.
30
+ # By default: Ai4r::NeuralNetwork::Backpropagation.
31
+ # * :network_parameters => Parameters to be forwarded to the back end
32
+ # neural ntework.
33
+ # * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
34
+ # 2 hidden layers with 8 and 6 neurons each. By default []
35
+ # * :training_iterations => How many times the training should be repeated.
36
+ # By default: 1000.
37
+ # :active_node_value => Default: 1
38
+ # :inactive_node_value => Default: 1
39
+ class MultilayerPerceptron < Classifier
40
+
41
+ attr_reader :data_set, :class_value, :network, :domains
42
+
43
+ parameters_info :network_class => "Neural network implementation class."+
44
+ "By default: Ai4r::NeuralNetwork::Backpropagation.",
45
+ :network_parameters => "parameters to be forwarded to the back end " +
46
+ "neural network.",
47
+ :hidden_layers => "Hidden layer structure. E.g. [8, 6] will generate " +
48
+ "2 hidden layers with 8 and 6 neurons each. By default []",
49
+ :training_iterations => "How many times the training should be " +
50
+ "repeated. By default: 1000",
51
+ :active_node_value => "Default: 1",
52
+ :inactive_node_value => "Default: 0"
53
+
54
+ def initialize
55
+ @network_class = Ai4r::NeuralNetwork::Backpropagation
56
+ @hidden_layers = []
57
+ @training_iterations = 500
58
+ @network_parameters = {}
59
+ @active_node_value = 1
60
+ @inactive_node_value = 0
61
+ end
62
+
63
+ # Build a new MultilayerPerceptron classifier. You must provide a DataSet
64
+ # instance as parameter. The last attribute of each item is considered as
65
+ # the item class.
66
+ def build(data_set)
67
+ data_set.check_not_empty
68
+ @data_set = data_set
69
+ @domains = @data_set.build_domains.collect {|domain| domain.to_a}
70
+ @outputs = @domains.last.length
71
+ @inputs = 0
72
+ @domains[0...-1].each {|domain| @inputs += domain.length}
73
+ @structure = [@inputs] + @hidden_layers + [@outputs]
74
+ @network = @network_class.new @structure
75
+ @training_iterations.times do
76
+ data_set.data_items.each do |data_item|
77
+ input_values = data_to_input(data_item[0...-1])
78
+ output_values = data_to_output(data_item.last)
79
+ @network.train(input_values, output_values)
80
+ end
81
+ end
82
+ return self
83
+ end
84
+
85
+ # You can evaluate new data, predicting its class.
86
+ # e.g.
87
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
88
+ def eval(data)
89
+ input_values = data_to_input(data)
90
+ output_values = @network.eval(input_values)
91
+ return @domains.last[get_max_index(output_values)]
92
+ end
93
+
94
+ # Multilayer Perceptron Classifiers cannot generate
95
+ # human-readable rules.
96
+ def get_rules
97
+ return "raise 'Neural networks classifiers do not generate human-readable rules.'"
98
+ end
99
+
100
+ protected
101
+
102
+ def data_to_input(data_item)
103
+ input_values = Array.new(@inputs, @inactive_node_value)
104
+ accum_index = 0
105
+ data_item.each_index do |att_index|
106
+ att_value = data_item[att_index]
107
+ domain_index = @domains[att_index].index(att_value)
108
+ input_values[domain_index + accum_index] = @active_node_value
109
+ accum_index = @domains[att_index].length
110
+ end
111
+ return input_values
112
+ end
113
+
114
+ def data_to_output(data_item)
115
+ output_values = Array.new(@outputs, @inactive_node_value)
116
+ output_values[@domains.last.index(data_item)] = @active_node_value
117
+ return output_values
118
+ end
119
+
120
+ def get_max_index(output_values)
121
+ max_value = @inactive_node_value
122
+ max_index = 0
123
+ output_values.each_index do |output_index|
124
+ if max_value < output_values[output_index]
125
+ max_value = output_values[output_index]
126
+ max_index = output_index
127
+ end
128
+ end
129
+ return max_index
130
+ end
131
+
132
+ end
133
+
134
+ end
135
+ end
@@ -25,7 +25,8 @@ module Ai4r
25
25
  attr_reader :data_set, :rule
26
26
 
27
27
  # Build a new OneR classifier. You must provide a DataSet instance
28
- # as parameter.
28
+ # as parameter. The last attribute of each item is considered as
29
+ # the item class.
29
30
  def build(data_set)
30
31
  data_set.check_not_empty
31
32
  @data_set = data_set
@@ -29,7 +29,8 @@ module Ai4r
29
29
  attr_reader :data_set, :rules
30
30
 
31
31
  # Build a new Prism classifier. You must provide a DataSet instance
32
- # as parameter.
32
+ # as parameter. The last attribute of each item is considered as
33
+ # the item class.
33
34
  def build(data_set)
34
35
  data_set.check_not_empty
35
36
  @data_set = data_set
@@ -25,7 +25,8 @@ module Ai4r
25
25
  attr_reader :data_set, :class_value
26
26
 
27
27
  # Build a new ZeroR classifier. You must provide a DataSet instance
28
- # as parameter.
28
+ # as parameter. The last attribute of each item is considered as
29
+ # the item class.
29
30
  def build(data_set)
30
31
  data_set.check_not_empty
31
32
  @data_set = data_set
@@ -0,0 +1,60 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of a Hierarchical clusterer with complete linkage.
17
+ # Hierarchical clusteres create one cluster per element, and then
18
+ # progressively merge clusters, until the required number of clusters
19
+ # is reached.
20
+ # With average linkage, the distance between two clusters is computed as
21
+ # the average distance between elements of each cluster.
22
+ class AverageLinkage < SingleLinkage
23
+
24
+ # Build a new clusterer, using data examples found in data_set.
25
+ # Items will be clustered in "number_of_clusters" different
26
+ # clusters.
27
+ def build(data_set, number_of_clusters)
28
+ super
29
+ end
30
+
31
+ # Classifies the given data item, returning the cluster index it belongs
32
+ # to (0-based).
33
+ def eval(data_item)
34
+ super
35
+ end
36
+
37
+ protected
38
+
39
+ # Calculate cluster distance using the average linkage method
40
+ def calc_index_clusters_distance(cluster_a, cluster_b)
41
+ dist_sum = 0.0
42
+ cluster_a.each do |index_a|
43
+ cluster_b.each do |index_b|
44
+ dist_sum += read_distance_matrix(index_a, index_b)
45
+ end
46
+ end
47
+ return dist_sum/(cluster_a.length*cluster_b.length)
48
+ end
49
+
50
+ def distance_between_item_and_cluster(data_item, cluster)
51
+ dist_sum = 0.0
52
+ cluster.data_items.each do |another_item|
53
+ dist_sum += distance(data_item, another_item)
54
+ end
55
+ return dist_sum/cluster.data_items.length
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -7,7 +7,6 @@
7
7
  # the Mozilla Public License version 1.1 as published by the
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
- require "set"
11
10
  require File.dirname(__FILE__) + '/../data/data_set'
12
11
  require File.dirname(__FILE__) + '/../clusterers/k_means'
13
12
 
@@ -25,6 +24,23 @@ module Ai4r
25
24
  attr_reader :data_set, :number_of_clusters, :clusters, :centroids
26
25
  attr_accessor :max_iterations, :distance_function, :refine
27
26
 
27
+ parameters_info :max_iterations => "Maximum number of iterations to " +
28
+ "build the clusterer. By default it is uncapped.",
29
+ :distance_function => "Custom implementation of distance function. " +
30
+ "It must be a closure receiving two data items and return the " +
31
+ "distance bewteen them. By default, this algorithm uses " +
32
+ "ecuclidean distance of numeric attributes to the power of 2.",
33
+ :centroid_function => "Custom implementation to calculate the " +
34
+ "centroid of a cluster. It must be a closure receiving an array of " +
35
+ "data sets, and return an array of data items, representing the " +
36
+ "centroids of for each data set. " +
37
+ "By default, this algorithm returns a data items using the mode "+
38
+ "or mean of each attribute on each data set.",
39
+ :refine => "Boolean value. True by default. It will run the " +
40
+ "classic K Means algorithm, using as initial centroids the " +
41
+ "result of the bisecting approach."
42
+
43
+
28
44
  def intialize
29
45
  @refine = true
30
46
  end
@@ -54,44 +70,6 @@ module Ai4r
54
70
  return self
55
71
  end
56
72
 
57
- # Get info on what can be parameterized on this clusterer algorithm.
58
- # It returns a hash with the following format:
59
- # { :param_name => "Info on the parameter" }
60
- def get_parameters_info
61
- { :max_iterations => "Maximum number of iterations used to bisect a " +
62
- "cluster. By default it is uncapped.",
63
- :distance_function => "Custom implementation of distance function. " +
64
- "It must be a closure receiving two data items and return the " +
65
- "distance bewteen them. By default, this algorithm uses " +
66
- "ecuclidean distance of numeric attributes to the power of 2.",
67
- :refine => "Boolean value. True by default. It will run the " +
68
- "classic K Means algorithm, using as initial centroids the " +
69
- "result of the bisecting approach."
70
- }
71
- end
72
-
73
- # Set parameters on this clusterer instance.
74
- # You must provide a hash with the folowing format:
75
- # { :param_name => parameter_value }
76
- #
77
- # Use get_parameters_info to know what parameters are accepted.
78
- def set_parameters(parameters)
79
- super
80
- if parameters.has_key?(:refine)
81
- @refine = parameters[:refine]
82
- end
83
- return self
84
- end
85
-
86
- # Get parameter values on this clusterer instance.
87
- # Returns a hash with the folowing format:
88
- # { :param_name => parameter_value }
89
- def get_parameters
90
- params = super
91
- params[:refine] = @refine
92
- return params
93
- end
94
-
95
73
  protected
96
74
  def calc_initial_centroids
97
75
  @centroids # Use existing centroids
@@ -31,6 +31,31 @@ module Ai4r
31
31
  raise NotImplementedError
32
32
  end
33
33
 
34
+ protected
35
+ # Usefull as a defult distance function for clustering algorithms
36
+ def euclidean_distance(a, b)
37
+ dist = 0.0
38
+ a.each_index do |index|
39
+ if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
40
+ dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
41
+ end
42
+ end
43
+ return dist
44
+ end
45
+
46
+ def get_min_index(array)
47
+ min = array.first
48
+ index = 0
49
+ array.each_index do |i|
50
+ x = array[i]
51
+ if x < min
52
+ min = x
53
+ index = i
54
+ end
55
+ end
56
+ return index
57
+ end
58
+
34
59
  end
35
60
  end
36
61
  end
@@ -0,0 +1,62 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of a Hierarchical clusterer with complete linkage.
17
+ # Hierarchical clusteres create one cluster per element, and then
18
+ # progressively merge clusters, until the required number of clusters
19
+ # is reached.
20
+ # With complete linkage, the distance between two clusters is computed as
21
+ # the maximum distance between elements of each cluster.
22
+ class CompleteLinkage < SingleLinkage
23
+
24
+ # Build a new clusterer, using data examples found in data_set.
25
+ # Items will be clustered in "number_of_clusters" different
26
+ # clusters.
27
+ def build(data_set, number_of_clusters)
28
+ super
29
+ end
30
+
31
+ # Classifies the given data item, returning the cluster index it belongs
32
+ # to (0-based).
33
+ def eval(data_item)
34
+ super
35
+ end
36
+
37
+ protected
38
+
39
+ # Calculate cluster distance using the complete linkage method
40
+ def calc_index_clusters_distance(cluster_a, cluster_b)
41
+ max_dist = 0
42
+ cluster_a.each do |index_a|
43
+ cluster_b.each do |index_b|
44
+ dist = read_distance_matrix(index_a, index_b)
45
+ max_dist = dist if dist > max_dist
46
+ end
47
+ end
48
+ return max_dist
49
+ end
50
+
51
+ def distance_between_item_and_cluster(data_item, cluster)
52
+ max_dist = 0
53
+ cluster.data_items.each do |another_item|
54
+ dist = distance(data_item, another_item)
55
+ max_dist = dist if dist > max_dist
56
+ end
57
+ return max_dist
58
+ end
59
+
60
+ end
61
+ end
62
+ end