ai4r 1.12 → 1.13

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. data/README.rdoc +7 -12
  2. data/examples/classifiers/simple_linear_regression_example.csv +159 -0
  3. data/examples/classifiers/simple_linear_regression_example.rb +15 -0
  4. data/examples/clusterers/clusterer_example.rb +56 -0
  5. data/examples/neural_network/backpropagation_example.rb +2 -1
  6. data/lib/ai4r.rb +3 -1
  7. data/lib/ai4r/classifiers/id3.rb +6 -2
  8. data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
  9. data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
  10. data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
  11. data/lib/ai4r/clusterers/average_linkage.rb +3 -3
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
  13. data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
  14. data/lib/ai4r/clusterers/clusterer.rb +0 -11
  15. data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
  16. data/lib/ai4r/clusterers/diana.rb +2 -2
  17. data/lib/ai4r/clusterers/k_means.rb +123 -21
  18. data/lib/ai4r/clusterers/median_linkage.rb +3 -3
  19. data/lib/ai4r/clusterers/single_linkage.rb +4 -4
  20. data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
  21. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
  22. data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
  23. data/lib/ai4r/data/data_set.rb +12 -3
  24. data/lib/ai4r/data/proximity.rb +22 -0
  25. data/lib/ai4r/neural_network/backpropagation.rb +26 -15
  26. data/test/classifiers/id3_test.rb +12 -0
  27. data/test/classifiers/multilayer_perceptron_test.rb +1 -1
  28. data/test/classifiers/naive_bayes_test.rb +18 -18
  29. data/test/classifiers/simple_linear_regression_test.rb +37 -0
  30. data/test/clusterers/k_means_test.rb +75 -8
  31. data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
  32. data/test/data/data_set_test.rb +8 -0
  33. data/test/data/proximity_test.rb +7 -1
  34. metadata +96 -55
@@ -0,0 +1,118 @@
1
+ # Author:: Malav Bhavsar
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/classifier'
12
+
13
+ module Ai4r
14
+ module Classifiers
15
+
16
+
17
+ # = Introduction
18
+ #
19
+ # This is an implementation of a Simple Linear Regression Classifier.
20
+ #
21
+ # For further details regarding Bayes and Naive Bayes Classifier have a look at this link:
22
+ # http://en.wikipedia.org/wiki/Naive_Bayesian_classification
23
+ # http://en.wikipedia.org/wiki/Bayes%27_theorem
24
+ #
25
+ #
26
+ # = How to use it
27
+ #
28
+ # data = DataSet.new.parse_csv_with_labels "autoPrice.csv"
29
+ # c = SimpleLinearRegression.new.
30
+ # build data
31
+ # c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
32
+ #
33
+
34
+ class SimpleLinearRegression < Classifier
35
+
36
+ attr_reader :attribute, :attribute_index, :slope, :intercept
37
+
38
+ def initialize
39
+ @attribute = nil
40
+ @attribute_index = 0
41
+ @slope = 0
42
+ @intercept = 0
43
+ end
44
+
45
+ # You can evaluate new data, predicting its category.
46
+ # e.g.
47
+ # c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
48
+ # => 11876.96774193548
49
+ def eval(data)
50
+ @intercept + @slope * data[@attribute_index]
51
+ end
52
+
53
+ # Gets the best attribute and does Linear Regression using it to find out the
54
+ # slope and intercept.
55
+ # Parameter data has to be an instance of DataSet
56
+ def build(data)
57
+ raise "Error instance must be passed" unless data.is_a?(DataSet)
58
+ raise "Data should not be empty" if data.data_items.length == 0
59
+ y_mean = data.get_mean_or_mode[data.num_attributes - 1]
60
+
61
+ # Choose best attribute
62
+ min_msq = Float::MAX
63
+ attribute = nil
64
+ chosen = -1
65
+ chosen_slope = 0.0 / 0.0 # Float::NAN
66
+ chosen_intercept = 0.0 / 0.0 # Float::NAN
67
+
68
+ data.data_labels.each do |attr_name|
69
+ attr_index = data.get_index attr_name
70
+ if attr_index != data.num_attributes-1
71
+ # Compute slope and intercept
72
+ x_mean = data.get_mean_or_mode[attr_index]
73
+ sum_x_diff_squared = 0
74
+ sum_y_diff_squared = 0
75
+ slope = 0
76
+ data.data_items.map do |instance|
77
+ x_diff = instance[attr_index] - x_mean
78
+ y_diff = instance[attr_index] - y_mean
79
+ slope += x_diff * y_diff
80
+ sum_x_diff_squared += x_diff * x_diff
81
+ sum_y_diff_squared += y_diff * y_diff
82
+ end
83
+
84
+ if sum_x_diff_squared == 0
85
+ next
86
+ end
87
+
88
+ numerator = slope
89
+ slope /= sum_x_diff_squared
90
+ intercept = y_mean - slope * x_mean
91
+ msq = sum_y_diff_squared - slope * numerator
92
+
93
+ if msq < min_msq
94
+ min_msq = msq
95
+ chosen = attr_index
96
+ chosen_slope = slope
97
+ chosen_intercept = intercept
98
+ end
99
+ end
100
+ end
101
+
102
+ if chosen == -1
103
+ raise "no useful attribute found"
104
+ @attribute = nil
105
+ @attribute_index = 0
106
+ @slope = 0
107
+ @intercept = y_mean
108
+ else
109
+ @attribute = data.data_labels[chosen]
110
+ @attribute_index = chosen
111
+ @slope = chosen_slope
112
+ @intercept = chosen_intercept
113
+ end
114
+ return self
115
+ end
116
+ end
117
+ end
118
+ end
@@ -16,7 +16,7 @@ module Ai4r
16
16
  # Implementation of a Hierarchical clusterer with group average
17
17
  # linkage, AKA unweighted pair group method average or UPGMA (Everitt
18
18
  # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # With average linkage, the distance between a clusters cx and
@@ -29,8 +29,8 @@ module Ai4r
29
29
  parameters_info :distance_function =>
30
30
  "Custom implementation of distance function. " +
31
31
  "It must be a closure receiving two data items and return the " +
32
- "distance bewteen them. By default, this algorithm uses " +
33
- "ecuclidean distance of numeric attributes to the power of 2."
32
+ "distance between them. By default, this algorithm uses " +
33
+ "euclidean distance of numeric attributes to the power of 2."
34
34
 
35
35
  # Build a new clusterer, using data examples found in data_set.
36
36
  # Items will be clustered in "number_of_clusters" different
@@ -28,8 +28,8 @@ module Ai4r
28
28
  "build the clusterer. By default it is uncapped.",
29
29
  :distance_function => "Custom implementation of distance function. " +
30
30
  "It must be a closure receiving two data items and return the " +
31
- "distance bewteen them. By default, this algorithm uses " +
32
- "ecuclidean distance of numeric attributes to the power of 2.",
31
+ "distance between them. By default, this algorithm uses " +
32
+ "euclidean distance of numeric attributes to the power of 2.",
33
33
  :centroid_function => "Custom implementation to calculate the " +
34
34
  "centroid of a cluster. It must be a closure receiving an array of " +
35
35
  "data sets, and return an array of data items, representing the " +
@@ -17,7 +17,7 @@ module Ai4r
17
17
  # centroid linkage algorithm, aka unweighted pair group method
18
18
  # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
19
19
  # Sokal and Michener, 1958 )
20
- # Hierarchical clusteres create one cluster per element, and then
20
+ # Hierarchical clusterer create one cluster per element, and then
21
21
  # progressively merge clusters, until the required number of clusters
22
22
  # is reached.
23
23
  # The distance between clusters is the squared euclidean distance
@@ -32,8 +32,8 @@ module Ai4r
32
32
  parameters_info :distance_function =>
33
33
  "Custom implementation of distance function. " +
34
34
  "It must be a closure receiving two data items and return the " +
35
- "distance bewteen them. By default, this algorithm uses " +
36
- "ecuclidean distance of numeric attributes to the power of 2."
35
+ "distance between them. By default, this algorithm uses " +
36
+ "euclidean distance of numeric attributes to the power of 2."
37
37
 
38
38
  # Build a new clusterer, using data examples found in data_set.
39
39
  # Items will be clustered in "number_of_clusters" different
@@ -32,17 +32,6 @@ module Ai4r
32
32
  end
33
33
 
34
34
  protected
35
- # Usefull as a defult distance function for clustering algorithms
36
- def euclidean_distance(a, b)
37
- dist = 0.0
38
- a.each_index do |index|
39
- if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
40
- dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
41
- end
42
- end
43
- return dist
44
- end
45
-
46
35
  def get_min_index(array)
47
36
  min = array.first
48
37
  index = 0
@@ -15,7 +15,7 @@ module Ai4r
15
15
 
16
16
  # Implementation of a Hierarchical clusterer with complete linkage (Everitt
17
17
  # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
18
- # Hierarchical clusteres create one cluster per element, and then
18
+ # Hierarchical clusterer create one cluster per element, and then
19
19
  # progressively merge clusters, until the required number of clusters
20
20
  # is reached.
21
21
  # With complete linkage, the distance between two clusters is computed as
@@ -27,8 +27,8 @@ module Ai4r
27
27
  parameters_info :distance_function =>
28
28
  "Custom implementation of distance function. " +
29
29
  "It must be a closure receiving two data items and return the " +
30
- "distance bewteen them. By default, this algorithm uses " +
31
- "ecuclidean distance of numeric attributes to the power of 2."
30
+ "distance between them. By default, this algorithm uses " +
31
+ "euclidean distance of numeric attributes to the power of 2."
32
32
 
33
33
 
34
34
  # Build a new clusterer, using data examples found in data_set.
@@ -25,8 +25,8 @@ module Ai4r
25
25
  parameters_info :distance_function =>
26
26
  "Custom implementation of distance function. " +
27
27
  "It must be a closure receiving two data items and return the " +
28
- "distance bewteen them. By default, this algorithm uses " +
29
- "ecuclidean distance of numeric attributes to the power of 2."
28
+ "distance between them. By default, this algorithm uses " +
29
+ "euclidean distance of numeric attributes to the power of 2."
30
30
 
31
31
  def initialize
32
32
  @distance_function = lambda do |a,b|
@@ -8,6 +8,7 @@
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
10
  require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
11
12
  require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
13
 
13
14
  module Ai4r
@@ -27,22 +28,31 @@ module Ai4r
27
28
  "build the clusterer. By default it is uncapped.",
28
29
  :distance_function => "Custom implementation of distance function. " +
29
30
  "It must be a closure receiving two data items and return the " +
30
- "distance bewteen them. By default, this algorithm uses " +
31
- "ecuclidean distance of numeric attributes to the power of 2.",
31
+ "distance between them. By default, this algorithm uses " +
32
+ "euclidean distance of numeric attributes to the power of 2.",
32
33
  :centroid_function => "Custom implementation to calculate the " +
33
34
  "centroid of a cluster. It must be a closure receiving an array of " +
34
35
  "data sets, and return an array of data items, representing the " +
35
36
  "centroids of for each data set. " +
36
37
  "By default, this algorithm returns a data items using the mode "+
37
- "or mean of each attribute on each data set."
38
+ "or mean of each attribute on each data set.",
39
+ :centroid_indices => "Indices of data items (indexed from 0) to be " +
40
+ "the initial centroids. Otherwise, the initial centroids will be " +
41
+ "assigned randomly from the data set.",
42
+ :on_empty => "Action to take if a cluster becomes empty, with values " +
43
+ "'eliminate' (the default action, eliminate the empty cluster), " +
44
+ "'terminate' (terminate with error), 'random' (relocate the " +
45
+ "empty cluster to a random point), 'outlier' (relocate the " +
46
+ "empty cluster to the point furthest from its centroid)."
38
47
 
39
48
  def initialize
40
49
  @distance_function = nil
41
50
  @max_iterations = nil
42
- @old_centroids = nil
43
51
  @centroid_function = lambda do |data_sets|
44
52
  data_sets.collect{ |data_set| data_set.get_mean_or_mode}
45
53
  end
54
+ @centroid_indices = []
55
+ @on_empty = 'eliminate' # default if none specified
46
56
  end
47
57
 
48
58
 
@@ -52,6 +62,8 @@ module Ai4r
52
62
  def build(data_set, number_of_clusters)
53
63
  @data_set = data_set
54
64
  @number_of_clusters = number_of_clusters
65
+ raise ArgumentError, 'Length of centroid indices array differs from the specified number of clusters' unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
66
+ raise ArgumentError, 'Invalid value for on_empty' unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
55
67
  @iterations = 0
56
68
 
57
69
  calc_initial_centroids
@@ -73,32 +85,27 @@ module Ai4r
73
85
  # This function calculates the distance between 2 different
74
86
  # instances. By default, it returns the euclidean distance to the
75
87
  # power of 2.
76
- # You can provide a more convinient distance implementation:
88
+ # You can provide a more convenient distance implementation:
77
89
  #
78
90
  # 1- Overwriting this method
79
91
  #
80
92
  # 2- Providing a closure to the :distance_function parameter
81
93
  def distance(a, b)
82
94
  return @distance_function.call(a, b) if @distance_function
83
- return euclidean_distance(a, b)
95
+ return Ai4r::Data::Proximity.squared_euclidean_distance(
96
+ a.select {|att_a| att_a.is_a? Numeric} ,
97
+ b.select {|att_b| att_b.is_a? Numeric})
84
98
  end
85
99
 
86
100
  protected
87
101
 
88
102
  def calc_initial_centroids
89
- @centroids = []
90
- tried_indexes = []
91
- while @centroids.length < @number_of_clusters &&
92
- tried_indexes.length < @data_set.data_items.length
93
- random_index = rand(@data_set.data_items.length)
94
- if !tried_indexes.include?(random_index)
95
- tried_indexes << random_index
96
- if !@centroids.include? @data_set.data_items[random_index]
97
- @centroids << @data_set.data_items[random_index]
98
- end
99
- end
103
+ @centroids, @old_centroids = [], nil
104
+ if @centroid_indices.empty?
105
+ populate_centroids('random')
106
+ else
107
+ populate_centroids('indices')
100
108
  end
101
- @number_of_clusters = @centroids.length
102
109
  end
103
110
 
104
111
  def stop_criteria_met
@@ -110,9 +117,14 @@ module Ai4r
110
117
  @clusters = Array.new(@number_of_clusters) do
111
118
  Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
112
119
  end
113
- @data_set.data_items.each do |data_item|
114
- @clusters[eval(data_item)] << data_item
120
+ @cluster_indices = Array.new(@number_of_clusters) {[]}
121
+
122
+ @data_set.data_items.each_with_index do |data_item, data_index|
123
+ c = eval(data_item)
124
+ @clusters[c] << data_item
125
+ @cluster_indices[c] << data_index if @on_empty == 'outlier'
115
126
  end
127
+ manage_empty_clusters if has_empty_cluster?
116
128
  end
117
129
 
118
130
  def recompute_centroids
@@ -120,7 +132,97 @@ module Ai4r
120
132
  @iterations += 1
121
133
  @centroids = @centroid_function.call(@clusters)
122
134
  end
123
-
135
+
136
+ def populate_centroids(populate_method, number_of_clusters=@number_of_clusters)
137
+ tried_indexes = []
138
+ case populate_method
139
+ when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
140
+ while @centroids.length < number_of_clusters &&
141
+ tried_indexes.length < @data_set.data_items.length
142
+ random_index = rand(@data_set.data_items.length)
143
+ if !tried_indexes.include?(random_index)
144
+ tried_indexes << random_index
145
+ if !@centroids.include? @data_set.data_items[random_index]
146
+ @centroids << @data_set.data_items[random_index]
147
+ end
148
+ end
149
+ end
150
+ when 'indices' # for initial assignment only (with the :centroid_indices option)
151
+ @centroid_indices.each do |index|
152
+ raise ArgumentError, "Invalid centroid index #{index}" unless (index.is_a? Integer) && index >=0 && index < @data_set.data_items.length
153
+ if !tried_indexes.include?(index)
154
+ tried_indexes << index
155
+ if !@centroids.include? @data_set.data_items[index]
156
+ @centroids << @data_set.data_items[index]
157
+ end
158
+ end
159
+ end
160
+ when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
161
+ sorted_data_indices = sort_data_indices_by_dist_to_centroid
162
+ i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
163
+ while @centroids.length < number_of_clusters &&
164
+ tried_indexes.length < @data_set.data_items.length
165
+ outlier_index = sorted_data_indices[i]
166
+ if !tried_indexes.include?(outlier_index)
167
+ tried_indexes << outlier_index
168
+ if !@centroids.include? @data_set.data_items[outlier_index]
169
+ @centroids << @data_set.data_items[outlier_index]
170
+ end
171
+ end
172
+ i > 0 ? i -= 1 : break
173
+ end
174
+ end
175
+ @number_of_clusters = @centroids.length
176
+ end
177
+
178
+ # Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
179
+ # Returns indices, sorted in order from the nearest to furthest.
180
+ def sort_data_indices_by_dist_to_centroid
181
+ sorted_data_indices = []
182
+ h = {}
183
+ @clusters.each_with_index do |cluster, c|
184
+ centroid = @centroids[c]
185
+ cluster.data_items.each_with_index do |data_item, i|
186
+ dist_to_centroid = distance(data_item, centroid)
187
+ data_index = @cluster_indices[c][i]
188
+ h[data_index] = dist_to_centroid
189
+ end
190
+ end
191
+ # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
192
+ sorted_data_indices = h.sort_by{|k,v| v}.collect{|a,b| a}
193
+ end
194
+
195
+ def has_empty_cluster?
196
+ found_empty = false
197
+ @number_of_clusters.times do |c|
198
+ found_empty = true if @clusters[c].data_items.empty?
199
+ end
200
+ found_empty
201
+ end
202
+
203
+ def manage_empty_clusters
204
+ return if self.on_empty == 'terminate' # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
205
+
206
+ initial_number_of_clusters = @number_of_clusters
207
+ eliminate_empty_clusters
208
+ return if self.on_empty == 'eliminate'
209
+ populate_centroids(self.on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
210
+ calculate_membership_clusters
211
+ end
212
+
213
+ def eliminate_empty_clusters
214
+ old_clusters, old_centroids, old_cluster_indices = @clusters, @centroids, @cluster_indices
215
+ @clusters, @centroids, @cluster_indices = [], [], []
216
+ @number_of_clusters.times do |i|
217
+ if !old_clusters[i].data_items.empty?
218
+ @clusters << old_clusters[i]
219
+ @cluster_indices << old_cluster_indices[i]
220
+ @centroids << old_centroids[i]
221
+ end
222
+ end
223
+ @number_of_clusters = @centroids.length
224
+ end
225
+
124
226
  end
125
227
  end
126
228
  end
@@ -16,7 +16,7 @@ module Ai4r
16
16
  # Implementation of an Agglomerative Hierarchical clusterer with
17
17
  # median linkage algorithm, aka weighted pair group method centroid
18
18
  # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # Similar to centroid linkages, but using fix weight:
@@ -29,8 +29,8 @@ module Ai4r
29
29
  parameters_info :distance_function =>
30
30
  "Custom implementation of distance function. " +
31
31
  "It must be a closure receiving two data items and return the " +
32
- "distance bewteen them. By default, this algorithm uses " +
33
- "ecuclidean distance of numeric attributes to the power of 2."
32
+ "distance between them. By default, this algorithm uses " +
33
+ "euclidean distance of numeric attributes to the power of 2."
34
34
 
35
35
  # Build a new clusterer, using data examples found in data_set.
36
36
  # Items will be clustered in "number_of_clusters" different