ai4r 1.12 → 1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. data/README.rdoc +7 -12
  2. data/examples/classifiers/simple_linear_regression_example.csv +159 -0
  3. data/examples/classifiers/simple_linear_regression_example.rb +15 -0
  4. data/examples/clusterers/clusterer_example.rb +56 -0
  5. data/examples/neural_network/backpropagation_example.rb +2 -1
  6. data/lib/ai4r.rb +3 -1
  7. data/lib/ai4r/classifiers/id3.rb +6 -2
  8. data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
  9. data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
  10. data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
  11. data/lib/ai4r/clusterers/average_linkage.rb +3 -3
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
  13. data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
  14. data/lib/ai4r/clusterers/clusterer.rb +0 -11
  15. data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
  16. data/lib/ai4r/clusterers/diana.rb +2 -2
  17. data/lib/ai4r/clusterers/k_means.rb +123 -21
  18. data/lib/ai4r/clusterers/median_linkage.rb +3 -3
  19. data/lib/ai4r/clusterers/single_linkage.rb +4 -4
  20. data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
  21. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
  22. data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
  23. data/lib/ai4r/data/data_set.rb +12 -3
  24. data/lib/ai4r/data/proximity.rb +22 -0
  25. data/lib/ai4r/neural_network/backpropagation.rb +26 -15
  26. data/test/classifiers/id3_test.rb +12 -0
  27. data/test/classifiers/multilayer_perceptron_test.rb +1 -1
  28. data/test/classifiers/naive_bayes_test.rb +18 -18
  29. data/test/classifiers/simple_linear_regression_test.rb +37 -0
  30. data/test/clusterers/k_means_test.rb +75 -8
  31. data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
  32. data/test/data/data_set_test.rb +8 -0
  33. data/test/data/proximity_test.rb +7 -1
  34. metadata +96 -55
@@ -0,0 +1,118 @@
1
+ # Author:: Malav Bhavsar
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/classifier'
12
+
13
+ module Ai4r
14
+ module Classifiers
15
+
16
+
17
+ # = Introduction
18
+ #
19
+ # This is an implementation of a Simple Linear Regression Classifier.
20
+ #
21
+ # For further details regarding Bayes and Naive Bayes Classifier have a look at this link:
22
+ # http://en.wikipedia.org/wiki/Naive_Bayesian_classification
23
+ # http://en.wikipedia.org/wiki/Bayes%27_theorem
24
+ #
25
+ #
26
+ # = How to use it
27
+ #
28
+ # data = DataSet.new.parse_csv_with_labels "autoPrice.csv"
29
+ # c = SimpleLinearRegression.new.
30
+ # build data
31
+ # c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
32
+ #
33
+
34
+ class SimpleLinearRegression < Classifier
35
+
36
+ attr_reader :attribute, :attribute_index, :slope, :intercept
37
+
38
+ def initialize
39
+ @attribute = nil
40
+ @attribute_index = 0
41
+ @slope = 0
42
+ @intercept = 0
43
+ end
44
+
45
+ # You can evaluate new data, predicting its category.
46
+ # e.g.
47
+ # c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
48
+ # => 11876.96774193548
49
+ def eval(data)
50
+ @intercept + @slope * data[@attribute_index]
51
+ end
52
+
53
+ # Gets the best attribute and does Linear Regression using it to find out the
54
+ # slope and intercept.
55
+ # Parameter data has to be an instance of DataSet
56
+ def build(data)
57
+ raise "Error instance must be passed" unless data.is_a?(DataSet)
58
+ raise "Data should not be empty" if data.data_items.length == 0
59
+ y_mean = data.get_mean_or_mode[data.num_attributes - 1]
60
+
61
+ # Choose best attribute
62
+ min_msq = Float::MAX
63
+ attribute = nil
64
+ chosen = -1
65
+ chosen_slope = 0.0 / 0.0 # Float::NAN
66
+ chosen_intercept = 0.0 / 0.0 # Float::NAN
67
+
68
+ data.data_labels.each do |attr_name|
69
+ attr_index = data.get_index attr_name
70
+ if attr_index != data.num_attributes-1
71
+ # Compute slope and intercept
72
+ x_mean = data.get_mean_or_mode[attr_index]
73
+ sum_x_diff_squared = 0
74
+ sum_y_diff_squared = 0
75
+ slope = 0
76
+ data.data_items.map do |instance|
77
+ x_diff = instance[attr_index] - x_mean
78
+ y_diff = instance[attr_index] - y_mean
79
+ slope += x_diff * y_diff
80
+ sum_x_diff_squared += x_diff * x_diff
81
+ sum_y_diff_squared += y_diff * y_diff
82
+ end
83
+
84
+ if sum_x_diff_squared == 0
85
+ next
86
+ end
87
+
88
+ numerator = slope
89
+ slope /= sum_x_diff_squared
90
+ intercept = y_mean - slope * x_mean
91
+ msq = sum_y_diff_squared - slope * numerator
92
+
93
+ if msq < min_msq
94
+ min_msq = msq
95
+ chosen = attr_index
96
+ chosen_slope = slope
97
+ chosen_intercept = intercept
98
+ end
99
+ end
100
+ end
101
+
102
+ if chosen == -1
103
+ raise "no useful attribute found"
104
+ @attribute = nil
105
+ @attribute_index = 0
106
+ @slope = 0
107
+ @intercept = y_mean
108
+ else
109
+ @attribute = data.data_labels[chosen]
110
+ @attribute_index = chosen
111
+ @slope = chosen_slope
112
+ @intercept = chosen_intercept
113
+ end
114
+ return self
115
+ end
116
+ end
117
+ end
118
+ end
@@ -16,7 +16,7 @@ module Ai4r
16
16
  # Implementation of a Hierarchical clusterer with group average
17
17
  # linkage, AKA unweighted pair group method average or UPGMA (Everitt
18
18
  # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # With average linkage, the distance between a clusters cx and
@@ -29,8 +29,8 @@ module Ai4r
29
29
  parameters_info :distance_function =>
30
30
  "Custom implementation of distance function. " +
31
31
  "It must be a closure receiving two data items and return the " +
32
- "distance bewteen them. By default, this algorithm uses " +
33
- "ecuclidean distance of numeric attributes to the power of 2."
32
+ "distance between them. By default, this algorithm uses " +
33
+ "euclidean distance of numeric attributes to the power of 2."
34
34
 
35
35
  # Build a new clusterer, using data examples found in data_set.
36
36
  # Items will be clustered in "number_of_clusters" different
@@ -28,8 +28,8 @@ module Ai4r
28
28
  "build the clusterer. By default it is uncapped.",
29
29
  :distance_function => "Custom implementation of distance function. " +
30
30
  "It must be a closure receiving two data items and return the " +
31
- "distance bewteen them. By default, this algorithm uses " +
32
- "ecuclidean distance of numeric attributes to the power of 2.",
31
+ "distance between them. By default, this algorithm uses " +
32
+ "euclidean distance of numeric attributes to the power of 2.",
33
33
  :centroid_function => "Custom implementation to calculate the " +
34
34
  "centroid of a cluster. It must be a closure receiving an array of " +
35
35
  "data sets, and return an array of data items, representing the " +
@@ -17,7 +17,7 @@ module Ai4r
17
17
  # centroid linkage algorithm, aka unweighted pair group method
18
18
  # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
19
19
  # Sokal and Michener, 1958 )
20
- # Hierarchical clusteres create one cluster per element, and then
20
+ # Hierarchical clusterer create one cluster per element, and then
21
21
  # progressively merge clusters, until the required number of clusters
22
22
  # is reached.
23
23
  # The distance between clusters is the squared euclidean distance
@@ -32,8 +32,8 @@ module Ai4r
32
32
  parameters_info :distance_function =>
33
33
  "Custom implementation of distance function. " +
34
34
  "It must be a closure receiving two data items and return the " +
35
- "distance bewteen them. By default, this algorithm uses " +
36
- "ecuclidean distance of numeric attributes to the power of 2."
35
+ "distance between them. By default, this algorithm uses " +
36
+ "euclidean distance of numeric attributes to the power of 2."
37
37
 
38
38
  # Build a new clusterer, using data examples found in data_set.
39
39
  # Items will be clustered in "number_of_clusters" different
@@ -32,17 +32,6 @@ module Ai4r
32
32
  end
33
33
 
34
34
  protected
35
- # Usefull as a defult distance function for clustering algorithms
36
- def euclidean_distance(a, b)
37
- dist = 0.0
38
- a.each_index do |index|
39
- if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
40
- dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
41
- end
42
- end
43
- return dist
44
- end
45
-
46
35
  def get_min_index(array)
47
36
  min = array.first
48
37
  index = 0
@@ -15,7 +15,7 @@ module Ai4r
15
15
 
16
16
  # Implementation of a Hierarchical clusterer with complete linkage (Everitt
17
17
  # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
18
- # Hierarchical clusteres create one cluster per element, and then
18
+ # Hierarchical clusterer create one cluster per element, and then
19
19
  # progressively merge clusters, until the required number of clusters
20
20
  # is reached.
21
21
  # With complete linkage, the distance between two clusters is computed as
@@ -27,8 +27,8 @@ module Ai4r
27
27
  parameters_info :distance_function =>
28
28
  "Custom implementation of distance function. " +
29
29
  "It must be a closure receiving two data items and return the " +
30
- "distance bewteen them. By default, this algorithm uses " +
31
- "ecuclidean distance of numeric attributes to the power of 2."
30
+ "distance between them. By default, this algorithm uses " +
31
+ "euclidean distance of numeric attributes to the power of 2."
32
32
 
33
33
 
34
34
  # Build a new clusterer, using data examples found in data_set.
@@ -25,8 +25,8 @@ module Ai4r
25
25
  parameters_info :distance_function =>
26
26
  "Custom implementation of distance function. " +
27
27
  "It must be a closure receiving two data items and return the " +
28
- "distance bewteen them. By default, this algorithm uses " +
29
- "ecuclidean distance of numeric attributes to the power of 2."
28
+ "distance between them. By default, this algorithm uses " +
29
+ "euclidean distance of numeric attributes to the power of 2."
30
30
 
31
31
  def initialize
32
32
  @distance_function = lambda do |a,b|
@@ -8,6 +8,7 @@
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
10
  require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
11
12
  require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
13
 
13
14
  module Ai4r
@@ -27,22 +28,31 @@ module Ai4r
27
28
  "build the clusterer. By default it is uncapped.",
28
29
  :distance_function => "Custom implementation of distance function. " +
29
30
  "It must be a closure receiving two data items and return the " +
30
- "distance bewteen them. By default, this algorithm uses " +
31
- "ecuclidean distance of numeric attributes to the power of 2.",
31
+ "distance between them. By default, this algorithm uses " +
32
+ "euclidean distance of numeric attributes to the power of 2.",
32
33
  :centroid_function => "Custom implementation to calculate the " +
33
34
  "centroid of a cluster. It must be a closure receiving an array of " +
34
35
  "data sets, and return an array of data items, representing the " +
35
36
  "centroids of for each data set. " +
36
37
  "By default, this algorithm returns a data items using the mode "+
37
- "or mean of each attribute on each data set."
38
+ "or mean of each attribute on each data set.",
39
+ :centroid_indices => "Indices of data items (indexed from 0) to be " +
40
+ "the initial centroids. Otherwise, the initial centroids will be " +
41
+ "assigned randomly from the data set.",
42
+ :on_empty => "Action to take if a cluster becomes empty, with values " +
43
+ "'eliminate' (the default action, eliminate the empty cluster), " +
44
+ "'terminate' (terminate with error), 'random' (relocate the " +
45
+ "empty cluster to a random point), 'outlier' (relocate the " +
46
+ "empty cluster to the point furthest from its centroid)."
38
47
 
39
48
  def initialize
40
49
  @distance_function = nil
41
50
  @max_iterations = nil
42
- @old_centroids = nil
43
51
  @centroid_function = lambda do |data_sets|
44
52
  data_sets.collect{ |data_set| data_set.get_mean_or_mode}
45
53
  end
54
+ @centroid_indices = []
55
+ @on_empty = 'eliminate' # default if none specified
46
56
  end
47
57
 
48
58
 
@@ -52,6 +62,8 @@ module Ai4r
52
62
  def build(data_set, number_of_clusters)
53
63
  @data_set = data_set
54
64
  @number_of_clusters = number_of_clusters
65
+ raise ArgumentError, 'Length of centroid indices array differs from the specified number of clusters' unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
66
+ raise ArgumentError, 'Invalid value for on_empty' unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
55
67
  @iterations = 0
56
68
 
57
69
  calc_initial_centroids
@@ -73,32 +85,27 @@ module Ai4r
73
85
  # This function calculates the distance between 2 different
74
86
  # instances. By default, it returns the euclidean distance to the
75
87
  # power of 2.
76
- # You can provide a more convinient distance implementation:
88
+ # You can provide a more convenient distance implementation:
77
89
  #
78
90
  # 1- Overwriting this method
79
91
  #
80
92
  # 2- Providing a closure to the :distance_function parameter
81
93
  def distance(a, b)
82
94
  return @distance_function.call(a, b) if @distance_function
83
- return euclidean_distance(a, b)
95
+ return Ai4r::Data::Proximity.squared_euclidean_distance(
96
+ a.select {|att_a| att_a.is_a? Numeric} ,
97
+ b.select {|att_b| att_b.is_a? Numeric})
84
98
  end
85
99
 
86
100
  protected
87
101
 
88
102
  def calc_initial_centroids
89
- @centroids = []
90
- tried_indexes = []
91
- while @centroids.length < @number_of_clusters &&
92
- tried_indexes.length < @data_set.data_items.length
93
- random_index = rand(@data_set.data_items.length)
94
- if !tried_indexes.include?(random_index)
95
- tried_indexes << random_index
96
- if !@centroids.include? @data_set.data_items[random_index]
97
- @centroids << @data_set.data_items[random_index]
98
- end
99
- end
103
+ @centroids, @old_centroids = [], nil
104
+ if @centroid_indices.empty?
105
+ populate_centroids('random')
106
+ else
107
+ populate_centroids('indices')
100
108
  end
101
- @number_of_clusters = @centroids.length
102
109
  end
103
110
 
104
111
  def stop_criteria_met
@@ -110,9 +117,14 @@ module Ai4r
110
117
  @clusters = Array.new(@number_of_clusters) do
111
118
  Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
112
119
  end
113
- @data_set.data_items.each do |data_item|
114
- @clusters[eval(data_item)] << data_item
120
+ @cluster_indices = Array.new(@number_of_clusters) {[]}
121
+
122
+ @data_set.data_items.each_with_index do |data_item, data_index|
123
+ c = eval(data_item)
124
+ @clusters[c] << data_item
125
+ @cluster_indices[c] << data_index if @on_empty == 'outlier'
115
126
  end
127
+ manage_empty_clusters if has_empty_cluster?
116
128
  end
117
129
 
118
130
  def recompute_centroids
@@ -120,7 +132,97 @@ module Ai4r
120
132
  @iterations += 1
121
133
  @centroids = @centroid_function.call(@clusters)
122
134
  end
123
-
135
+
136
+ def populate_centroids(populate_method, number_of_clusters=@number_of_clusters)
137
+ tried_indexes = []
138
+ case populate_method
139
+ when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
140
+ while @centroids.length < number_of_clusters &&
141
+ tried_indexes.length < @data_set.data_items.length
142
+ random_index = rand(@data_set.data_items.length)
143
+ if !tried_indexes.include?(random_index)
144
+ tried_indexes << random_index
145
+ if !@centroids.include? @data_set.data_items[random_index]
146
+ @centroids << @data_set.data_items[random_index]
147
+ end
148
+ end
149
+ end
150
+ when 'indices' # for initial assignment only (with the :centroid_indices option)
151
+ @centroid_indices.each do |index|
152
+ raise ArgumentError, "Invalid centroid index #{index}" unless (index.is_a? Integer) && index >=0 && index < @data_set.data_items.length
153
+ if !tried_indexes.include?(index)
154
+ tried_indexes << index
155
+ if !@centroids.include? @data_set.data_items[index]
156
+ @centroids << @data_set.data_items[index]
157
+ end
158
+ end
159
+ end
160
+ when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
161
+ sorted_data_indices = sort_data_indices_by_dist_to_centroid
162
+ i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
163
+ while @centroids.length < number_of_clusters &&
164
+ tried_indexes.length < @data_set.data_items.length
165
+ outlier_index = sorted_data_indices[i]
166
+ if !tried_indexes.include?(outlier_index)
167
+ tried_indexes << outlier_index
168
+ if !@centroids.include? @data_set.data_items[outlier_index]
169
+ @centroids << @data_set.data_items[outlier_index]
170
+ end
171
+ end
172
+ i > 0 ? i -= 1 : break
173
+ end
174
+ end
175
+ @number_of_clusters = @centroids.length
176
+ end
177
+
178
+ # Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
179
+ # Returns indices, sorted in order from the nearest to furthest.
180
+ def sort_data_indices_by_dist_to_centroid
181
+ sorted_data_indices = []
182
+ h = {}
183
+ @clusters.each_with_index do |cluster, c|
184
+ centroid = @centroids[c]
185
+ cluster.data_items.each_with_index do |data_item, i|
186
+ dist_to_centroid = distance(data_item, centroid)
187
+ data_index = @cluster_indices[c][i]
188
+ h[data_index] = dist_to_centroid
189
+ end
190
+ end
191
+ # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
192
+ sorted_data_indices = h.sort_by{|k,v| v}.collect{|a,b| a}
193
+ end
194
+
195
+ def has_empty_cluster?
196
+ found_empty = false
197
+ @number_of_clusters.times do |c|
198
+ found_empty = true if @clusters[c].data_items.empty?
199
+ end
200
+ found_empty
201
+ end
202
+
203
+ def manage_empty_clusters
204
+ return if self.on_empty == 'terminate' # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
205
+
206
+ initial_number_of_clusters = @number_of_clusters
207
+ eliminate_empty_clusters
208
+ return if self.on_empty == 'eliminate'
209
+ populate_centroids(self.on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
210
+ calculate_membership_clusters
211
+ end
212
+
213
+ def eliminate_empty_clusters
214
+ old_clusters, old_centroids, old_cluster_indices = @clusters, @centroids, @cluster_indices
215
+ @clusters, @centroids, @cluster_indices = [], [], []
216
+ @number_of_clusters.times do |i|
217
+ if !old_clusters[i].data_items.empty?
218
+ @clusters << old_clusters[i]
219
+ @cluster_indices << old_cluster_indices[i]
220
+ @centroids << old_centroids[i]
221
+ end
222
+ end
223
+ @number_of_clusters = @centroids.length
224
+ end
225
+
124
226
  end
125
227
  end
126
228
  end
@@ -16,7 +16,7 @@ module Ai4r
16
16
  # Implementation of an Agglomerative Hierarchical clusterer with
17
17
  # median linkage algorithm, aka weighted pair group method centroid
18
18
  # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # Similar to centroid linkages, but using fix weight:
@@ -29,8 +29,8 @@ module Ai4r
29
29
  parameters_info :distance_function =>
30
30
  "Custom implementation of distance function. " +
31
31
  "It must be a closure receiving two data items and return the " +
32
- "distance bewteen them. By default, this algorithm uses " +
33
- "ecuclidean distance of numeric attributes to the power of 2."
32
+ "distance between them. By default, this algorithm uses " +
33
+ "euclidean distance of numeric attributes to the power of 2."
34
34
 
35
35
  # Build a new clusterer, using data examples found in data_set.
36
36
  # Items will be clustered in "number_of_clusters" different