ai4r 1.12 → 1.13

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. data/README.rdoc +7 -12
  2. data/examples/classifiers/simple_linear_regression_example.csv +159 -0
  3. data/examples/classifiers/simple_linear_regression_example.rb +15 -0
  4. data/examples/clusterers/clusterer_example.rb +56 -0
  5. data/examples/neural_network/backpropagation_example.rb +2 -1
  6. data/lib/ai4r.rb +3 -1
  7. data/lib/ai4r/classifiers/id3.rb +6 -2
  8. data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
  9. data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
  10. data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
  11. data/lib/ai4r/clusterers/average_linkage.rb +3 -3
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
  13. data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
  14. data/lib/ai4r/clusterers/clusterer.rb +0 -11
  15. data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
  16. data/lib/ai4r/clusterers/diana.rb +2 -2
  17. data/lib/ai4r/clusterers/k_means.rb +123 -21
  18. data/lib/ai4r/clusterers/median_linkage.rb +3 -3
  19. data/lib/ai4r/clusterers/single_linkage.rb +4 -4
  20. data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
  21. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
  22. data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
  23. data/lib/ai4r/data/data_set.rb +12 -3
  24. data/lib/ai4r/data/proximity.rb +22 -0
  25. data/lib/ai4r/neural_network/backpropagation.rb +26 -15
  26. data/test/classifiers/id3_test.rb +12 -0
  27. data/test/classifiers/multilayer_perceptron_test.rb +1 -1
  28. data/test/classifiers/naive_bayes_test.rb +18 -18
  29. data/test/classifiers/simple_linear_regression_test.rb +37 -0
  30. data/test/clusterers/k_means_test.rb +75 -8
  31. data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
  32. data/test/data/data_set_test.rb +8 -0
  33. data/test/data/proximity_test.rb +7 -1
  34. metadata +96 -55
@@ -16,7 +16,7 @@ module Ai4r
16
16
 
17
17
  # Implementation of a Hierarchical clusterer with single linkage (Everitt et
18
18
  # al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # With single linkage, the distance between two clusters is computed as the
@@ -30,8 +30,8 @@ module Ai4r
30
30
  parameters_info :distance_function =>
31
31
  "Custom implementation of distance function. " +
32
32
  "It must be a closure receiving two data items and return the " +
33
- "distance bewteen them. By default, this algorithm uses " +
34
- "ecuclidean distance of numeric attributes to the power of 2."
33
+ "distance between them. By default, this algorithm uses " +
34
+ "euclidean distance of numeric attributes to the power of 2."
35
35
 
36
36
  def initialize
37
37
  @distance_function = lambda do |a,b|
@@ -105,7 +105,7 @@ module Ai4r
105
105
  end
106
106
 
107
107
  # ci and cj are the indexes of the clusters that are going to
108
- # be merged. We need to remove distances from/to ci and ci,
108
+ # be merged. We need to remove distances from/to ci and cj,
109
109
  # and add distances from/to new cluster (ci U cj)
110
110
  def update_distance_matrix(ci, cj)
111
111
  ci, cj = cj, ci if cj > ci
@@ -16,10 +16,10 @@ module Ai4r
16
16
  # Implementation of an Agglomerative Hierarchical clusterer with
17
17
  # Ward's method linkage algorithm, aka the minimum variance method (Everitt
18
18
  # et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
- # The objective of this method is to minime the variance.
22
+ # The objective of this method is to minimize the variance.
23
23
  #
24
24
  # D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
25
25
  # (nj/(ni+nj+nx))*D(cx, cj) -
@@ -29,8 +29,8 @@ module Ai4r
29
29
  parameters_info :distance_function =>
30
30
  "Custom implementation of distance function. " +
31
31
  "It must be a closure receiving two data items and return the " +
32
- "distance bewteen them. By default, this algorithm uses " +
33
- "ecuclidean distance of numeric attributes to the power of 2."
32
+ "distance between them. By default, this algorithm uses " +
33
+ "euclidean distance of numeric attributes to the power of 2."
34
34
 
35
35
  # Build a new clusterer, using data examples found in data_set.
36
36
  # Items will be clustered in "number_of_clusters" different
@@ -0,0 +1,48 @@
1
+ # Author:: Peter Lubell-Doughtie
2
+ # License:: BSD 3 Clause
3
+ # Project:: ai4r
4
+ # Url:: http://peet.ldee.org
5
+
6
+ require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
7
+
8
+ module Ai4r
9
+ module Clusterers
10
+
11
+ # Hierarchical version to store classes as merges occur.
12
+ class WardLinkageHierarchical < WardLinkage
13
+
14
+ attr_reader :cluster_tree
15
+
16
+ def initialize(depth = nil)
17
+ @cluster_tree = []
18
+ @depth = depth
19
+ @merges_so_far = 0
20
+ super()
21
+ end
22
+
23
+ def build(data_set, number_of_clusters)
24
+ data_len = data_set.data_items.length
25
+ @total_merges = data_len - number_of_clusters
26
+ super
27
+ @cluster_tree << self.clusters
28
+ @cluster_tree.reverse!
29
+ return self
30
+ end
31
+
32
+ protected
33
+
34
+ def merge_clusters(index_a, index_b, index_clusters)
35
+ # only store if no or above depth
36
+ if @depth.nil? or @merges_so_far > @total_merges - @depth
37
+ # store current clusters
38
+ stored_distance_matrix = @distance_matrix.dup
39
+ @cluster_tree << build_clusters_from_index_clusters(index_clusters)
40
+ @distance_matrix = stored_distance_matrix
41
+ end
42
+ @merges_so_far += 1
43
+ super
44
+ end
45
+ end
46
+ end
47
+ end
48
+
@@ -16,7 +16,7 @@ module Ai4r
16
16
  # Implementation of an Agglomerative Hierarchical clusterer with
17
17
  # weighted average linkage algorithm, aka weighted pair group method
18
18
  # average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # Similar to AverageLinkage, but the distances between clusters are
@@ -28,8 +28,8 @@ module Ai4r
28
28
  parameters_info :distance_function =>
29
29
  "Custom implementation of distance function. " +
30
30
  "It must be a closure receiving two data items and return the " +
31
- "distance bewteen them. By default, this algorithm uses " +
32
- "ecuclidean distance of numeric attributes to the power of 2."
31
+ "distance between them. By default, this algorithm uses " +
32
+ "euclidean distance of numeric attributes to the power of 2."
33
33
 
34
34
  # Build a new clusterer, using data examples found in data_set.
35
35
  # Items will be clustered in "number_of_clusters" different
@@ -20,8 +20,6 @@ module Ai4r
20
20
  # the data_labels property.
21
21
  class DataSet
22
22
 
23
- @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
24
-
25
23
  attr_reader :data_labels, :data_items
26
24
 
27
25
  # Create a new DataSet. By default, empty.
@@ -82,11 +80,18 @@ module Ai4r
82
80
  def parse_csv(filepath)
83
81
  items = []
84
82
  open_csv_file(filepath) do |row|
85
- items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
83
+ items << row.collect{|x| is_number?(x) ? Float(x) : x }
86
84
  end
87
85
  set_data_items(items)
88
86
  end
89
87
 
88
+ # Same as load_csv_with_labels, but it will try to convert cell contents as numbers.
89
+ def parse_csv_with_labels(filepath)
90
+ parse_csv(filepath)
91
+ @data_labels = @data_items.shift
92
+ return self
93
+ end
94
+
90
95
  # Set data labels.
91
96
  # Data labels must have the following format:
92
97
  # [ 'city', 'age_range', 'gender', 'marketing_target' ]
@@ -224,6 +229,10 @@ module Ai4r
224
229
 
225
230
  protected
226
231
 
232
+ def is_number?(x)
233
+ true if Float(x) rescue false
234
+ end
235
+
227
236
  def check_data_items(data_items)
228
237
  if !data_items || data_items.empty?
229
238
  raise ArgumentError, "Examples data set must not be empty."
@@ -92,6 +92,28 @@ module Ai4r
92
92
  return 1.0/similarity - 1
93
93
  end
94
94
 
95
+ # Cosine similarity is a measure of similarity between two vectors
96
+ # of an inner product space that measures the cosine of the
97
+ # angle between them (http://en.wikipedia.org/wiki/Cosine_similarity).
98
+ #
99
+ # Parameters a and b are vectors with continuous attributes.
100
+ #
101
+ # D = sum(a[i] * b[i]) / sqrt(sum(a[i]**2)) * sqrt(sum(b[i]**2))
102
+ def self.cosine_distance(a,b)
103
+ dot_product = 0.0
104
+ norm_a = 0.0
105
+ norm_b = 0.0
106
+ magnitude = 0.0
107
+
108
+ a.each_index do |i|
109
+ dot_product += a[i] * b[i]
110
+ norm_a += a[i] ** 2
111
+ norm_b += b[i] ** 2
112
+ end
113
+
114
+ magnitude = Math.sqrt(norm_a) * Math.sqrt(norm_b)
115
+ return 1 - (dot_product / magnitude)
116
+ end
95
117
  end
96
118
 
97
119
  end
@@ -44,7 +44,7 @@ module Ai4r
44
44
  # Use class method get_parameters_info to obtain details on the algorithm
45
45
  # parameters. Use set_parameters to set values for this parameters.
46
46
  #
47
- # * :disable_bias => If true, the alforithm will not use bias nodes.
47
+ # * :disable_bias => If true, the algorithm will not use bias nodes.
48
48
  # False by default.
49
49
  # * :initial_weight_function => f(n, i, j) must return the initial
50
50
  # weight for the conection between the node i in layer n, and node j in
@@ -86,7 +86,7 @@ module Ai4r
86
86
 
87
87
  include Ai4r::Data::Parameterizable
88
88
 
89
- parameters_info :disable_bias => "If true, the alforithm will not use "+
89
+ parameters_info :disable_bias => "If true, the algorithm will not use "+
90
90
  "bias nodes. False by default.",
91
91
  :initial_weight_function => "f(n, i, j) must return the initial "+
92
92
  "weight for the conection between the node i in layer n, and "+
@@ -136,6 +136,17 @@ module Ai4r
136
136
  return @activation_nodes.last.clone
137
137
  end
138
138
 
139
+ # Evaluates the input and returns most active node
140
+ # E.g.
141
+ # net = Backpropagation.new([4, 3, 2])
142
+ # net.eval_result([25, 32.3, 12.8, 1.5])
143
+ # # eval gives [0.83, 0.03]
144
+ # # => 0
145
+ def eval_result(input_values)
146
+ result = eval(input_values)
147
+ result.index(result.max)
148
+ end
149
+
139
150
  # This method trains the network using the backpropagation algorithm.
140
151
  #
141
152
  # input: Networks input
@@ -178,20 +189,20 @@ module Ai4r
178
189
  @last_changes,
179
190
  @activation_nodes
180
191
  ]
181
- end
192
+ end
182
193
 
183
- def marshal_load(ary)
184
- @structure,
185
- @disable_bias,
186
- @learning_rate,
187
- @momentum,
188
- @weights,
189
- @last_changes,
190
- @activation_nodes = ary
191
- @initial_weight_function = lambda { |n, i, j| ((rand 2000)/1000.0) - 1}
192
- @propagation_function = lambda { |x| 1/(1+Math.exp(-1*(x))) } #lambda { |x| Math.tanh(x) }
193
- @derivative_propagation_function = lambda { |y| y*(1-y) } #lambda { |y| 1.0 - y**2 }
194
- end
194
+ def marshal_load(ary)
195
+ @structure,
196
+ @disable_bias,
197
+ @learning_rate,
198
+ @momentum,
199
+ @weights,
200
+ @last_changes,
201
+ @activation_nodes = ary
202
+ @initial_weight_function = lambda { |n, i, j| ((rand 2000)/1000.0) - 1}
203
+ @propagation_function = lambda { |x| 1/(1+Math.exp(-1*(x))) } #lambda { |x| Math.tanh(x) }
204
+ @derivative_propagation_function = lambda { |y| y*(1-y) } #lambda { |y| 1.0 - y**2 }
205
+ end
195
206
 
196
207
 
197
208
  # Propagate error backwards
@@ -203,6 +203,18 @@ class ID3Test < Test::Unit::TestCase
203
203
  eval id3.get_rules
204
204
  assert_equal 'N', marketing_target
205
205
  end
206
+
207
+ def test_model_failure
208
+ bad_data_items = [ ['a', 'Y'],
209
+ ['b', 'N'],
210
+ ]
211
+ bad_data_labels = ['bogus', 'target']
212
+ id3 = ID3.new.build(DataSet.new(:data_items =>bad_data_items, :data_labels => bad_data_labels))
213
+ assert_raise ModelFailureError do
214
+ id3.eval(['c'])
215
+ end
216
+ assert_equal true, true
217
+ end
206
218
  end
207
219
 
208
220
 
@@ -23,7 +23,7 @@ class MultilayerPerceptronTest < Test::Unit::TestCase
23
23
  ['Chicago', '[50-80]', 'M', 'N'],
24
24
  ])
25
25
 
26
- def test_initialize
26
+ def test_initialize
27
27
  classifier = MultilayerPerceptron.new
28
28
  assert_equal 1, classifier.active_node_value
29
29
  assert_equal 0, classifier.inactive_node_value
@@ -7,37 +7,37 @@ include Ai4r::Data
7
7
 
8
8
  class NaiveBayesTest < Test::Unit::TestCase
9
9
 
10
- @@data_labels = [ "Color","Type","Origin","Stolen?" ]
10
+ @@data_labels = %w(Color Type Origin Stolen?)
11
11
 
12
12
  @@data_items = [
13
- ["Red", "Sports", "Domestic", "Yes"],
14
- ["Red", "Sports", "Domestic", "No"],
15
- ["Red", "Sports", "Domestic", "Yes"],
16
- ["Yellow","Sports", "Domestic", "No"],
17
- ["Yellow","Sports", "Imported", "Yes"],
18
- ["Yellow","SUV", "Imported", "No"],
19
- ["Yellow","SUV", "Imported", "Yes"],
20
- ["Yellow","Sports", "Domestic", "No"],
21
- ["Red", "SUV", "Imported", "No"],
22
- ["Red", "Sports", "Imported", "Yes"]
23
- ]
13
+ %w(Red Sports Domestic Yes),
14
+ %w(Red Sports Domestic No),
15
+ %w(Red Sports Domestic Yes),
16
+ %w(Yellow Sports Domestic No),
17
+ %w(Yellow Sports Imported Yes),
18
+ %w(Yellow SUV Imported No),
19
+ %w(Yellow SUV Imported Yes),
20
+ %w(Yellow Sports Domestic No),
21
+ %w(Red SUV Imported No),
22
+ %w(Red Sports Imported Yes)
23
+ ]
24
24
 
25
25
  def setup
26
26
  @data_set = DataSet.new
27
27
  @data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
28
- @b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
28
+ @b = NaiveBayes.new.set_parameters({:m => 3}).build @data_set
29
29
  end
30
30
 
31
31
  def test_eval
32
- result = @b.eval(["Red", "SUV", "Domestic"])
33
- assert_equal "No", result
32
+ result = @b.eval(%w(Red SUV Domestic))
33
+ assert_equal 'No', result
34
34
  end
35
35
 
36
36
  def test_get_probability_map
37
- map = @b.get_probability_map(["Red", "SUV", "Domestic"])
37
+ map = @b.get_probability_map(%w(Red SUV Domestic))
38
38
  assert_equal 2, map.keys.length
39
- assert_in_delta 0.42, map["Yes"], 0.1
40
- assert_in_delta 0.58, map["No"], 0.1
39
+ assert_in_delta 0.42, map['Yes'], 0.1
40
+ assert_in_delta 0.58, map['No'], 0.1
41
41
  end
42
42
 
43
43
  end
@@ -0,0 +1,37 @@
1
+ require 'ai4r/classifiers/simple_linear_regression'
2
+ require 'ai4r/data/data_set'
3
+ require 'test/unit'
4
+
5
+ include Ai4r::Classifiers
6
+ include Ai4r::Data
7
+
8
+ class SimpleLinearRegressionTest < Test::Unit::TestCase
9
+
10
+ @@data_labels = ["symboling", "normalized-losses", "wheel-base", "length", "width", "height", "curb-weight",
11
+ "engine-size", "bore" , "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg",
12
+ "highway-mpg", "class"]
13
+
14
+ @@data_items = [
15
+ [2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10,102,5500,24,30,13950],
16
+ [2,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8,115,5500,18,22,17450],
17
+ [1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25,17710],
18
+ [1,158,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140,5500,17,20,23875],
19
+ [2,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16430],
20
+ [0,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16925],
21
+ [0,188,101.2,176.8,64.8,54.3,2710,164,3.31,3.19,9,121,4250,21,28,20970],
22
+ [0,188,101.2,176.8,64.8,54.3,2765,164,3.31,3.19,9,121,4250,21,28,21105],
23
+ [2,121,88.4,141.1,60.3,53.2,1488,61,2.91,3.03,9.5,48,5100,47,53,5151],
24
+ ]
25
+
26
+ def setup
27
+ @data_set = DataSet.new
28
+ @data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
29
+ @c = SimpleLinearRegression.new.build @data_set
30
+ end
31
+
32
+ def test_eval
33
+ result = @c.eval([-1,95,109.1,188.8,68.9,55.5,3062,141,3.78,3.15,9.5,114,5400,19,25])
34
+ assert_equal 17218.444444444445, result
35
+ end
36
+
37
+ end
@@ -17,7 +17,11 @@ class KMeansTest < Test::Unit::TestCase
17
17
 
18
18
  @@data = [ [10, 3], [3, 10], [2, 8], [2, 5], [3, 8], [10, 3],
19
19
  [1, 3], [8, 1], [2, 9], [2, 5], [3, 3], [9, 4]]
20
-
20
+
21
+ # k-means will generate an empty cluster with this data and initial centroid assignment
22
+ @@empty_cluster_data = [[-0.1, 0], [0, 0], [0.1, 0], [-0.1, 10], [0.1, 10], [0.2, 10]]
23
+ @@empty_centroid_indices = [0,1,2]
24
+
21
25
  def test_build
22
26
  data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
23
27
  clusterer = KMeans.new.build(data_set, 4)
@@ -25,21 +29,45 @@ class KMeansTest < Test::Unit::TestCase
25
29
  # Verify that all 4 clusters are created
26
30
  assert_equal 4, clusterer.clusters.length
27
31
  assert_equal 4, clusterer.centroids.length
28
- # The addition of all instances of every cluster must be equal than
32
+ # The addition of all instances of every cluster must be equal to
29
33
  # the number of data points
30
34
  total_length = 0
31
35
  clusterer.clusters.each do |cluster|
32
36
  total_length += cluster.data_items.length
33
37
  end
34
38
  assert_equal @@data.length, total_length
35
- # Data inside clusters must be the same as orifinal data
39
+ # Data inside clusters must be the same as original data
36
40
  clusterer.clusters.each do |cluster|
37
41
  cluster.data_items.each do |data_item|
38
42
  assert @@data.include?(data_item)
39
43
  end
40
44
  end
41
45
  end
42
-
46
+
47
+ def test_build_and_eliminate_empty_clusters
48
+ data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
49
+ # :eliminate is the :on_empty default, so we don't need to pass it as a parameter for it
50
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
51
+
52
+ # Verify that one cluster was eliminated
53
+ assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
54
+ assert_equal @@empty_centroid_indices.size - 1, clusterer.centroids.length
55
+
56
+ # The addition of all instances of every cluster must be equal to
57
+ # the number of data points
58
+ total_length = 0
59
+ clusterer.clusters.each do |cluster|
60
+ total_length += cluster.data_items.length
61
+ end
62
+ assert_equal @@empty_cluster_data.length, total_length
63
+ # Data inside clusters must be the same as original data
64
+ clusterer.clusters.each do |cluster|
65
+ cluster.data_items.each do |data_item|
66
+ assert @@empty_cluster_data.include?(data_item)
67
+ end
68
+ end
69
+ end
70
+
43
71
  def test_eval
44
72
  data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
45
73
  clusterer = KMeans.new.build(data_set, 4)
@@ -54,13 +82,18 @@ class KMeansTest < Test::Unit::TestCase
54
82
  assert clusterer.distance(centroid, item) >= min_distance
55
83
  end
56
84
  end
57
-
85
+
58
86
  def test_distance
59
87
  clusterer = KMeans.new
60
- # By default, distance returns the eucledian distance to the power of 2
88
+ # By default, distance returns the euclidean distance to the power of 2
61
89
  assert_equal 2385, clusterer.distance(
62
90
  [1, 10, "Chicago", 2],
63
91
  [10, 10, "London", 50])
92
+
93
+ # Ensure default distance raises error for nil argument
94
+ exception = assert_raise(TypeError) {clusterer.distance([1, 10], [nil, nil])}
95
+ assert_equal("nil can't be coerced into Fixnum", exception.message)
96
+
64
97
  # Test new distance definition
65
98
  manhattan_distance = lambda do |a, b|
66
99
  dist = 0.0
@@ -84,7 +117,42 @@ class KMeansTest < Test::Unit::TestCase
84
117
  build(data_set, 4)
85
118
  assert_equal 1, clusterer.iterations
86
119
  end
87
-
120
+
121
+ def test_centroid_indices
122
+ data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
123
+ # centroid_indices need not be specified:
124
+ KMeans.new.build(data_set, 4)
125
+ # centroid_indices can be specified:
126
+ KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 4)
127
+ # raises exception if number of clusters differs from length of centroid_indices:
128
+ exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 2)}
129
+ assert_equal('Length of centroid indices array differs from the specified number of clusters', exception.message)
130
+ # raises exception for bad centroid index:
131
+ exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,@@data.size+10]}).build(data_set, 4)}
132
+ assert_equal("Invalid centroid index #{@@data.size+10}", exception.message)
133
+ end
134
+
135
+ def test_on_empty
136
+ data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
137
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
138
+ # Verify that one cluster was eliminated
139
+ assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
140
+ # Verify that eliminate is the on_empty default
141
+ assert_equal 'eliminate', clusterer.on_empty
142
+ # Verify that invalid on_empty option throws an argument error
143
+ exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'ldkfje'}).build(data_set, @@empty_centroid_indices.size)}
144
+ assert_equal("Invalid value for on_empty", exception.message)
145
+ # Verify that on_empty option 'terminate' raises an error when an empty cluster arises
146
+ exception = assert_raise(TypeError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'terminate'}).build(data_set, @@empty_centroid_indices.size)}
147
+ assert_equal("nil can't be coerced into Float", exception.message)
148
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'random'}).build(data_set, @@empty_centroid_indices.size)
149
+ # Verify that cluster was not eliminated
150
+ assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
151
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'outlier'}).build(data_set, @@empty_centroid_indices.size)
152
+ # Verify that cluster was not eliminated
153
+ assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
154
+ end
155
+
88
156
  private
89
157
  def draw_map(clusterer)
90
158
  map = Array.new(11) {Array.new(11, 0)}
@@ -95,6 +163,5 @@ class KMeansTest < Test::Unit::TestCase
95
163
  end
96
164
  map.each { |row| puts row.inspect}
97
165
  end
98
-
99
166
  end
100
167