ai4r 1.12 → 1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. data/README.rdoc +7 -12
  2. data/examples/classifiers/simple_linear_regression_example.csv +159 -0
  3. data/examples/classifiers/simple_linear_regression_example.rb +15 -0
  4. data/examples/clusterers/clusterer_example.rb +56 -0
  5. data/examples/neural_network/backpropagation_example.rb +2 -1
  6. data/lib/ai4r.rb +3 -1
  7. data/lib/ai4r/classifiers/id3.rb +6 -2
  8. data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
  9. data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
  10. data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
  11. data/lib/ai4r/clusterers/average_linkage.rb +3 -3
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
  13. data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
  14. data/lib/ai4r/clusterers/clusterer.rb +0 -11
  15. data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
  16. data/lib/ai4r/clusterers/diana.rb +2 -2
  17. data/lib/ai4r/clusterers/k_means.rb +123 -21
  18. data/lib/ai4r/clusterers/median_linkage.rb +3 -3
  19. data/lib/ai4r/clusterers/single_linkage.rb +4 -4
  20. data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
  21. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
  22. data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
  23. data/lib/ai4r/data/data_set.rb +12 -3
  24. data/lib/ai4r/data/proximity.rb +22 -0
  25. data/lib/ai4r/neural_network/backpropagation.rb +26 -15
  26. data/test/classifiers/id3_test.rb +12 -0
  27. data/test/classifiers/multilayer_perceptron_test.rb +1 -1
  28. data/test/classifiers/naive_bayes_test.rb +18 -18
  29. data/test/classifiers/simple_linear_regression_test.rb +37 -0
  30. data/test/clusterers/k_means_test.rb +75 -8
  31. data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
  32. data/test/data/data_set_test.rb +8 -0
  33. data/test/data/proximity_test.rb +7 -1
  34. metadata +96 -55
@@ -16,7 +16,7 @@ module Ai4r
16
16
 
17
17
  # Implementation of a Hierarchical clusterer with single linkage (Everitt et
18
18
  # al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # With single linkage, the distance between two clusters is computed as the
@@ -30,8 +30,8 @@ module Ai4r
30
30
  parameters_info :distance_function =>
31
31
  "Custom implementation of distance function. " +
32
32
  "It must be a closure receiving two data items and return the " +
33
- "distance bewteen them. By default, this algorithm uses " +
34
- "ecuclidean distance of numeric attributes to the power of 2."
33
+ "distance between them. By default, this algorithm uses " +
34
+ "euclidean distance of numeric attributes to the power of 2."
35
35
 
36
36
  def initialize
37
37
  @distance_function = lambda do |a,b|
@@ -105,7 +105,7 @@ module Ai4r
105
105
  end
106
106
 
107
107
  # ci and cj are the indexes of the clusters that are going to
108
- # be merged. We need to remove distances from/to ci and ci,
108
+ # be merged. We need to remove distances from/to ci and cj,
109
109
  # and add distances from/to new cluster (ci U cj)
110
110
  def update_distance_matrix(ci, cj)
111
111
  ci, cj = cj, ci if cj > ci
@@ -16,10 +16,10 @@ module Ai4r
16
16
  # Implementation of an Agglomerative Hierarchical clusterer with
17
17
  # Ward's method linkage algorithm, aka the minimum variance method (Everitt
18
18
  # et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
- # The objective of this method is to minime the variance.
22
+ # The objective of this method is to minimize the variance.
23
23
  #
24
24
  # D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
25
25
  # (nj/(ni+nj+nx))*D(cx, cj) -
@@ -29,8 +29,8 @@ module Ai4r
29
29
  parameters_info :distance_function =>
30
30
  "Custom implementation of distance function. " +
31
31
  "It must be a closure receiving two data items and return the " +
32
- "distance bewteen them. By default, this algorithm uses " +
33
- "ecuclidean distance of numeric attributes to the power of 2."
32
+ "distance between them. By default, this algorithm uses " +
33
+ "euclidean distance of numeric attributes to the power of 2."
34
34
 
35
35
  # Build a new clusterer, using data examples found in data_set.
36
36
  # Items will be clustered in "number_of_clusters" different
@@ -0,0 +1,48 @@
1
+ # Author:: Peter Lubell-Doughtie
2
+ # License:: BSD 3 Clause
3
+ # Project:: ai4r
4
+ # Url:: http://peet.ldee.org
5
+
6
+ require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
7
+
8
+ module Ai4r
9
+ module Clusterers
10
+
11
+ # Hierarchical version to store classes as merges occur.
12
+ class WardLinkageHierarchical < WardLinkage
13
+
14
+ attr_reader :cluster_tree
15
+
16
+ def initialize(depth = nil)
17
+ @cluster_tree = []
18
+ @depth = depth
19
+ @merges_so_far = 0
20
+ super()
21
+ end
22
+
23
+ def build(data_set, number_of_clusters)
24
+ data_len = data_set.data_items.length
25
+ @total_merges = data_len - number_of_clusters
26
+ super
27
+ @cluster_tree << self.clusters
28
+ @cluster_tree.reverse!
29
+ return self
30
+ end
31
+
32
+ protected
33
+
34
+ def merge_clusters(index_a, index_b, index_clusters)
35
+ # only store if no or above depth
36
+ if @depth.nil? or @merges_so_far > @total_merges - @depth
37
+ # store current clusters
38
+ stored_distance_matrix = @distance_matrix.dup
39
+ @cluster_tree << build_clusters_from_index_clusters(index_clusters)
40
+ @distance_matrix = stored_distance_matrix
41
+ end
42
+ @merges_so_far += 1
43
+ super
44
+ end
45
+ end
46
+ end
47
+ end
48
+
@@ -16,7 +16,7 @@ module Ai4r
16
16
  # Implementation of an Agglomerative Hierarchical clusterer with
17
17
  # weighted average linkage algorithm, aka weighted pair group method
18
18
  # average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
19
- # Hierarchical clusteres create one cluster per element, and then
19
+ # Hierarchical clusterer create one cluster per element, and then
20
20
  # progressively merge clusters, until the required number of clusters
21
21
  # is reached.
22
22
  # Similar to AverageLinkage, but the distances between clusters are
@@ -28,8 +28,8 @@ module Ai4r
28
28
  parameters_info :distance_function =>
29
29
  "Custom implementation of distance function. " +
30
30
  "It must be a closure receiving two data items and return the " +
31
- "distance bewteen them. By default, this algorithm uses " +
32
- "ecuclidean distance of numeric attributes to the power of 2."
31
+ "distance between them. By default, this algorithm uses " +
32
+ "euclidean distance of numeric attributes to the power of 2."
33
33
 
34
34
  # Build a new clusterer, using data examples found in data_set.
35
35
  # Items will be clustered in "number_of_clusters" different
@@ -20,8 +20,6 @@ module Ai4r
20
20
  # the data_labels property.
21
21
  class DataSet
22
22
 
23
- @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
24
-
25
23
  attr_reader :data_labels, :data_items
26
24
 
27
25
  # Create a new DataSet. By default, empty.
@@ -82,11 +80,18 @@ module Ai4r
82
80
  def parse_csv(filepath)
83
81
  items = []
84
82
  open_csv_file(filepath) do |row|
85
- items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
83
+ items << row.collect{|x| is_number?(x) ? Float(x) : x }
86
84
  end
87
85
  set_data_items(items)
88
86
  end
89
87
 
88
+ # Same as load_csv_with_labels, but it will try to convert cell contents as numbers.
89
+ def parse_csv_with_labels(filepath)
90
+ parse_csv(filepath)
91
+ @data_labels = @data_items.shift
92
+ return self
93
+ end
94
+
90
95
  # Set data labels.
91
96
  # Data labels must have the following format:
92
97
  # [ 'city', 'age_range', 'gender', 'marketing_target' ]
@@ -224,6 +229,10 @@ module Ai4r
224
229
 
225
230
  protected
226
231
 
232
+ def is_number?(x)
233
+ true if Float(x) rescue false
234
+ end
235
+
227
236
  def check_data_items(data_items)
228
237
  if !data_items || data_items.empty?
229
238
  raise ArgumentError, "Examples data set must not be empty."
@@ -92,6 +92,28 @@ module Ai4r
92
92
  return 1.0/similarity - 1
93
93
  end
94
94
 
95
+ # Cosine similarity is a measure of similarity between two vectors
96
+ # of an inner product space that measures the cosine of the
97
+ # angle between them (http://en.wikipedia.org/wiki/Cosine_similarity).
98
+ #
99
+ # Parameters a and b are vectors with continuous attributes.
100
+ #
101
+ # D = sum(a[i] * b[i]) / sqrt(sum(a[i]**2)) * sqrt(sum(b[i]**2))
102
+ def self.cosine_distance(a,b)
103
+ dot_product = 0.0
104
+ norm_a = 0.0
105
+ norm_b = 0.0
106
+ magnitude = 0.0
107
+
108
+ a.each_index do |i|
109
+ dot_product += a[i] * b[i]
110
+ norm_a += a[i] ** 2
111
+ norm_b += b[i] ** 2
112
+ end
113
+
114
+ magnitude = Math.sqrt(norm_a) * Math.sqrt(norm_b)
115
+ return 1 - (dot_product / magnitude)
116
+ end
95
117
  end
96
118
 
97
119
  end
@@ -44,7 +44,7 @@ module Ai4r
44
44
  # Use class method get_parameters_info to obtain details on the algorithm
45
45
  # parameters. Use set_parameters to set values for this parameters.
46
46
  #
47
- # * :disable_bias => If true, the alforithm will not use bias nodes.
47
+ # * :disable_bias => If true, the algorithm will not use bias nodes.
48
48
  # False by default.
49
49
  # * :initial_weight_function => f(n, i, j) must return the initial
50
50
  # weight for the conection between the node i in layer n, and node j in
@@ -86,7 +86,7 @@ module Ai4r
86
86
 
87
87
  include Ai4r::Data::Parameterizable
88
88
 
89
- parameters_info :disable_bias => "If true, the alforithm will not use "+
89
+ parameters_info :disable_bias => "If true, the algorithm will not use "+
90
90
  "bias nodes. False by default.",
91
91
  :initial_weight_function => "f(n, i, j) must return the initial "+
92
92
  "weight for the conection between the node i in layer n, and "+
@@ -136,6 +136,17 @@ module Ai4r
136
136
  return @activation_nodes.last.clone
137
137
  end
138
138
 
139
+ # Evaluates the input and returns most active node
140
+ # E.g.
141
+ # net = Backpropagation.new([4, 3, 2])
142
+ # net.eval_result([25, 32.3, 12.8, 1.5])
143
+ # # eval gives [0.83, 0.03]
144
+ # # => 0
145
+ def eval_result(input_values)
146
+ result = eval(input_values)
147
+ result.index(result.max)
148
+ end
149
+
139
150
  # This method trains the network using the backpropagation algorithm.
140
151
  #
141
152
  # input: Networks input
@@ -178,20 +189,20 @@ module Ai4r
178
189
  @last_changes,
179
190
  @activation_nodes
180
191
  ]
181
- end
192
+ end
182
193
 
183
- def marshal_load(ary)
184
- @structure,
185
- @disable_bias,
186
- @learning_rate,
187
- @momentum,
188
- @weights,
189
- @last_changes,
190
- @activation_nodes = ary
191
- @initial_weight_function = lambda { |n, i, j| ((rand 2000)/1000.0) - 1}
192
- @propagation_function = lambda { |x| 1/(1+Math.exp(-1*(x))) } #lambda { |x| Math.tanh(x) }
193
- @derivative_propagation_function = lambda { |y| y*(1-y) } #lambda { |y| 1.0 - y**2 }
194
- end
194
+ def marshal_load(ary)
195
+ @structure,
196
+ @disable_bias,
197
+ @learning_rate,
198
+ @momentum,
199
+ @weights,
200
+ @last_changes,
201
+ @activation_nodes = ary
202
+ @initial_weight_function = lambda { |n, i, j| ((rand 2000)/1000.0) - 1}
203
+ @propagation_function = lambda { |x| 1/(1+Math.exp(-1*(x))) } #lambda { |x| Math.tanh(x) }
204
+ @derivative_propagation_function = lambda { |y| y*(1-y) } #lambda { |y| 1.0 - y**2 }
205
+ end
195
206
 
196
207
 
197
208
  # Propagate error backwards
@@ -203,6 +203,18 @@ class ID3Test < Test::Unit::TestCase
203
203
  eval id3.get_rules
204
204
  assert_equal 'N', marketing_target
205
205
  end
206
+
207
+ def test_model_failure
208
+ bad_data_items = [ ['a', 'Y'],
209
+ ['b', 'N'],
210
+ ]
211
+ bad_data_labels = ['bogus', 'target']
212
+ id3 = ID3.new.build(DataSet.new(:data_items =>bad_data_items, :data_labels => bad_data_labels))
213
+ assert_raise ModelFailureError do
214
+ id3.eval(['c'])
215
+ end
216
+ assert_equal true, true
217
+ end
206
218
  end
207
219
 
208
220
 
@@ -23,7 +23,7 @@ class MultilayerPerceptronTest < Test::Unit::TestCase
23
23
  ['Chicago', '[50-80]', 'M', 'N'],
24
24
  ])
25
25
 
26
- def test_initialize
26
+ def test_initialize
27
27
  classifier = MultilayerPerceptron.new
28
28
  assert_equal 1, classifier.active_node_value
29
29
  assert_equal 0, classifier.inactive_node_value
@@ -7,37 +7,37 @@ include Ai4r::Data
7
7
 
8
8
  class NaiveBayesTest < Test::Unit::TestCase
9
9
 
10
- @@data_labels = [ "Color","Type","Origin","Stolen?" ]
10
+ @@data_labels = %w(Color Type Origin Stolen?)
11
11
 
12
12
  @@data_items = [
13
- ["Red", "Sports", "Domestic", "Yes"],
14
- ["Red", "Sports", "Domestic", "No"],
15
- ["Red", "Sports", "Domestic", "Yes"],
16
- ["Yellow","Sports", "Domestic", "No"],
17
- ["Yellow","Sports", "Imported", "Yes"],
18
- ["Yellow","SUV", "Imported", "No"],
19
- ["Yellow","SUV", "Imported", "Yes"],
20
- ["Yellow","Sports", "Domestic", "No"],
21
- ["Red", "SUV", "Imported", "No"],
22
- ["Red", "Sports", "Imported", "Yes"]
23
- ]
13
+ %w(Red Sports Domestic Yes),
14
+ %w(Red Sports Domestic No),
15
+ %w(Red Sports Domestic Yes),
16
+ %w(Yellow Sports Domestic No),
17
+ %w(Yellow Sports Imported Yes),
18
+ %w(Yellow SUV Imported No),
19
+ %w(Yellow SUV Imported Yes),
20
+ %w(Yellow Sports Domestic No),
21
+ %w(Red SUV Imported No),
22
+ %w(Red Sports Imported Yes)
23
+ ]
24
24
 
25
25
  def setup
26
26
  @data_set = DataSet.new
27
27
  @data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
28
- @b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
28
+ @b = NaiveBayes.new.set_parameters({:m => 3}).build @data_set
29
29
  end
30
30
 
31
31
  def test_eval
32
- result = @b.eval(["Red", "SUV", "Domestic"])
33
- assert_equal "No", result
32
+ result = @b.eval(%w(Red SUV Domestic))
33
+ assert_equal 'No', result
34
34
  end
35
35
 
36
36
  def test_get_probability_map
37
- map = @b.get_probability_map(["Red", "SUV", "Domestic"])
37
+ map = @b.get_probability_map(%w(Red SUV Domestic))
38
38
  assert_equal 2, map.keys.length
39
- assert_in_delta 0.42, map["Yes"], 0.1
40
- assert_in_delta 0.58, map["No"], 0.1
39
+ assert_in_delta 0.42, map['Yes'], 0.1
40
+ assert_in_delta 0.58, map['No'], 0.1
41
41
  end
42
42
 
43
43
  end
@@ -0,0 +1,37 @@
1
+ require 'ai4r/classifiers/simple_linear_regression'
2
+ require 'ai4r/data/data_set'
3
+ require 'test/unit'
4
+
5
+ include Ai4r::Classifiers
6
+ include Ai4r::Data
7
+
8
+ class SimpleLinearRegressionTest < Test::Unit::TestCase
9
+
10
+ @@data_labels = ["symboling", "normalized-losses", "wheel-base", "length", "width", "height", "curb-weight",
11
+ "engine-size", "bore" , "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg",
12
+ "highway-mpg", "class"]
13
+
14
+ @@data_items = [
15
+ [2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10,102,5500,24,30,13950],
16
+ [2,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8,115,5500,18,22,17450],
17
+ [1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25,17710],
18
+ [1,158,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140,5500,17,20,23875],
19
+ [2,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16430],
20
+ [0,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16925],
21
+ [0,188,101.2,176.8,64.8,54.3,2710,164,3.31,3.19,9,121,4250,21,28,20970],
22
+ [0,188,101.2,176.8,64.8,54.3,2765,164,3.31,3.19,9,121,4250,21,28,21105],
23
+ [2,121,88.4,141.1,60.3,53.2,1488,61,2.91,3.03,9.5,48,5100,47,53,5151],
24
+ ]
25
+
26
+ def setup
27
+ @data_set = DataSet.new
28
+ @data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
29
+ @c = SimpleLinearRegression.new.build @data_set
30
+ end
31
+
32
+ def test_eval
33
+ result = @c.eval([-1,95,109.1,188.8,68.9,55.5,3062,141,3.78,3.15,9.5,114,5400,19,25])
34
+ assert_equal 17218.444444444445, result
35
+ end
36
+
37
+ end
@@ -17,7 +17,11 @@ class KMeansTest < Test::Unit::TestCase
17
17
 
18
18
  @@data = [ [10, 3], [3, 10], [2, 8], [2, 5], [3, 8], [10, 3],
19
19
  [1, 3], [8, 1], [2, 9], [2, 5], [3, 3], [9, 4]]
20
-
20
+
21
+ # k-means will generate an empty cluster with this data and initial centroid assignment
22
+ @@empty_cluster_data = [[-0.1, 0], [0, 0], [0.1, 0], [-0.1, 10], [0.1, 10], [0.2, 10]]
23
+ @@empty_centroid_indices = [0,1,2]
24
+
21
25
  def test_build
22
26
  data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
23
27
  clusterer = KMeans.new.build(data_set, 4)
@@ -25,21 +29,45 @@ class KMeansTest < Test::Unit::TestCase
25
29
  # Verify that all 4 clusters are created
26
30
  assert_equal 4, clusterer.clusters.length
27
31
  assert_equal 4, clusterer.centroids.length
28
- # The addition of all instances of every cluster must be equal than
32
+ # The addition of all instances of every cluster must be equal to
29
33
  # the number of data points
30
34
  total_length = 0
31
35
  clusterer.clusters.each do |cluster|
32
36
  total_length += cluster.data_items.length
33
37
  end
34
38
  assert_equal @@data.length, total_length
35
- # Data inside clusters must be the same as orifinal data
39
+ # Data inside clusters must be the same as original data
36
40
  clusterer.clusters.each do |cluster|
37
41
  cluster.data_items.each do |data_item|
38
42
  assert @@data.include?(data_item)
39
43
  end
40
44
  end
41
45
  end
42
-
46
+
47
+ def test_build_and_eliminate_empty_clusters
48
+ data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
49
+ # :eliminate is the :on_empty default, so we don't need to pass it as a parameter for it
50
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
51
+
52
+ # Verify that one cluster was eliminated
53
+ assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
54
+ assert_equal @@empty_centroid_indices.size - 1, clusterer.centroids.length
55
+
56
+ # The addition of all instances of every cluster must be equal to
57
+ # the number of data points
58
+ total_length = 0
59
+ clusterer.clusters.each do |cluster|
60
+ total_length += cluster.data_items.length
61
+ end
62
+ assert_equal @@empty_cluster_data.length, total_length
63
+ # Data inside clusters must be the same as original data
64
+ clusterer.clusters.each do |cluster|
65
+ cluster.data_items.each do |data_item|
66
+ assert @@empty_cluster_data.include?(data_item)
67
+ end
68
+ end
69
+ end
70
+
43
71
  def test_eval
44
72
  data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
45
73
  clusterer = KMeans.new.build(data_set, 4)
@@ -54,13 +82,18 @@ class KMeansTest < Test::Unit::TestCase
54
82
  assert clusterer.distance(centroid, item) >= min_distance
55
83
  end
56
84
  end
57
-
85
+
58
86
  def test_distance
59
87
  clusterer = KMeans.new
60
- # By default, distance returns the eucledian distance to the power of 2
88
+ # By default, distance returns the euclidean distance to the power of 2
61
89
  assert_equal 2385, clusterer.distance(
62
90
  [1, 10, "Chicago", 2],
63
91
  [10, 10, "London", 50])
92
+
93
+ # Ensure default distance raises error for nil argument
94
+ exception = assert_raise(TypeError) {clusterer.distance([1, 10], [nil, nil])}
95
+ assert_equal("nil can't be coerced into Fixnum", exception.message)
96
+
64
97
  # Test new distance definition
65
98
  manhattan_distance = lambda do |a, b|
66
99
  dist = 0.0
@@ -84,7 +117,42 @@ class KMeansTest < Test::Unit::TestCase
84
117
  build(data_set, 4)
85
118
  assert_equal 1, clusterer.iterations
86
119
  end
87
-
120
+
121
+ def test_centroid_indices
122
+ data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
123
+ # centroid_indices need not be specified:
124
+ KMeans.new.build(data_set, 4)
125
+ # centroid_indices can be specified:
126
+ KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 4)
127
+ # raises exception if number of clusters differs from length of centroid_indices:
128
+ exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 2)}
129
+ assert_equal('Length of centroid indices array differs from the specified number of clusters', exception.message)
130
+ # raises exception for bad centroid index:
131
+ exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,@@data.size+10]}).build(data_set, 4)}
132
+ assert_equal("Invalid centroid index #{@@data.size+10}", exception.message)
133
+ end
134
+
135
+ def test_on_empty
136
+ data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
137
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
138
+ # Verify that one cluster was eliminated
139
+ assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
140
+ # Verify that eliminate is the on_empty default
141
+ assert_equal 'eliminate', clusterer.on_empty
142
+ # Verify that invalid on_empty option throws an argument error
143
+ exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'ldkfje'}).build(data_set, @@empty_centroid_indices.size)}
144
+ assert_equal("Invalid value for on_empty", exception.message)
145
+ # Verify that on_empty option 'terminate' raises an error when an empty cluster arises
146
+ exception = assert_raise(TypeError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'terminate'}).build(data_set, @@empty_centroid_indices.size)}
147
+ assert_equal("nil can't be coerced into Float", exception.message)
148
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'random'}).build(data_set, @@empty_centroid_indices.size)
149
+ # Verify that cluster was not eliminated
150
+ assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
151
+ clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'outlier'}).build(data_set, @@empty_centroid_indices.size)
152
+ # Verify that cluster was not eliminated
153
+ assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
154
+ end
155
+
88
156
  private
89
157
  def draw_map(clusterer)
90
158
  map = Array.new(11) {Array.new(11, 0)}
@@ -95,6 +163,5 @@ class KMeansTest < Test::Unit::TestCase
95
163
  end
96
164
  map.each { |row| puts row.inspect}
97
165
  end
98
-
99
166
  end
100
167