ai4r 1.12 → 1.13
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +7 -12
- data/examples/classifiers/simple_linear_regression_example.csv +159 -0
- data/examples/classifiers/simple_linear_regression_example.rb +15 -0
- data/examples/clusterers/clusterer_example.rb +56 -0
- data/examples/neural_network/backpropagation_example.rb +2 -1
- data/lib/ai4r.rb +3 -1
- data/lib/ai4r/classifiers/id3.rb +6 -2
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
- data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
- data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
- data/lib/ai4r/clusterers/average_linkage.rb +3 -3
- data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
- data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
- data/lib/ai4r/clusterers/clusterer.rb +0 -11
- data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
- data/lib/ai4r/clusterers/diana.rb +2 -2
- data/lib/ai4r/clusterers/k_means.rb +123 -21
- data/lib/ai4r/clusterers/median_linkage.rb +3 -3
- data/lib/ai4r/clusterers/single_linkage.rb +4 -4
- data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
- data/lib/ai4r/data/data_set.rb +12 -3
- data/lib/ai4r/data/proximity.rb +22 -0
- data/lib/ai4r/neural_network/backpropagation.rb +26 -15
- data/test/classifiers/id3_test.rb +12 -0
- data/test/classifiers/multilayer_perceptron_test.rb +1 -1
- data/test/classifiers/naive_bayes_test.rb +18 -18
- data/test/classifiers/simple_linear_regression_test.rb +37 -0
- data/test/clusterers/k_means_test.rb +75 -8
- data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
- data/test/data/data_set_test.rb +8 -0
- data/test/data/proximity_test.rb +7 -1
- metadata +96 -55
@@ -16,7 +16,7 @@ module Ai4r
|
|
16
16
|
|
17
17
|
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
18
18
|
# al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
|
19
|
-
# Hierarchical
|
19
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
20
|
# progressively merge clusters, until the required number of clusters
|
21
21
|
# is reached.
|
22
22
|
# With single linkage, the distance between two clusters is computed as the
|
@@ -30,8 +30,8 @@ module Ai4r
|
|
30
30
|
parameters_info :distance_function =>
|
31
31
|
"Custom implementation of distance function. " +
|
32
32
|
"It must be a closure receiving two data items and return the " +
|
33
|
-
"distance
|
34
|
-
"
|
33
|
+
"distance between them. By default, this algorithm uses " +
|
34
|
+
"euclidean distance of numeric attributes to the power of 2."
|
35
35
|
|
36
36
|
def initialize
|
37
37
|
@distance_function = lambda do |a,b|
|
@@ -105,7 +105,7 @@ module Ai4r
|
|
105
105
|
end
|
106
106
|
|
107
107
|
# ci and cj are the indexes of the clusters that are going to
|
108
|
-
# be merged. We need to remove distances from/to ci and
|
108
|
+
# be merged. We need to remove distances from/to ci and cj,
|
109
109
|
# and add distances from/to new cluster (ci U cj)
|
110
110
|
def update_distance_matrix(ci, cj)
|
111
111
|
ci, cj = cj, ci if cj > ci
|
@@ -16,10 +16,10 @@ module Ai4r
|
|
16
16
|
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
17
|
# Ward's method linkage algorithm, aka the minimum variance method (Everitt
|
18
18
|
# et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
|
19
|
-
# Hierarchical
|
19
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
20
|
# progressively merge clusters, until the required number of clusters
|
21
21
|
# is reached.
|
22
|
-
# The objective of this method is to
|
22
|
+
# The objective of this method is to minimize the variance.
|
23
23
|
#
|
24
24
|
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
25
25
|
# (nj/(ni+nj+nx))*D(cx, cj) -
|
@@ -29,8 +29,8 @@ module Ai4r
|
|
29
29
|
parameters_info :distance_function =>
|
30
30
|
"Custom implementation of distance function. " +
|
31
31
|
"It must be a closure receiving two data items and return the " +
|
32
|
-
"distance
|
33
|
-
"
|
32
|
+
"distance between them. By default, this algorithm uses " +
|
33
|
+
"euclidean distance of numeric attributes to the power of 2."
|
34
34
|
|
35
35
|
# Build a new clusterer, using data examples found in data_set.
|
36
36
|
# Items will be clustered in "number_of_clusters" different
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Author:: Peter Lubell-Doughtie
|
2
|
+
# License:: BSD 3 Clause
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://peet.ldee.org
|
5
|
+
|
6
|
+
require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
|
7
|
+
|
8
|
+
module Ai4r
|
9
|
+
module Clusterers
|
10
|
+
|
11
|
+
# Hierarchical version to store classes as merges occur.
|
12
|
+
class WardLinkageHierarchical < WardLinkage
|
13
|
+
|
14
|
+
attr_reader :cluster_tree
|
15
|
+
|
16
|
+
def initialize(depth = nil)
|
17
|
+
@cluster_tree = []
|
18
|
+
@depth = depth
|
19
|
+
@merges_so_far = 0
|
20
|
+
super()
|
21
|
+
end
|
22
|
+
|
23
|
+
def build(data_set, number_of_clusters)
|
24
|
+
data_len = data_set.data_items.length
|
25
|
+
@total_merges = data_len - number_of_clusters
|
26
|
+
super
|
27
|
+
@cluster_tree << self.clusters
|
28
|
+
@cluster_tree.reverse!
|
29
|
+
return self
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
35
|
+
# only store if no or above depth
|
36
|
+
if @depth.nil? or @merges_so_far > @total_merges - @depth
|
37
|
+
# store current clusters
|
38
|
+
stored_distance_matrix = @distance_matrix.dup
|
39
|
+
@cluster_tree << build_clusters_from_index_clusters(index_clusters)
|
40
|
+
@distance_matrix = stored_distance_matrix
|
41
|
+
end
|
42
|
+
@merges_so_far += 1
|
43
|
+
super
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
@@ -16,7 +16,7 @@ module Ai4r
|
|
16
16
|
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
17
|
# weighted average linkage algorithm, aka weighted pair group method
|
18
18
|
# average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
|
19
|
-
# Hierarchical
|
19
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
20
|
# progressively merge clusters, until the required number of clusters
|
21
21
|
# is reached.
|
22
22
|
# Similar to AverageLinkage, but the distances between clusters are
|
@@ -28,8 +28,8 @@ module Ai4r
|
|
28
28
|
parameters_info :distance_function =>
|
29
29
|
"Custom implementation of distance function. " +
|
30
30
|
"It must be a closure receiving two data items and return the " +
|
31
|
-
"distance
|
32
|
-
"
|
31
|
+
"distance between them. By default, this algorithm uses " +
|
32
|
+
"euclidean distance of numeric attributes to the power of 2."
|
33
33
|
|
34
34
|
# Build a new clusterer, using data examples found in data_set.
|
35
35
|
# Items will be clustered in "number_of_clusters" different
|
data/lib/ai4r/data/data_set.rb
CHANGED
@@ -20,8 +20,6 @@ module Ai4r
|
|
20
20
|
# the data_labels property.
|
21
21
|
class DataSet
|
22
22
|
|
23
|
-
@@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
|
24
|
-
|
25
23
|
attr_reader :data_labels, :data_items
|
26
24
|
|
27
25
|
# Create a new DataSet. By default, empty.
|
@@ -82,11 +80,18 @@ module Ai4r
|
|
82
80
|
def parse_csv(filepath)
|
83
81
|
items = []
|
84
82
|
open_csv_file(filepath) do |row|
|
85
|
-
items << row.collect{|x| (x
|
83
|
+
items << row.collect{|x| is_number?(x) ? Float(x) : x }
|
86
84
|
end
|
87
85
|
set_data_items(items)
|
88
86
|
end
|
89
87
|
|
88
|
+
# Same as load_csv_with_labels, but it will try to convert cell contents as numbers.
|
89
|
+
def parse_csv_with_labels(filepath)
|
90
|
+
parse_csv(filepath)
|
91
|
+
@data_labels = @data_items.shift
|
92
|
+
return self
|
93
|
+
end
|
94
|
+
|
90
95
|
# Set data labels.
|
91
96
|
# Data labels must have the following format:
|
92
97
|
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
@@ -224,6 +229,10 @@ module Ai4r
|
|
224
229
|
|
225
230
|
protected
|
226
231
|
|
232
|
+
def is_number?(x)
|
233
|
+
true if Float(x) rescue false
|
234
|
+
end
|
235
|
+
|
227
236
|
def check_data_items(data_items)
|
228
237
|
if !data_items || data_items.empty?
|
229
238
|
raise ArgumentError, "Examples data set must not be empty."
|
data/lib/ai4r/data/proximity.rb
CHANGED
@@ -92,6 +92,28 @@ module Ai4r
|
|
92
92
|
return 1.0/similarity - 1
|
93
93
|
end
|
94
94
|
|
95
|
+
# Cosine similarity is a measure of similarity between two vectors
|
96
|
+
# of an inner product space that measures the cosine of the
|
97
|
+
# angle between them (http://en.wikipedia.org/wiki/Cosine_similarity).
|
98
|
+
#
|
99
|
+
# Parameters a and b are vectors with continuous attributes.
|
100
|
+
#
|
101
|
+
# D = sum(a[i] * b[i]) / sqrt(sum(a[i]**2)) * sqrt(sum(b[i]**2))
|
102
|
+
def self.cosine_distance(a,b)
|
103
|
+
dot_product = 0.0
|
104
|
+
norm_a = 0.0
|
105
|
+
norm_b = 0.0
|
106
|
+
magnitude = 0.0
|
107
|
+
|
108
|
+
a.each_index do |i|
|
109
|
+
dot_product += a[i] * b[i]
|
110
|
+
norm_a += a[i] ** 2
|
111
|
+
norm_b += b[i] ** 2
|
112
|
+
end
|
113
|
+
|
114
|
+
magnitude = Math.sqrt(norm_a) * Math.sqrt(norm_b)
|
115
|
+
return 1 - (dot_product / magnitude)
|
116
|
+
end
|
95
117
|
end
|
96
118
|
|
97
119
|
end
|
@@ -44,7 +44,7 @@ module Ai4r
|
|
44
44
|
# Use class method get_parameters_info to obtain details on the algorithm
|
45
45
|
# parameters. Use set_parameters to set values for this parameters.
|
46
46
|
#
|
47
|
-
# * :disable_bias => If true, the
|
47
|
+
# * :disable_bias => If true, the algorithm will not use bias nodes.
|
48
48
|
# False by default.
|
49
49
|
# * :initial_weight_function => f(n, i, j) must return the initial
|
50
50
|
# weight for the conection between the node i in layer n, and node j in
|
@@ -86,7 +86,7 @@ module Ai4r
|
|
86
86
|
|
87
87
|
include Ai4r::Data::Parameterizable
|
88
88
|
|
89
|
-
parameters_info :disable_bias => "If true, the
|
89
|
+
parameters_info :disable_bias => "If true, the algorithm will not use "+
|
90
90
|
"bias nodes. False by default.",
|
91
91
|
:initial_weight_function => "f(n, i, j) must return the initial "+
|
92
92
|
"weight for the conection between the node i in layer n, and "+
|
@@ -136,6 +136,17 @@ module Ai4r
|
|
136
136
|
return @activation_nodes.last.clone
|
137
137
|
end
|
138
138
|
|
139
|
+
# Evaluates the input and returns most active node
|
140
|
+
# E.g.
|
141
|
+
# net = Backpropagation.new([4, 3, 2])
|
142
|
+
# net.eval_result([25, 32.3, 12.8, 1.5])
|
143
|
+
# # eval gives [0.83, 0.03]
|
144
|
+
# # => 0
|
145
|
+
def eval_result(input_values)
|
146
|
+
result = eval(input_values)
|
147
|
+
result.index(result.max)
|
148
|
+
end
|
149
|
+
|
139
150
|
# This method trains the network using the backpropagation algorithm.
|
140
151
|
#
|
141
152
|
# input: Networks input
|
@@ -178,20 +189,20 @@ module Ai4r
|
|
178
189
|
@last_changes,
|
179
190
|
@activation_nodes
|
180
191
|
]
|
181
|
-
|
192
|
+
end
|
182
193
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
194
|
+
def marshal_load(ary)
|
195
|
+
@structure,
|
196
|
+
@disable_bias,
|
197
|
+
@learning_rate,
|
198
|
+
@momentum,
|
199
|
+
@weights,
|
200
|
+
@last_changes,
|
201
|
+
@activation_nodes = ary
|
202
|
+
@initial_weight_function = lambda { |n, i, j| ((rand 2000)/1000.0) - 1}
|
203
|
+
@propagation_function = lambda { |x| 1/(1+Math.exp(-1*(x))) } #lambda { |x| Math.tanh(x) }
|
204
|
+
@derivative_propagation_function = lambda { |y| y*(1-y) } #lambda { |y| 1.0 - y**2 }
|
205
|
+
end
|
195
206
|
|
196
207
|
|
197
208
|
# Propagate error backwards
|
@@ -203,6 +203,18 @@ class ID3Test < Test::Unit::TestCase
|
|
203
203
|
eval id3.get_rules
|
204
204
|
assert_equal 'N', marketing_target
|
205
205
|
end
|
206
|
+
|
207
|
+
def test_model_failure
|
208
|
+
bad_data_items = [ ['a', 'Y'],
|
209
|
+
['b', 'N'],
|
210
|
+
]
|
211
|
+
bad_data_labels = ['bogus', 'target']
|
212
|
+
id3 = ID3.new.build(DataSet.new(:data_items =>bad_data_items, :data_labels => bad_data_labels))
|
213
|
+
assert_raise ModelFailureError do
|
214
|
+
id3.eval(['c'])
|
215
|
+
end
|
216
|
+
assert_equal true, true
|
217
|
+
end
|
206
218
|
end
|
207
219
|
|
208
220
|
|
@@ -23,7 +23,7 @@ class MultilayerPerceptronTest < Test::Unit::TestCase
|
|
23
23
|
['Chicago', '[50-80]', 'M', 'N'],
|
24
24
|
])
|
25
25
|
|
26
|
-
|
26
|
+
def test_initialize
|
27
27
|
classifier = MultilayerPerceptron.new
|
28
28
|
assert_equal 1, classifier.active_node_value
|
29
29
|
assert_equal 0, classifier.inactive_node_value
|
@@ -7,37 +7,37 @@ include Ai4r::Data
|
|
7
7
|
|
8
8
|
class NaiveBayesTest < Test::Unit::TestCase
|
9
9
|
|
10
|
-
@@data_labels =
|
10
|
+
@@data_labels = %w(Color Type Origin Stolen?)
|
11
11
|
|
12
12
|
@@data_items = [
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
13
|
+
%w(Red Sports Domestic Yes),
|
14
|
+
%w(Red Sports Domestic No),
|
15
|
+
%w(Red Sports Domestic Yes),
|
16
|
+
%w(Yellow Sports Domestic No),
|
17
|
+
%w(Yellow Sports Imported Yes),
|
18
|
+
%w(Yellow SUV Imported No),
|
19
|
+
%w(Yellow SUV Imported Yes),
|
20
|
+
%w(Yellow Sports Domestic No),
|
21
|
+
%w(Red SUV Imported No),
|
22
|
+
%w(Red Sports Imported Yes)
|
23
|
+
]
|
24
24
|
|
25
25
|
def setup
|
26
26
|
@data_set = DataSet.new
|
27
27
|
@data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
|
28
|
-
@b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
|
28
|
+
@b = NaiveBayes.new.set_parameters({:m => 3}).build @data_set
|
29
29
|
end
|
30
30
|
|
31
31
|
def test_eval
|
32
|
-
result = @b.eval(
|
33
|
-
assert_equal
|
32
|
+
result = @b.eval(%w(Red SUV Domestic))
|
33
|
+
assert_equal 'No', result
|
34
34
|
end
|
35
35
|
|
36
36
|
def test_get_probability_map
|
37
|
-
map = @b.get_probability_map(
|
37
|
+
map = @b.get_probability_map(%w(Red SUV Domestic))
|
38
38
|
assert_equal 2, map.keys.length
|
39
|
-
assert_in_delta 0.42, map[
|
40
|
-
assert_in_delta 0.58, map[
|
39
|
+
assert_in_delta 0.42, map['Yes'], 0.1
|
40
|
+
assert_in_delta 0.58, map['No'], 0.1
|
41
41
|
end
|
42
42
|
|
43
43
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'ai4r/classifiers/simple_linear_regression'
|
2
|
+
require 'ai4r/data/data_set'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
include Ai4r::Classifiers
|
6
|
+
include Ai4r::Data
|
7
|
+
|
8
|
+
class SimpleLinearRegressionTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
@@data_labels = ["symboling", "normalized-losses", "wheel-base", "length", "width", "height", "curb-weight",
|
11
|
+
"engine-size", "bore" , "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg",
|
12
|
+
"highway-mpg", "class"]
|
13
|
+
|
14
|
+
@@data_items = [
|
15
|
+
[2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10,102,5500,24,30,13950],
|
16
|
+
[2,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8,115,5500,18,22,17450],
|
17
|
+
[1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25,17710],
|
18
|
+
[1,158,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140,5500,17,20,23875],
|
19
|
+
[2,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16430],
|
20
|
+
[0,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16925],
|
21
|
+
[0,188,101.2,176.8,64.8,54.3,2710,164,3.31,3.19,9,121,4250,21,28,20970],
|
22
|
+
[0,188,101.2,176.8,64.8,54.3,2765,164,3.31,3.19,9,121,4250,21,28,21105],
|
23
|
+
[2,121,88.4,141.1,60.3,53.2,1488,61,2.91,3.03,9.5,48,5100,47,53,5151],
|
24
|
+
]
|
25
|
+
|
26
|
+
def setup
|
27
|
+
@data_set = DataSet.new
|
28
|
+
@data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
|
29
|
+
@c = SimpleLinearRegression.new.build @data_set
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_eval
|
33
|
+
result = @c.eval([-1,95,109.1,188.8,68.9,55.5,3062,141,3.78,3.15,9.5,114,5400,19,25])
|
34
|
+
assert_equal 17218.444444444445, result
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -17,7 +17,11 @@ class KMeansTest < Test::Unit::TestCase
|
|
17
17
|
|
18
18
|
@@data = [ [10, 3], [3, 10], [2, 8], [2, 5], [3, 8], [10, 3],
|
19
19
|
[1, 3], [8, 1], [2, 9], [2, 5], [3, 3], [9, 4]]
|
20
|
-
|
20
|
+
|
21
|
+
# k-means will generate an empty cluster with this data and initial centroid assignment
|
22
|
+
@@empty_cluster_data = [[-0.1, 0], [0, 0], [0.1, 0], [-0.1, 10], [0.1, 10], [0.2, 10]]
|
23
|
+
@@empty_centroid_indices = [0,1,2]
|
24
|
+
|
21
25
|
def test_build
|
22
26
|
data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
|
23
27
|
clusterer = KMeans.new.build(data_set, 4)
|
@@ -25,21 +29,45 @@ class KMeansTest < Test::Unit::TestCase
|
|
25
29
|
# Verify that all 4 clusters are created
|
26
30
|
assert_equal 4, clusterer.clusters.length
|
27
31
|
assert_equal 4, clusterer.centroids.length
|
28
|
-
# The addition of all instances of every cluster must be equal
|
32
|
+
# The addition of all instances of every cluster must be equal to
|
29
33
|
# the number of data points
|
30
34
|
total_length = 0
|
31
35
|
clusterer.clusters.each do |cluster|
|
32
36
|
total_length += cluster.data_items.length
|
33
37
|
end
|
34
38
|
assert_equal @@data.length, total_length
|
35
|
-
# Data inside clusters must be the same as
|
39
|
+
# Data inside clusters must be the same as original data
|
36
40
|
clusterer.clusters.each do |cluster|
|
37
41
|
cluster.data_items.each do |data_item|
|
38
42
|
assert @@data.include?(data_item)
|
39
43
|
end
|
40
44
|
end
|
41
45
|
end
|
42
|
-
|
46
|
+
|
47
|
+
def test_build_and_eliminate_empty_clusters
|
48
|
+
data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
|
49
|
+
# :eliminate is the :on_empty default, so we don't need to pass it as a parameter for it
|
50
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
|
51
|
+
|
52
|
+
# Verify that one cluster was eliminated
|
53
|
+
assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
|
54
|
+
assert_equal @@empty_centroid_indices.size - 1, clusterer.centroids.length
|
55
|
+
|
56
|
+
# The addition of all instances of every cluster must be equal to
|
57
|
+
# the number of data points
|
58
|
+
total_length = 0
|
59
|
+
clusterer.clusters.each do |cluster|
|
60
|
+
total_length += cluster.data_items.length
|
61
|
+
end
|
62
|
+
assert_equal @@empty_cluster_data.length, total_length
|
63
|
+
# Data inside clusters must be the same as original data
|
64
|
+
clusterer.clusters.each do |cluster|
|
65
|
+
cluster.data_items.each do |data_item|
|
66
|
+
assert @@empty_cluster_data.include?(data_item)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
43
71
|
def test_eval
|
44
72
|
data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
|
45
73
|
clusterer = KMeans.new.build(data_set, 4)
|
@@ -54,13 +82,18 @@ class KMeansTest < Test::Unit::TestCase
|
|
54
82
|
assert clusterer.distance(centroid, item) >= min_distance
|
55
83
|
end
|
56
84
|
end
|
57
|
-
|
85
|
+
|
58
86
|
def test_distance
|
59
87
|
clusterer = KMeans.new
|
60
|
-
# By default, distance returns the
|
88
|
+
# By default, distance returns the euclidean distance to the power of 2
|
61
89
|
assert_equal 2385, clusterer.distance(
|
62
90
|
[1, 10, "Chicago", 2],
|
63
91
|
[10, 10, "London", 50])
|
92
|
+
|
93
|
+
# Ensure default distance raises error for nil argument
|
94
|
+
exception = assert_raise(TypeError) {clusterer.distance([1, 10], [nil, nil])}
|
95
|
+
assert_equal("nil can't be coerced into Fixnum", exception.message)
|
96
|
+
|
64
97
|
# Test new distance definition
|
65
98
|
manhattan_distance = lambda do |a, b|
|
66
99
|
dist = 0.0
|
@@ -84,7 +117,42 @@ class KMeansTest < Test::Unit::TestCase
|
|
84
117
|
build(data_set, 4)
|
85
118
|
assert_equal 1, clusterer.iterations
|
86
119
|
end
|
87
|
-
|
120
|
+
|
121
|
+
def test_centroid_indices
|
122
|
+
data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
|
123
|
+
# centroid_indices need not be specified:
|
124
|
+
KMeans.new.build(data_set, 4)
|
125
|
+
# centroid_indices can be specified:
|
126
|
+
KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 4)
|
127
|
+
# raises exception if number of clusters differs from length of centroid_indices:
|
128
|
+
exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 2)}
|
129
|
+
assert_equal('Length of centroid indices array differs from the specified number of clusters', exception.message)
|
130
|
+
# raises exception for bad centroid index:
|
131
|
+
exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,@@data.size+10]}).build(data_set, 4)}
|
132
|
+
assert_equal("Invalid centroid index #{@@data.size+10}", exception.message)
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_on_empty
|
136
|
+
data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
|
137
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
|
138
|
+
# Verify that one cluster was eliminated
|
139
|
+
assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
|
140
|
+
# Verify that eliminate is the on_empty default
|
141
|
+
assert_equal 'eliminate', clusterer.on_empty
|
142
|
+
# Verify that invalid on_empty option throws an argument error
|
143
|
+
exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'ldkfje'}).build(data_set, @@empty_centroid_indices.size)}
|
144
|
+
assert_equal("Invalid value for on_empty", exception.message)
|
145
|
+
# Verify that on_empty option 'terminate' raises an error when an empty cluster arises
|
146
|
+
exception = assert_raise(TypeError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'terminate'}).build(data_set, @@empty_centroid_indices.size)}
|
147
|
+
assert_equal("nil can't be coerced into Float", exception.message)
|
148
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'random'}).build(data_set, @@empty_centroid_indices.size)
|
149
|
+
# Verify that cluster was not eliminated
|
150
|
+
assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
|
151
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'outlier'}).build(data_set, @@empty_centroid_indices.size)
|
152
|
+
# Verify that cluster was not eliminated
|
153
|
+
assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
|
154
|
+
end
|
155
|
+
|
88
156
|
private
|
89
157
|
def draw_map(clusterer)
|
90
158
|
map = Array.new(11) {Array.new(11, 0)}
|
@@ -95,6 +163,5 @@ class KMeansTest < Test::Unit::TestCase
|
|
95
163
|
end
|
96
164
|
map.each { |row| puts row.inspect}
|
97
165
|
end
|
98
|
-
|
99
166
|
end
|
100
167
|
|