ai4r 1.12 → 1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +7 -12
- data/examples/classifiers/simple_linear_regression_example.csv +159 -0
- data/examples/classifiers/simple_linear_regression_example.rb +15 -0
- data/examples/clusterers/clusterer_example.rb +56 -0
- data/examples/neural_network/backpropagation_example.rb +2 -1
- data/lib/ai4r.rb +3 -1
- data/lib/ai4r/classifiers/id3.rb +6 -2
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
- data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
- data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
- data/lib/ai4r/clusterers/average_linkage.rb +3 -3
- data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
- data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
- data/lib/ai4r/clusterers/clusterer.rb +0 -11
- data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
- data/lib/ai4r/clusterers/diana.rb +2 -2
- data/lib/ai4r/clusterers/k_means.rb +123 -21
- data/lib/ai4r/clusterers/median_linkage.rb +3 -3
- data/lib/ai4r/clusterers/single_linkage.rb +4 -4
- data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
- data/lib/ai4r/data/data_set.rb +12 -3
- data/lib/ai4r/data/proximity.rb +22 -0
- data/lib/ai4r/neural_network/backpropagation.rb +26 -15
- data/test/classifiers/id3_test.rb +12 -0
- data/test/classifiers/multilayer_perceptron_test.rb +1 -1
- data/test/classifiers/naive_bayes_test.rb +18 -18
- data/test/classifiers/simple_linear_regression_test.rb +37 -0
- data/test/clusterers/k_means_test.rb +75 -8
- data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
- data/test/data/data_set_test.rb +8 -0
- data/test/data/proximity_test.rb +7 -1
- metadata +96 -55
@@ -16,7 +16,7 @@ module Ai4r
|
|
16
16
|
|
17
17
|
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
18
18
|
# al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
|
19
|
-
# Hierarchical
|
19
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
20
|
# progressively merge clusters, until the required number of clusters
|
21
21
|
# is reached.
|
22
22
|
# With single linkage, the distance between two clusters is computed as the
|
@@ -30,8 +30,8 @@ module Ai4r
|
|
30
30
|
parameters_info :distance_function =>
|
31
31
|
"Custom implementation of distance function. " +
|
32
32
|
"It must be a closure receiving two data items and return the " +
|
33
|
-
"distance
|
34
|
-
"
|
33
|
+
"distance between them. By default, this algorithm uses " +
|
34
|
+
"euclidean distance of numeric attributes to the power of 2."
|
35
35
|
|
36
36
|
def initialize
|
37
37
|
@distance_function = lambda do |a,b|
|
@@ -105,7 +105,7 @@ module Ai4r
|
|
105
105
|
end
|
106
106
|
|
107
107
|
# ci and cj are the indexes of the clusters that are going to
|
108
|
-
# be merged. We need to remove distances from/to ci and
|
108
|
+
# be merged. We need to remove distances from/to ci and cj,
|
109
109
|
# and add distances from/to new cluster (ci U cj)
|
110
110
|
def update_distance_matrix(ci, cj)
|
111
111
|
ci, cj = cj, ci if cj > ci
|
@@ -16,10 +16,10 @@ module Ai4r
|
|
16
16
|
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
17
|
# Ward's method linkage algorithm, aka the minimum variance method (Everitt
|
18
18
|
# et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
|
19
|
-
# Hierarchical
|
19
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
20
|
# progressively merge clusters, until the required number of clusters
|
21
21
|
# is reached.
|
22
|
-
# The objective of this method is to
|
22
|
+
# The objective of this method is to minimize the variance.
|
23
23
|
#
|
24
24
|
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
25
25
|
# (nj/(ni+nj+nx))*D(cx, cj) -
|
@@ -29,8 +29,8 @@ module Ai4r
|
|
29
29
|
parameters_info :distance_function =>
|
30
30
|
"Custom implementation of distance function. " +
|
31
31
|
"It must be a closure receiving two data items and return the " +
|
32
|
-
"distance
|
33
|
-
"
|
32
|
+
"distance between them. By default, this algorithm uses " +
|
33
|
+
"euclidean distance of numeric attributes to the power of 2."
|
34
34
|
|
35
35
|
# Build a new clusterer, using data examples found in data_set.
|
36
36
|
# Items will be clustered in "number_of_clusters" different
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Author:: Peter Lubell-Doughtie
|
2
|
+
# License:: BSD 3 Clause
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://peet.ldee.org
|
5
|
+
|
6
|
+
require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
|
7
|
+
|
8
|
+
module Ai4r
|
9
|
+
module Clusterers
|
10
|
+
|
11
|
+
# Hierarchical version to store classes as merges occur.
|
12
|
+
class WardLinkageHierarchical < WardLinkage
|
13
|
+
|
14
|
+
attr_reader :cluster_tree
|
15
|
+
|
16
|
+
def initialize(depth = nil)
|
17
|
+
@cluster_tree = []
|
18
|
+
@depth = depth
|
19
|
+
@merges_so_far = 0
|
20
|
+
super()
|
21
|
+
end
|
22
|
+
|
23
|
+
def build(data_set, number_of_clusters)
|
24
|
+
data_len = data_set.data_items.length
|
25
|
+
@total_merges = data_len - number_of_clusters
|
26
|
+
super
|
27
|
+
@cluster_tree << self.clusters
|
28
|
+
@cluster_tree.reverse!
|
29
|
+
return self
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
35
|
+
# only store if no or above depth
|
36
|
+
if @depth.nil? or @merges_so_far > @total_merges - @depth
|
37
|
+
# store current clusters
|
38
|
+
stored_distance_matrix = @distance_matrix.dup
|
39
|
+
@cluster_tree << build_clusters_from_index_clusters(index_clusters)
|
40
|
+
@distance_matrix = stored_distance_matrix
|
41
|
+
end
|
42
|
+
@merges_so_far += 1
|
43
|
+
super
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
@@ -16,7 +16,7 @@ module Ai4r
|
|
16
16
|
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
17
|
# weighted average linkage algorithm, aka weighted pair group method
|
18
18
|
# average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
|
19
|
-
# Hierarchical
|
19
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
20
|
# progressively merge clusters, until the required number of clusters
|
21
21
|
# is reached.
|
22
22
|
# Similar to AverageLinkage, but the distances between clusters are
|
@@ -28,8 +28,8 @@ module Ai4r
|
|
28
28
|
parameters_info :distance_function =>
|
29
29
|
"Custom implementation of distance function. " +
|
30
30
|
"It must be a closure receiving two data items and return the " +
|
31
|
-
"distance
|
32
|
-
"
|
31
|
+
"distance between them. By default, this algorithm uses " +
|
32
|
+
"euclidean distance of numeric attributes to the power of 2."
|
33
33
|
|
34
34
|
# Build a new clusterer, using data examples found in data_set.
|
35
35
|
# Items will be clustered in "number_of_clusters" different
|
data/lib/ai4r/data/data_set.rb
CHANGED
@@ -20,8 +20,6 @@ module Ai4r
|
|
20
20
|
# the data_labels property.
|
21
21
|
class DataSet
|
22
22
|
|
23
|
-
@@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
|
24
|
-
|
25
23
|
attr_reader :data_labels, :data_items
|
26
24
|
|
27
25
|
# Create a new DataSet. By default, empty.
|
@@ -82,11 +80,18 @@ module Ai4r
|
|
82
80
|
def parse_csv(filepath)
|
83
81
|
items = []
|
84
82
|
open_csv_file(filepath) do |row|
|
85
|
-
items << row.collect{|x| (x
|
83
|
+
items << row.collect{|x| is_number?(x) ? Float(x) : x }
|
86
84
|
end
|
87
85
|
set_data_items(items)
|
88
86
|
end
|
89
87
|
|
88
|
+
# Same as load_csv_with_labels, but it will try to convert cell contents as numbers.
|
89
|
+
def parse_csv_with_labels(filepath)
|
90
|
+
parse_csv(filepath)
|
91
|
+
@data_labels = @data_items.shift
|
92
|
+
return self
|
93
|
+
end
|
94
|
+
|
90
95
|
# Set data labels.
|
91
96
|
# Data labels must have the following format:
|
92
97
|
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
@@ -224,6 +229,10 @@ module Ai4r
|
|
224
229
|
|
225
230
|
protected
|
226
231
|
|
232
|
+
def is_number?(x)
|
233
|
+
true if Float(x) rescue false
|
234
|
+
end
|
235
|
+
|
227
236
|
def check_data_items(data_items)
|
228
237
|
if !data_items || data_items.empty?
|
229
238
|
raise ArgumentError, "Examples data set must not be empty."
|
data/lib/ai4r/data/proximity.rb
CHANGED
@@ -92,6 +92,28 @@ module Ai4r
|
|
92
92
|
return 1.0/similarity - 1
|
93
93
|
end
|
94
94
|
|
95
|
+
# Cosine similarity is a measure of similarity between two vectors
|
96
|
+
# of an inner product space that measures the cosine of the
|
97
|
+
# angle between them (http://en.wikipedia.org/wiki/Cosine_similarity).
|
98
|
+
#
|
99
|
+
# Parameters a and b are vectors with continuous attributes.
|
100
|
+
#
|
101
|
+
# D = sum(a[i] * b[i]) / sqrt(sum(a[i]**2)) * sqrt(sum(b[i]**2))
|
102
|
+
def self.cosine_distance(a,b)
|
103
|
+
dot_product = 0.0
|
104
|
+
norm_a = 0.0
|
105
|
+
norm_b = 0.0
|
106
|
+
magnitude = 0.0
|
107
|
+
|
108
|
+
a.each_index do |i|
|
109
|
+
dot_product += a[i] * b[i]
|
110
|
+
norm_a += a[i] ** 2
|
111
|
+
norm_b += b[i] ** 2
|
112
|
+
end
|
113
|
+
|
114
|
+
magnitude = Math.sqrt(norm_a) * Math.sqrt(norm_b)
|
115
|
+
return 1 - (dot_product / magnitude)
|
116
|
+
end
|
95
117
|
end
|
96
118
|
|
97
119
|
end
|
@@ -44,7 +44,7 @@ module Ai4r
|
|
44
44
|
# Use class method get_parameters_info to obtain details on the algorithm
|
45
45
|
# parameters. Use set_parameters to set values for this parameters.
|
46
46
|
#
|
47
|
-
# * :disable_bias => If true, the
|
47
|
+
# * :disable_bias => If true, the algorithm will not use bias nodes.
|
48
48
|
# False by default.
|
49
49
|
# * :initial_weight_function => f(n, i, j) must return the initial
|
50
50
|
# weight for the conection between the node i in layer n, and node j in
|
@@ -86,7 +86,7 @@ module Ai4r
|
|
86
86
|
|
87
87
|
include Ai4r::Data::Parameterizable
|
88
88
|
|
89
|
-
parameters_info :disable_bias => "If true, the
|
89
|
+
parameters_info :disable_bias => "If true, the algorithm will not use "+
|
90
90
|
"bias nodes. False by default.",
|
91
91
|
:initial_weight_function => "f(n, i, j) must return the initial "+
|
92
92
|
"weight for the conection between the node i in layer n, and "+
|
@@ -136,6 +136,17 @@ module Ai4r
|
|
136
136
|
return @activation_nodes.last.clone
|
137
137
|
end
|
138
138
|
|
139
|
+
# Evaluates the input and returns most active node
|
140
|
+
# E.g.
|
141
|
+
# net = Backpropagation.new([4, 3, 2])
|
142
|
+
# net.eval_result([25, 32.3, 12.8, 1.5])
|
143
|
+
# # eval gives [0.83, 0.03]
|
144
|
+
# # => 0
|
145
|
+
def eval_result(input_values)
|
146
|
+
result = eval(input_values)
|
147
|
+
result.index(result.max)
|
148
|
+
end
|
149
|
+
|
139
150
|
# This method trains the network using the backpropagation algorithm.
|
140
151
|
#
|
141
152
|
# input: Networks input
|
@@ -178,20 +189,20 @@ module Ai4r
|
|
178
189
|
@last_changes,
|
179
190
|
@activation_nodes
|
180
191
|
]
|
181
|
-
|
192
|
+
end
|
182
193
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
194
|
+
def marshal_load(ary)
|
195
|
+
@structure,
|
196
|
+
@disable_bias,
|
197
|
+
@learning_rate,
|
198
|
+
@momentum,
|
199
|
+
@weights,
|
200
|
+
@last_changes,
|
201
|
+
@activation_nodes = ary
|
202
|
+
@initial_weight_function = lambda { |n, i, j| ((rand 2000)/1000.0) - 1}
|
203
|
+
@propagation_function = lambda { |x| 1/(1+Math.exp(-1*(x))) } #lambda { |x| Math.tanh(x) }
|
204
|
+
@derivative_propagation_function = lambda { |y| y*(1-y) } #lambda { |y| 1.0 - y**2 }
|
205
|
+
end
|
195
206
|
|
196
207
|
|
197
208
|
# Propagate error backwards
|
@@ -203,6 +203,18 @@ class ID3Test < Test::Unit::TestCase
|
|
203
203
|
eval id3.get_rules
|
204
204
|
assert_equal 'N', marketing_target
|
205
205
|
end
|
206
|
+
|
207
|
+
def test_model_failure
|
208
|
+
bad_data_items = [ ['a', 'Y'],
|
209
|
+
['b', 'N'],
|
210
|
+
]
|
211
|
+
bad_data_labels = ['bogus', 'target']
|
212
|
+
id3 = ID3.new.build(DataSet.new(:data_items =>bad_data_items, :data_labels => bad_data_labels))
|
213
|
+
assert_raise ModelFailureError do
|
214
|
+
id3.eval(['c'])
|
215
|
+
end
|
216
|
+
assert_equal true, true
|
217
|
+
end
|
206
218
|
end
|
207
219
|
|
208
220
|
|
@@ -23,7 +23,7 @@ class MultilayerPerceptronTest < Test::Unit::TestCase
|
|
23
23
|
['Chicago', '[50-80]', 'M', 'N'],
|
24
24
|
])
|
25
25
|
|
26
|
-
|
26
|
+
def test_initialize
|
27
27
|
classifier = MultilayerPerceptron.new
|
28
28
|
assert_equal 1, classifier.active_node_value
|
29
29
|
assert_equal 0, classifier.inactive_node_value
|
@@ -7,37 +7,37 @@ include Ai4r::Data
|
|
7
7
|
|
8
8
|
class NaiveBayesTest < Test::Unit::TestCase
|
9
9
|
|
10
|
-
@@data_labels =
|
10
|
+
@@data_labels = %w(Color Type Origin Stolen?)
|
11
11
|
|
12
12
|
@@data_items = [
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
13
|
+
%w(Red Sports Domestic Yes),
|
14
|
+
%w(Red Sports Domestic No),
|
15
|
+
%w(Red Sports Domestic Yes),
|
16
|
+
%w(Yellow Sports Domestic No),
|
17
|
+
%w(Yellow Sports Imported Yes),
|
18
|
+
%w(Yellow SUV Imported No),
|
19
|
+
%w(Yellow SUV Imported Yes),
|
20
|
+
%w(Yellow Sports Domestic No),
|
21
|
+
%w(Red SUV Imported No),
|
22
|
+
%w(Red Sports Imported Yes)
|
23
|
+
]
|
24
24
|
|
25
25
|
def setup
|
26
26
|
@data_set = DataSet.new
|
27
27
|
@data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
|
28
|
-
@b = NaiveBayes.new.set_parameters({:m=>3}).build @data_set
|
28
|
+
@b = NaiveBayes.new.set_parameters({:m => 3}).build @data_set
|
29
29
|
end
|
30
30
|
|
31
31
|
def test_eval
|
32
|
-
result = @b.eval(
|
33
|
-
assert_equal
|
32
|
+
result = @b.eval(%w(Red SUV Domestic))
|
33
|
+
assert_equal 'No', result
|
34
34
|
end
|
35
35
|
|
36
36
|
def test_get_probability_map
|
37
|
-
map = @b.get_probability_map(
|
37
|
+
map = @b.get_probability_map(%w(Red SUV Domestic))
|
38
38
|
assert_equal 2, map.keys.length
|
39
|
-
assert_in_delta 0.42, map[
|
40
|
-
assert_in_delta 0.58, map[
|
39
|
+
assert_in_delta 0.42, map['Yes'], 0.1
|
40
|
+
assert_in_delta 0.58, map['No'], 0.1
|
41
41
|
end
|
42
42
|
|
43
43
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'ai4r/classifiers/simple_linear_regression'
|
2
|
+
require 'ai4r/data/data_set'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
include Ai4r::Classifiers
|
6
|
+
include Ai4r::Data
|
7
|
+
|
8
|
+
class SimpleLinearRegressionTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
@@data_labels = ["symboling", "normalized-losses", "wheel-base", "length", "width", "height", "curb-weight",
|
11
|
+
"engine-size", "bore" , "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg",
|
12
|
+
"highway-mpg", "class"]
|
13
|
+
|
14
|
+
@@data_items = [
|
15
|
+
[2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10,102,5500,24,30,13950],
|
16
|
+
[2,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8,115,5500,18,22,17450],
|
17
|
+
[1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25,17710],
|
18
|
+
[1,158,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140,5500,17,20,23875],
|
19
|
+
[2,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16430],
|
20
|
+
[0,192,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16925],
|
21
|
+
[0,188,101.2,176.8,64.8,54.3,2710,164,3.31,3.19,9,121,4250,21,28,20970],
|
22
|
+
[0,188,101.2,176.8,64.8,54.3,2765,164,3.31,3.19,9,121,4250,21,28,21105],
|
23
|
+
[2,121,88.4,141.1,60.3,53.2,1488,61,2.91,3.03,9.5,48,5100,47,53,5151],
|
24
|
+
]
|
25
|
+
|
26
|
+
def setup
|
27
|
+
@data_set = DataSet.new
|
28
|
+
@data_set = DataSet.new(:data_items => @@data_items, :data_labels => @@data_labels)
|
29
|
+
@c = SimpleLinearRegression.new.build @data_set
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_eval
|
33
|
+
result = @c.eval([-1,95,109.1,188.8,68.9,55.5,3062,141,3.78,3.15,9.5,114,5400,19,25])
|
34
|
+
assert_equal 17218.444444444445, result
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -17,7 +17,11 @@ class KMeansTest < Test::Unit::TestCase
|
|
17
17
|
|
18
18
|
@@data = [ [10, 3], [3, 10], [2, 8], [2, 5], [3, 8], [10, 3],
|
19
19
|
[1, 3], [8, 1], [2, 9], [2, 5], [3, 3], [9, 4]]
|
20
|
-
|
20
|
+
|
21
|
+
# k-means will generate an empty cluster with this data and initial centroid assignment
|
22
|
+
@@empty_cluster_data = [[-0.1, 0], [0, 0], [0.1, 0], [-0.1, 10], [0.1, 10], [0.2, 10]]
|
23
|
+
@@empty_centroid_indices = [0,1,2]
|
24
|
+
|
21
25
|
def test_build
|
22
26
|
data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
|
23
27
|
clusterer = KMeans.new.build(data_set, 4)
|
@@ -25,21 +29,45 @@ class KMeansTest < Test::Unit::TestCase
|
|
25
29
|
# Verify that all 4 clusters are created
|
26
30
|
assert_equal 4, clusterer.clusters.length
|
27
31
|
assert_equal 4, clusterer.centroids.length
|
28
|
-
# The addition of all instances of every cluster must be equal
|
32
|
+
# The addition of all instances of every cluster must be equal to
|
29
33
|
# the number of data points
|
30
34
|
total_length = 0
|
31
35
|
clusterer.clusters.each do |cluster|
|
32
36
|
total_length += cluster.data_items.length
|
33
37
|
end
|
34
38
|
assert_equal @@data.length, total_length
|
35
|
-
# Data inside clusters must be the same as
|
39
|
+
# Data inside clusters must be the same as original data
|
36
40
|
clusterer.clusters.each do |cluster|
|
37
41
|
cluster.data_items.each do |data_item|
|
38
42
|
assert @@data.include?(data_item)
|
39
43
|
end
|
40
44
|
end
|
41
45
|
end
|
42
|
-
|
46
|
+
|
47
|
+
def test_build_and_eliminate_empty_clusters
|
48
|
+
data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
|
49
|
+
# :eliminate is the :on_empty default, so we don't need to pass it as a parameter for it
|
50
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
|
51
|
+
|
52
|
+
# Verify that one cluster was eliminated
|
53
|
+
assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
|
54
|
+
assert_equal @@empty_centroid_indices.size - 1, clusterer.centroids.length
|
55
|
+
|
56
|
+
# The addition of all instances of every cluster must be equal to
|
57
|
+
# the number of data points
|
58
|
+
total_length = 0
|
59
|
+
clusterer.clusters.each do |cluster|
|
60
|
+
total_length += cluster.data_items.length
|
61
|
+
end
|
62
|
+
assert_equal @@empty_cluster_data.length, total_length
|
63
|
+
# Data inside clusters must be the same as original data
|
64
|
+
clusterer.clusters.each do |cluster|
|
65
|
+
cluster.data_items.each do |data_item|
|
66
|
+
assert @@empty_cluster_data.include?(data_item)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
43
71
|
def test_eval
|
44
72
|
data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
|
45
73
|
clusterer = KMeans.new.build(data_set, 4)
|
@@ -54,13 +82,18 @@ class KMeansTest < Test::Unit::TestCase
|
|
54
82
|
assert clusterer.distance(centroid, item) >= min_distance
|
55
83
|
end
|
56
84
|
end
|
57
|
-
|
85
|
+
|
58
86
|
def test_distance
|
59
87
|
clusterer = KMeans.new
|
60
|
-
# By default, distance returns the
|
88
|
+
# By default, distance returns the euclidean distance to the power of 2
|
61
89
|
assert_equal 2385, clusterer.distance(
|
62
90
|
[1, 10, "Chicago", 2],
|
63
91
|
[10, 10, "London", 50])
|
92
|
+
|
93
|
+
# Ensure default distance raises error for nil argument
|
94
|
+
exception = assert_raise(TypeError) {clusterer.distance([1, 10], [nil, nil])}
|
95
|
+
assert_equal("nil can't be coerced into Fixnum", exception.message)
|
96
|
+
|
64
97
|
# Test new distance definition
|
65
98
|
manhattan_distance = lambda do |a, b|
|
66
99
|
dist = 0.0
|
@@ -84,7 +117,42 @@ class KMeansTest < Test::Unit::TestCase
|
|
84
117
|
build(data_set, 4)
|
85
118
|
assert_equal 1, clusterer.iterations
|
86
119
|
end
|
87
|
-
|
120
|
+
|
121
|
+
def test_centroid_indices
|
122
|
+
data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"])
|
123
|
+
# centroid_indices need not be specified:
|
124
|
+
KMeans.new.build(data_set, 4)
|
125
|
+
# centroid_indices can be specified:
|
126
|
+
KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 4)
|
127
|
+
# raises exception if number of clusters differs from length of centroid_indices:
|
128
|
+
exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 2)}
|
129
|
+
assert_equal('Length of centroid indices array differs from the specified number of clusters', exception.message)
|
130
|
+
# raises exception for bad centroid index:
|
131
|
+
exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,@@data.size+10]}).build(data_set, 4)}
|
132
|
+
assert_equal("Invalid centroid index #{@@data.size+10}", exception.message)
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_on_empty
|
136
|
+
data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"])
|
137
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size)
|
138
|
+
# Verify that one cluster was eliminated
|
139
|
+
assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length
|
140
|
+
# Verify that eliminate is the on_empty default
|
141
|
+
assert_equal 'eliminate', clusterer.on_empty
|
142
|
+
# Verify that invalid on_empty option throws an argument error
|
143
|
+
exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'ldkfje'}).build(data_set, @@empty_centroid_indices.size)}
|
144
|
+
assert_equal("Invalid value for on_empty", exception.message)
|
145
|
+
# Verify that on_empty option 'terminate' raises an error when an empty cluster arises
|
146
|
+
exception = assert_raise(TypeError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'terminate'}).build(data_set, @@empty_centroid_indices.size)}
|
147
|
+
assert_equal("nil can't be coerced into Float", exception.message)
|
148
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'random'}).build(data_set, @@empty_centroid_indices.size)
|
149
|
+
# Verify that cluster was not eliminated
|
150
|
+
assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
|
151
|
+
clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'outlier'}).build(data_set, @@empty_centroid_indices.size)
|
152
|
+
# Verify that cluster was not eliminated
|
153
|
+
assert_equal @@empty_centroid_indices.size, clusterer.clusters.length
|
154
|
+
end
|
155
|
+
|
88
156
|
private
|
89
157
|
def draw_map(clusterer)
|
90
158
|
map = Array.new(11) {Array.new(11, 0)}
|
@@ -95,6 +163,5 @@ class KMeansTest < Test::Unit::TestCase
|
|
95
163
|
end
|
96
164
|
map.each { |row| puts row.inspect}
|
97
165
|
end
|
98
|
-
|
99
166
|
end
|
100
167
|
|