ai4r 1.13 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.rb +14 -11
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +40 -34
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +48 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +524 -145
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +29 -14
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +270 -135
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +223 -103
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +62 -62
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +399 -134
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +57 -28
- metadata +79 -109
- data/README.rdoc +0 -39
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -220
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/simple_linear_regression_test.rb +0 -37
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -167
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -104
- data/test/data/proximity_test.rb +0 -87
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -1,228 +1,363 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../data/proximity'
|
14
|
+
require_relative '../clusterers/clusterer'
|
13
15
|
|
14
16
|
module Ai4r
|
15
17
|
module Clusterers
|
16
|
-
|
17
|
-
# The k-means algorithm is an algorithm to cluster n objects
|
18
|
+
# The k-means algorithm is an algorithm to cluster n objects
|
18
19
|
# based on attributes into k partitions, with k < n.
|
19
|
-
#
|
20
|
+
#
|
20
21
|
# More about K Means algorithm:
|
21
|
-
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
23
|
class KMeans < Clusterer
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
24
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :centroids, :iterations, :history
|
25
|
+
|
26
|
+
parameters_info(
|
27
|
+
max_iterations: 'Maximum number of iterations to build the clusterer. By default it is uncapped.',
|
28
|
+
distance_function: 'Custom implementation of distance function. ' \
|
29
|
+
'It must be a closure receiving two data items and return the ' \
|
30
|
+
'distance between them. By default, this algorithm uses ' \
|
31
|
+
'euclidean distance of numeric attributes to the power of 2.',
|
32
|
+
centroid_function: 'Custom implementation to calculate the ' \
|
33
|
+
'centroid of a cluster. It must be a closure receiving an array of ' \
|
34
|
+
'data sets, and return an array of data items, representing the ' \
|
35
|
+
'centroids of for each data set. ' \
|
36
|
+
'By default, this algorithm returns a data items using the mode ' \
|
37
|
+
'or mean of each attribute on each data set.',
|
38
|
+
centroid_indices: 'Indices of data items (indexed from 0) to be ' \
|
39
|
+
'the initial centroids. Otherwise, the initial centroids will be ' \
|
40
|
+
'assigned randomly from the data set.',
|
41
|
+
on_empty: 'Action to take if a cluster becomes empty, with values ' \
|
42
|
+
"'eliminate' (the default action, eliminate the empty cluster), " \
|
43
|
+
"'terminate' (terminate with error), 'random' (relocate the " \
|
44
|
+
"empty cluster to a random point), 'outlier' (relocate the " \
|
45
|
+
'empty cluster to the point furthest from its centroid).',
|
46
|
+
random_seed: "Seed value used to initialize Ruby's random number " \
|
47
|
+
'generator when selecting random centroids.',
|
48
|
+
init_method: 'Strategy to initialize centroids. Available values: ' \
|
49
|
+
':random (default) and :kmeans_plus_plus.',
|
50
|
+
restarts: 'Number of random initializations to perform. ' \
|
51
|
+
'The best run (lowest SSE) will be kept.',
|
52
|
+
track_history: 'Keep centroids and assignments for each iteration ' \
|
53
|
+
'when building the clusterer.'
|
54
|
+
)
|
55
|
+
|
56
|
+
# @return [Object]
|
48
57
|
def initialize
|
58
|
+
super()
|
49
59
|
@distance_function = nil
|
50
60
|
@max_iterations = nil
|
51
|
-
@centroid_function = lambda do |data_sets|
|
52
|
-
data_sets.collect
|
61
|
+
@centroid_function = lambda do |data_sets|
|
62
|
+
data_sets.collect(&:get_mean_or_mode)
|
53
63
|
end
|
54
64
|
@centroid_indices = []
|
55
65
|
@on_empty = 'eliminate' # default if none specified
|
66
|
+
@random_seed = nil
|
67
|
+
@rng = nil
|
68
|
+
@init_method = :random
|
69
|
+
@restarts = 1
|
70
|
+
@track_history = false
|
56
71
|
end
|
57
|
-
|
58
|
-
|
72
|
+
|
59
73
|
# Build a new clusterer, using data examples found in data_set.
|
60
74
|
# Items will be clustered in "number_of_clusters" different
|
61
75
|
# clusters.
|
76
|
+
# @param data_set [Object]
|
77
|
+
# @param number_of_clusters [Object]
|
78
|
+
# @return [Object]
|
62
79
|
def build(data_set, number_of_clusters)
|
63
80
|
@data_set = data_set
|
64
81
|
@number_of_clusters = number_of_clusters
|
65
|
-
raise ArgumentError, '
|
66
|
-
|
67
|
-
@
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
82
|
+
raise ArgumentError, 'Number of clusters larger than data items' if @number_of_clusters > @data_set.data_items.length
|
83
|
+
|
84
|
+
unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
|
85
|
+
raise ArgumentError,
|
86
|
+
'Length of centroid indices array differs from the specified number of clusters'
|
87
|
+
end
|
88
|
+
unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
|
89
|
+
raise ArgumentError,
|
90
|
+
'Invalid value for on_empty'
|
91
|
+
end
|
92
|
+
|
93
|
+
seed_base = @random_seed
|
94
|
+
best_sse = nil
|
95
|
+
best_centroids = nil
|
96
|
+
best_clusters = nil
|
97
|
+
best_iterations = nil
|
98
|
+
|
99
|
+
(@restarts || 1).times do |i|
|
100
|
+
@random_seed = seed_base.nil? ? nil : seed_base + i
|
101
|
+
@rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
|
102
|
+
@iterations = 0
|
103
|
+
@history = [] if @track_history
|
104
|
+
calc_initial_centroids
|
105
|
+
until stop_criteria_met
|
106
|
+
calculate_membership_clusters
|
107
|
+
if @track_history
|
108
|
+
@history << {
|
109
|
+
centroids: @centroids.collect(&:dup),
|
110
|
+
assignments: @assignments.dup
|
111
|
+
}
|
112
|
+
end
|
113
|
+
recompute_centroids
|
114
|
+
end
|
115
|
+
current_sse = sse
|
116
|
+
next unless best_sse.nil? || current_sse < best_sse
|
117
|
+
|
118
|
+
best_sse = current_sse
|
119
|
+
best_centroids = Marshal.load(Marshal.dump(@centroids))
|
120
|
+
best_clusters = Marshal.load(Marshal.dump(@clusters))
|
121
|
+
best_iterations = @iterations
|
73
122
|
end
|
74
|
-
|
75
|
-
|
123
|
+
|
124
|
+
@random_seed = seed_base
|
125
|
+
@rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
|
126
|
+
@centroids = best_centroids
|
127
|
+
@clusters = best_clusters
|
128
|
+
@iterations = best_iterations
|
129
|
+
self
|
76
130
|
end
|
77
|
-
|
78
|
-
# Classifies the given data item, returning the cluster index it belongs
|
131
|
+
|
132
|
+
# Classifies the given data item, returning the cluster index it belongs
|
79
133
|
# to (0-based).
|
134
|
+
# @param data_item [Object]
|
135
|
+
# @return [Object]
|
80
136
|
def eval(data_item)
|
81
|
-
get_min_index(@centroids.collect
|
82
|
-
|
137
|
+
get_min_index(@centroids.collect do |centroid|
|
138
|
+
distance(data_item, centroid)
|
139
|
+
end)
|
83
140
|
end
|
84
|
-
|
141
|
+
|
142
|
+
# Sum of squared distances of all points to their respective centroids.
|
143
|
+
# It can be used as a measure of cluster compactness (SSE).
|
144
|
+
# @return [Object]
|
145
|
+
def sse
|
146
|
+
sum = 0.0
|
147
|
+
@clusters.each_with_index do |cluster, i|
|
148
|
+
centroid = @centroids[i]
|
149
|
+
cluster.data_items.each do |item|
|
150
|
+
sum += distance(item, centroid)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
sum
|
154
|
+
end
|
155
|
+
|
85
156
|
# This function calculates the distance between 2 different
|
86
|
-
# instances. By default, it returns the euclidean distance to the
|
157
|
+
# instances. By default, it returns the euclidean distance to the
|
87
158
|
# power of 2.
|
88
159
|
# You can provide a more convenient distance implementation:
|
89
|
-
#
|
160
|
+
#
|
90
161
|
# 1- Overwriting this method
|
91
|
-
#
|
162
|
+
#
|
92
163
|
# 2- Providing a closure to the :distance_function parameter
|
164
|
+
# @param a [Object]
|
165
|
+
# @param b [Object]
|
166
|
+
# @return [Object]
|
93
167
|
def distance(a, b)
|
94
168
|
return @distance_function.call(a, b) if @distance_function
|
95
|
-
|
96
|
-
|
97
|
-
|
169
|
+
|
170
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
171
|
+
a.select { |att_a| att_a.is_a? Numeric },
|
172
|
+
b.select { |att_b| att_b.is_a? Numeric }
|
173
|
+
)
|
98
174
|
end
|
99
|
-
|
100
|
-
protected
|
101
|
-
|
175
|
+
|
176
|
+
protected
|
177
|
+
|
178
|
+
# @return [Object]
|
102
179
|
def calc_initial_centroids
|
103
|
-
@centroids
|
180
|
+
@centroids = []
|
181
|
+
@old_centroids = nil
|
104
182
|
if @centroid_indices.empty?
|
105
|
-
|
183
|
+
if @init_method == :kmeans_plus_plus
|
184
|
+
kmeans_plus_plus_init
|
185
|
+
else
|
186
|
+
populate_centroids('random')
|
187
|
+
end
|
106
188
|
else
|
107
189
|
populate_centroids('indices')
|
108
190
|
end
|
109
191
|
end
|
110
|
-
|
192
|
+
|
193
|
+
# @return [Object]
|
111
194
|
def stop_criteria_met
|
112
|
-
@old_centroids == @centroids ||
|
195
|
+
@old_centroids == @centroids ||
|
113
196
|
(@max_iterations && (@max_iterations <= @iterations))
|
114
197
|
end
|
115
|
-
|
198
|
+
|
199
|
+
# @return [Object]
|
116
200
|
def calculate_membership_clusters
|
117
|
-
@clusters = Array.new(@number_of_clusters) do
|
118
|
-
Ai4r::Data::DataSet.new :
|
201
|
+
@clusters = Array.new(@number_of_clusters) do
|
202
|
+
Ai4r::Data::DataSet.new data_labels: @data_set.data_labels
|
119
203
|
end
|
120
|
-
@cluster_indices = Array.new(@number_of_clusters) {[]}
|
121
|
-
|
204
|
+
@cluster_indices = Array.new(@number_of_clusters) { [] }
|
205
|
+
@assignments = Array.new(@data_set.data_items.length)
|
206
|
+
|
122
207
|
@data_set.data_items.each_with_index do |data_item, data_index|
|
123
208
|
c = eval(data_item)
|
124
209
|
@clusters[c] << data_item
|
125
210
|
@cluster_indices[c] << data_index if @on_empty == 'outlier'
|
211
|
+
@assignments[data_index] = c
|
126
212
|
end
|
127
|
-
manage_empty_clusters if
|
213
|
+
manage_empty_clusters if empty_cluster?
|
128
214
|
end
|
129
|
-
|
215
|
+
|
216
|
+
# @return [Object]
|
130
217
|
def recompute_centroids
|
131
218
|
@old_centroids = @centroids
|
132
219
|
@iterations += 1
|
133
|
-
@centroids = @centroid_function.call(@clusters)
|
220
|
+
@centroids = @centroid_function.call(@clusters)
|
134
221
|
end
|
135
222
|
|
136
|
-
|
223
|
+
# @return [Object]
|
224
|
+
def kmeans_plus_plus_init
|
225
|
+
chosen_indices = []
|
226
|
+
first_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
|
227
|
+
return if first_index.nil?
|
228
|
+
|
229
|
+
@centroids << @data_set.data_items[first_index]
|
230
|
+
chosen_indices << first_index
|
231
|
+
while @centroids.length < @number_of_clusters &&
|
232
|
+
chosen_indices.length < @data_set.data_items.length
|
233
|
+
distances = []
|
234
|
+
total = 0.0
|
235
|
+
@data_set.data_items.each_with_index do |item, index|
|
236
|
+
next if chosen_indices.include?(index)
|
237
|
+
|
238
|
+
min_dist = @centroids.map { |c| distance(item, c) }.min
|
239
|
+
distances << [index, min_dist]
|
240
|
+
total += min_dist
|
241
|
+
end
|
242
|
+
break if distances.empty?
|
243
|
+
|
244
|
+
r = @rng.rand * total
|
245
|
+
cumulative = 0.0
|
246
|
+
chosen = distances.find do |_idx, dist|
|
247
|
+
cumulative += dist
|
248
|
+
cumulative >= r
|
249
|
+
end
|
250
|
+
chosen_indices << chosen[0]
|
251
|
+
@centroids << @data_set.data_items[chosen[0]]
|
252
|
+
end
|
253
|
+
@number_of_clusters = @centroids.length
|
254
|
+
end
|
255
|
+
|
256
|
+
# @param populate_method [Object]
|
257
|
+
# @param number_of_clusters [Object]
|
258
|
+
# @return [Object]
|
259
|
+
def populate_centroids(populate_method, number_of_clusters = @number_of_clusters)
|
137
260
|
tried_indexes = []
|
138
261
|
case populate_method
|
139
262
|
when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
|
140
|
-
while @centroids.length < number_of_clusters &&
|
141
|
-
|
142
|
-
random_index =
|
143
|
-
if
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
end
|
148
|
-
end
|
263
|
+
while @centroids.length < number_of_clusters &&
|
264
|
+
tried_indexes.length < @data_set.data_items.length
|
265
|
+
random_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
|
266
|
+
next if tried_indexes.include?(random_index)
|
267
|
+
|
268
|
+
tried_indexes << random_index
|
269
|
+
@centroids << @data_set.data_items[random_index] unless @centroids.include? @data_set.data_items[random_index]
|
149
270
|
end
|
150
271
|
when 'indices' # for initial assignment only (with the :centroid_indices option)
|
151
272
|
@centroid_indices.each do |index|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
if !@centroids.include? @data_set.data_items[index]
|
156
|
-
@centroids << @data_set.data_items[index]
|
157
|
-
end
|
273
|
+
unless (index.is_a? Integer) && index >= 0 && index < @data_set.data_items.length
|
274
|
+
raise ArgumentError,
|
275
|
+
"Invalid centroid index #{index}"
|
158
276
|
end
|
277
|
+
|
278
|
+
next if tried_indexes.include?(index)
|
279
|
+
|
280
|
+
tried_indexes << index
|
281
|
+
@centroids << @data_set.data_items[index] unless @centroids.include? @data_set.data_items[index]
|
159
282
|
end
|
160
283
|
when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
|
161
284
|
sorted_data_indices = sort_data_indices_by_dist_to_centroid
|
162
285
|
i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
|
163
|
-
while @centroids.length < number_of_clusters &&
|
164
|
-
|
165
|
-
outlier_index = sorted_data_indices[i]
|
166
|
-
|
286
|
+
while @centroids.length < number_of_clusters &&
|
287
|
+
tried_indexes.length < @data_set.data_items.length
|
288
|
+
outlier_index = sorted_data_indices[i]
|
289
|
+
unless tried_indexes.include?(outlier_index)
|
167
290
|
tried_indexes << outlier_index
|
168
|
-
|
169
|
-
@centroids << @data_set.data_items[outlier_index]
|
170
|
-
end
|
291
|
+
@centroids << @data_set.data_items[outlier_index] unless @centroids.include? @data_set.data_items[outlier_index]
|
171
292
|
end
|
172
|
-
i
|
293
|
+
i.positive? ? i -= 1 : break
|
173
294
|
end
|
174
|
-
end
|
295
|
+
end
|
175
296
|
@number_of_clusters = @centroids.length
|
176
|
-
end
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
297
|
+
end
|
298
|
+
|
299
|
+
# Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
|
300
|
+
# Returns indices, sorted in order from the nearest to furthest.
|
301
|
+
# @return [Object]
|
302
|
+
def sort_data_indices_by_dist_to_centroid
|
303
|
+
h = {}
|
304
|
+
@clusters.each_with_index do |cluster, c|
|
305
|
+
centroid = @centroids[c]
|
306
|
+
cluster.data_items.each_with_index do |data_item, i|
|
307
|
+
dist_to_centroid = distance(data_item, centroid)
|
308
|
+
data_index = @cluster_indices[c][i]
|
309
|
+
h[data_index] = dist_to_centroid
|
310
|
+
end
|
311
|
+
end
|
312
|
+
# sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
|
313
|
+
h.sort_by { |_k, v| v }.collect { |a, _b| a }
|
314
|
+
end
|
315
|
+
|
316
|
+
# @return [Object]
|
317
|
+
def empty_cluster?
|
196
318
|
found_empty = false
|
197
319
|
@number_of_clusters.times do |c|
|
198
320
|
found_empty = true if @clusters[c].data_items.empty?
|
199
321
|
end
|
200
322
|
found_empty
|
201
323
|
end
|
202
|
-
|
324
|
+
|
325
|
+
# @return [Object]
|
203
326
|
def manage_empty_clusters
|
204
|
-
|
205
|
-
|
327
|
+
# Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
|
328
|
+
return if on_empty == 'terminate'
|
329
|
+
|
206
330
|
initial_number_of_clusters = @number_of_clusters
|
207
331
|
eliminate_empty_clusters
|
208
|
-
return if
|
209
|
-
|
210
|
-
|
332
|
+
return if on_empty == 'eliminate'
|
333
|
+
|
334
|
+
populate_centroids(on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
|
335
|
+
calculate_membership_clusters
|
211
336
|
end
|
212
|
-
|
337
|
+
|
338
|
+
# @return [Object]
|
213
339
|
def eliminate_empty_clusters
|
214
|
-
old_clusters
|
215
|
-
|
340
|
+
old_clusters = @clusters
|
341
|
+
old_centroids = @centroids
|
342
|
+
old_cluster_indices = @cluster_indices
|
343
|
+
old_assignments = @assignments
|
344
|
+
@clusters = []
|
345
|
+
@centroids = []
|
346
|
+
@cluster_indices = []
|
347
|
+
remap = {}
|
348
|
+
new_index = 0
|
216
349
|
@number_of_clusters.times do |i|
|
217
|
-
if
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
350
|
+
next if old_clusters[i].data_items.empty?
|
351
|
+
|
352
|
+
remap[i] = new_index
|
353
|
+
@clusters << old_clusters[i]
|
354
|
+
@cluster_indices << old_cluster_indices[i]
|
355
|
+
@centroids << old_centroids[i]
|
356
|
+
new_index += 1
|
222
357
|
end
|
223
358
|
@number_of_clusters = @centroids.length
|
359
|
+
@assignments = old_assignments.map { |c| remap[c] }
|
224
360
|
end
|
225
|
-
|
226
361
|
end
|
227
362
|
end
|
228
363
|
end
|
@@ -1,61 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../clusterers/single_linkage'
|
14
|
+
require_relative '../clusterers/cluster_tree'
|
12
15
|
|
13
16
|
module Ai4r
|
14
17
|
module Clusterers
|
15
|
-
|
16
|
-
#
|
17
|
-
# median linkage algorithm, aka weighted pair group method centroid
|
18
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
19
|
+
# median linkage algorithm, aka weighted pair group method centroid
|
18
20
|
# or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
|
19
|
-
# Hierarchical clusterer create one cluster per element, and then
|
21
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
22
|
# progressively merge clusters, until the required number of clusters
|
21
23
|
# is reached.
|
22
|
-
# Similar to centroid linkages, but using fix weight:
|
23
|
-
#
|
24
|
-
# D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
|
25
|
-
# (1/2)*D(cx, cj) -
|
24
|
+
# Similar to centroid linkages, but using fix weight:
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
|
27
|
+
# (1/2)*D(cx, cj) -
|
26
28
|
# (1/4)*D(ci, cj)
|
27
29
|
class MedianLinkage < SingleLinkage
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
30
|
+
include ClusterTree
|
31
|
+
|
32
|
+
parameters_info distance_function:
|
33
|
+
'Custom implementation of distance function. ' \
|
34
|
+
'It must be a closure receiving two data items and return the ' \
|
35
|
+
'distance between them. By default, this algorithm uses ' \
|
36
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
37
|
+
|
35
38
|
# Build a new clusterer, using data examples found in data_set.
|
36
39
|
# Items will be clustered in "number_of_clusters" different
|
37
40
|
# clusters.
|
38
|
-
|
41
|
+
# @param data_set [Object]
|
42
|
+
# @param number_of_clusters [Object]
|
43
|
+
# @param *options [Object]
|
44
|
+
# @return [Object]
|
45
|
+
def build(data_set, number_of_clusters = 1, **options)
|
39
46
|
super
|
40
47
|
end
|
41
|
-
|
42
|
-
# This algorithms does not allow classification of new data items
|
48
|
+
|
49
|
+
# This algorithms does not allow classification of new data items
|
43
50
|
# once it has been built. Rebuild the cluster including you data element.
|
44
|
-
|
45
|
-
|
51
|
+
# @param _data_item [Object]
|
52
|
+
# @return [Object]
|
53
|
+
def eval(_data_item)
|
54
|
+
raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
|
46
55
|
end
|
47
|
-
|
56
|
+
|
57
|
+
# @return [Object]
|
58
|
+
def supports_eval?
|
59
|
+
false
|
60
|
+
end
|
61
|
+
|
48
62
|
protected
|
49
|
-
|
63
|
+
|
50
64
|
# return distance between cluster cx and cluster (ci U cj),
|
51
65
|
# using median linkage
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
66
|
+
# @param cx [Object]
|
67
|
+
# @param ci [Object]
|
68
|
+
# @param cj [Object]
|
69
|
+
# @return [Object]
|
70
|
+
def linkage_distance(cluster_x, cluster_i, cluster_j)
|
71
|
+
((0.5 * read_distance_matrix(cluster_x, cluster_i)) +
|
72
|
+
(0.5 * read_distance_matrix(cluster_x, cluster_j)) -
|
73
|
+
(0.25 * read_distance_matrix(cluster_i, cluster_j)))
|
56
74
|
end
|
57
|
-
|
58
75
|
end
|
59
76
|
end
|
60
77
|
end
|
61
|
-
|