ai4r 1.12 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.csv +159 -0
- data/examples/classifiers/simple_linear_regression_example.rb +18 -0
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +62 -0
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +49 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +527 -144
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +112 -48
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +143 -0
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +28 -24
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +309 -72
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +63 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +229 -100
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +72 -50
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +419 -143
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +58 -27
- metadata +117 -106
- data/README.rdoc +0 -44
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -208
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -100
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -96
- data/test/data/proximity_test.rb +0 -81
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -1,74 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../data/proximity'
|
14
|
+
require_relative '../clusterers/clusterer'
|
13
15
|
|
14
16
|
module Ai4r
|
15
17
|
module Clusterers
|
16
|
-
|
17
|
-
# DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
|
18
|
+
# DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
|
18
19
|
# Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
|
19
20
|
# Clusterer. It begins with only one cluster with all data items,
|
20
21
|
# and divides the clusters until the desired clusters number is reached.
|
21
22
|
class Diana < Clusterer
|
22
|
-
|
23
23
|
attr_reader :data_set, :number_of_clusters, :clusters
|
24
|
-
|
25
|
-
parameters_info :
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
|
25
|
+
parameters_info distance_function:
|
26
|
+
'Custom implementation of distance function. ' \
|
27
|
+
'It must be a closure receiving two data items and return the ' \
|
28
|
+
'distance between them. By default, this algorithm uses ' \
|
29
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
30
|
+
|
31
|
+
# @return [Object]
|
31
32
|
def initialize
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
super()
|
34
|
+
@distance_function = lambda do |a, b|
|
35
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
36
|
+
a.select { |att_a| att_a.is_a? Numeric },
|
37
|
+
b.select { |att_b| att_b.is_a? Numeric }
|
38
|
+
)
|
39
|
+
end
|
37
40
|
end
|
38
|
-
|
41
|
+
|
39
42
|
# Build a new clusterer, using divisive analysis (DIANA algorithm)
|
43
|
+
# @param data_set [Object]
|
44
|
+
# @param number_of_clusters [Object]
|
45
|
+
# @return [Object]
|
40
46
|
def build(data_set, number_of_clusters)
|
41
47
|
@data_set = data_set
|
42
48
|
@number_of_clusters = number_of_clusters
|
43
|
-
@clusters = [@data_set
|
44
|
-
|
45
|
-
while
|
49
|
+
@clusters = [@data_set]
|
50
|
+
|
51
|
+
while @clusters.length < @number_of_clusters
|
46
52
|
cluster_index_to_split = max_diameter_cluster(@clusters)
|
47
53
|
cluster_to_split = @clusters[cluster_index_to_split]
|
48
54
|
splinter_cluster = init_splinter_cluster(cluster_to_split)
|
49
|
-
|
55
|
+
loop do
|
50
56
|
dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
|
51
|
-
break if dist_diff
|
57
|
+
break if dist_diff.negative?
|
58
|
+
|
52
59
|
splinter_cluster << cluster_to_split.data_items[index]
|
53
60
|
cluster_to_split.data_items.delete_at(index)
|
54
61
|
end
|
55
62
|
@clusters << splinter_cluster
|
56
63
|
end
|
57
|
-
|
58
|
-
|
64
|
+
|
65
|
+
self
|
59
66
|
end
|
60
|
-
|
61
|
-
# Classifies the given data item, returning the cluster index it belongs
|
67
|
+
|
68
|
+
# Classifies the given data item, returning the cluster index it belongs
|
62
69
|
# to (0-based).
|
70
|
+
# @param data_item [Object]
|
71
|
+
# @return [Object]
|
63
72
|
def eval(data_item)
|
64
73
|
get_min_index(@clusters.collect do |cluster|
|
65
74
|
distance_sum(data_item, cluster) / cluster.data_items.length
|
66
|
-
|
75
|
+
end)
|
67
76
|
end
|
68
|
-
|
77
|
+
|
69
78
|
protected
|
70
|
-
|
79
|
+
|
71
80
|
# return the cluster with max diameter
|
81
|
+
# @param clusters [Object]
|
82
|
+
# @return [Object]
|
72
83
|
def max_diameter_cluster(clusters)
|
73
84
|
max_index = 0
|
74
85
|
max_diameter = 0
|
@@ -79,10 +90,12 @@ module Ai4r
|
|
79
90
|
max_diameter = diameter
|
80
91
|
end
|
81
92
|
end
|
82
|
-
|
93
|
+
max_index
|
83
94
|
end
|
84
|
-
|
95
|
+
|
85
96
|
# Max distance between 2 items in a cluster
|
97
|
+
# @param cluster [Object]
|
98
|
+
# @return [Object]
|
86
99
|
def cluster_diameter(cluster)
|
87
100
|
diameter = 0
|
88
101
|
cluster.data_items.each_with_index do |item_a, item_a_pos|
|
@@ -91,49 +104,62 @@ module Ai4r
|
|
91
104
|
diameter = d if d > diameter
|
92
105
|
end
|
93
106
|
end
|
94
|
-
|
107
|
+
diameter
|
95
108
|
end
|
96
|
-
|
109
|
+
|
97
110
|
# Create a cluster with the item with mx distance
|
98
111
|
# to the rest of the cluster's items.
|
99
112
|
# That item is removed from the initial cluster.
|
113
|
+
# @param cluster_to_split [Object]
|
114
|
+
# @return [Object]
|
100
115
|
def init_splinter_cluster(cluster_to_split)
|
101
116
|
max = 0.0
|
102
117
|
max_index = 0
|
103
118
|
cluster_to_split.data_items.each_with_index do |item, index|
|
104
119
|
sum = distance_sum(item, cluster_to_split)
|
105
|
-
|
120
|
+
if sum > max
|
121
|
+
max = sum
|
122
|
+
max_index = index
|
123
|
+
end
|
106
124
|
end
|
107
125
|
splinter_cluster = cluster_to_split[max_index]
|
108
126
|
cluster_to_split.data_items.delete_at(max_index)
|
109
|
-
|
127
|
+
splinter_cluster
|
110
128
|
end
|
111
|
-
|
112
|
-
# Return the max average distance between any item of
|
129
|
+
|
130
|
+
# Return the max average distance between any item of
|
113
131
|
# cluster_to_split and the rest of items in that cluster,
|
114
132
|
# minus the average distance with the items of splinter_cluster,
|
115
133
|
# and the index of the item.
|
116
134
|
# A positive value means that the items is closer to the
|
117
135
|
# splinter group than to its current cluster.
|
136
|
+
# @param cluster_to_split [Object]
|
137
|
+
# @param splinter_cluster [Object]
|
138
|
+
# @return [Object]
|
118
139
|
def max_distance_difference(cluster_to_split, splinter_cluster)
|
119
|
-
max_diff = -
|
140
|
+
max_diff = -Float::INFINITY
|
120
141
|
max_diff_index = 0
|
121
142
|
cluster_to_split.data_items.each_with_index do |item, index|
|
122
|
-
dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
|
123
|
-
dist_b = distance_sum(item, splinter_cluster) /
|
143
|
+
dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length - 1)
|
144
|
+
dist_b = distance_sum(item, splinter_cluster) / splinter_cluster.data_items.length
|
124
145
|
dist_diff = dist_a - dist_b
|
125
|
-
|
146
|
+
if dist_diff > max_diff
|
147
|
+
max_diff = dist_diff
|
148
|
+
max_diff_index = index
|
149
|
+
end
|
126
150
|
end
|
127
|
-
|
151
|
+
[max_diff, max_diff_index]
|
128
152
|
end
|
129
|
-
|
153
|
+
|
130
154
|
# Sum up the distance between an item and all the items in a cluster
|
155
|
+
# @param item_a [Object]
|
156
|
+
# @param cluster [Object]
|
157
|
+
# @return [Object]
|
131
158
|
def distance_sum(item_a, cluster)
|
132
159
|
cluster.data_items.inject(0.0) do |sum, item_b|
|
133
160
|
sum + @distance_function.call(item_a, item_b)
|
134
161
|
end
|
135
162
|
end
|
136
|
-
|
137
163
|
end
|
138
164
|
end
|
139
165
|
end
|
@@ -1,126 +1,363 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../data/proximity'
|
14
|
+
require_relative '../clusterers/clusterer'
|
12
15
|
|
13
16
|
module Ai4r
|
14
17
|
module Clusterers
|
15
|
-
|
16
|
-
# The k-means algorithm is an algorithm to cluster n objects
|
18
|
+
# The k-means algorithm is an algorithm to cluster n objects
|
17
19
|
# based on attributes into k partitions, with k < n.
|
18
|
-
#
|
20
|
+
#
|
19
21
|
# More about K Means algorithm:
|
20
|
-
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
21
23
|
class KMeans < Clusterer
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
24
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :centroids, :iterations, :history
|
25
|
+
|
26
|
+
parameters_info(
|
27
|
+
max_iterations: 'Maximum number of iterations to build the clusterer. By default it is uncapped.',
|
28
|
+
distance_function: 'Custom implementation of distance function. ' \
|
29
|
+
'It must be a closure receiving two data items and return the ' \
|
30
|
+
'distance between them. By default, this algorithm uses ' \
|
31
|
+
'euclidean distance of numeric attributes to the power of 2.',
|
32
|
+
centroid_function: 'Custom implementation to calculate the ' \
|
33
|
+
'centroid of a cluster. It must be a closure receiving an array of ' \
|
34
|
+
'data sets, and return an array of data items, representing the ' \
|
35
|
+
'centroids of for each data set. ' \
|
36
|
+
'By default, this algorithm returns a data items using the mode ' \
|
37
|
+
'or mean of each attribute on each data set.',
|
38
|
+
centroid_indices: 'Indices of data items (indexed from 0) to be ' \
|
39
|
+
'the initial centroids. Otherwise, the initial centroids will be ' \
|
40
|
+
'assigned randomly from the data set.',
|
41
|
+
on_empty: 'Action to take if a cluster becomes empty, with values ' \
|
42
|
+
"'eliminate' (the default action, eliminate the empty cluster), " \
|
43
|
+
"'terminate' (terminate with error), 'random' (relocate the " \
|
44
|
+
"empty cluster to a random point), 'outlier' (relocate the " \
|
45
|
+
'empty cluster to the point furthest from its centroid).',
|
46
|
+
random_seed: "Seed value used to initialize Ruby's random number " \
|
47
|
+
'generator when selecting random centroids.',
|
48
|
+
init_method: 'Strategy to initialize centroids. Available values: ' \
|
49
|
+
':random (default) and :kmeans_plus_plus.',
|
50
|
+
restarts: 'Number of random initializations to perform. ' \
|
51
|
+
'The best run (lowest SSE) will be kept.',
|
52
|
+
track_history: 'Keep centroids and assignments for each iteration ' \
|
53
|
+
'when building the clusterer.'
|
54
|
+
)
|
55
|
+
|
56
|
+
# @return [Object]
|
39
57
|
def initialize
|
58
|
+
super()
|
40
59
|
@distance_function = nil
|
41
60
|
@max_iterations = nil
|
42
|
-
@
|
43
|
-
|
44
|
-
data_sets.collect{ |data_set| data_set.get_mean_or_mode}
|
61
|
+
@centroid_function = lambda do |data_sets|
|
62
|
+
data_sets.collect(&:get_mean_or_mode)
|
45
63
|
end
|
64
|
+
@centroid_indices = []
|
65
|
+
@on_empty = 'eliminate' # default if none specified
|
66
|
+
@random_seed = nil
|
67
|
+
@rng = nil
|
68
|
+
@init_method = :random
|
69
|
+
@restarts = 1
|
70
|
+
@track_history = false
|
46
71
|
end
|
47
|
-
|
48
|
-
|
72
|
+
|
49
73
|
# Build a new clusterer, using data examples found in data_set.
|
50
74
|
# Items will be clustered in "number_of_clusters" different
|
51
75
|
# clusters.
|
76
|
+
# @param data_set [Object]
|
77
|
+
# @param number_of_clusters [Object]
|
78
|
+
# @return [Object]
|
52
79
|
def build(data_set, number_of_clusters)
|
53
80
|
@data_set = data_set
|
54
81
|
@number_of_clusters = number_of_clusters
|
55
|
-
@
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
82
|
+
raise ArgumentError, 'Number of clusters larger than data items' if @number_of_clusters > @data_set.data_items.length
|
83
|
+
|
84
|
+
unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
|
85
|
+
raise ArgumentError,
|
86
|
+
'Length of centroid indices array differs from the specified number of clusters'
|
87
|
+
end
|
88
|
+
unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
|
89
|
+
raise ArgumentError,
|
90
|
+
'Invalid value for on_empty'
|
91
|
+
end
|
92
|
+
|
93
|
+
seed_base = @random_seed
|
94
|
+
best_sse = nil
|
95
|
+
best_centroids = nil
|
96
|
+
best_clusters = nil
|
97
|
+
best_iterations = nil
|
98
|
+
|
99
|
+
(@restarts || 1).times do |i|
|
100
|
+
@random_seed = seed_base.nil? ? nil : seed_base + i
|
101
|
+
@rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
|
102
|
+
@iterations = 0
|
103
|
+
@history = [] if @track_history
|
104
|
+
calc_initial_centroids
|
105
|
+
until stop_criteria_met
|
106
|
+
calculate_membership_clusters
|
107
|
+
if @track_history
|
108
|
+
@history << {
|
109
|
+
centroids: @centroids.collect(&:dup),
|
110
|
+
assignments: @assignments.dup
|
111
|
+
}
|
112
|
+
end
|
113
|
+
recompute_centroids
|
114
|
+
end
|
115
|
+
current_sse = sse
|
116
|
+
next unless best_sse.nil? || current_sse < best_sse
|
117
|
+
|
118
|
+
best_sse = current_sse
|
119
|
+
best_centroids = Marshal.load(Marshal.dump(@centroids))
|
120
|
+
best_clusters = Marshal.load(Marshal.dump(@clusters))
|
121
|
+
best_iterations = @iterations
|
61
122
|
end
|
62
|
-
|
63
|
-
|
123
|
+
|
124
|
+
@random_seed = seed_base
|
125
|
+
@rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
|
126
|
+
@centroids = best_centroids
|
127
|
+
@clusters = best_clusters
|
128
|
+
@iterations = best_iterations
|
129
|
+
self
|
64
130
|
end
|
65
|
-
|
66
|
-
# Classifies the given data item, returning the cluster index it belongs
|
131
|
+
|
132
|
+
# Classifies the given data item, returning the cluster index it belongs
|
67
133
|
# to (0-based).
|
134
|
+
# @param data_item [Object]
|
135
|
+
# @return [Object]
|
68
136
|
def eval(data_item)
|
69
|
-
get_min_index(@centroids.collect
|
70
|
-
|
137
|
+
get_min_index(@centroids.collect do |centroid|
|
138
|
+
distance(data_item, centroid)
|
139
|
+
end)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Sum of squared distances of all points to their respective centroids.
|
143
|
+
# It can be used as a measure of cluster compactness (SSE).
|
144
|
+
# @return [Object]
|
145
|
+
def sse
|
146
|
+
sum = 0.0
|
147
|
+
@clusters.each_with_index do |cluster, i|
|
148
|
+
centroid = @centroids[i]
|
149
|
+
cluster.data_items.each do |item|
|
150
|
+
sum += distance(item, centroid)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
sum
|
71
154
|
end
|
72
|
-
|
155
|
+
|
73
156
|
# This function calculates the distance between 2 different
|
74
|
-
# instances. By default, it returns the euclidean distance to the
|
157
|
+
# instances. By default, it returns the euclidean distance to the
|
75
158
|
# power of 2.
|
76
|
-
# You can provide a more
|
77
|
-
#
|
159
|
+
# You can provide a more convenient distance implementation:
|
160
|
+
#
|
78
161
|
# 1- Overwriting this method
|
79
|
-
#
|
162
|
+
#
|
80
163
|
# 2- Providing a closure to the :distance_function parameter
|
164
|
+
# @param a [Object]
|
165
|
+
# @param b [Object]
|
166
|
+
# @return [Object]
|
81
167
|
def distance(a, b)
|
82
168
|
return @distance_function.call(a, b) if @distance_function
|
83
|
-
|
169
|
+
|
170
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
171
|
+
a.select { |att_a| att_a.is_a? Numeric },
|
172
|
+
b.select { |att_b| att_b.is_a? Numeric }
|
173
|
+
)
|
84
174
|
end
|
85
|
-
|
86
|
-
protected
|
87
|
-
|
175
|
+
|
176
|
+
protected
|
177
|
+
|
178
|
+
# @return [Object]
|
88
179
|
def calc_initial_centroids
|
89
180
|
@centroids = []
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
if !@centroids.include? @data_set.data_items[random_index]
|
97
|
-
@centroids << @data_set.data_items[random_index]
|
98
|
-
end
|
181
|
+
@old_centroids = nil
|
182
|
+
if @centroid_indices.empty?
|
183
|
+
if @init_method == :kmeans_plus_plus
|
184
|
+
kmeans_plus_plus_init
|
185
|
+
else
|
186
|
+
populate_centroids('random')
|
99
187
|
end
|
188
|
+
else
|
189
|
+
populate_centroids('indices')
|
100
190
|
end
|
101
|
-
@number_of_clusters = @centroids.length
|
102
191
|
end
|
103
|
-
|
192
|
+
|
193
|
+
# @return [Object]
|
104
194
|
def stop_criteria_met
|
105
|
-
@old_centroids == @centroids ||
|
195
|
+
@old_centroids == @centroids ||
|
106
196
|
(@max_iterations && (@max_iterations <= @iterations))
|
107
197
|
end
|
108
|
-
|
198
|
+
|
199
|
+
# @return [Object]
|
109
200
|
def calculate_membership_clusters
|
110
|
-
@clusters = Array.new(@number_of_clusters) do
|
111
|
-
Ai4r::Data::DataSet.new :
|
201
|
+
@clusters = Array.new(@number_of_clusters) do
|
202
|
+
Ai4r::Data::DataSet.new data_labels: @data_set.data_labels
|
112
203
|
end
|
113
|
-
@
|
114
|
-
|
204
|
+
@cluster_indices = Array.new(@number_of_clusters) { [] }
|
205
|
+
@assignments = Array.new(@data_set.data_items.length)
|
206
|
+
|
207
|
+
@data_set.data_items.each_with_index do |data_item, data_index|
|
208
|
+
c = eval(data_item)
|
209
|
+
@clusters[c] << data_item
|
210
|
+
@cluster_indices[c] << data_index if @on_empty == 'outlier'
|
211
|
+
@assignments[data_index] = c
|
115
212
|
end
|
213
|
+
manage_empty_clusters if empty_cluster?
|
116
214
|
end
|
117
|
-
|
215
|
+
|
216
|
+
# @return [Object]
|
118
217
|
def recompute_centroids
|
119
218
|
@old_centroids = @centroids
|
120
219
|
@iterations += 1
|
121
|
-
@centroids = @centroid_function.call(@clusters)
|
220
|
+
@centroids = @centroid_function.call(@clusters)
|
221
|
+
end
|
222
|
+
|
223
|
+
# @return [Object]
|
224
|
+
def kmeans_plus_plus_init
|
225
|
+
chosen_indices = []
|
226
|
+
first_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
|
227
|
+
return if first_index.nil?
|
228
|
+
|
229
|
+
@centroids << @data_set.data_items[first_index]
|
230
|
+
chosen_indices << first_index
|
231
|
+
while @centroids.length < @number_of_clusters &&
|
232
|
+
chosen_indices.length < @data_set.data_items.length
|
233
|
+
distances = []
|
234
|
+
total = 0.0
|
235
|
+
@data_set.data_items.each_with_index do |item, index|
|
236
|
+
next if chosen_indices.include?(index)
|
237
|
+
|
238
|
+
min_dist = @centroids.map { |c| distance(item, c) }.min
|
239
|
+
distances << [index, min_dist]
|
240
|
+
total += min_dist
|
241
|
+
end
|
242
|
+
break if distances.empty?
|
243
|
+
|
244
|
+
r = @rng.rand * total
|
245
|
+
cumulative = 0.0
|
246
|
+
chosen = distances.find do |_idx, dist|
|
247
|
+
cumulative += dist
|
248
|
+
cumulative >= r
|
249
|
+
end
|
250
|
+
chosen_indices << chosen[0]
|
251
|
+
@centroids << @data_set.data_items[chosen[0]]
|
252
|
+
end
|
253
|
+
@number_of_clusters = @centroids.length
|
254
|
+
end
|
255
|
+
|
256
|
+
# @param populate_method [Object]
|
257
|
+
# @param number_of_clusters [Object]
|
258
|
+
# @return [Object]
|
259
|
+
def populate_centroids(populate_method, number_of_clusters = @number_of_clusters)
|
260
|
+
tried_indexes = []
|
261
|
+
case populate_method
|
262
|
+
when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
|
263
|
+
while @centroids.length < number_of_clusters &&
|
264
|
+
tried_indexes.length < @data_set.data_items.length
|
265
|
+
random_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
|
266
|
+
next if tried_indexes.include?(random_index)
|
267
|
+
|
268
|
+
tried_indexes << random_index
|
269
|
+
@centroids << @data_set.data_items[random_index] unless @centroids.include? @data_set.data_items[random_index]
|
270
|
+
end
|
271
|
+
when 'indices' # for initial assignment only (with the :centroid_indices option)
|
272
|
+
@centroid_indices.each do |index|
|
273
|
+
unless (index.is_a? Integer) && index >= 0 && index < @data_set.data_items.length
|
274
|
+
raise ArgumentError,
|
275
|
+
"Invalid centroid index #{index}"
|
276
|
+
end
|
277
|
+
|
278
|
+
next if tried_indexes.include?(index)
|
279
|
+
|
280
|
+
tried_indexes << index
|
281
|
+
@centroids << @data_set.data_items[index] unless @centroids.include? @data_set.data_items[index]
|
282
|
+
end
|
283
|
+
when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
|
284
|
+
sorted_data_indices = sort_data_indices_by_dist_to_centroid
|
285
|
+
i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
|
286
|
+
while @centroids.length < number_of_clusters &&
|
287
|
+
tried_indexes.length < @data_set.data_items.length
|
288
|
+
outlier_index = sorted_data_indices[i]
|
289
|
+
unless tried_indexes.include?(outlier_index)
|
290
|
+
tried_indexes << outlier_index
|
291
|
+
@centroids << @data_set.data_items[outlier_index] unless @centroids.include? @data_set.data_items[outlier_index]
|
292
|
+
end
|
293
|
+
i.positive? ? i -= 1 : break
|
294
|
+
end
|
295
|
+
end
|
296
|
+
@number_of_clusters = @centroids.length
|
297
|
+
end
|
298
|
+
|
299
|
+
# Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
|
300
|
+
# Returns indices, sorted in order from the nearest to furthest.
|
301
|
+
# @return [Object]
|
302
|
+
def sort_data_indices_by_dist_to_centroid
|
303
|
+
h = {}
|
304
|
+
@clusters.each_with_index do |cluster, c|
|
305
|
+
centroid = @centroids[c]
|
306
|
+
cluster.data_items.each_with_index do |data_item, i|
|
307
|
+
dist_to_centroid = distance(data_item, centroid)
|
308
|
+
data_index = @cluster_indices[c][i]
|
309
|
+
h[data_index] = dist_to_centroid
|
310
|
+
end
|
311
|
+
end
|
312
|
+
# sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
|
313
|
+
h.sort_by { |_k, v| v }.collect { |a, _b| a }
|
314
|
+
end
|
315
|
+
|
316
|
+
# @return [Object]
|
317
|
+
def empty_cluster?
|
318
|
+
found_empty = false
|
319
|
+
@number_of_clusters.times do |c|
|
320
|
+
found_empty = true if @clusters[c].data_items.empty?
|
321
|
+
end
|
322
|
+
found_empty
|
323
|
+
end
|
324
|
+
|
325
|
+
# @return [Object]
|
326
|
+
def manage_empty_clusters
|
327
|
+
# Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
|
328
|
+
return if on_empty == 'terminate'
|
329
|
+
|
330
|
+
initial_number_of_clusters = @number_of_clusters
|
331
|
+
eliminate_empty_clusters
|
332
|
+
return if on_empty == 'eliminate'
|
333
|
+
|
334
|
+
populate_centroids(on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
|
335
|
+
calculate_membership_clusters
|
336
|
+
end
|
337
|
+
|
338
|
+
# @return [Object]
|
339
|
+
def eliminate_empty_clusters
|
340
|
+
old_clusters = @clusters
|
341
|
+
old_centroids = @centroids
|
342
|
+
old_cluster_indices = @cluster_indices
|
343
|
+
old_assignments = @assignments
|
344
|
+
@clusters = []
|
345
|
+
@centroids = []
|
346
|
+
@cluster_indices = []
|
347
|
+
remap = {}
|
348
|
+
new_index = 0
|
349
|
+
@number_of_clusters.times do |i|
|
350
|
+
next if old_clusters[i].data_items.empty?
|
351
|
+
|
352
|
+
remap[i] = new_index
|
353
|
+
@clusters << old_clusters[i]
|
354
|
+
@cluster_indices << old_cluster_indices[i]
|
355
|
+
@centroids << old_centroids[i]
|
356
|
+
new_index += 1
|
357
|
+
end
|
358
|
+
@number_of_clusters = @centroids.length
|
359
|
+
@assignments = old_assignments.map { |c| remap[c] }
|
122
360
|
end
|
123
|
-
|
124
361
|
end
|
125
362
|
end
|
126
363
|
end
|