ai4r 1.13 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.rb +14 -11
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +40 -34
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +48 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +524 -145
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +29 -14
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +270 -135
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +223 -103
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +62 -62
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +399 -134
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +57 -28
- metadata +79 -109
- data/README.rdoc +0 -39
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -220
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/simple_linear_regression_test.rb +0 -37
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -167
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -104
- data/test/data/proximity_test.rb +0 -87
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -1,172 +1,278 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../data/proximity'
|
14
|
+
require_relative '../clusterers/clusterer'
|
15
|
+
require_relative '../clusterers/cluster_tree'
|
13
16
|
|
14
17
|
module Ai4r
|
15
18
|
module Clusterers
|
16
|
-
|
17
|
-
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
19
|
+
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
18
20
|
# al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
|
19
|
-
# Hierarchical clusterer create one cluster per element, and then
|
21
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
22
|
# progressively merge clusters, until the required number of clusters
|
21
23
|
# is reached.
|
22
|
-
# With single linkage, the distance between two clusters is computed as the
|
24
|
+
# With single linkage, the distance between two clusters is computed as the
|
23
25
|
# distance between the two closest elements in the two clusters.
|
24
26
|
#
|
25
27
|
# D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
|
26
28
|
class SingleLinkage < Clusterer
|
27
|
-
|
29
|
+
include ClusterTree
|
30
|
+
|
28
31
|
attr_reader :data_set, :number_of_clusters, :clusters
|
29
|
-
|
30
|
-
parameters_info :
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
|
33
|
+
parameters_info distance_function:
|
34
|
+
'Custom implementation of distance function. ' \
|
35
|
+
'It must be a closure receiving two data items and return the ' \
|
36
|
+
'distance between them. By default, this algorithm uses ' \
|
37
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
38
|
+
|
39
|
+
# @return [Object]
|
40
|
+
def initialize(*args)
|
41
|
+
super(*args)
|
42
|
+
@distance_function = lambda do |a, b|
|
43
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
44
|
+
a.select { |att_a| att_a.is_a? Numeric },
|
45
|
+
b.select { |att_b| att_b.is_a? Numeric }
|
46
|
+
)
|
47
|
+
end
|
42
48
|
end
|
43
|
-
|
49
|
+
|
44
50
|
# Build a new clusterer, using data examples found in data_set.
|
45
51
|
# Items will be clustered in "number_of_clusters" different
|
46
52
|
# clusters.
|
47
|
-
|
53
|
+
#
|
54
|
+
# If you specify :distance options, it will stop whether
|
55
|
+
# number_of_clusters are reached or no distance among clusters are below :distance
|
56
|
+
# @param data_set [Object]
|
57
|
+
# @param number_of_clusters [Object]
|
58
|
+
# @param *options [Object]
|
59
|
+
# @return [Object]
|
60
|
+
def build(data_set, number_of_clusters = 1, **options)
|
48
61
|
@data_set = data_set
|
49
|
-
|
50
|
-
|
62
|
+
distance = options[:distance] || Float::INFINITY
|
63
|
+
|
51
64
|
@index_clusters = create_initial_index_clusters
|
52
65
|
create_distance_matrix(data_set)
|
53
|
-
while @index_clusters.length >
|
66
|
+
while @index_clusters.length > number_of_clusters
|
54
67
|
ci, cj = get_closest_clusters(@index_clusters)
|
68
|
+
break if read_distance_matrix(ci, cj) > distance
|
69
|
+
|
55
70
|
update_distance_matrix(ci, cj)
|
56
71
|
merge_clusters(ci, cj, @index_clusters)
|
57
72
|
end
|
73
|
+
|
74
|
+
@number_of_clusters = @index_clusters.length
|
75
|
+
@distance_matrix = nil
|
58
76
|
@clusters = build_clusters_from_index_clusters @index_clusters
|
59
|
-
|
60
|
-
return self
|
77
|
+
self
|
61
78
|
end
|
62
|
-
|
63
|
-
#
|
79
|
+
|
80
|
+
# @param clusters [Object]
|
81
|
+
# @return [Object]
|
82
|
+
def draw_map(clusters)
|
83
|
+
map = Array.new(11) { Array.new(11, 0) }
|
84
|
+
clusters.each_index do |i|
|
85
|
+
clusters[i].data_items.each do |point|
|
86
|
+
map[point.first][point.last] = (i + 1)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
map
|
90
|
+
end
|
91
|
+
|
92
|
+
# Classifies the given data item, returning the cluster index it belongs
|
64
93
|
# to (0-based).
|
94
|
+
# @param data_item [Object]
|
95
|
+
# @return [Object]
|
65
96
|
def eval(data_item)
|
66
|
-
get_min_index(@clusters.collect
|
67
|
-
|
97
|
+
get_min_index(@clusters.collect do |cluster|
|
98
|
+
distance_between_item_and_cluster(data_item, cluster)
|
99
|
+
end)
|
100
|
+
end
|
101
|
+
|
102
|
+
protected
|
103
|
+
|
104
|
+
# @param i [Object]
|
105
|
+
# @param j [Object]
|
106
|
+
# @return [Object]
|
107
|
+
def distance_between_indexes(i, j)
|
108
|
+
@distance_function.call(@data_set.data_items[i], @data_set.data_items[j])
|
109
|
+
end
|
110
|
+
|
111
|
+
public
|
112
|
+
|
113
|
+
# Compute mean silhouette coefficient of the clustering result.
|
114
|
+
# Returns a float between -1 and 1. Only valid after build.
|
115
|
+
# @return [Object]
|
116
|
+
def silhouette
|
117
|
+
return nil unless @index_clusters && @data_set
|
118
|
+
|
119
|
+
total = 0.0
|
120
|
+
count = @data_set.data_items.length
|
121
|
+
|
122
|
+
@index_clusters.each_with_index do |cluster, ci|
|
123
|
+
cluster.each do |index|
|
124
|
+
a = 0.0
|
125
|
+
if cluster.length > 1
|
126
|
+
cluster.each do |j|
|
127
|
+
next if j == index
|
128
|
+
|
129
|
+
a += distance_between_indexes(index, j)
|
130
|
+
end
|
131
|
+
a /= (cluster.length - 1)
|
132
|
+
end
|
133
|
+
|
134
|
+
b = nil
|
135
|
+
@index_clusters.each_with_index do |other_cluster, cj|
|
136
|
+
next if ci == cj
|
137
|
+
|
138
|
+
dist = 0.0
|
139
|
+
other_cluster.each do |j|
|
140
|
+
dist += distance_between_indexes(index, j)
|
141
|
+
end
|
142
|
+
dist /= other_cluster.length
|
143
|
+
b = dist if b.nil? || dist < b
|
144
|
+
end
|
145
|
+
s = b&.positive? ? (b - a) / [a, b].max : 0.0
|
146
|
+
total += s
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
total / count
|
68
151
|
end
|
69
|
-
|
152
|
+
|
70
153
|
protected
|
71
|
-
|
154
|
+
|
72
155
|
# returns [ [0], [1], [2], ... , [n-1] ]
|
73
156
|
# where n is the number of data items in the data set
|
157
|
+
# @return [Object]
|
74
158
|
def create_initial_index_clusters
|
75
159
|
index_clusters = []
|
76
|
-
@data_set.data_items.length.times {|i| index_clusters << [i]}
|
77
|
-
|
160
|
+
@data_set.data_items.length.times { |i| index_clusters << [i] }
|
161
|
+
index_clusters
|
78
162
|
end
|
79
|
-
|
163
|
+
|
80
164
|
# Create a partial distance matrix:
|
81
|
-
# [
|
82
|
-
# [d(1,0)],
|
83
|
-
# [d(2,0)
|
84
|
-
# [d(3,0)
|
85
|
-
# ...
|
86
|
-
# [d(n-1,0)
|
165
|
+
# [
|
166
|
+
# [d(1,0)],
|
167
|
+
# [d(2,0), d(2,1)],
|
168
|
+
# [d(3,0), d(3,1), d(3,2)],
|
169
|
+
# ...
|
170
|
+
# [d(n-1,0), d(n-1,1), d(n-1,2), ... , d(n-1,n-2)]
|
87
171
|
# ]
|
88
172
|
# where n is the number of data items in the data set
|
173
|
+
# @param data_set [Object]
|
174
|
+
# @return [Object]
|
89
175
|
def create_distance_matrix(data_set)
|
90
|
-
@distance_matrix = Array.new(data_set.data_items.length-1)
|
176
|
+
@distance_matrix = Array.new(data_set.data_items.length - 1) do |index|
|
177
|
+
Array.new(index + 1)
|
178
|
+
end
|
91
179
|
data_set.data_items.each_with_index do |a, i|
|
92
180
|
i.times do |j|
|
93
181
|
b = data_set.data_items[j]
|
94
|
-
@distance_matrix[i-1][j] = @distance_function.call(a, b)
|
182
|
+
@distance_matrix[i - 1][j] = @distance_function.call(a, b)
|
95
183
|
end
|
96
184
|
end
|
97
185
|
end
|
98
|
-
|
186
|
+
|
99
187
|
# Returns the distance between element data_item[index_a] and
|
100
188
|
# data_item[index_b] using the distance matrix
|
189
|
+
# @param index_a [Object]
|
190
|
+
# @param index_b [Object]
|
191
|
+
# @return [Object]
|
101
192
|
def read_distance_matrix(index_a, index_b)
|
102
193
|
return 0 if index_a == index_b
|
194
|
+
|
103
195
|
index_a, index_b = index_b, index_a if index_b > index_a
|
104
|
-
|
196
|
+
@distance_matrix[index_a - 1][index_b]
|
105
197
|
end
|
106
198
|
|
107
199
|
# ci and cj are the indexes of the clusters that are going to
|
108
|
-
# be merged. We need to remove distances from/to ci and cj,
|
200
|
+
# be merged. We need to remove distances from/to ci and cj,
|
109
201
|
# and add distances from/to new cluster (ci U cj)
|
202
|
+
# @param ci [Object]
|
203
|
+
# @param cj [Object]
|
204
|
+
# @return [Object]
|
110
205
|
def update_distance_matrix(ci, cj)
|
111
206
|
ci, cj = cj, ci if cj > ci
|
112
|
-
distances_to_new_cluster =
|
113
|
-
(@distance_matrix.length+1).times do |cx|
|
114
|
-
if cx!= ci && cx!=cj
|
115
|
-
distances_to_new_cluster << linkage_distance(cx, ci, cj)
|
116
|
-
end
|
207
|
+
distances_to_new_cluster = []
|
208
|
+
(@distance_matrix.length + 1).times do |cx|
|
209
|
+
distances_to_new_cluster << linkage_distance(cx, ci, cj) if cx != ci && cx != cj
|
117
210
|
end
|
118
|
-
if cj
|
119
|
-
@distance_matrix.delete_at(1)
|
120
|
-
@distance_matrix.delete_at(0)
|
121
|
-
elsif cj
|
122
|
-
@distance_matrix.delete_at(ci-1)
|
123
|
-
@distance_matrix.delete_at(0)
|
211
|
+
if cj.zero? && ci == 1
|
212
|
+
@distance_matrix.delete_at(1)
|
213
|
+
@distance_matrix.delete_at(0)
|
214
|
+
elsif cj.zero?
|
215
|
+
@distance_matrix.delete_at(ci - 1)
|
216
|
+
@distance_matrix.delete_at(0)
|
124
217
|
else
|
125
|
-
@distance_matrix.delete_at(ci-1)
|
126
|
-
@distance_matrix.delete_at(cj-1)
|
218
|
+
@distance_matrix.delete_at(ci - 1)
|
219
|
+
@distance_matrix.delete_at(cj - 1)
|
127
220
|
end
|
128
|
-
@distance_matrix.each do |d|
|
221
|
+
@distance_matrix.each do |d|
|
129
222
|
d.delete_at(ci)
|
130
223
|
d.delete_at(cj)
|
131
224
|
end
|
132
225
|
@distance_matrix << distances_to_new_cluster
|
133
226
|
end
|
134
|
-
|
227
|
+
|
135
228
|
# return distance between cluster cx and new cluster (ci U cj),
|
136
229
|
# using single linkage
|
137
|
-
|
138
|
-
|
139
|
-
|
230
|
+
# @param cx [Object]
|
231
|
+
# @param ci [Object]
|
232
|
+
# @param cj [Object]
|
233
|
+
# @return [Object]
|
234
|
+
def linkage_distance(cluster_x, cluster_i, cluster_j)
|
235
|
+
[read_distance_matrix(cluster_x, cluster_i),
|
236
|
+
read_distance_matrix(cluster_x, cluster_j)].min
|
140
237
|
end
|
141
|
-
|
142
|
-
# cluster_a and cluster_b are removed from index_cluster,
|
238
|
+
|
239
|
+
# cluster_a and cluster_b are removed from index_cluster,
|
143
240
|
# and a new cluster with all members of cluster_a and cluster_b
|
144
|
-
# is added.
|
241
|
+
# is added.
|
145
242
|
# It modifies index clusters array.
|
243
|
+
# @param index_a [Object]
|
244
|
+
# @param index_b [Object]
|
245
|
+
# @param index_clusters [Object]
|
246
|
+
# @return [Object]
|
146
247
|
def merge_clusters(index_a, index_b, index_clusters)
|
147
248
|
index_a, index_b = index_b, index_a if index_b > index_a
|
148
249
|
new_index_cluster = index_clusters[index_a] +
|
149
|
-
|
250
|
+
index_clusters[index_b]
|
150
251
|
index_clusters.delete_at index_a
|
151
252
|
index_clusters.delete_at index_b
|
152
253
|
index_clusters << new_index_cluster
|
153
|
-
|
254
|
+
index_clusters
|
154
255
|
end
|
155
|
-
|
156
|
-
# Given an array with clusters of data_items indexes,
|
157
|
-
# it returns an array of data_items clusters
|
256
|
+
|
257
|
+
# Given an array with clusters of data_items indexes,
|
258
|
+
# it returns an array of data_items clusters
|
259
|
+
# @param index_clusters [Object]
|
260
|
+
# @return [Object]
|
158
261
|
def build_clusters_from_index_clusters(index_clusters)
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
262
|
+
index_clusters.collect do |index_cluster|
|
263
|
+
Ai4r::Data::DataSet.new(data_labels: @data_set.data_labels,
|
264
|
+
data_items: index_cluster.collect do |i|
|
265
|
+
@data_set.data_items[i]
|
266
|
+
end)
|
163
267
|
end
|
164
268
|
end
|
165
|
-
|
269
|
+
|
166
270
|
# Returns ans array with the indexes of the two closest
|
167
271
|
# clusters => [index_cluster_a, index_cluster_b]
|
272
|
+
# @param index_clusters [Object]
|
273
|
+
# @return [Object]
|
168
274
|
def get_closest_clusters(index_clusters)
|
169
|
-
min_distance =
|
275
|
+
min_distance = Float::INFINITY
|
170
276
|
closest_clusters = [1, 0]
|
171
277
|
index_clusters.each_index do |index_a|
|
172
278
|
index_a.times do |index_b|
|
@@ -177,18 +283,20 @@ module Ai4r
|
|
177
283
|
end
|
178
284
|
end
|
179
285
|
end
|
180
|
-
|
286
|
+
closest_clusters
|
181
287
|
end
|
182
|
-
|
288
|
+
|
289
|
+
# @param data_item [Object]
|
290
|
+
# @param cluster [Object]
|
291
|
+
# @return [Object]
|
183
292
|
def distance_between_item_and_cluster(data_item, cluster)
|
184
|
-
min_dist =
|
293
|
+
min_dist = Float::INFINITY
|
185
294
|
cluster.data_items.each do |another_item|
|
186
295
|
dist = @distance_function.call(data_item, another_item)
|
187
296
|
min_dist = dist if dist < min_dist
|
188
297
|
end
|
189
|
-
|
298
|
+
min_dist
|
190
299
|
end
|
191
|
-
|
192
300
|
end
|
193
301
|
end
|
194
302
|
end
|
@@ -1,64 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../clusterers/single_linkage'
|
14
|
+
require_relative '../clusterers/cluster_tree'
|
12
15
|
|
13
16
|
module Ai4r
|
14
17
|
module Clusterers
|
15
|
-
|
16
|
-
# Implementation of an Agglomerative Hierarchical clusterer with
|
18
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
19
|
# Ward's method linkage algorithm, aka the minimum variance method (Everitt
|
18
20
|
# et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
|
19
|
-
# Hierarchical clusterer create one cluster per element, and then
|
21
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
22
|
# progressively merge clusters, until the required number of clusters
|
21
23
|
# is reached.
|
22
|
-
# The objective of this method is to minimize the variance.
|
23
|
-
#
|
24
|
-
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
25
|
-
# (nj/(ni+nj+nx))*D(cx, cj) -
|
24
|
+
# The objective of this method is to minimize the variance.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
27
|
+
# (nj/(ni+nj+nx))*D(cx, cj) -
|
26
28
|
# (nx/(ni+nj)^2)*D(ci, cj)
|
27
29
|
class WardLinkage < SingleLinkage
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
30
|
+
include ClusterTree
|
31
|
+
|
32
|
+
parameters_info distance_function:
|
33
|
+
'Custom implementation of distance function. ' \
|
34
|
+
'It must be a closure receiving two data items and return the ' \
|
35
|
+
'distance between them. By default, this algorithm uses ' \
|
36
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
37
|
+
|
35
38
|
# Build a new clusterer, using data examples found in data_set.
|
36
39
|
# Items will be clustered in "number_of_clusters" different
|
37
40
|
# clusters.
|
38
|
-
|
41
|
+
# @param data_set [Object]
|
42
|
+
# @param number_of_clusters [Object]
|
43
|
+
# @param *options [Object]
|
44
|
+
# @return [Object]
|
45
|
+
def build(data_set, number_of_clusters = 1, **options)
|
39
46
|
super
|
40
47
|
end
|
41
|
-
|
42
|
-
# This algorithms does not allow classification of new data items
|
48
|
+
|
49
|
+
# This algorithms does not allow classification of new data items
|
43
50
|
# once it has been built. Rebuild the cluster including you data element.
|
44
|
-
|
45
|
-
|
51
|
+
# @param _data_item [Object]
|
52
|
+
# @return [Object]
|
53
|
+
def eval(_data_item)
|
54
|
+
raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
|
46
55
|
end
|
47
|
-
|
56
|
+
|
57
|
+
# @return [Object]
|
58
|
+
def supports_eval?
|
59
|
+
false
|
60
|
+
end
|
61
|
+
|
48
62
|
protected
|
49
|
-
|
63
|
+
|
50
64
|
# return distance between cluster cx and cluster (ci U cj),
|
51
65
|
# using ward's method linkage
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
66
|
+
# @param cx [Object]
|
67
|
+
# @param ci [Object]
|
68
|
+
# @param cj [Object]
|
69
|
+
# @return [Object]
|
70
|
+
def linkage_distance(cluster_x, cluster_i, cluster_j)
|
71
|
+
ni = @index_clusters[cluster_i].length
|
72
|
+
nj = @index_clusters[cluster_j].length
|
73
|
+
nx = @index_clusters[cluster_x].length
|
74
|
+
((((1.0 * (ni + nx) * read_distance_matrix(cluster_x, cluster_i)) +
|
75
|
+
(1.0 * (nj + nx) * read_distance_matrix(cluster_x, cluster_j))) / (ni + nj + nx)) -
|
76
|
+
(1.0 * nx * read_distance_matrix(cluster_i, cluster_j) / ((ni + nj)**2)))
|
59
77
|
end
|
60
|
-
|
61
78
|
end
|
62
79
|
end
|
63
80
|
end
|
64
|
-
|
@@ -1,39 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Peter Lubell-Doughtie
|
2
4
|
# License:: BSD 3 Clause
|
3
5
|
# Project:: ai4r
|
4
6
|
# Url:: http://peet.ldee.org
|
5
7
|
|
6
|
-
|
8
|
+
require_relative '../clusterers/ward_linkage'
|
9
|
+
require_relative '../clusterers/cluster_tree'
|
7
10
|
|
8
11
|
module Ai4r
|
9
12
|
module Clusterers
|
10
|
-
|
11
13
|
# Hierarchical version to store classes as merges occur.
|
12
14
|
class WardLinkageHierarchical < WardLinkage
|
15
|
+
include ClusterTree
|
13
16
|
|
14
|
-
|
15
|
-
|
17
|
+
# @param depth [Object]
|
18
|
+
# @return [Object]
|
16
19
|
def initialize(depth = nil)
|
17
20
|
@cluster_tree = []
|
18
21
|
@depth = depth
|
19
22
|
@merges_so_far = 0
|
20
|
-
super()
|
23
|
+
super(depth)
|
21
24
|
end
|
22
25
|
|
23
|
-
|
26
|
+
# @param data_set [Object]
|
27
|
+
# @param number_of_clusters [Object]
|
28
|
+
# @param *options [Object]
|
29
|
+
# @return [Object]
|
30
|
+
def build(data_set, number_of_clusters = 1, **options)
|
24
31
|
data_len = data_set.data_items.length
|
25
32
|
@total_merges = data_len - number_of_clusters
|
26
33
|
super
|
27
|
-
@cluster_tree <<
|
34
|
+
@cluster_tree << clusters
|
28
35
|
@cluster_tree.reverse!
|
29
|
-
|
36
|
+
self
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Object]
|
40
|
+
def supports_eval?
|
41
|
+
false
|
30
42
|
end
|
31
43
|
|
32
44
|
protected
|
33
45
|
|
46
|
+
# @param index_a [Object]
|
47
|
+
# @param index_b [Object]
|
48
|
+
# @param index_clusters [Object]
|
49
|
+
# @return [Object]
|
34
50
|
def merge_clusters(index_a, index_b, index_clusters)
|
35
51
|
# only store if no or above depth
|
36
|
-
if @depth.nil?
|
52
|
+
if @depth.nil? || (@merges_so_far > @total_merges - @depth)
|
37
53
|
# store current clusters
|
38
54
|
stored_distance_matrix = @distance_matrix.dup
|
39
55
|
@cluster_tree << build_clusters_from_index_clusters(index_clusters)
|
@@ -45,4 +61,3 @@ module Ai4r
|
|
45
61
|
end
|
46
62
|
end
|
47
63
|
end
|
48
|
-
|