ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +174 -0
  3. data/examples/classifiers/hyperpipes_data.csv +14 -0
  4. data/examples/classifiers/hyperpipes_example.rb +22 -0
  5. data/examples/classifiers/ib1_example.rb +12 -0
  6. data/examples/classifiers/id3_example.rb +15 -10
  7. data/examples/classifiers/id3_graphviz_example.rb +17 -0
  8. data/examples/classifiers/logistic_regression_example.rb +11 -0
  9. data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
  10. data/examples/classifiers/naive_bayes_example.rb +12 -13
  11. data/examples/classifiers/one_r_example.rb +27 -0
  12. data/examples/classifiers/parameter_tutorial.rb +29 -0
  13. data/examples/classifiers/prism_nominal_example.rb +15 -0
  14. data/examples/classifiers/prism_numeric_example.rb +21 -0
  15. data/examples/classifiers/simple_linear_regression_example.rb +14 -11
  16. data/examples/classifiers/zero_and_one_r_example.rb +34 -0
  17. data/examples/classifiers/zero_one_r_data.csv +8 -0
  18. data/examples/clusterers/clusterer_example.rb +40 -34
  19. data/examples/clusterers/dbscan_example.rb +17 -0
  20. data/examples/clusterers/dendrogram_example.rb +17 -0
  21. data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
  22. data/examples/clusterers/kmeans_custom_example.rb +26 -0
  23. data/examples/genetic_algorithm/bitstring_example.rb +41 -0
  24. data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
  25. data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
  26. data/examples/neural_network/backpropagation_example.rb +48 -48
  27. data/examples/neural_network/hopfield_example.rb +45 -0
  28. data/examples/neural_network/patterns_with_base_noise.rb +39 -39
  29. data/examples/neural_network/patterns_with_noise.rb +41 -39
  30. data/examples/neural_network/train_epochs_callback.rb +25 -0
  31. data/examples/neural_network/training_patterns.rb +39 -39
  32. data/examples/neural_network/transformer_text_classification.rb +78 -0
  33. data/examples/neural_network/xor_example.rb +23 -22
  34. data/examples/reinforcement/q_learning_example.rb +10 -0
  35. data/examples/som/som_data.rb +155 -152
  36. data/examples/som/som_multi_node_example.rb +12 -13
  37. data/examples/som/som_single_example.rb +12 -15
  38. data/examples/transformer/decode_classifier_example.rb +68 -0
  39. data/examples/transformer/deterministic_example.rb +10 -0
  40. data/examples/transformer/seq2seq_example.rb +16 -0
  41. data/lib/ai4r/classifiers/classifier.rb +24 -16
  42. data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
  43. data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
  44. data/lib/ai4r/classifiers/ib1.rb +122 -32
  45. data/lib/ai4r/classifiers/id3.rb +524 -145
  46. data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
  47. data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
  48. data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
  49. data/lib/ai4r/classifiers/one_r.rb +112 -44
  50. data/lib/ai4r/classifiers/prism.rb +167 -76
  51. data/lib/ai4r/classifiers/random_forest.rb +72 -0
  52. data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
  53. data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
  54. data/lib/ai4r/classifiers/votes.rb +57 -0
  55. data/lib/ai4r/classifiers/zero_r.rb +71 -30
  56. data/lib/ai4r/clusterers/average_linkage.rb +46 -27
  57. data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
  58. data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
  59. data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
  60. data/lib/ai4r/clusterers/clusterer.rb +29 -14
  61. data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
  62. data/lib/ai4r/clusterers/dbscan.rb +134 -0
  63. data/lib/ai4r/clusterers/diana.rb +75 -49
  64. data/lib/ai4r/clusterers/k_means.rb +270 -135
  65. data/lib/ai4r/clusterers/median_linkage.rb +49 -33
  66. data/lib/ai4r/clusterers/single_linkage.rb +196 -88
  67. data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
  68. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
  69. data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
  70. data/lib/ai4r/data/data_set.rb +223 -103
  71. data/lib/ai4r/data/parameterizable.rb +31 -25
  72. data/lib/ai4r/data/proximity.rb +62 -62
  73. data/lib/ai4r/data/statistics.rb +46 -35
  74. data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
  75. data/lib/ai4r/experiment/split.rb +39 -0
  76. data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
  77. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
  78. data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
  79. data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
  80. data/lib/ai4r/neural_network/activation_functions.rb +37 -0
  81. data/lib/ai4r/neural_network/backpropagation.rb +399 -134
  82. data/lib/ai4r/neural_network/hopfield.rb +175 -58
  83. data/lib/ai4r/neural_network/transformer.rb +194 -0
  84. data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
  85. data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
  86. data/lib/ai4r/reinforcement/q_learning.rb +51 -0
  87. data/lib/ai4r/search/a_star.rb +76 -0
  88. data/lib/ai4r/search/bfs.rb +50 -0
  89. data/lib/ai4r/search/dfs.rb +50 -0
  90. data/lib/ai4r/search/mcts.rb +118 -0
  91. data/lib/ai4r/search.rb +12 -0
  92. data/lib/ai4r/som/distance_metrics.rb +29 -0
  93. data/lib/ai4r/som/layer.rb +28 -17
  94. data/lib/ai4r/som/node.rb +61 -32
  95. data/lib/ai4r/som/som.rb +158 -41
  96. data/lib/ai4r/som/two_phase_layer.rb +21 -25
  97. data/lib/ai4r/version.rb +3 -0
  98. data/lib/ai4r.rb +57 -28
  99. metadata +79 -109
  100. data/README.rdoc +0 -39
  101. data/test/classifiers/hyperpipes_test.rb +0 -84
  102. data/test/classifiers/ib1_test.rb +0 -78
  103. data/test/classifiers/id3_test.rb +0 -220
  104. data/test/classifiers/multilayer_perceptron_test.rb +0 -79
  105. data/test/classifiers/naive_bayes_test.rb +0 -43
  106. data/test/classifiers/one_r_test.rb +0 -62
  107. data/test/classifiers/prism_test.rb +0 -85
  108. data/test/classifiers/simple_linear_regression_test.rb +0 -37
  109. data/test/classifiers/zero_r_test.rb +0 -50
  110. data/test/clusterers/average_linkage_test.rb +0 -51
  111. data/test/clusterers/bisecting_k_means_test.rb +0 -66
  112. data/test/clusterers/centroid_linkage_test.rb +0 -53
  113. data/test/clusterers/complete_linkage_test.rb +0 -57
  114. data/test/clusterers/diana_test.rb +0 -69
  115. data/test/clusterers/k_means_test.rb +0 -167
  116. data/test/clusterers/median_linkage_test.rb +0 -53
  117. data/test/clusterers/single_linkage_test.rb +0 -122
  118. data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
  119. data/test/clusterers/ward_linkage_test.rb +0 -53
  120. data/test/clusterers/weighted_average_linkage_test.rb +0 -53
  121. data/test/data/data_set_test.rb +0 -104
  122. data/test/data/proximity_test.rb +0 -87
  123. data/test/data/statistics_test.rb +0 -65
  124. data/test/experiment/classifier_evaluator_test.rb +0 -76
  125. data/test/genetic_algorithm/chromosome_test.rb +0 -57
  126. data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
  127. data/test/neural_network/backpropagation_test.rb +0 -82
  128. data/test/neural_network/hopfield_test.rb +0 -72
  129. data/test/som/som_test.rb +0 -97
@@ -1,228 +1,363 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../data/proximity'
12
- require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+ require_relative '../data/data_set'
13
+ require_relative '../data/proximity'
14
+ require_relative '../clusterers/clusterer'
13
15
 
14
16
  module Ai4r
15
17
  module Clusterers
16
-
17
- # The k-means algorithm is an algorithm to cluster n objects
18
+ # The k-means algorithm is an algorithm to cluster n objects
18
19
  # based on attributes into k partitions, with k < n.
19
- #
20
+ #
20
21
  # More about K Means algorithm:
21
- # http://en.wikipedia.org/wiki/K-means_algorithm
22
+ # http://en.wikipedia.org/wiki/K-means_algorithm
22
23
  class KMeans < Clusterer
23
-
24
- attr_reader :data_set, :number_of_clusters
25
- attr_reader :clusters, :centroids, :iterations
26
-
27
- parameters_info :max_iterations => "Maximum number of iterations to " +
28
- "build the clusterer. By default it is uncapped.",
29
- :distance_function => "Custom implementation of distance function. " +
30
- "It must be a closure receiving two data items and return the " +
31
- "distance between them. By default, this algorithm uses " +
32
- "euclidean distance of numeric attributes to the power of 2.",
33
- :centroid_function => "Custom implementation to calculate the " +
34
- "centroid of a cluster. It must be a closure receiving an array of " +
35
- "data sets, and return an array of data items, representing the " +
36
- "centroids of for each data set. " +
37
- "By default, this algorithm returns a data items using the mode "+
38
- "or mean of each attribute on each data set.",
39
- :centroid_indices => "Indices of data items (indexed from 0) to be " +
40
- "the initial centroids. Otherwise, the initial centroids will be " +
41
- "assigned randomly from the data set.",
42
- :on_empty => "Action to take if a cluster becomes empty, with values " +
43
- "'eliminate' (the default action, eliminate the empty cluster), " +
44
- "'terminate' (terminate with error), 'random' (relocate the " +
45
- "empty cluster to a random point), 'outlier' (relocate the " +
46
- "empty cluster to the point furthest from its centroid)."
47
-
24
+ attr_reader :data_set, :number_of_clusters, :clusters, :centroids, :iterations, :history
25
+
26
+ parameters_info(
27
+ max_iterations: 'Maximum number of iterations to build the clusterer. By default it is uncapped.',
28
+ distance_function: 'Custom implementation of distance function. ' \
29
+ 'It must be a closure receiving two data items and return the ' \
30
+ 'distance between them. By default, this algorithm uses ' \
31
+ 'euclidean distance of numeric attributes to the power of 2.',
32
+ centroid_function: 'Custom implementation to calculate the ' \
33
+ 'centroid of a cluster. It must be a closure receiving an array of ' \
34
+ 'data sets, and return an array of data items, representing the ' \
35
+ 'centroids of for each data set. ' \
36
+ 'By default, this algorithm returns a data items using the mode ' \
37
+ 'or mean of each attribute on each data set.',
38
+ centroid_indices: 'Indices of data items (indexed from 0) to be ' \
39
+ 'the initial centroids. Otherwise, the initial centroids will be ' \
40
+ 'assigned randomly from the data set.',
41
+ on_empty: 'Action to take if a cluster becomes empty, with values ' \
42
+ "'eliminate' (the default action, eliminate the empty cluster), " \
43
+ "'terminate' (terminate with error), 'random' (relocate the " \
44
+ "empty cluster to a random point), 'outlier' (relocate the " \
45
+ 'empty cluster to the point furthest from its centroid).',
46
+ random_seed: "Seed value used to initialize Ruby's random number " \
47
+ 'generator when selecting random centroids.',
48
+ init_method: 'Strategy to initialize centroids. Available values: ' \
49
+ ':random (default) and :kmeans_plus_plus.',
50
+ restarts: 'Number of random initializations to perform. ' \
51
+ 'The best run (lowest SSE) will be kept.',
52
+ track_history: 'Keep centroids and assignments for each iteration ' \
53
+ 'when building the clusterer.'
54
+ )
55
+
56
+ # @return [Object]
48
57
  def initialize
58
+ super()
49
59
  @distance_function = nil
50
60
  @max_iterations = nil
51
- @centroid_function = lambda do |data_sets|
52
- data_sets.collect{ |data_set| data_set.get_mean_or_mode}
61
+ @centroid_function = lambda do |data_sets|
62
+ data_sets.collect(&:get_mean_or_mode)
53
63
  end
54
64
  @centroid_indices = []
55
65
  @on_empty = 'eliminate' # default if none specified
66
+ @random_seed = nil
67
+ @rng = nil
68
+ @init_method = :random
69
+ @restarts = 1
70
+ @track_history = false
56
71
  end
57
-
58
-
72
+
59
73
  # Build a new clusterer, using data examples found in data_set.
60
74
  # Items will be clustered in "number_of_clusters" different
61
75
  # clusters.
76
+ # @param data_set [Object]
77
+ # @param number_of_clusters [Object]
78
+ # @return [Object]
62
79
  def build(data_set, number_of_clusters)
63
80
  @data_set = data_set
64
81
  @number_of_clusters = number_of_clusters
65
- raise ArgumentError, 'Length of centroid indices array differs from the specified number of clusters' unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
66
- raise ArgumentError, 'Invalid value for on_empty' unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
67
- @iterations = 0
68
-
69
- calc_initial_centroids
70
- while(not stop_criteria_met)
71
- calculate_membership_clusters
72
- recompute_centroids
82
+ raise ArgumentError, 'Number of clusters larger than data items' if @number_of_clusters > @data_set.data_items.length
83
+
84
+ unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
85
+ raise ArgumentError,
86
+ 'Length of centroid indices array differs from the specified number of clusters'
87
+ end
88
+ unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
89
+ raise ArgumentError,
90
+ 'Invalid value for on_empty'
91
+ end
92
+
93
+ seed_base = @random_seed
94
+ best_sse = nil
95
+ best_centroids = nil
96
+ best_clusters = nil
97
+ best_iterations = nil
98
+
99
+ (@restarts || 1).times do |i|
100
+ @random_seed = seed_base.nil? ? nil : seed_base + i
101
+ @rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
102
+ @iterations = 0
103
+ @history = [] if @track_history
104
+ calc_initial_centroids
105
+ until stop_criteria_met
106
+ calculate_membership_clusters
107
+ if @track_history
108
+ @history << {
109
+ centroids: @centroids.collect(&:dup),
110
+ assignments: @assignments.dup
111
+ }
112
+ end
113
+ recompute_centroids
114
+ end
115
+ current_sse = sse
116
+ next unless best_sse.nil? || current_sse < best_sse
117
+
118
+ best_sse = current_sse
119
+ best_centroids = Marshal.load(Marshal.dump(@centroids))
120
+ best_clusters = Marshal.load(Marshal.dump(@clusters))
121
+ best_iterations = @iterations
73
122
  end
74
-
75
- return self
123
+
124
+ @random_seed = seed_base
125
+ @rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
126
+ @centroids = best_centroids
127
+ @clusters = best_clusters
128
+ @iterations = best_iterations
129
+ self
76
130
  end
77
-
78
- # Classifies the given data item, returning the cluster index it belongs
131
+
132
+ # Classifies the given data item, returning the cluster index it belongs
79
133
  # to (0-based).
134
+ # @param data_item [Object]
135
+ # @return [Object]
80
136
  def eval(data_item)
81
- get_min_index(@centroids.collect {|centroid|
82
- distance(data_item, centroid)})
137
+ get_min_index(@centroids.collect do |centroid|
138
+ distance(data_item, centroid)
139
+ end)
83
140
  end
84
-
141
+
142
+ # Sum of squared distances of all points to their respective centroids.
143
+ # It can be used as a measure of cluster compactness (SSE).
144
+ # @return [Object]
145
+ def sse
146
+ sum = 0.0
147
+ @clusters.each_with_index do |cluster, i|
148
+ centroid = @centroids[i]
149
+ cluster.data_items.each do |item|
150
+ sum += distance(item, centroid)
151
+ end
152
+ end
153
+ sum
154
+ end
155
+
85
156
  # This function calculates the distance between 2 different
86
- # instances. By default, it returns the euclidean distance to the
157
+ # instances. By default, it returns the euclidean distance to the
87
158
  # power of 2.
88
159
  # You can provide a more convenient distance implementation:
89
- #
160
+ #
90
161
  # 1- Overwriting this method
91
- #
162
+ #
92
163
  # 2- Providing a closure to the :distance_function parameter
164
+ # @param a [Object]
165
+ # @param b [Object]
166
+ # @return [Object]
93
167
  def distance(a, b)
94
168
  return @distance_function.call(a, b) if @distance_function
95
- return Ai4r::Data::Proximity.squared_euclidean_distance(
96
- a.select {|att_a| att_a.is_a? Numeric} ,
97
- b.select {|att_b| att_b.is_a? Numeric})
169
+
170
+ Ai4r::Data::Proximity.squared_euclidean_distance(
171
+ a.select { |att_a| att_a.is_a? Numeric },
172
+ b.select { |att_b| att_b.is_a? Numeric }
173
+ )
98
174
  end
99
-
100
- protected
101
-
175
+
176
+ protected
177
+
178
+ # @return [Object]
102
179
  def calc_initial_centroids
103
- @centroids, @old_centroids = [], nil
180
+ @centroids = []
181
+ @old_centroids = nil
104
182
  if @centroid_indices.empty?
105
- populate_centroids('random')
183
+ if @init_method == :kmeans_plus_plus
184
+ kmeans_plus_plus_init
185
+ else
186
+ populate_centroids('random')
187
+ end
106
188
  else
107
189
  populate_centroids('indices')
108
190
  end
109
191
  end
110
-
192
+
193
+ # @return [Object]
111
194
  def stop_criteria_met
112
- @old_centroids == @centroids ||
195
+ @old_centroids == @centroids ||
113
196
  (@max_iterations && (@max_iterations <= @iterations))
114
197
  end
115
-
198
+
199
+ # @return [Object]
116
200
  def calculate_membership_clusters
117
- @clusters = Array.new(@number_of_clusters) do
118
- Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
201
+ @clusters = Array.new(@number_of_clusters) do
202
+ Ai4r::Data::DataSet.new data_labels: @data_set.data_labels
119
203
  end
120
- @cluster_indices = Array.new(@number_of_clusters) {[]}
121
-
204
+ @cluster_indices = Array.new(@number_of_clusters) { [] }
205
+ @assignments = Array.new(@data_set.data_items.length)
206
+
122
207
  @data_set.data_items.each_with_index do |data_item, data_index|
123
208
  c = eval(data_item)
124
209
  @clusters[c] << data_item
125
210
  @cluster_indices[c] << data_index if @on_empty == 'outlier'
211
+ @assignments[data_index] = c
126
212
  end
127
- manage_empty_clusters if has_empty_cluster?
213
+ manage_empty_clusters if empty_cluster?
128
214
  end
129
-
215
+
216
+ # @return [Object]
130
217
  def recompute_centroids
131
218
  @old_centroids = @centroids
132
219
  @iterations += 1
133
- @centroids = @centroid_function.call(@clusters)
220
+ @centroids = @centroid_function.call(@clusters)
134
221
  end
135
222
 
136
- def populate_centroids(populate_method, number_of_clusters=@number_of_clusters)
223
+ # @return [Object]
224
+ def kmeans_plus_plus_init
225
+ chosen_indices = []
226
+ first_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
227
+ return if first_index.nil?
228
+
229
+ @centroids << @data_set.data_items[first_index]
230
+ chosen_indices << first_index
231
+ while @centroids.length < @number_of_clusters &&
232
+ chosen_indices.length < @data_set.data_items.length
233
+ distances = []
234
+ total = 0.0
235
+ @data_set.data_items.each_with_index do |item, index|
236
+ next if chosen_indices.include?(index)
237
+
238
+ min_dist = @centroids.map { |c| distance(item, c) }.min
239
+ distances << [index, min_dist]
240
+ total += min_dist
241
+ end
242
+ break if distances.empty?
243
+
244
+ r = @rng.rand * total
245
+ cumulative = 0.0
246
+ chosen = distances.find do |_idx, dist|
247
+ cumulative += dist
248
+ cumulative >= r
249
+ end
250
+ chosen_indices << chosen[0]
251
+ @centroids << @data_set.data_items[chosen[0]]
252
+ end
253
+ @number_of_clusters = @centroids.length
254
+ end
255
+
256
+ # @param populate_method [Object]
257
+ # @param number_of_clusters [Object]
258
+ # @return [Object]
259
+ def populate_centroids(populate_method, number_of_clusters = @number_of_clusters)
137
260
  tried_indexes = []
138
261
  case populate_method
139
262
  when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
140
- while @centroids.length < number_of_clusters &&
141
- tried_indexes.length < @data_set.data_items.length
142
- random_index = rand(@data_set.data_items.length)
143
- if !tried_indexes.include?(random_index)
144
- tried_indexes << random_index
145
- if !@centroids.include? @data_set.data_items[random_index]
146
- @centroids << @data_set.data_items[random_index]
147
- end
148
- end
263
+ while @centroids.length < number_of_clusters &&
264
+ tried_indexes.length < @data_set.data_items.length
265
+ random_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
266
+ next if tried_indexes.include?(random_index)
267
+
268
+ tried_indexes << random_index
269
+ @centroids << @data_set.data_items[random_index] unless @centroids.include? @data_set.data_items[random_index]
149
270
  end
150
271
  when 'indices' # for initial assignment only (with the :centroid_indices option)
151
272
  @centroid_indices.each do |index|
152
- raise ArgumentError, "Invalid centroid index #{index}" unless (index.is_a? Integer) && index >=0 && index < @data_set.data_items.length
153
- if !tried_indexes.include?(index)
154
- tried_indexes << index
155
- if !@centroids.include? @data_set.data_items[index]
156
- @centroids << @data_set.data_items[index]
157
- end
273
+ unless (index.is_a? Integer) && index >= 0 && index < @data_set.data_items.length
274
+ raise ArgumentError,
275
+ "Invalid centroid index #{index}"
158
276
  end
277
+
278
+ next if tried_indexes.include?(index)
279
+
280
+ tried_indexes << index
281
+ @centroids << @data_set.data_items[index] unless @centroids.include? @data_set.data_items[index]
159
282
  end
160
283
  when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
161
284
  sorted_data_indices = sort_data_indices_by_dist_to_centroid
162
285
  i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
163
- while @centroids.length < number_of_clusters &&
164
- tried_indexes.length < @data_set.data_items.length
165
- outlier_index = sorted_data_indices[i]
166
- if !tried_indexes.include?(outlier_index)
286
+ while @centroids.length < number_of_clusters &&
287
+ tried_indexes.length < @data_set.data_items.length
288
+ outlier_index = sorted_data_indices[i]
289
+ unless tried_indexes.include?(outlier_index)
167
290
  tried_indexes << outlier_index
168
- if !@centroids.include? @data_set.data_items[outlier_index]
169
- @centroids << @data_set.data_items[outlier_index]
170
- end
291
+ @centroids << @data_set.data_items[outlier_index] unless @centroids.include? @data_set.data_items[outlier_index]
171
292
  end
172
- i > 0 ? i -= 1 : break
293
+ i.positive? ? i -= 1 : break
173
294
  end
174
- end
295
+ end
175
296
  @number_of_clusters = @centroids.length
176
- end
177
-
178
- # Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
179
- # Returns indices, sorted in order from the nearest to furthest.
180
- def sort_data_indices_by_dist_to_centroid
181
- sorted_data_indices = []
182
- h = {}
183
- @clusters.each_with_index do |cluster, c|
184
- centroid = @centroids[c]
185
- cluster.data_items.each_with_index do |data_item, i|
186
- dist_to_centroid = distance(data_item, centroid)
187
- data_index = @cluster_indices[c][i]
188
- h[data_index] = dist_to_centroid
189
- end
190
- end
191
- # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
192
- sorted_data_indices = h.sort_by{|k,v| v}.collect{|a,b| a}
193
- end
194
-
195
- def has_empty_cluster?
297
+ end
298
+
299
+ # Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
300
+ # Returns indices, sorted in order from the nearest to furthest.
301
+ # @return [Object]
302
+ def sort_data_indices_by_dist_to_centroid
303
+ h = {}
304
+ @clusters.each_with_index do |cluster, c|
305
+ centroid = @centroids[c]
306
+ cluster.data_items.each_with_index do |data_item, i|
307
+ dist_to_centroid = distance(data_item, centroid)
308
+ data_index = @cluster_indices[c][i]
309
+ h[data_index] = dist_to_centroid
310
+ end
311
+ end
312
+ # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
313
+ h.sort_by { |_k, v| v }.collect { |a, _b| a }
314
+ end
315
+
316
+ # @return [Object]
317
+ def empty_cluster?
196
318
  found_empty = false
197
319
  @number_of_clusters.times do |c|
198
320
  found_empty = true if @clusters[c].data_items.empty?
199
321
  end
200
322
  found_empty
201
323
  end
202
-
324
+
325
+ # @return [Object]
203
326
  def manage_empty_clusters
204
- return if self.on_empty == 'terminate' # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
205
-
327
+ # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
328
+ return if on_empty == 'terminate'
329
+
206
330
  initial_number_of_clusters = @number_of_clusters
207
331
  eliminate_empty_clusters
208
- return if self.on_empty == 'eliminate'
209
- populate_centroids(self.on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
210
- calculate_membership_clusters
332
+ return if on_empty == 'eliminate'
333
+
334
+ populate_centroids(on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
335
+ calculate_membership_clusters
211
336
  end
212
-
337
+
338
+ # @return [Object]
213
339
  def eliminate_empty_clusters
214
- old_clusters, old_centroids, old_cluster_indices = @clusters, @centroids, @cluster_indices
215
- @clusters, @centroids, @cluster_indices = [], [], []
340
+ old_clusters = @clusters
341
+ old_centroids = @centroids
342
+ old_cluster_indices = @cluster_indices
343
+ old_assignments = @assignments
344
+ @clusters = []
345
+ @centroids = []
346
+ @cluster_indices = []
347
+ remap = {}
348
+ new_index = 0
216
349
  @number_of_clusters.times do |i|
217
- if !old_clusters[i].data_items.empty?
218
- @clusters << old_clusters[i]
219
- @cluster_indices << old_cluster_indices[i]
220
- @centroids << old_centroids[i]
221
- end
350
+ next if old_clusters[i].data_items.empty?
351
+
352
+ remap[i] = new_index
353
+ @clusters << old_clusters[i]
354
+ @cluster_indices << old_cluster_indices[i]
355
+ @centroids << old_centroids[i]
356
+ new_index += 1
222
357
  end
223
358
  @number_of_clusters = @centroids.length
359
+ @assignments = old_assignments.map { |c| remap[c] }
224
360
  end
225
-
226
361
  end
227
362
  end
228
363
  end
@@ -1,61 +1,77 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://www.ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+ require_relative '../data/data_set'
13
+ require_relative '../clusterers/single_linkage'
14
+ require_relative '../clusterers/cluster_tree'
12
15
 
13
16
  module Ai4r
14
17
  module Clusterers
15
-
16
- # Implementation of an Agglomerative Hierarchical clusterer with
17
- # median linkage algorithm, aka weighted pair group method centroid
18
+ # Implementation of an Agglomerative Hierarchical clusterer with
19
+ # median linkage algorithm, aka weighted pair group method centroid
18
20
  # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
19
- # Hierarchical clusterer create one cluster per element, and then
21
+ # Hierarchical clusterer create one cluster per element, and then
20
22
  # progressively merge clusters, until the required number of clusters
21
23
  # is reached.
22
- # Similar to centroid linkages, but using fix weight:
23
- #
24
- # D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
25
- # (1/2)*D(cx, cj) -
24
+ # Similar to centroid linkages, but using fix weight:
25
+ #
26
+ # D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
27
+ # (1/2)*D(cx, cj) -
26
28
  # (1/4)*D(ci, cj)
27
29
  class MedianLinkage < SingleLinkage
28
-
29
- parameters_info :distance_function =>
30
- "Custom implementation of distance function. " +
31
- "It must be a closure receiving two data items and return the " +
32
- "distance between them. By default, this algorithm uses " +
33
- "euclidean distance of numeric attributes to the power of 2."
34
-
30
+ include ClusterTree
31
+
32
+ parameters_info distance_function:
33
+ 'Custom implementation of distance function. ' \
34
+ 'It must be a closure receiving two data items and return the ' \
35
+ 'distance between them. By default, this algorithm uses ' \
36
+ 'euclidean distance of numeric attributes to the power of 2.'
37
+
35
38
  # Build a new clusterer, using data examples found in data_set.
36
39
  # Items will be clustered in "number_of_clusters" different
37
40
  # clusters.
38
- def build(data_set, number_of_clusters)
41
+ # @param data_set [Object]
42
+ # @param number_of_clusters [Object]
43
+ # @param *options [Object]
44
+ # @return [Object]
45
+ def build(data_set, number_of_clusters = 1, **options)
39
46
  super
40
47
  end
41
-
42
- # This algorithms does not allow classification of new data items
48
+
49
+ # This algorithms does not allow classification of new data items
43
50
  # once it has been built. Rebuild the cluster including you data element.
44
- def eval(data_item)
45
- Raise "Eval of new data is not supported by this algorithm."
51
+ # @param _data_item [Object]
52
+ # @return [Object]
53
+ def eval(_data_item)
54
+ raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
46
55
  end
47
-
56
+
57
+ # @return [Object]
58
+ def supports_eval?
59
+ false
60
+ end
61
+
48
62
  protected
49
-
63
+
50
64
  # return distance between cluster cx and cluster (ci U cj),
51
65
  # using median linkage
52
- def linkage_distance(cx, ci, cj)
53
- ( 0.5 * read_distance_matrix(cx, ci) +
54
- 0.5 * read_distance_matrix(cx, cj) -
55
- 0.25 * read_distance_matrix(ci, cj))
66
+ # @param cx [Object]
67
+ # @param ci [Object]
68
+ # @param cj [Object]
69
+ # @return [Object]
70
+ def linkage_distance(cluster_x, cluster_i, cluster_j)
71
+ ((0.5 * read_distance_matrix(cluster_x, cluster_i)) +
72
+ (0.5 * read_distance_matrix(cluster_x, cluster_j)) -
73
+ (0.25 * read_distance_matrix(cluster_i, cluster_j)))
56
74
  end
57
-
58
75
  end
59
76
  end
60
77
  end
61
-