ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +174 -0
  3. data/examples/classifiers/hyperpipes_data.csv +14 -0
  4. data/examples/classifiers/hyperpipes_example.rb +22 -0
  5. data/examples/classifiers/ib1_example.rb +12 -0
  6. data/examples/classifiers/id3_example.rb +15 -10
  7. data/examples/classifiers/id3_graphviz_example.rb +17 -0
  8. data/examples/classifiers/logistic_regression_example.rb +11 -0
  9. data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
  10. data/examples/classifiers/naive_bayes_example.rb +12 -13
  11. data/examples/classifiers/one_r_example.rb +27 -0
  12. data/examples/classifiers/parameter_tutorial.rb +29 -0
  13. data/examples/classifiers/prism_nominal_example.rb +15 -0
  14. data/examples/classifiers/prism_numeric_example.rb +21 -0
  15. data/examples/classifiers/simple_linear_regression_example.rb +14 -11
  16. data/examples/classifiers/zero_and_one_r_example.rb +34 -0
  17. data/examples/classifiers/zero_one_r_data.csv +8 -0
  18. data/examples/clusterers/clusterer_example.rb +40 -34
  19. data/examples/clusterers/dbscan_example.rb +17 -0
  20. data/examples/clusterers/dendrogram_example.rb +17 -0
  21. data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
  22. data/examples/clusterers/kmeans_custom_example.rb +26 -0
  23. data/examples/genetic_algorithm/bitstring_example.rb +41 -0
  24. data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
  25. data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
  26. data/examples/neural_network/backpropagation_example.rb +48 -48
  27. data/examples/neural_network/hopfield_example.rb +45 -0
  28. data/examples/neural_network/patterns_with_base_noise.rb +39 -39
  29. data/examples/neural_network/patterns_with_noise.rb +41 -39
  30. data/examples/neural_network/train_epochs_callback.rb +25 -0
  31. data/examples/neural_network/training_patterns.rb +39 -39
  32. data/examples/neural_network/transformer_text_classification.rb +78 -0
  33. data/examples/neural_network/xor_example.rb +23 -22
  34. data/examples/reinforcement/q_learning_example.rb +10 -0
  35. data/examples/som/som_data.rb +155 -152
  36. data/examples/som/som_multi_node_example.rb +12 -13
  37. data/examples/som/som_single_example.rb +12 -15
  38. data/examples/transformer/decode_classifier_example.rb +68 -0
  39. data/examples/transformer/deterministic_example.rb +10 -0
  40. data/examples/transformer/seq2seq_example.rb +16 -0
  41. data/lib/ai4r/classifiers/classifier.rb +24 -16
  42. data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
  43. data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
  44. data/lib/ai4r/classifiers/ib1.rb +122 -32
  45. data/lib/ai4r/classifiers/id3.rb +524 -145
  46. data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
  47. data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
  48. data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
  49. data/lib/ai4r/classifiers/one_r.rb +112 -44
  50. data/lib/ai4r/classifiers/prism.rb +167 -76
  51. data/lib/ai4r/classifiers/random_forest.rb +72 -0
  52. data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
  53. data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
  54. data/lib/ai4r/classifiers/votes.rb +57 -0
  55. data/lib/ai4r/classifiers/zero_r.rb +71 -30
  56. data/lib/ai4r/clusterers/average_linkage.rb +46 -27
  57. data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
  58. data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
  59. data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
  60. data/lib/ai4r/clusterers/clusterer.rb +29 -14
  61. data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
  62. data/lib/ai4r/clusterers/dbscan.rb +134 -0
  63. data/lib/ai4r/clusterers/diana.rb +75 -49
  64. data/lib/ai4r/clusterers/k_means.rb +270 -135
  65. data/lib/ai4r/clusterers/median_linkage.rb +49 -33
  66. data/lib/ai4r/clusterers/single_linkage.rb +196 -88
  67. data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
  68. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
  69. data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
  70. data/lib/ai4r/data/data_set.rb +223 -103
  71. data/lib/ai4r/data/parameterizable.rb +31 -25
  72. data/lib/ai4r/data/proximity.rb +62 -62
  73. data/lib/ai4r/data/statistics.rb +46 -35
  74. data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
  75. data/lib/ai4r/experiment/split.rb +39 -0
  76. data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
  77. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
  78. data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
  79. data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
  80. data/lib/ai4r/neural_network/activation_functions.rb +37 -0
  81. data/lib/ai4r/neural_network/backpropagation.rb +399 -134
  82. data/lib/ai4r/neural_network/hopfield.rb +175 -58
  83. data/lib/ai4r/neural_network/transformer.rb +194 -0
  84. data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
  85. data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
  86. data/lib/ai4r/reinforcement/q_learning.rb +51 -0
  87. data/lib/ai4r/search/a_star.rb +76 -0
  88. data/lib/ai4r/search/bfs.rb +50 -0
  89. data/lib/ai4r/search/dfs.rb +50 -0
  90. data/lib/ai4r/search/mcts.rb +118 -0
  91. data/lib/ai4r/search.rb +12 -0
  92. data/lib/ai4r/som/distance_metrics.rb +29 -0
  93. data/lib/ai4r/som/layer.rb +28 -17
  94. data/lib/ai4r/som/node.rb +61 -32
  95. data/lib/ai4r/som/som.rb +158 -41
  96. data/lib/ai4r/som/two_phase_layer.rb +21 -25
  97. data/lib/ai4r/version.rb +3 -0
  98. data/lib/ai4r.rb +57 -28
  99. metadata +79 -109
  100. data/README.rdoc +0 -39
  101. data/test/classifiers/hyperpipes_test.rb +0 -84
  102. data/test/classifiers/ib1_test.rb +0 -78
  103. data/test/classifiers/id3_test.rb +0 -220
  104. data/test/classifiers/multilayer_perceptron_test.rb +0 -79
  105. data/test/classifiers/naive_bayes_test.rb +0 -43
  106. data/test/classifiers/one_r_test.rb +0 -62
  107. data/test/classifiers/prism_test.rb +0 -85
  108. data/test/classifiers/simple_linear_regression_test.rb +0 -37
  109. data/test/classifiers/zero_r_test.rb +0 -50
  110. data/test/clusterers/average_linkage_test.rb +0 -51
  111. data/test/clusterers/bisecting_k_means_test.rb +0 -66
  112. data/test/clusterers/centroid_linkage_test.rb +0 -53
  113. data/test/clusterers/complete_linkage_test.rb +0 -57
  114. data/test/clusterers/diana_test.rb +0 -69
  115. data/test/clusterers/k_means_test.rb +0 -167
  116. data/test/clusterers/median_linkage_test.rb +0 -53
  117. data/test/clusterers/single_linkage_test.rb +0 -122
  118. data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
  119. data/test/clusterers/ward_linkage_test.rb +0 -53
  120. data/test/clusterers/weighted_average_linkage_test.rb +0 -53
  121. data/test/data/data_set_test.rb +0 -104
  122. data/test/data/proximity_test.rb +0 -87
  123. data/test/data/statistics_test.rb +0 -65
  124. data/test/experiment/classifier_evaluator_test.rb +0 -76
  125. data/test/genetic_algorithm/chromosome_test.rb +0 -57
  126. data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
  127. data/test/neural_network/backpropagation_test.rb +0 -82
  128. data/test/neural_network/hopfield_test.rb +0 -72
  129. data/test/som/som_test.rb +0 -97
@@ -1,172 +1,278 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../data/proximity'
12
- require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+ require_relative '../data/data_set'
13
+ require_relative '../data/proximity'
14
+ require_relative '../clusterers/clusterer'
15
+ require_relative '../clusterers/cluster_tree'
13
16
 
14
17
  module Ai4r
15
18
  module Clusterers
16
-
17
- # Implementation of a Hierarchical clusterer with single linkage (Everitt et
19
+ # Implementation of a Hierarchical clusterer with single linkage (Everitt et
18
20
  # al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
19
- # Hierarchical clusterer create one cluster per element, and then
21
+ # Hierarchical clusterer create one cluster per element, and then
20
22
  # progressively merge clusters, until the required number of clusters
21
23
  # is reached.
22
- # With single linkage, the distance between two clusters is computed as the
24
+ # With single linkage, the distance between two clusters is computed as the
23
25
  # distance between the two closest elements in the two clusters.
24
26
  #
25
27
  # D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
26
28
  class SingleLinkage < Clusterer
27
-
29
+ include ClusterTree
30
+
28
31
  attr_reader :data_set, :number_of_clusters, :clusters
29
-
30
- parameters_info :distance_function =>
31
- "Custom implementation of distance function. " +
32
- "It must be a closure receiving two data items and return the " +
33
- "distance between them. By default, this algorithm uses " +
34
- "euclidean distance of numeric attributes to the power of 2."
35
-
36
- def initialize
37
- @distance_function = lambda do |a,b|
38
- Ai4r::Data::Proximity.squared_euclidean_distance(
39
- a.select {|att_a| att_a.is_a? Numeric} ,
40
- b.select {|att_b| att_b.is_a? Numeric})
41
- end
32
+
33
+ parameters_info distance_function:
34
+ 'Custom implementation of distance function. ' \
35
+ 'It must be a closure receiving two data items and return the ' \
36
+ 'distance between them. By default, this algorithm uses ' \
37
+ 'euclidean distance of numeric attributes to the power of 2.'
38
+
39
+ # @return [Object]
40
+ def initialize(*args)
41
+ super(*args)
42
+ @distance_function = lambda do |a, b|
43
+ Ai4r::Data::Proximity.squared_euclidean_distance(
44
+ a.select { |att_a| att_a.is_a? Numeric },
45
+ b.select { |att_b| att_b.is_a? Numeric }
46
+ )
47
+ end
42
48
  end
43
-
49
+
44
50
  # Build a new clusterer, using data examples found in data_set.
45
51
  # Items will be clustered in "number_of_clusters" different
46
52
  # clusters.
47
- def build(data_set, number_of_clusters)
53
+ #
54
+ # If you specify :distance options, it will stop whether
55
+ # number_of_clusters are reached or no distance among clusters are below :distance
56
+ # @param data_set [Object]
57
+ # @param number_of_clusters [Object]
58
+ # @param *options [Object]
59
+ # @return [Object]
60
+ def build(data_set, number_of_clusters = 1, **options)
48
61
  @data_set = data_set
49
- @number_of_clusters = number_of_clusters
50
-
62
+ distance = options[:distance] || Float::INFINITY
63
+
51
64
  @index_clusters = create_initial_index_clusters
52
65
  create_distance_matrix(data_set)
53
- while @index_clusters.length > @number_of_clusters
66
+ while @index_clusters.length > number_of_clusters
54
67
  ci, cj = get_closest_clusters(@index_clusters)
68
+ break if read_distance_matrix(ci, cj) > distance
69
+
55
70
  update_distance_matrix(ci, cj)
56
71
  merge_clusters(ci, cj, @index_clusters)
57
72
  end
73
+
74
+ @number_of_clusters = @index_clusters.length
75
+ @distance_matrix = nil
58
76
  @clusters = build_clusters_from_index_clusters @index_clusters
59
-
60
- return self
77
+ self
61
78
  end
62
-
63
- # Classifies the given data item, returning the cluster index it belongs
79
+
80
+ # @param clusters [Object]
81
+ # @return [Object]
82
+ def draw_map(clusters)
83
+ map = Array.new(11) { Array.new(11, 0) }
84
+ clusters.each_index do |i|
85
+ clusters[i].data_items.each do |point|
86
+ map[point.first][point.last] = (i + 1)
87
+ end
88
+ end
89
+ map
90
+ end
91
+
92
+ # Classifies the given data item, returning the cluster index it belongs
64
93
  # to (0-based).
94
+ # @param data_item [Object]
95
+ # @return [Object]
65
96
  def eval(data_item)
66
- get_min_index(@clusters.collect {|cluster|
67
- distance_between_item_and_cluster(data_item, cluster)})
97
+ get_min_index(@clusters.collect do |cluster|
98
+ distance_between_item_and_cluster(data_item, cluster)
99
+ end)
100
+ end
101
+
102
+ protected
103
+
104
+ # @param i [Object]
105
+ # @param j [Object]
106
+ # @return [Object]
107
+ def distance_between_indexes(i, j)
108
+ @distance_function.call(@data_set.data_items[i], @data_set.data_items[j])
109
+ end
110
+
111
+ public
112
+
113
+ # Compute mean silhouette coefficient of the clustering result.
114
+ # Returns a float between -1 and 1. Only valid after build.
115
+ # @return [Object]
116
+ def silhouette
117
+ return nil unless @index_clusters && @data_set
118
+
119
+ total = 0.0
120
+ count = @data_set.data_items.length
121
+
122
+ @index_clusters.each_with_index do |cluster, ci|
123
+ cluster.each do |index|
124
+ a = 0.0
125
+ if cluster.length > 1
126
+ cluster.each do |j|
127
+ next if j == index
128
+
129
+ a += distance_between_indexes(index, j)
130
+ end
131
+ a /= (cluster.length - 1)
132
+ end
133
+
134
+ b = nil
135
+ @index_clusters.each_with_index do |other_cluster, cj|
136
+ next if ci == cj
137
+
138
+ dist = 0.0
139
+ other_cluster.each do |j|
140
+ dist += distance_between_indexes(index, j)
141
+ end
142
+ dist /= other_cluster.length
143
+ b = dist if b.nil? || dist < b
144
+ end
145
+ s = b&.positive? ? (b - a) / [a, b].max : 0.0
146
+ total += s
147
+ end
148
+ end
149
+
150
+ total / count
68
151
  end
69
-
152
+
70
153
  protected
71
-
154
+
72
155
  # returns [ [0], [1], [2], ... , [n-1] ]
73
156
  # where n is the number of data items in the data set
157
+ # @return [Object]
74
158
  def create_initial_index_clusters
75
159
  index_clusters = []
76
- @data_set.data_items.length.times {|i| index_clusters << [i]}
77
- return index_clusters
160
+ @data_set.data_items.length.times { |i| index_clusters << [i] }
161
+ index_clusters
78
162
  end
79
-
163
+
80
164
  # Create a partial distance matrix:
81
- # [
82
- # [d(1,0)],
83
- # [d(2,0)], [d(2,1)],
84
- # [d(3,0)], [d(3,1)], [d(3,2)],
85
- # ...
86
- # [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
165
+ # [
166
+ # [d(1,0)],
167
+ # [d(2,0), d(2,1)],
168
+ # [d(3,0), d(3,1), d(3,2)],
169
+ # ...
170
+ # [d(n-1,0), d(n-1,1), d(n-1,2), ... , d(n-1,n-2)]
87
171
  # ]
88
172
  # where n is the number of data items in the data set
173
+ # @param data_set [Object]
174
+ # @return [Object]
89
175
  def create_distance_matrix(data_set)
90
- @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
176
+ @distance_matrix = Array.new(data_set.data_items.length - 1) do |index|
177
+ Array.new(index + 1)
178
+ end
91
179
  data_set.data_items.each_with_index do |a, i|
92
180
  i.times do |j|
93
181
  b = data_set.data_items[j]
94
- @distance_matrix[i-1][j] = @distance_function.call(a, b)
182
+ @distance_matrix[i - 1][j] = @distance_function.call(a, b)
95
183
  end
96
184
  end
97
185
  end
98
-
186
+
99
187
  # Returns the distance between element data_item[index_a] and
100
188
  # data_item[index_b] using the distance matrix
189
+ # @param index_a [Object]
190
+ # @param index_b [Object]
191
+ # @return [Object]
101
192
  def read_distance_matrix(index_a, index_b)
102
193
  return 0 if index_a == index_b
194
+
103
195
  index_a, index_b = index_b, index_a if index_b > index_a
104
- return @distance_matrix[index_a-1][index_b]
196
+ @distance_matrix[index_a - 1][index_b]
105
197
  end
106
198
 
107
199
  # ci and cj are the indexes of the clusters that are going to
108
- # be merged. We need to remove distances from/to ci and cj,
200
+ # be merged. We need to remove distances from/to ci and cj,
109
201
  # and add distances from/to new cluster (ci U cj)
202
+ # @param ci [Object]
203
+ # @param cj [Object]
204
+ # @return [Object]
110
205
  def update_distance_matrix(ci, cj)
111
206
  ci, cj = cj, ci if cj > ci
112
- distances_to_new_cluster = Array.new
113
- (@distance_matrix.length+1).times do |cx|
114
- if cx!= ci && cx!=cj
115
- distances_to_new_cluster << linkage_distance(cx, ci, cj)
116
- end
207
+ distances_to_new_cluster = []
208
+ (@distance_matrix.length + 1).times do |cx|
209
+ distances_to_new_cluster << linkage_distance(cx, ci, cj) if cx != ci && cx != cj
117
210
  end
118
- if cj==0 && ci==1
119
- @distance_matrix.delete_at(1)
120
- @distance_matrix.delete_at(0)
121
- elsif cj==0
122
- @distance_matrix.delete_at(ci-1)
123
- @distance_matrix.delete_at(0)
211
+ if cj.zero? && ci == 1
212
+ @distance_matrix.delete_at(1)
213
+ @distance_matrix.delete_at(0)
214
+ elsif cj.zero?
215
+ @distance_matrix.delete_at(ci - 1)
216
+ @distance_matrix.delete_at(0)
124
217
  else
125
- @distance_matrix.delete_at(ci-1)
126
- @distance_matrix.delete_at(cj-1)
218
+ @distance_matrix.delete_at(ci - 1)
219
+ @distance_matrix.delete_at(cj - 1)
127
220
  end
128
- @distance_matrix.each do |d|
221
+ @distance_matrix.each do |d|
129
222
  d.delete_at(ci)
130
223
  d.delete_at(cj)
131
224
  end
132
225
  @distance_matrix << distances_to_new_cluster
133
226
  end
134
-
227
+
135
228
  # return distance between cluster cx and new cluster (ci U cj),
136
229
  # using single linkage
137
- def linkage_distance(cx, ci, cj)
138
- [read_distance_matrix(cx, ci),
139
- read_distance_matrix(cx, cj)].min
230
+ # @param cx [Object]
231
+ # @param ci [Object]
232
+ # @param cj [Object]
233
+ # @return [Object]
234
+ def linkage_distance(cluster_x, cluster_i, cluster_j)
235
+ [read_distance_matrix(cluster_x, cluster_i),
236
+ read_distance_matrix(cluster_x, cluster_j)].min
140
237
  end
141
-
142
- # cluster_a and cluster_b are removed from index_cluster,
238
+
239
+ # cluster_a and cluster_b are removed from index_cluster,
143
240
  # and a new cluster with all members of cluster_a and cluster_b
144
- # is added.
241
+ # is added.
145
242
  # It modifies index clusters array.
243
+ # @param index_a [Object]
244
+ # @param index_b [Object]
245
+ # @param index_clusters [Object]
246
+ # @return [Object]
146
247
  def merge_clusters(index_a, index_b, index_clusters)
147
248
  index_a, index_b = index_b, index_a if index_b > index_a
148
249
  new_index_cluster = index_clusters[index_a] +
149
- index_clusters[index_b]
250
+ index_clusters[index_b]
150
251
  index_clusters.delete_at index_a
151
252
  index_clusters.delete_at index_b
152
253
  index_clusters << new_index_cluster
153
- return index_clusters
254
+ index_clusters
154
255
  end
155
-
156
- # Given an array with clusters of data_items indexes,
157
- # it returns an array of data_items clusters
256
+
257
+ # Given an array with clusters of data_items indexes,
258
+ # it returns an array of data_items clusters
259
+ # @param index_clusters [Object]
260
+ # @return [Object]
158
261
  def build_clusters_from_index_clusters(index_clusters)
159
- @distance_matrix = nil
160
- return index_clusters.collect do |index_cluster|
161
- Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
162
- :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
262
+ index_clusters.collect do |index_cluster|
263
+ Ai4r::Data::DataSet.new(data_labels: @data_set.data_labels,
264
+ data_items: index_cluster.collect do |i|
265
+ @data_set.data_items[i]
266
+ end)
163
267
  end
164
268
  end
165
-
269
+
166
270
  # Returns ans array with the indexes of the two closest
167
271
  # clusters => [index_cluster_a, index_cluster_b]
272
+ # @param index_clusters [Object]
273
+ # @return [Object]
168
274
  def get_closest_clusters(index_clusters)
169
- min_distance = 1.0/0
275
+ min_distance = Float::INFINITY
170
276
  closest_clusters = [1, 0]
171
277
  index_clusters.each_index do |index_a|
172
278
  index_a.times do |index_b|
@@ -177,18 +283,20 @@ module Ai4r
177
283
  end
178
284
  end
179
285
  end
180
- return closest_clusters
286
+ closest_clusters
181
287
  end
182
-
288
+
289
+ # @param data_item [Object]
290
+ # @param cluster [Object]
291
+ # @return [Object]
183
292
  def distance_between_item_and_cluster(data_item, cluster)
184
- min_dist = 1.0/0
293
+ min_dist = Float::INFINITY
185
294
  cluster.data_items.each do |another_item|
186
295
  dist = @distance_function.call(data_item, another_item)
187
296
  min_dist = dist if dist < min_dist
188
297
  end
189
- return min_dist
298
+ min_dist
190
299
  end
191
-
192
300
  end
193
301
  end
194
302
  end
@@ -1,64 +1,80 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://www.ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+ require_relative '../data/data_set'
13
+ require_relative '../clusterers/single_linkage'
14
+ require_relative '../clusterers/cluster_tree'
12
15
 
13
16
  module Ai4r
14
17
  module Clusterers
15
-
16
- # Implementation of an Agglomerative Hierarchical clusterer with
18
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
19
  # Ward's method linkage algorithm, aka the minimum variance method (Everitt
18
20
  # et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
19
- # Hierarchical clusterer create one cluster per element, and then
21
+ # Hierarchical clusterer create one cluster per element, and then
20
22
  # progressively merge clusters, until the required number of clusters
21
23
  # is reached.
22
- # The objective of this method is to minimize the variance.
23
- #
24
- # D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
25
- # (nj/(ni+nj+nx))*D(cx, cj) -
24
+ # The objective of this method is to minimize the variance.
25
+ #
26
+ # D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
27
+ # (nj/(ni+nj+nx))*D(cx, cj) -
26
28
  # (nx/(ni+nj)^2)*D(ci, cj)
27
29
  class WardLinkage < SingleLinkage
28
-
29
- parameters_info :distance_function =>
30
- "Custom implementation of distance function. " +
31
- "It must be a closure receiving two data items and return the " +
32
- "distance between them. By default, this algorithm uses " +
33
- "euclidean distance of numeric attributes to the power of 2."
34
-
30
+ include ClusterTree
31
+
32
+ parameters_info distance_function:
33
+ 'Custom implementation of distance function. ' \
34
+ 'It must be a closure receiving two data items and return the ' \
35
+ 'distance between them. By default, this algorithm uses ' \
36
+ 'euclidean distance of numeric attributes to the power of 2.'
37
+
35
38
  # Build a new clusterer, using data examples found in data_set.
36
39
  # Items will be clustered in "number_of_clusters" different
37
40
  # clusters.
38
- def build(data_set, number_of_clusters)
41
+ # @param data_set [Object]
42
+ # @param number_of_clusters [Object]
43
+ # @param *options [Object]
44
+ # @return [Object]
45
+ def build(data_set, number_of_clusters = 1, **options)
39
46
  super
40
47
  end
41
-
42
- # This algorithms does not allow classification of new data items
48
+
49
+ # This algorithms does not allow classification of new data items
43
50
  # once it has been built. Rebuild the cluster including you data element.
44
- def eval(data_item)
45
- Raise "Eval of new data is not supported by this algorithm."
51
+ # @param _data_item [Object]
52
+ # @return [Object]
53
+ def eval(_data_item)
54
+ raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
46
55
  end
47
-
56
+
57
+ # @return [Object]
58
+ def supports_eval?
59
+ false
60
+ end
61
+
48
62
  protected
49
-
63
+
50
64
  # return distance between cluster cx and cluster (ci U cj),
51
65
  # using ward's method linkage
52
- def linkage_distance(cx, ci, cj)
53
- ni = @index_clusters[ci].length
54
- nj = @index_clusters[cj].length
55
- nx = @index_clusters[cx].length
56
- ( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
57
- ( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx) -
58
- ( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
66
+ # @param cx [Object]
67
+ # @param ci [Object]
68
+ # @param cj [Object]
69
+ # @return [Object]
70
+ def linkage_distance(cluster_x, cluster_i, cluster_j)
71
+ ni = @index_clusters[cluster_i].length
72
+ nj = @index_clusters[cluster_j].length
73
+ nx = @index_clusters[cluster_x].length
74
+ ((((1.0 * (ni + nx) * read_distance_matrix(cluster_x, cluster_i)) +
75
+ (1.0 * (nj + nx) * read_distance_matrix(cluster_x, cluster_j))) / (ni + nj + nx)) -
76
+ (1.0 * nx * read_distance_matrix(cluster_i, cluster_j) / ((ni + nj)**2)))
59
77
  end
60
-
61
78
  end
62
79
  end
63
80
  end
64
-
@@ -1,39 +1,55 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Peter Lubell-Doughtie
2
4
  # License:: BSD 3 Clause
3
5
  # Project:: ai4r
4
6
  # Url:: http://peet.ldee.org
5
7
 
6
- require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
8
+ require_relative '../clusterers/ward_linkage'
9
+ require_relative '../clusterers/cluster_tree'
7
10
 
8
11
  module Ai4r
9
12
  module Clusterers
10
-
11
13
  # Hierarchical version to store classes as merges occur.
12
14
  class WardLinkageHierarchical < WardLinkage
15
+ include ClusterTree
13
16
 
14
- attr_reader :cluster_tree
15
-
17
+ # @param depth [Object]
18
+ # @return [Object]
16
19
  def initialize(depth = nil)
17
20
  @cluster_tree = []
18
21
  @depth = depth
19
22
  @merges_so_far = 0
20
- super()
23
+ super(depth)
21
24
  end
22
25
 
23
- def build(data_set, number_of_clusters)
26
+ # @param data_set [Object]
27
+ # @param number_of_clusters [Object]
28
+ # @param *options [Object]
29
+ # @return [Object]
30
+ def build(data_set, number_of_clusters = 1, **options)
24
31
  data_len = data_set.data_items.length
25
32
  @total_merges = data_len - number_of_clusters
26
33
  super
27
- @cluster_tree << self.clusters
34
+ @cluster_tree << clusters
28
35
  @cluster_tree.reverse!
29
- return self
36
+ self
37
+ end
38
+
39
+ # @return [Object]
40
+ def supports_eval?
41
+ false
30
42
  end
31
43
 
32
44
  protected
33
45
 
46
+ # @param index_a [Object]
47
+ # @param index_b [Object]
48
+ # @param index_clusters [Object]
49
+ # @return [Object]
34
50
  def merge_clusters(index_a, index_b, index_clusters)
35
51
  # only store if no or above depth
36
- if @depth.nil? or @merges_so_far > @total_merges - @depth
52
+ if @depth.nil? || (@merges_so_far > @total_merges - @depth)
37
53
  # store current clusters
38
54
  stored_distance_matrix = @distance_matrix.dup
39
55
  @cluster_tree << build_clusters_from_index_clusters(index_clusters)
@@ -45,4 +61,3 @@ module Ai4r
45
61
  end
46
62
  end
47
63
  end
48
-