ai4r 1.12 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +174 -0
  3. data/examples/classifiers/hyperpipes_data.csv +14 -0
  4. data/examples/classifiers/hyperpipes_example.rb +22 -0
  5. data/examples/classifiers/ib1_example.rb +12 -0
  6. data/examples/classifiers/id3_example.rb +15 -10
  7. data/examples/classifiers/id3_graphviz_example.rb +17 -0
  8. data/examples/classifiers/logistic_regression_example.rb +11 -0
  9. data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
  10. data/examples/classifiers/naive_bayes_example.rb +12 -13
  11. data/examples/classifiers/one_r_example.rb +27 -0
  12. data/examples/classifiers/parameter_tutorial.rb +29 -0
  13. data/examples/classifiers/prism_nominal_example.rb +15 -0
  14. data/examples/classifiers/prism_numeric_example.rb +21 -0
  15. data/examples/classifiers/simple_linear_regression_example.csv +159 -0
  16. data/examples/classifiers/simple_linear_regression_example.rb +18 -0
  17. data/examples/classifiers/zero_and_one_r_example.rb +34 -0
  18. data/examples/classifiers/zero_one_r_data.csv +8 -0
  19. data/examples/clusterers/clusterer_example.rb +62 -0
  20. data/examples/clusterers/dbscan_example.rb +17 -0
  21. data/examples/clusterers/dendrogram_example.rb +17 -0
  22. data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
  23. data/examples/clusterers/kmeans_custom_example.rb +26 -0
  24. data/examples/genetic_algorithm/bitstring_example.rb +41 -0
  25. data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
  26. data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
  27. data/examples/neural_network/backpropagation_example.rb +49 -48
  28. data/examples/neural_network/hopfield_example.rb +45 -0
  29. data/examples/neural_network/patterns_with_base_noise.rb +39 -39
  30. data/examples/neural_network/patterns_with_noise.rb +41 -39
  31. data/examples/neural_network/train_epochs_callback.rb +25 -0
  32. data/examples/neural_network/training_patterns.rb +39 -39
  33. data/examples/neural_network/transformer_text_classification.rb +78 -0
  34. data/examples/neural_network/xor_example.rb +23 -22
  35. data/examples/reinforcement/q_learning_example.rb +10 -0
  36. data/examples/som/som_data.rb +155 -152
  37. data/examples/som/som_multi_node_example.rb +12 -13
  38. data/examples/som/som_single_example.rb +12 -15
  39. data/examples/transformer/decode_classifier_example.rb +68 -0
  40. data/examples/transformer/deterministic_example.rb +10 -0
  41. data/examples/transformer/seq2seq_example.rb +16 -0
  42. data/lib/ai4r/classifiers/classifier.rb +24 -16
  43. data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
  44. data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
  45. data/lib/ai4r/classifiers/ib1.rb +122 -32
  46. data/lib/ai4r/classifiers/id3.rb +527 -144
  47. data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
  48. data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
  49. data/lib/ai4r/classifiers/naive_bayes.rb +112 -48
  50. data/lib/ai4r/classifiers/one_r.rb +112 -44
  51. data/lib/ai4r/classifiers/prism.rb +167 -76
  52. data/lib/ai4r/classifiers/random_forest.rb +72 -0
  53. data/lib/ai4r/classifiers/simple_linear_regression.rb +143 -0
  54. data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
  55. data/lib/ai4r/classifiers/votes.rb +57 -0
  56. data/lib/ai4r/classifiers/zero_r.rb +71 -30
  57. data/lib/ai4r/clusterers/average_linkage.rb +46 -27
  58. data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
  59. data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
  60. data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
  61. data/lib/ai4r/clusterers/clusterer.rb +28 -24
  62. data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
  63. data/lib/ai4r/clusterers/dbscan.rb +134 -0
  64. data/lib/ai4r/clusterers/diana.rb +75 -49
  65. data/lib/ai4r/clusterers/k_means.rb +309 -72
  66. data/lib/ai4r/clusterers/median_linkage.rb +49 -33
  67. data/lib/ai4r/clusterers/single_linkage.rb +196 -88
  68. data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
  69. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +63 -0
  70. data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
  71. data/lib/ai4r/data/data_set.rb +229 -100
  72. data/lib/ai4r/data/parameterizable.rb +31 -25
  73. data/lib/ai4r/data/proximity.rb +72 -50
  74. data/lib/ai4r/data/statistics.rb +46 -35
  75. data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
  76. data/lib/ai4r/experiment/split.rb +39 -0
  77. data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
  78. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
  79. data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
  80. data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
  81. data/lib/ai4r/neural_network/activation_functions.rb +37 -0
  82. data/lib/ai4r/neural_network/backpropagation.rb +419 -143
  83. data/lib/ai4r/neural_network/hopfield.rb +175 -58
  84. data/lib/ai4r/neural_network/transformer.rb +194 -0
  85. data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
  86. data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
  87. data/lib/ai4r/reinforcement/q_learning.rb +51 -0
  88. data/lib/ai4r/search/a_star.rb +76 -0
  89. data/lib/ai4r/search/bfs.rb +50 -0
  90. data/lib/ai4r/search/dfs.rb +50 -0
  91. data/lib/ai4r/search/mcts.rb +118 -0
  92. data/lib/ai4r/search.rb +12 -0
  93. data/lib/ai4r/som/distance_metrics.rb +29 -0
  94. data/lib/ai4r/som/layer.rb +28 -17
  95. data/lib/ai4r/som/node.rb +61 -32
  96. data/lib/ai4r/som/som.rb +158 -41
  97. data/lib/ai4r/som/two_phase_layer.rb +21 -25
  98. data/lib/ai4r/version.rb +3 -0
  99. data/lib/ai4r.rb +58 -27
  100. metadata +117 -106
  101. data/README.rdoc +0 -44
  102. data/test/classifiers/hyperpipes_test.rb +0 -84
  103. data/test/classifiers/ib1_test.rb +0 -78
  104. data/test/classifiers/id3_test.rb +0 -208
  105. data/test/classifiers/multilayer_perceptron_test.rb +0 -79
  106. data/test/classifiers/naive_bayes_test.rb +0 -43
  107. data/test/classifiers/one_r_test.rb +0 -62
  108. data/test/classifiers/prism_test.rb +0 -85
  109. data/test/classifiers/zero_r_test.rb +0 -50
  110. data/test/clusterers/average_linkage_test.rb +0 -51
  111. data/test/clusterers/bisecting_k_means_test.rb +0 -66
  112. data/test/clusterers/centroid_linkage_test.rb +0 -53
  113. data/test/clusterers/complete_linkage_test.rb +0 -57
  114. data/test/clusterers/diana_test.rb +0 -69
  115. data/test/clusterers/k_means_test.rb +0 -100
  116. data/test/clusterers/median_linkage_test.rb +0 -53
  117. data/test/clusterers/single_linkage_test.rb +0 -122
  118. data/test/clusterers/ward_linkage_test.rb +0 -53
  119. data/test/clusterers/weighted_average_linkage_test.rb +0 -53
  120. data/test/data/data_set_test.rb +0 -96
  121. data/test/data/proximity_test.rb +0 -81
  122. data/test/data/statistics_test.rb +0 -65
  123. data/test/experiment/classifier_evaluator_test.rb +0 -76
  124. data/test/genetic_algorithm/chromosome_test.rb +0 -57
  125. data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
  126. data/test/neural_network/backpropagation_test.rb +0 -82
  127. data/test/neural_network/hopfield_test.rb +0 -72
  128. data/test/som/som_test.rb +0 -97
@@ -1,74 +1,85 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://www.ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../data/proximity'
12
- require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+ require_relative '../data/data_set'
13
+ require_relative '../data/proximity'
14
+ require_relative '../clusterers/clusterer'
13
15
 
14
16
  module Ai4r
15
17
  module Clusterers
16
-
17
- # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
18
+ # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
18
19
  # Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
19
20
  # Clusterer. It begins with only one cluster with all data items,
20
21
  # and divides the clusters until the desired clusters number is reached.
21
22
  class Diana < Clusterer
22
-
23
23
  attr_reader :data_set, :number_of_clusters, :clusters
24
-
25
- parameters_info :distance_function =>
26
- "Custom implementation of distance function. " +
27
- "It must be a closure receiving two data items and return the " +
28
- "distance bewteen them. By default, this algorithm uses " +
29
- "ecuclidean distance of numeric attributes to the power of 2."
30
-
24
+
25
+ parameters_info distance_function:
26
+ 'Custom implementation of distance function. ' \
27
+ 'It must be a closure receiving two data items and return the ' \
28
+ 'distance between them. By default, this algorithm uses ' \
29
+ 'euclidean distance of numeric attributes to the power of 2.'
30
+
31
+ # @return [Object]
31
32
  def initialize
32
- @distance_function = lambda do |a,b|
33
- Ai4r::Data::Proximity.squared_euclidean_distance(
34
- a.select {|att_a| att_a.is_a? Numeric} ,
35
- b.select {|att_b| att_b.is_a? Numeric})
36
- end
33
+ super()
34
+ @distance_function = lambda do |a, b|
35
+ Ai4r::Data::Proximity.squared_euclidean_distance(
36
+ a.select { |att_a| att_a.is_a? Numeric },
37
+ b.select { |att_b| att_b.is_a? Numeric }
38
+ )
39
+ end
37
40
  end
38
-
41
+
39
42
  # Build a new clusterer, using divisive analysis (DIANA algorithm)
43
+ # @param data_set [Object]
44
+ # @param number_of_clusters [Object]
45
+ # @return [Object]
40
46
  def build(data_set, number_of_clusters)
41
47
  @data_set = data_set
42
48
  @number_of_clusters = number_of_clusters
43
- @clusters = [@data_set[0..-1]]
44
-
45
- while(@clusters.length < @number_of_clusters)
49
+ @clusters = [@data_set]
50
+
51
+ while @clusters.length < @number_of_clusters
46
52
  cluster_index_to_split = max_diameter_cluster(@clusters)
47
53
  cluster_to_split = @clusters[cluster_index_to_split]
48
54
  splinter_cluster = init_splinter_cluster(cluster_to_split)
49
- while true
55
+ loop do
50
56
  dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
51
- break if dist_diff < 0
57
+ break if dist_diff.negative?
58
+
52
59
  splinter_cluster << cluster_to_split.data_items[index]
53
60
  cluster_to_split.data_items.delete_at(index)
54
61
  end
55
62
  @clusters << splinter_cluster
56
63
  end
57
-
58
- return self
64
+
65
+ self
59
66
  end
60
-
61
- # Classifies the given data item, returning the cluster index it belongs
67
+
68
+ # Classifies the given data item, returning the cluster index it belongs
62
69
  # to (0-based).
70
+ # @param data_item [Object]
71
+ # @return [Object]
63
72
  def eval(data_item)
64
73
  get_min_index(@clusters.collect do |cluster|
65
74
  distance_sum(data_item, cluster) / cluster.data_items.length
66
- end)
75
+ end)
67
76
  end
68
-
77
+
69
78
  protected
70
-
79
+
71
80
  # return the cluster with max diameter
81
+ # @param clusters [Object]
82
+ # @return [Object]
72
83
  def max_diameter_cluster(clusters)
73
84
  max_index = 0
74
85
  max_diameter = 0
@@ -79,10 +90,12 @@ module Ai4r
79
90
  max_diameter = diameter
80
91
  end
81
92
  end
82
- return max_index
93
+ max_index
83
94
  end
84
-
95
+
85
96
  # Max distance between 2 items in a cluster
97
+ # @param cluster [Object]
98
+ # @return [Object]
86
99
  def cluster_diameter(cluster)
87
100
  diameter = 0
88
101
  cluster.data_items.each_with_index do |item_a, item_a_pos|
@@ -91,49 +104,62 @@ module Ai4r
91
104
  diameter = d if d > diameter
92
105
  end
93
106
  end
94
- return diameter
107
+ diameter
95
108
  end
96
-
109
+
97
110
  # Create a cluster with the item with mx distance
98
111
  # to the rest of the cluster's items.
99
112
  # That item is removed from the initial cluster.
113
+ # @param cluster_to_split [Object]
114
+ # @return [Object]
100
115
  def init_splinter_cluster(cluster_to_split)
101
116
  max = 0.0
102
117
  max_index = 0
103
118
  cluster_to_split.data_items.each_with_index do |item, index|
104
119
  sum = distance_sum(item, cluster_to_split)
105
- max, max_index = sum, index if sum > max
120
+ if sum > max
121
+ max = sum
122
+ max_index = index
123
+ end
106
124
  end
107
125
  splinter_cluster = cluster_to_split[max_index]
108
126
  cluster_to_split.data_items.delete_at(max_index)
109
- return splinter_cluster
127
+ splinter_cluster
110
128
  end
111
-
112
- # Return the max average distance between any item of
129
+
130
+ # Return the max average distance between any item of
113
131
  # cluster_to_split and the rest of items in that cluster,
114
132
  # minus the average distance with the items of splinter_cluster,
115
133
  # and the index of the item.
116
134
  # A positive value means that the items is closer to the
117
135
  # splinter group than to its current cluster.
136
+ # @param cluster_to_split [Object]
137
+ # @param splinter_cluster [Object]
138
+ # @return [Object]
118
139
  def max_distance_difference(cluster_to_split, splinter_cluster)
119
- max_diff = -1.0/0
140
+ max_diff = -Float::INFINITY
120
141
  max_diff_index = 0
121
142
  cluster_to_split.data_items.each_with_index do |item, index|
122
- dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
123
- dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
143
+ dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length - 1)
144
+ dist_b = distance_sum(item, splinter_cluster) / splinter_cluster.data_items.length
124
145
  dist_diff = dist_a - dist_b
125
- max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
146
+ if dist_diff > max_diff
147
+ max_diff = dist_diff
148
+ max_diff_index = index
149
+ end
126
150
  end
127
- return max_diff, max_diff_index
151
+ [max_diff, max_diff_index]
128
152
  end
129
-
153
+
130
154
  # Sum up the distance between an item and all the items in a cluster
155
+ # @param item_a [Object]
156
+ # @param cluster [Object]
157
+ # @return [Object]
131
158
  def distance_sum(item_a, cluster)
132
159
  cluster.data_items.inject(0.0) do |sum, item_b|
133
160
  sum + @distance_function.call(item_a, item_b)
134
161
  end
135
162
  end
136
-
137
163
  end
138
164
  end
139
165
  end
@@ -1,126 +1,363 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+ require_relative '../data/data_set'
13
+ require_relative '../data/proximity'
14
+ require_relative '../clusterers/clusterer'
12
15
 
13
16
  module Ai4r
14
17
  module Clusterers
15
-
16
- # The k-means algorithm is an algorithm to cluster n objects
18
+ # The k-means algorithm is an algorithm to cluster n objects
17
19
  # based on attributes into k partitions, with k < n.
18
- #
20
+ #
19
21
  # More about K Means algorithm:
20
- # http://en.wikipedia.org/wiki/K-means_algorithm
22
+ # http://en.wikipedia.org/wiki/K-means_algorithm
21
23
  class KMeans < Clusterer
22
-
23
- attr_reader :data_set, :number_of_clusters
24
- attr_reader :clusters, :centroids, :iterations
25
-
26
- parameters_info :max_iterations => "Maximum number of iterations to " +
27
- "build the clusterer. By default it is uncapped.",
28
- :distance_function => "Custom implementation of distance function. " +
29
- "It must be a closure receiving two data items and return the " +
30
- "distance bewteen them. By default, this algorithm uses " +
31
- "ecuclidean distance of numeric attributes to the power of 2.",
32
- :centroid_function => "Custom implementation to calculate the " +
33
- "centroid of a cluster. It must be a closure receiving an array of " +
34
- "data sets, and return an array of data items, representing the " +
35
- "centroids of for each data set. " +
36
- "By default, this algorithm returns a data items using the mode "+
37
- "or mean of each attribute on each data set."
38
-
24
+ attr_reader :data_set, :number_of_clusters, :clusters, :centroids, :iterations, :history
25
+
26
+ parameters_info(
27
+ max_iterations: 'Maximum number of iterations to build the clusterer. By default it is uncapped.',
28
+ distance_function: 'Custom implementation of distance function. ' \
29
+ 'It must be a closure receiving two data items and return the ' \
30
+ 'distance between them. By default, this algorithm uses ' \
31
+ 'euclidean distance of numeric attributes to the power of 2.',
32
+ centroid_function: 'Custom implementation to calculate the ' \
33
+ 'centroid of a cluster. It must be a closure receiving an array of ' \
34
+ 'data sets, and return an array of data items, representing the ' \
35
+ 'centroids of for each data set. ' \
36
+ 'By default, this algorithm returns a data items using the mode ' \
37
+ 'or mean of each attribute on each data set.',
38
+ centroid_indices: 'Indices of data items (indexed from 0) to be ' \
39
+ 'the initial centroids. Otherwise, the initial centroids will be ' \
40
+ 'assigned randomly from the data set.',
41
+ on_empty: 'Action to take if a cluster becomes empty, with values ' \
42
+ "'eliminate' (the default action, eliminate the empty cluster), " \
43
+ "'terminate' (terminate with error), 'random' (relocate the " \
44
+ "empty cluster to a random point), 'outlier' (relocate the " \
45
+ 'empty cluster to the point furthest from its centroid).',
46
+ random_seed: "Seed value used to initialize Ruby's random number " \
47
+ 'generator when selecting random centroids.',
48
+ init_method: 'Strategy to initialize centroids. Available values: ' \
49
+ ':random (default) and :kmeans_plus_plus.',
50
+ restarts: 'Number of random initializations to perform. ' \
51
+ 'The best run (lowest SSE) will be kept.',
52
+ track_history: 'Keep centroids and assignments for each iteration ' \
53
+ 'when building the clusterer.'
54
+ )
55
+
56
+ # @return [Object]
39
57
  def initialize
58
+ super()
40
59
  @distance_function = nil
41
60
  @max_iterations = nil
42
- @old_centroids = nil
43
- @centroid_function = lambda do |data_sets|
44
- data_sets.collect{ |data_set| data_set.get_mean_or_mode}
61
+ @centroid_function = lambda do |data_sets|
62
+ data_sets.collect(&:get_mean_or_mode)
45
63
  end
64
+ @centroid_indices = []
65
+ @on_empty = 'eliminate' # default if none specified
66
+ @random_seed = nil
67
+ @rng = nil
68
+ @init_method = :random
69
+ @restarts = 1
70
+ @track_history = false
46
71
  end
47
-
48
-
72
+
49
73
  # Build a new clusterer, using data examples found in data_set.
50
74
  # Items will be clustered in "number_of_clusters" different
51
75
  # clusters.
76
+ # @param data_set [Object]
77
+ # @param number_of_clusters [Object]
78
+ # @return [Object]
52
79
  def build(data_set, number_of_clusters)
53
80
  @data_set = data_set
54
81
  @number_of_clusters = number_of_clusters
55
- @iterations = 0
56
-
57
- calc_initial_centroids
58
- while(not stop_criteria_met)
59
- calculate_membership_clusters
60
- recompute_centroids
82
+ raise ArgumentError, 'Number of clusters larger than data items' if @number_of_clusters > @data_set.data_items.length
83
+
84
+ unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
85
+ raise ArgumentError,
86
+ 'Length of centroid indices array differs from the specified number of clusters'
87
+ end
88
+ unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
89
+ raise ArgumentError,
90
+ 'Invalid value for on_empty'
91
+ end
92
+
93
+ seed_base = @random_seed
94
+ best_sse = nil
95
+ best_centroids = nil
96
+ best_clusters = nil
97
+ best_iterations = nil
98
+
99
+ (@restarts || 1).times do |i|
100
+ @random_seed = seed_base.nil? ? nil : seed_base + i
101
+ @rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
102
+ @iterations = 0
103
+ @history = [] if @track_history
104
+ calc_initial_centroids
105
+ until stop_criteria_met
106
+ calculate_membership_clusters
107
+ if @track_history
108
+ @history << {
109
+ centroids: @centroids.collect(&:dup),
110
+ assignments: @assignments.dup
111
+ }
112
+ end
113
+ recompute_centroids
114
+ end
115
+ current_sse = sse
116
+ next unless best_sse.nil? || current_sse < best_sse
117
+
118
+ best_sse = current_sse
119
+ best_centroids = Marshal.load(Marshal.dump(@centroids))
120
+ best_clusters = Marshal.load(Marshal.dump(@clusters))
121
+ best_iterations = @iterations
61
122
  end
62
-
63
- return self
123
+
124
+ @random_seed = seed_base
125
+ @rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
126
+ @centroids = best_centroids
127
+ @clusters = best_clusters
128
+ @iterations = best_iterations
129
+ self
64
130
  end
65
-
66
- # Classifies the given data item, returning the cluster index it belongs
131
+
132
+ # Classifies the given data item, returning the cluster index it belongs
67
133
  # to (0-based).
134
+ # @param data_item [Object]
135
+ # @return [Object]
68
136
  def eval(data_item)
69
- get_min_index(@centroids.collect {|centroid|
70
- distance(data_item, centroid)})
137
+ get_min_index(@centroids.collect do |centroid|
138
+ distance(data_item, centroid)
139
+ end)
140
+ end
141
+
142
+ # Sum of squared distances of all points to their respective centroids.
143
+ # It can be used as a measure of cluster compactness (SSE).
144
+ # @return [Object]
145
+ def sse
146
+ sum = 0.0
147
+ @clusters.each_with_index do |cluster, i|
148
+ centroid = @centroids[i]
149
+ cluster.data_items.each do |item|
150
+ sum += distance(item, centroid)
151
+ end
152
+ end
153
+ sum
71
154
  end
72
-
155
+
73
156
  # This function calculates the distance between 2 different
74
- # instances. By default, it returns the euclidean distance to the
157
+ # instances. By default, it returns the euclidean distance to the
75
158
  # power of 2.
76
- # You can provide a more convinient distance implementation:
77
- #
159
+ # You can provide a more convenient distance implementation:
160
+ #
78
161
  # 1- Overwriting this method
79
- #
162
+ #
80
163
  # 2- Providing a closure to the :distance_function parameter
164
+ # @param a [Object]
165
+ # @param b [Object]
166
+ # @return [Object]
81
167
  def distance(a, b)
82
168
  return @distance_function.call(a, b) if @distance_function
83
- return euclidean_distance(a, b)
169
+
170
+ Ai4r::Data::Proximity.squared_euclidean_distance(
171
+ a.select { |att_a| att_a.is_a? Numeric },
172
+ b.select { |att_b| att_b.is_a? Numeric }
173
+ )
84
174
  end
85
-
86
- protected
87
-
175
+
176
+ protected
177
+
178
+ # @return [Object]
88
179
  def calc_initial_centroids
89
180
  @centroids = []
90
- tried_indexes = []
91
- while @centroids.length < @number_of_clusters &&
92
- tried_indexes.length < @data_set.data_items.length
93
- random_index = rand(@data_set.data_items.length)
94
- if !tried_indexes.include?(random_index)
95
- tried_indexes << random_index
96
- if !@centroids.include? @data_set.data_items[random_index]
97
- @centroids << @data_set.data_items[random_index]
98
- end
181
+ @old_centroids = nil
182
+ if @centroid_indices.empty?
183
+ if @init_method == :kmeans_plus_plus
184
+ kmeans_plus_plus_init
185
+ else
186
+ populate_centroids('random')
99
187
  end
188
+ else
189
+ populate_centroids('indices')
100
190
  end
101
- @number_of_clusters = @centroids.length
102
191
  end
103
-
192
+
193
+ # @return [Object]
104
194
  def stop_criteria_met
105
- @old_centroids == @centroids ||
195
+ @old_centroids == @centroids ||
106
196
  (@max_iterations && (@max_iterations <= @iterations))
107
197
  end
108
-
198
+
199
+ # @return [Object]
109
200
  def calculate_membership_clusters
110
- @clusters = Array.new(@number_of_clusters) do
111
- Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
201
+ @clusters = Array.new(@number_of_clusters) do
202
+ Ai4r::Data::DataSet.new data_labels: @data_set.data_labels
112
203
  end
113
- @data_set.data_items.each do |data_item|
114
- @clusters[eval(data_item)] << data_item
204
+ @cluster_indices = Array.new(@number_of_clusters) { [] }
205
+ @assignments = Array.new(@data_set.data_items.length)
206
+
207
+ @data_set.data_items.each_with_index do |data_item, data_index|
208
+ c = eval(data_item)
209
+ @clusters[c] << data_item
210
+ @cluster_indices[c] << data_index if @on_empty == 'outlier'
211
+ @assignments[data_index] = c
115
212
  end
213
+ manage_empty_clusters if empty_cluster?
116
214
  end
117
-
215
+
216
+ # @return [Object]
118
217
  def recompute_centroids
119
218
  @old_centroids = @centroids
120
219
  @iterations += 1
121
- @centroids = @centroid_function.call(@clusters)
220
+ @centroids = @centroid_function.call(@clusters)
221
+ end
222
+
223
+ # @return [Object]
224
+ def kmeans_plus_plus_init
225
+ chosen_indices = []
226
+ first_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
227
+ return if first_index.nil?
228
+
229
+ @centroids << @data_set.data_items[first_index]
230
+ chosen_indices << first_index
231
+ while @centroids.length < @number_of_clusters &&
232
+ chosen_indices.length < @data_set.data_items.length
233
+ distances = []
234
+ total = 0.0
235
+ @data_set.data_items.each_with_index do |item, index|
236
+ next if chosen_indices.include?(index)
237
+
238
+ min_dist = @centroids.map { |c| distance(item, c) }.min
239
+ distances << [index, min_dist]
240
+ total += min_dist
241
+ end
242
+ break if distances.empty?
243
+
244
+ r = @rng.rand * total
245
+ cumulative = 0.0
246
+ chosen = distances.find do |_idx, dist|
247
+ cumulative += dist
248
+ cumulative >= r
249
+ end
250
+ chosen_indices << chosen[0]
251
+ @centroids << @data_set.data_items[chosen[0]]
252
+ end
253
+ @number_of_clusters = @centroids.length
254
+ end
255
+
256
+ # @param populate_method [Object]
257
+ # @param number_of_clusters [Object]
258
+ # @return [Object]
259
+ def populate_centroids(populate_method, number_of_clusters = @number_of_clusters)
260
+ tried_indexes = []
261
+ case populate_method
262
+ when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
263
+ while @centroids.length < number_of_clusters &&
264
+ tried_indexes.length < @data_set.data_items.length
265
+ random_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
266
+ next if tried_indexes.include?(random_index)
267
+
268
+ tried_indexes << random_index
269
+ @centroids << @data_set.data_items[random_index] unless @centroids.include? @data_set.data_items[random_index]
270
+ end
271
+ when 'indices' # for initial assignment only (with the :centroid_indices option)
272
+ @centroid_indices.each do |index|
273
+ unless (index.is_a? Integer) && index >= 0 && index < @data_set.data_items.length
274
+ raise ArgumentError,
275
+ "Invalid centroid index #{index}"
276
+ end
277
+
278
+ next if tried_indexes.include?(index)
279
+
280
+ tried_indexes << index
281
+ @centroids << @data_set.data_items[index] unless @centroids.include? @data_set.data_items[index]
282
+ end
283
+ when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
284
+ sorted_data_indices = sort_data_indices_by_dist_to_centroid
285
+ i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
286
+ while @centroids.length < number_of_clusters &&
287
+ tried_indexes.length < @data_set.data_items.length
288
+ outlier_index = sorted_data_indices[i]
289
+ unless tried_indexes.include?(outlier_index)
290
+ tried_indexes << outlier_index
291
+ @centroids << @data_set.data_items[outlier_index] unless @centroids.include? @data_set.data_items[outlier_index]
292
+ end
293
+ i.positive? ? i -= 1 : break
294
+ end
295
+ end
296
+ @number_of_clusters = @centroids.length
297
+ end
298
+
299
+ # Sort cluster points by distance to assigned centroid. Utilizes @cluster_indices.
300
+ # Returns indices, sorted in order from the nearest to furthest.
301
+ # @return [Object]
302
+ def sort_data_indices_by_dist_to_centroid
303
+ h = {}
304
+ @clusters.each_with_index do |cluster, c|
305
+ centroid = @centroids[c]
306
+ cluster.data_items.each_with_index do |data_item, i|
307
+ dist_to_centroid = distance(data_item, centroid)
308
+ data_index = @cluster_indices[c][i]
309
+ h[data_index] = dist_to_centroid
310
+ end
311
+ end
312
+ # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
313
+ h.sort_by { |_k, v| v }.collect { |a, _b| a }
314
+ end
315
+
316
+ # @return [Object]
317
+ def empty_cluster?
318
+ found_empty = false
319
+ @number_of_clusters.times do |c|
320
+ found_empty = true if @clusters[c].data_items.empty?
321
+ end
322
+ found_empty
323
+ end
324
+
325
+ # @return [Object]
326
+ def manage_empty_clusters
327
+ # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
328
+ return if on_empty == 'terminate'
329
+
330
+ initial_number_of_clusters = @number_of_clusters
331
+ eliminate_empty_clusters
332
+ return if on_empty == 'eliminate'
333
+
334
+ populate_centroids(on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
335
+ calculate_membership_clusters
336
+ end
337
+
338
+ # @return [Object]
339
+ def eliminate_empty_clusters
340
+ old_clusters = @clusters
341
+ old_centroids = @centroids
342
+ old_cluster_indices = @cluster_indices
343
+ old_assignments = @assignments
344
+ @clusters = []
345
+ @centroids = []
346
+ @cluster_indices = []
347
+ remap = {}
348
+ new_index = 0
349
+ @number_of_clusters.times do |i|
350
+ next if old_clusters[i].data_items.empty?
351
+
352
+ remap[i] = new_index
353
+ @clusters << old_clusters[i]
354
+ @cluster_indices << old_cluster_indices[i]
355
+ @centroids << old_centroids[i]
356
+ new_index += 1
357
+ end
358
+ @number_of_clusters = @centroids.length
359
+ @assignments = old_assignments.map { |c| remap[c] }
122
360
  end
123
-
124
361
  end
125
362
  end
126
363
  end