ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +174 -0
  3. data/examples/classifiers/hyperpipes_data.csv +14 -0
  4. data/examples/classifiers/hyperpipes_example.rb +22 -0
  5. data/examples/classifiers/ib1_example.rb +12 -0
  6. data/examples/classifiers/id3_example.rb +15 -10
  7. data/examples/classifiers/id3_graphviz_example.rb +17 -0
  8. data/examples/classifiers/logistic_regression_example.rb +11 -0
  9. data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
  10. data/examples/classifiers/naive_bayes_example.rb +12 -13
  11. data/examples/classifiers/one_r_example.rb +27 -0
  12. data/examples/classifiers/parameter_tutorial.rb +29 -0
  13. data/examples/classifiers/prism_nominal_example.rb +15 -0
  14. data/examples/classifiers/prism_numeric_example.rb +21 -0
  15. data/examples/classifiers/simple_linear_regression_example.rb +14 -11
  16. data/examples/classifiers/zero_and_one_r_example.rb +34 -0
  17. data/examples/classifiers/zero_one_r_data.csv +8 -0
  18. data/examples/clusterers/clusterer_example.rb +40 -34
  19. data/examples/clusterers/dbscan_example.rb +17 -0
  20. data/examples/clusterers/dendrogram_example.rb +17 -0
  21. data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
  22. data/examples/clusterers/kmeans_custom_example.rb +26 -0
  23. data/examples/genetic_algorithm/bitstring_example.rb +41 -0
  24. data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
  25. data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
  26. data/examples/neural_network/backpropagation_example.rb +48 -48
  27. data/examples/neural_network/hopfield_example.rb +45 -0
  28. data/examples/neural_network/patterns_with_base_noise.rb +39 -39
  29. data/examples/neural_network/patterns_with_noise.rb +41 -39
  30. data/examples/neural_network/train_epochs_callback.rb +25 -0
  31. data/examples/neural_network/training_patterns.rb +39 -39
  32. data/examples/neural_network/transformer_text_classification.rb +78 -0
  33. data/examples/neural_network/xor_example.rb +23 -22
  34. data/examples/reinforcement/q_learning_example.rb +10 -0
  35. data/examples/som/som_data.rb +155 -152
  36. data/examples/som/som_multi_node_example.rb +12 -13
  37. data/examples/som/som_single_example.rb +12 -15
  38. data/examples/transformer/decode_classifier_example.rb +68 -0
  39. data/examples/transformer/deterministic_example.rb +10 -0
  40. data/examples/transformer/seq2seq_example.rb +16 -0
  41. data/lib/ai4r/classifiers/classifier.rb +24 -16
  42. data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
  43. data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
  44. data/lib/ai4r/classifiers/ib1.rb +122 -32
  45. data/lib/ai4r/classifiers/id3.rb +524 -145
  46. data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
  47. data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
  48. data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
  49. data/lib/ai4r/classifiers/one_r.rb +112 -44
  50. data/lib/ai4r/classifiers/prism.rb +167 -76
  51. data/lib/ai4r/classifiers/random_forest.rb +72 -0
  52. data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
  53. data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
  54. data/lib/ai4r/classifiers/votes.rb +57 -0
  55. data/lib/ai4r/classifiers/zero_r.rb +71 -30
  56. data/lib/ai4r/clusterers/average_linkage.rb +46 -27
  57. data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
  58. data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
  59. data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
  60. data/lib/ai4r/clusterers/clusterer.rb +29 -14
  61. data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
  62. data/lib/ai4r/clusterers/dbscan.rb +134 -0
  63. data/lib/ai4r/clusterers/diana.rb +75 -49
  64. data/lib/ai4r/clusterers/k_means.rb +270 -135
  65. data/lib/ai4r/clusterers/median_linkage.rb +49 -33
  66. data/lib/ai4r/clusterers/single_linkage.rb +196 -88
  67. data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
  68. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
  69. data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
  70. data/lib/ai4r/data/data_set.rb +223 -103
  71. data/lib/ai4r/data/parameterizable.rb +31 -25
  72. data/lib/ai4r/data/proximity.rb +62 -62
  73. data/lib/ai4r/data/statistics.rb +46 -35
  74. data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
  75. data/lib/ai4r/experiment/split.rb +39 -0
  76. data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
  77. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
  78. data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
  79. data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
  80. data/lib/ai4r/neural_network/activation_functions.rb +37 -0
  81. data/lib/ai4r/neural_network/backpropagation.rb +399 -134
  82. data/lib/ai4r/neural_network/hopfield.rb +175 -58
  83. data/lib/ai4r/neural_network/transformer.rb +194 -0
  84. data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
  85. data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
  86. data/lib/ai4r/reinforcement/q_learning.rb +51 -0
  87. data/lib/ai4r/search/a_star.rb +76 -0
  88. data/lib/ai4r/search/bfs.rb +50 -0
  89. data/lib/ai4r/search/dfs.rb +50 -0
  90. data/lib/ai4r/search/mcts.rb +118 -0
  91. data/lib/ai4r/search.rb +12 -0
  92. data/lib/ai4r/som/distance_metrics.rb +29 -0
  93. data/lib/ai4r/som/layer.rb +28 -17
  94. data/lib/ai4r/som/node.rb +61 -32
  95. data/lib/ai4r/som/som.rb +158 -41
  96. data/lib/ai4r/som/two_phase_layer.rb +21 -25
  97. data/lib/ai4r/version.rb +3 -0
  98. data/lib/ai4r.rb +57 -28
  99. metadata +79 -109
  100. data/README.rdoc +0 -39
  101. data/test/classifiers/hyperpipes_test.rb +0 -84
  102. data/test/classifiers/ib1_test.rb +0 -78
  103. data/test/classifiers/id3_test.rb +0 -220
  104. data/test/classifiers/multilayer_perceptron_test.rb +0 -79
  105. data/test/classifiers/naive_bayes_test.rb +0 -43
  106. data/test/classifiers/one_r_test.rb +0 -62
  107. data/test/classifiers/prism_test.rb +0 -85
  108. data/test/classifiers/simple_linear_regression_test.rb +0 -37
  109. data/test/classifiers/zero_r_test.rb +0 -50
  110. data/test/clusterers/average_linkage_test.rb +0 -51
  111. data/test/clusterers/bisecting_k_means_test.rb +0 -66
  112. data/test/clusterers/centroid_linkage_test.rb +0 -53
  113. data/test/clusterers/complete_linkage_test.rb +0 -57
  114. data/test/clusterers/diana_test.rb +0 -69
  115. data/test/clusterers/k_means_test.rb +0 -167
  116. data/test/clusterers/median_linkage_test.rb +0 -53
  117. data/test/clusterers/single_linkage_test.rb +0 -122
  118. data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
  119. data/test/clusterers/ward_linkage_test.rb +0 -53
  120. data/test/clusterers/weighted_average_linkage_test.rb +0 -53
  121. data/test/data/data_set_test.rb +0 -104
  122. data/test/data/proximity_test.rb +0 -87
  123. data/test/data/statistics_test.rb +0 -65
  124. data/test/experiment/classifier_evaluator_test.rb +0 -76
  125. data/test/genetic_algorithm/chromosome_test.rb +0 -57
  126. data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
  127. data/test/neural_network/backpropagation_test.rb +0 -82
  128. data/test/neural_network/hopfield_test.rb +0 -72
  129. data/test/som/som_test.rb +0 -97
@@ -1,19 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Malav Bhavsar
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
8
  # You can redistribute it and/or modify it under the terms of
7
9
  # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/classifier'
12
+ require_relative '../data/data_set'
13
+ require_relative 'classifier'
12
14
 
13
15
  module Ai4r
14
16
  module Classifiers
15
-
16
-
17
17
  # = Introduction
18
18
  #
19
19
  # This is an implementation of a Simple Linear Regression Classifier.
@@ -30,88 +30,113 @@ module Ai4r
30
30
  # build data
31
31
  # c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
32
32
  #
33
-
34
- class SimpleLinearRegression < Classifier
35
33
 
34
+ # SimpleLinearRegression performs linear regression on one attribute.
35
+ class SimpleLinearRegression < Classifier
36
36
  attr_reader :attribute, :attribute_index, :slope, :intercept
37
37
 
38
+ parameters_info selected_attribute: 'Index of attribute to use for regression.'
39
+
40
+ # @return [Object]
38
41
  def initialize
42
+ super()
39
43
  @attribute = nil
40
44
  @attribute_index = 0
41
45
  @slope = 0
42
46
  @intercept = 0
47
+ @selected_attribute = nil
43
48
  end
44
49
 
45
50
  # You can evaluate new data, predicting its category.
46
51
  # e.g.
47
52
  # c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
48
53
  # => 11876.96774193548
54
+ # @param data [Object]
55
+ # @return [Object]
49
56
  def eval(data)
50
- @intercept + @slope * data[@attribute_index]
57
+ @intercept + (@slope * data[@attribute_index])
51
58
  end
52
59
 
53
60
  # Gets the best attribute and does Linear Regression using it to find out the
54
61
  # slope and intercept.
55
62
  # Parameter data has to be an instance of DataSet
63
+ # @param data [Object]
64
+ # @return [Object]
56
65
  def build(data)
57
- raise "Error instance must be passed" unless data.is_a?(DataSet)
58
- raise "Data should not be empty" if data.data_items.length == 0
66
+ validate_data(data)
67
+
59
68
  y_mean = data.get_mean_or_mode[data.num_attributes - 1]
69
+ result = if @selected_attribute
70
+ evaluate_attribute(data, @selected_attribute, y_mean)
71
+ else
72
+ evaluate_all_attributes(data, y_mean)
73
+ end
74
+ assign_result(data, result)
75
+ end
60
76
 
61
- # Choose best attribute
62
- min_msq = Float::MAX
63
- attribute = nil
64
- chosen = -1
65
- chosen_slope = 0.0 / 0.0 # Float::NAN
66
- chosen_intercept = 0.0 / 0.0 # Float::NAN
77
+ def validate_data(data)
78
+ raise 'Error instance must be passed' unless data.is_a?(Ai4r::Data::DataSet)
79
+ raise 'Data should not be empty' if data.data_items.empty?
80
+ end
67
81
 
82
+ def evaluate_attribute(data, attr_index, y_mean)
83
+ x_mean = data.get_mean_or_mode[attr_index]
84
+ slope, x_diff_sq, y_diff_sq = attribute_sums(data, attr_index, x_mean, y_mean)
85
+ if x_diff_sq.zero?
86
+ { chosen: attr_index, slope: 0, intercept: y_mean, msq: Float::MAX }
87
+ else
88
+ chosen_slope = slope / x_diff_sq
89
+ intercept = y_mean - (chosen_slope * x_mean)
90
+ { chosen: attr_index, slope: chosen_slope, intercept: intercept, msq: y_diff_sq - (chosen_slope * slope) }
91
+ end
92
+ end
93
+
94
+ def evaluate_all_attributes(data, y_mean)
95
+ result = { chosen: -1, msq: Float::MAX }
68
96
  data.data_labels.each do |attr_name|
69
97
  attr_index = data.get_index attr_name
70
- if attr_index != data.num_attributes-1
71
- # Compute slope and intercept
72
- x_mean = data.get_mean_or_mode[attr_index]
73
- sum_x_diff_squared = 0
74
- sum_y_diff_squared = 0
75
- slope = 0
76
- data.data_items.map do |instance|
77
- x_diff = instance[attr_index] - x_mean
78
- y_diff = instance[attr_index] - y_mean
79
- slope += x_diff * y_diff
80
- sum_x_diff_squared += x_diff * x_diff
81
- sum_y_diff_squared += y_diff * y_diff
82
- end
83
-
84
- if sum_x_diff_squared == 0
85
- next
86
- end
87
-
88
- numerator = slope
89
- slope /= sum_x_diff_squared
90
- intercept = y_mean - slope * x_mean
91
- msq = sum_y_diff_squared - slope * numerator
92
-
93
- if msq < min_msq
94
- min_msq = msq
95
- chosen = attr_index
96
- chosen_slope = slope
97
- chosen_intercept = intercept
98
- end
99
- end
98
+ next if attr_index == data.num_attributes - 1
99
+
100
+ candidate = evaluate_attribute(data, attr_index, y_mean)
101
+ next unless candidate[:msq] < result[:msq]
102
+
103
+ result = candidate
100
104
  end
105
+ result
106
+ end
101
107
 
102
- if chosen == -1
103
- raise "no useful attribute found"
104
- @attribute = nil
105
- @attribute_index = 0
106
- @slope = 0
107
- @intercept = y_mean
108
- else
109
- @attribute = data.data_labels[chosen]
110
- @attribute_index = chosen
111
- @slope = chosen_slope
112
- @intercept = chosen_intercept
108
+ def assign_result(data, result)
109
+ raise 'no useful attribute found' if result[:chosen] == -1
110
+
111
+ @attribute = data.data_labels[result[:chosen]]
112
+ @attribute_index = result[:chosen]
113
+ @slope = result[:slope]
114
+ @intercept = result[:intercept]
115
+ self
116
+ end
117
+
118
+ # Simple Linear Regression classifiers cannot generate human readable
119
+ # rules. This method returns a descriptive string indicating that rule
120
+ # extraction is not supported.
121
+ def get_rules
122
+ 'SimpleLinearRegression does not support rule extraction.'
123
+ end
124
+
125
+ private
126
+
127
+ # Calculate regression sums for the given attribute.
128
+ def attribute_sums(data, attr_index, x_mean, y_mean)
129
+ slope = 0
130
+ sum_x_diff_squared = 0
131
+ sum_y_diff_squared = 0
132
+ data.data_items.each do |instance|
133
+ x_diff = instance[attr_index] - x_mean
134
+ y_diff = instance[data.num_attributes - 1] - y_mean
135
+ slope += x_diff * y_diff
136
+ sum_x_diff_squared += x_diff * x_diff
137
+ sum_y_diff_squared += y_diff * y_diff
113
138
  end
114
- return self
139
+ [slope, sum_x_diff_squared, sum_y_diff_squared]
115
140
  end
116
141
  end
117
142
  end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: OpenAI Assistant
4
+ # License:: MPL 1.1
5
+ # Project:: ai4r
6
+ # Url:: https://github.com/SergioFierens/ai4r
7
+ #
8
+ # A minimal linear Support Vector Machine implementation using
9
+ # stochastic gradient descent. This implementation is intentionally
10
+ # simple and only supports binary classification with numeric
11
+ # attributes.
12
+
13
+ require_relative '../data/data_set'
14
+ require_relative 'classifier'
15
+
16
+ module Ai4r
17
+ module Classifiers
18
+ # A lightweight linear SVM classifier trained via gradient descent.
19
+ # Only two classes are supported. Predictions return the same class
20
+ # labels used in the training data.
21
+ class SupportVectorMachine < Classifier
22
+ attr_reader :weights, :bias, :classes
23
+
24
+ parameters_info learning_rate: 'Learning rate for gradient descent.',
25
+ iterations: 'Training iterations.',
26
+ c: 'Regularization strength.'
27
+
28
+ def initialize
29
+ super()
30
+ @learning_rate = 0.01
31
+ @iterations = 1000
32
+ @c = 1.0
33
+ @weights = []
34
+ @bias = 0.0
35
+ @classes = []
36
+ end
37
+
38
+ # Train the SVM using the provided DataSet. Only numeric attributes and
39
+ # exactly two classes are supported.
40
+ def build(data_set)
41
+ data_set.check_not_empty
42
+ @classes = data_set.build_domains.last.to_a
43
+ raise ArgumentError, 'SVM only supports two classes' unless @classes.size == 2
44
+
45
+ num_features = data_set.data_labels.length - 1
46
+ @weights = Array.new(num_features, 0.0)
47
+ @bias = 0.0
48
+
49
+ samples = data_set.data_items.map do |row|
50
+ [row[0...-1].map(&:to_f), row.last]
51
+ end
52
+
53
+ @iterations.times do
54
+ samples.each do |features, label|
55
+ y = label == @classes[0] ? 1.0 : -1.0
56
+ prediction = dot(@weights, features) + @bias
57
+ if y * prediction < 1
58
+ @weights.map!.with_index do |w, i|
59
+ w + (@learning_rate * ((@c * y * features[i]) - (2 * w)))
60
+ end
61
+ @bias += @learning_rate * @c * y
62
+ else
63
+ @weights.map!.with_index { |w, _i| w - (@learning_rate * 2 * w) }
64
+ end
65
+ end
66
+ end
67
+ self
68
+ end
69
+
70
+ # Predict the class for the given numeric feature vector.
71
+ def eval(data)
72
+ score = dot(@weights, data.map(&:to_f)) + @bias
73
+ score >= 0 ? @classes[0] : @classes[1]
74
+ end
75
+
76
+ # Support Vector Machine classifiers cannot generate human readable rules.
77
+ # This method returns a string indicating rule extraction is unsupported.
78
+ def get_rules
79
+ 'SupportVectorMachine does not support rule extraction.'
80
+ end
81
+
82
+ private
83
+
84
+ def dot(a, b)
85
+ sum = 0.0
86
+ a.each_index { |i| sum += a[i] * b[i] }
87
+ sum
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Will Warner
4
+ # License:: MPL 1.1
5
+ # Project:: ai4r
6
+ # Url:: https://github.com/SergioFierens/ai4r
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
10
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
11
+
12
+ module Ai4r
13
+ module Classifiers
14
+ # Simple vote counter used by ensemble methods.
15
+ class Votes
16
+ # @return [Object]
17
+ def initialize
18
+ self.tally_sheet = Hash.new(0)
19
+ end
20
+
21
+ # @param category [Object]
22
+ # @return [Object]
23
+ def increment_category(category)
24
+ tally_sheet[category] += 1
25
+ end
26
+
27
+ # @param category [Object]
28
+ # @return [Object]
29
+ def tally_for(category)
30
+ tally_sheet[category]
31
+ end
32
+
33
+ # @param tie_break [Object]
34
+ # @return [Object]
35
+ def get_winner(tie_break = :last, rng: Random.new)
36
+ n = 0 # used to create a stable sort of the tallys
37
+ sorted_sheet = tally_sheet.sort_by do |_, score|
38
+ n += 1
39
+ [score, n]
40
+ end
41
+ return nil if sorted_sheet.empty?
42
+
43
+ if tie_break == :random
44
+ max_score = sorted_sheet.last[1]
45
+ tied = sorted_sheet.select { |_, score| score == max_score }.map(&:first)
46
+ tied.sample(random: rng)
47
+ else
48
+ sorted_sheet.last.first
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ attr_accessor :tally_sheet
55
+ end
56
+ end
57
+ end
@@ -1,73 +1,114 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (Implementation only)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set.rb'
11
- require File.dirname(__FILE__) + '/../classifiers/classifier'
12
+ require_relative '../data/data_set'
13
+ require_relative '../classifiers/classifier'
12
14
 
13
15
  module Ai4r
14
16
  module Classifiers
15
-
16
17
  # = Introduction
17
- #
18
- # The idea behind the ZeroR classifier is to identify the
19
- # the most common class value in the training set.
20
- # It always returns that value when evaluating an instance.
21
- # It is frequently used as a baseline for evaluating other machine learning
18
+ #
19
+ # The idea behind the ZeroR classifier is to identify the
20
+ # the most common class value in the training set.
21
+ # It always returns that value when evaluating an instance.
22
+ # It is frequently used as a baseline for evaluating other machine learning
22
23
  # algorithms.
23
24
  class ZeroR < Classifier
24
-
25
25
  attr_reader :data_set, :class_value
26
-
26
+
27
+ parameters_info default_class: 'Return this value when the provided ' \
28
+ 'dataset is empty.',
29
+ tie_break: 'Strategy used when more than one class has the ' \
30
+ 'same maximal frequency. Valid values are :first (default) ' \
31
+ 'and :random.',
32
+ random_seed: 'Seed for tie resolution when using :random strategy.'
33
+
34
+ # @return [Object]
35
+ def initialize
36
+ super()
37
+ @default_class = nil
38
+ @tie_break = :first
39
+ @random_seed = nil
40
+ @rng = nil
41
+ end
42
+
27
43
  # Build a new ZeroR classifier. You must provide a DataSet instance
28
- # as parameter. The last attribute of each item is considered as
44
+ # as parameter. The last attribute of each item is considered as
29
45
  # the item class.
46
+ # @param data_set [Object]
47
+ # @return [Object]
30
48
  def build(data_set)
31
- data_set.check_not_empty
32
49
  @data_set = data_set
33
- frequencies = {}
50
+
51
+ if @data_set.data_items.empty?
52
+ @class_value = @default_class
53
+ return self
54
+ end
55
+
56
+ frequencies = Hash.new(0)
34
57
  max_freq = 0
35
- @class_value = nil
58
+ tied_classes = []
59
+
36
60
  @data_set.data_items.each do |example|
37
61
  class_value = example.last
38
- frequencies[class_value] = frequencies[class_value].nil? ? 1 : frequencies[class_value] + 1
62
+ frequencies[class_value] += 1
39
63
  class_frequency = frequencies[class_value]
40
- if max_freq < class_frequency
64
+ if class_frequency > max_freq
41
65
  max_freq = class_frequency
42
- @class_value = class_value
66
+ tied_classes = [class_value]
67
+ elsif class_frequency == max_freq && !tied_classes.include?(class_value)
68
+ tied_classes << class_value
43
69
  end
44
70
  end
45
- return self
71
+
72
+ rng = @rng || (@random_seed.nil? ? Random.new : Random.new(@random_seed))
73
+
74
+ @class_value = if tied_classes.length == 1
75
+ tied_classes.first
76
+ else
77
+ case @tie_break
78
+ when :random
79
+ tied_classes.sample(random: rng)
80
+ else
81
+ tied_classes.first
82
+ end
83
+ end
84
+
85
+ self
46
86
  end
47
-
87
+
48
88
  # You can evaluate new data, predicting its class.
49
89
  # e.g.
50
90
  # classifier.eval(['New York', '<30', 'F']) # => 'Y'
51
- def eval(data)
91
+ # @param data [Object]
92
+ # @return [Object]
93
+ def eval(_data)
52
94
  @class_value
53
95
  end
54
-
96
+
55
97
  # This method returns the generated rules in ruby code.
56
98
  # e.g.
57
- #
99
+ #
58
100
  # classifier.get_rules
59
101
  # # => marketing_target='Y'
60
102
  #
61
- # It is a nice way to inspect induction results, and also to execute them:
103
+ # It is a nice way to inspect induction results, and also to execute them:
62
104
  # marketing_target = nil
63
- # eval classifier.get_rules
105
+ # eval classifier.get_rules
64
106
  # puts marketing_target
65
107
  # # => 'Y'
108
+ # @return [Object]
66
109
  def get_rules
67
- return "#{@data_set.data_labels.last} = '#{@class_value}'"
110
+ "#{@data_set.category_label} = '#{@class_value}'"
68
111
  end
69
-
70
112
  end
71
-
72
113
  end
73
114
  end
@@ -1,59 +1,78 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+ require_relative '../data/data_set'
13
+ require_relative '../clusterers/single_linkage'
14
+ require_relative '../clusterers/cluster_tree'
12
15
 
13
16
  module Ai4r
14
17
  module Clusterers
15
-
16
18
  # Implementation of a Hierarchical clusterer with group average
17
- # linkage, AKA unweighted pair group method average or UPGMA (Everitt
19
+ # linkage, AKA unweighted pair group method average or UPGMA (Everitt
18
20
  # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
19
- # Hierarchical clusterer create one cluster per element, and then
21
+ # Hierarchical clusterer create one cluster per element, and then
20
22
  # progressively merge clusters, until the required number of clusters
21
23
  # is reached.
22
- # With average linkage, the distance between a clusters cx and
24
+ # With average linkage, the distance between a clusters cx and
23
25
  # cluster (ci U cj) the the average distance between cx and ci, and
24
26
  # cx and cj.
25
27
  #
26
28
  # D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
27
29
  class AverageLinkage < SingleLinkage
28
-
29
- parameters_info :distance_function =>
30
- "Custom implementation of distance function. " +
31
- "It must be a closure receiving two data items and return the " +
32
- "distance between them. By default, this algorithm uses " +
33
- "euclidean distance of numeric attributes to the power of 2."
34
-
30
+ include ClusterTree
31
+
32
+ parameters_info distance_function:
33
+ 'Custom implementation of distance function. ' \
34
+ 'It must be a closure receiving two data items and return the ' \
35
+ 'distance between them. By default, this algorithm uses ' \
36
+ 'euclidean distance of numeric attributes to the power of 2.'
37
+
35
38
  # Build a new clusterer, using data examples found in data_set.
36
39
  # Items will be clustered in "number_of_clusters" different
37
40
  # clusters.
38
- def build(data_set, number_of_clusters)
41
+ # @param data_set [Object]
42
+ # @param number_of_clusters [Object]
43
+ # @param *options [Object]
44
+ # @return [Object]
45
+ def build(data_set, number_of_clusters = 1, **options)
39
46
  super
40
47
  end
41
-
42
- # This algorithms does not allow classification of new data items
48
+
49
+ # This algorithms does not allow classification of new data items
43
50
  # once it has been built. Rebuild the cluster including you data element.
44
- def eval(data_item)
45
- Raise "Eval of new data is not supported by this algorithm."
51
+ # @param _data_item [Object]
52
+ # @return [Object]
53
+ def eval(_data_item)
54
+ raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
46
55
  end
47
-
56
+
57
+ # Average linkage builds a dendrogram and cannot classify new data
58
+ # once built.
59
+ # @return [Object]
60
+ def supports_eval?
61
+ false
62
+ end
63
+
48
64
  protected
49
-
65
+
50
66
  # return distance between cluster cx and cluster (ci U cj),
51
67
  # using average linkage
52
- def linkage_distance(cx, ci, cj)
53
- (read_distance_matrix(cx, ci)+
54
- read_distance_matrix(cx, cj))/2
68
+ # @param cx [Object]
69
+ # @param ci [Object]
70
+ # @param cj [Object]
71
+ # @return [Object]
72
+ def linkage_distance(cluster_x, cluster_i, cluster_j)
73
+ (read_distance_matrix(cluster_x, cluster_i) +
74
+ read_distance_matrix(cluster_x, cluster_j)) / 2
55
75
  end
56
-
57
76
  end
58
77
  end
59
78
  end