ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +174 -0
  3. data/examples/classifiers/hyperpipes_data.csv +14 -0
  4. data/examples/classifiers/hyperpipes_example.rb +22 -0
  5. data/examples/classifiers/ib1_example.rb +12 -0
  6. data/examples/classifiers/id3_example.rb +15 -10
  7. data/examples/classifiers/id3_graphviz_example.rb +17 -0
  8. data/examples/classifiers/logistic_regression_example.rb +11 -0
  9. data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
  10. data/examples/classifiers/naive_bayes_example.rb +12 -13
  11. data/examples/classifiers/one_r_example.rb +27 -0
  12. data/examples/classifiers/parameter_tutorial.rb +29 -0
  13. data/examples/classifiers/prism_nominal_example.rb +15 -0
  14. data/examples/classifiers/prism_numeric_example.rb +21 -0
  15. data/examples/classifiers/simple_linear_regression_example.rb +14 -11
  16. data/examples/classifiers/zero_and_one_r_example.rb +34 -0
  17. data/examples/classifiers/zero_one_r_data.csv +8 -0
  18. data/examples/clusterers/clusterer_example.rb +40 -34
  19. data/examples/clusterers/dbscan_example.rb +17 -0
  20. data/examples/clusterers/dendrogram_example.rb +17 -0
  21. data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
  22. data/examples/clusterers/kmeans_custom_example.rb +26 -0
  23. data/examples/genetic_algorithm/bitstring_example.rb +41 -0
  24. data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
  25. data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
  26. data/examples/neural_network/backpropagation_example.rb +48 -48
  27. data/examples/neural_network/hopfield_example.rb +45 -0
  28. data/examples/neural_network/patterns_with_base_noise.rb +39 -39
  29. data/examples/neural_network/patterns_with_noise.rb +41 -39
  30. data/examples/neural_network/train_epochs_callback.rb +25 -0
  31. data/examples/neural_network/training_patterns.rb +39 -39
  32. data/examples/neural_network/transformer_text_classification.rb +78 -0
  33. data/examples/neural_network/xor_example.rb +23 -22
  34. data/examples/reinforcement/q_learning_example.rb +10 -0
  35. data/examples/som/som_data.rb +155 -152
  36. data/examples/som/som_multi_node_example.rb +12 -13
  37. data/examples/som/som_single_example.rb +12 -15
  38. data/examples/transformer/decode_classifier_example.rb +68 -0
  39. data/examples/transformer/deterministic_example.rb +10 -0
  40. data/examples/transformer/seq2seq_example.rb +16 -0
  41. data/lib/ai4r/classifiers/classifier.rb +24 -16
  42. data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
  43. data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
  44. data/lib/ai4r/classifiers/ib1.rb +122 -32
  45. data/lib/ai4r/classifiers/id3.rb +524 -145
  46. data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
  47. data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
  48. data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
  49. data/lib/ai4r/classifiers/one_r.rb +112 -44
  50. data/lib/ai4r/classifiers/prism.rb +167 -76
  51. data/lib/ai4r/classifiers/random_forest.rb +72 -0
  52. data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
  53. data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
  54. data/lib/ai4r/classifiers/votes.rb +57 -0
  55. data/lib/ai4r/classifiers/zero_r.rb +71 -30
  56. data/lib/ai4r/clusterers/average_linkage.rb +46 -27
  57. data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
  58. data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
  59. data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
  60. data/lib/ai4r/clusterers/clusterer.rb +29 -14
  61. data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
  62. data/lib/ai4r/clusterers/dbscan.rb +134 -0
  63. data/lib/ai4r/clusterers/diana.rb +75 -49
  64. data/lib/ai4r/clusterers/k_means.rb +270 -135
  65. data/lib/ai4r/clusterers/median_linkage.rb +49 -33
  66. data/lib/ai4r/clusterers/single_linkage.rb +196 -88
  67. data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
  68. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
  69. data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
  70. data/lib/ai4r/data/data_set.rb +223 -103
  71. data/lib/ai4r/data/parameterizable.rb +31 -25
  72. data/lib/ai4r/data/proximity.rb +62 -62
  73. data/lib/ai4r/data/statistics.rb +46 -35
  74. data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
  75. data/lib/ai4r/experiment/split.rb +39 -0
  76. data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
  77. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
  78. data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
  79. data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
  80. data/lib/ai4r/neural_network/activation_functions.rb +37 -0
  81. data/lib/ai4r/neural_network/backpropagation.rb +399 -134
  82. data/lib/ai4r/neural_network/hopfield.rb +175 -58
  83. data/lib/ai4r/neural_network/transformer.rb +194 -0
  84. data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
  85. data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
  86. data/lib/ai4r/reinforcement/q_learning.rb +51 -0
  87. data/lib/ai4r/search/a_star.rb +76 -0
  88. data/lib/ai4r/search/bfs.rb +50 -0
  89. data/lib/ai4r/search/dfs.rb +50 -0
  90. data/lib/ai4r/search/mcts.rb +118 -0
  91. data/lib/ai4r/search.rb +12 -0
  92. data/lib/ai4r/som/distance_metrics.rb +29 -0
  93. data/lib/ai4r/som/layer.rb +28 -17
  94. data/lib/ai4r/som/node.rb +61 -32
  95. data/lib/ai4r/som/som.rb +158 -41
  96. data/lib/ai4r/som/two_phase_layer.rb +21 -25
  97. data/lib/ai4r/version.rb +3 -0
  98. data/lib/ai4r.rb +57 -28
  99. metadata +79 -109
  100. data/README.rdoc +0 -39
  101. data/test/classifiers/hyperpipes_test.rb +0 -84
  102. data/test/classifiers/ib1_test.rb +0 -78
  103. data/test/classifiers/id3_test.rb +0 -220
  104. data/test/classifiers/multilayer_perceptron_test.rb +0 -79
  105. data/test/classifiers/naive_bayes_test.rb +0 -43
  106. data/test/classifiers/one_r_test.rb +0 -62
  107. data/test/classifiers/prism_test.rb +0 -85
  108. data/test/classifiers/simple_linear_regression_test.rb +0 -37
  109. data/test/classifiers/zero_r_test.rb +0 -50
  110. data/test/clusterers/average_linkage_test.rb +0 -51
  111. data/test/clusterers/bisecting_k_means_test.rb +0 -66
  112. data/test/clusterers/centroid_linkage_test.rb +0 -53
  113. data/test/clusterers/complete_linkage_test.rb +0 -57
  114. data/test/clusterers/diana_test.rb +0 -69
  115. data/test/clusterers/k_means_test.rb +0 -167
  116. data/test/clusterers/median_linkage_test.rb +0 -53
  117. data/test/clusterers/single_linkage_test.rb +0 -122
  118. data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
  119. data/test/clusterers/ward_linkage_test.rb +0 -53
  120. data/test/clusterers/weighted_average_linkage_test.rb +0 -53
  121. data/test/data/data_set_test.rb +0 -104
  122. data/test/data/proximity_test.rb +0 -87
  123. data/test/data/statistics_test.rb +0 -65
  124. data/test/experiment/classifier_evaluator_test.rb +0 -76
  125. data/test/genetic_algorithm/chromosome_test.rb +0 -57
  126. data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
  127. data/test/neural_network/backpropagation_test.rb +0 -82
  128. data/test/neural_network/hopfield_test.rb +0 -72
  129. data/test/som/som_test.rb +0 -97
@@ -1,64 +1,70 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
12
  module Ai4r
11
13
  module Data
14
+ # Mix-in to declare configurable parameters for algorithms.
12
15
  module Parameterizable
13
-
16
+ # Class-level helpers for Parameterizable.
14
17
  module ClassMethods
15
-
16
18
  # Get info on what can be parameterized on this algorithm.
17
19
  # It returns a hash with the following format:
18
20
  # { :param_name => "Info on the parameter" }
21
+ # @return [Object]
19
22
  def get_parameters_info
20
- return @_params_info_ || {}
23
+ @_params_info_ || {}
21
24
  end
22
-
25
+
23
26
  # Set info on what can be parameterized on this algorithm.
24
27
  # You must provide a hash with the following format:
25
- # { :param_name => "Info on the parameter" }
28
+ # { :param_name => "Info on the parameter" }
29
+ # @param params_info [Object]
30
+ # @return [Object]
26
31
  def parameters_info(params_info)
27
- @_params_info_ = params_info
28
- params_info.keys.each do |param|
29
- attr_accessor param
32
+ @_params_info_ = get_parameters_info.merge(params_info)
33
+ params_info.each_key do |param|
34
+ attr_accessor param unless method_defined?(param) || method_defined?("#{param}=")
30
35
  end
31
36
  end
32
37
  end
33
-
38
+
34
39
  # Set parameter values on this algorithm instance.
35
- # You must provide a hash with the folowing format:
40
+ # You must provide a hash with the following format:
36
41
  # { :param_name => parameter_value }
42
+ # @param params [Object]
43
+ # @return [Object]
37
44
  def set_parameters(params)
38
- self.class.get_parameters_info.keys.each do | key |
39
- if self.respond_to?("#{key}=".to_sym)
40
- send("#{key}=".to_sym, params[key]) if params.has_key? key
41
- end
45
+ params.each do |key, val|
46
+ public_send("#{key}=", val) if respond_to?("#{key}=")
42
47
  end
43
- return self
48
+ self
44
49
  end
45
-
50
+
46
51
  # Get parameter values on this algorithm instance.
47
- # Returns a hash with the folowing format:
52
+ # Returns a hash with the following format:
48
53
  # { :param_name => parameter_value }
54
+ # @return [Object]
49
55
  def get_parameters
50
56
  params = {}
51
- self.class.get_parameters_info.keys.each do | key |
52
- params[key] = send(key) if self.respond_to?(key)
57
+ self.class.get_parameters_info.each_key do |key|
58
+ params[key] = send(key) if respond_to?(key)
53
59
  end
54
- return params
60
+ params
55
61
  end
56
62
 
63
+ # @param base [Object]
64
+ # @return [Object]
57
65
  def self.included(base)
58
66
  base.extend(ClassMethods)
59
67
  end
60
-
61
68
  end
62
69
  end
63
70
  end
64
-
@@ -1,122 +1,122 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
12
  module Ai4r
11
13
  module Data
12
-
13
14
  # This module provides classical distance functions
14
15
  module Proximity
15
-
16
16
  # This is a faster computational replacement for eclidean distance.
17
17
  # Parameters a and b are vectors with continuous attributes.
18
- def self.squared_euclidean_distance(a, b)
18
+ def squared_euclidean_distance(vec_a, vec_b)
19
19
  sum = 0.0
20
- a.each_with_index do |item_a, i|
21
- item_b = b[i]
20
+ vec_a.each_with_index do |item_a, i|
21
+ item_b = vec_b[i]
22
22
  sum += (item_a - item_b)**2
23
23
  end
24
- return sum
24
+ sum
25
25
  end
26
-
26
+
27
27
  # Euclidean distance, or L2 norm.
28
28
  # Parameters a and b are vectors with continuous attributes.
29
- # Euclidean distance tends to form hyperspherical
30
- # clusters(Clustering, Xu and Wunsch, 2009).
31
- # Translations and rotations do not cause a
29
+ # Euclidean distance tends to form hyperspherical
30
+ # clusters(Clustering, Xu and Wunsch, 2009).
31
+ # Translations and rotations do not cause a
32
32
  # distortion in distance relation (Duda et al, 2001)
33
- # If attributes are measured with different units,
34
- # attributes with larger values and variance will
33
+ # If attributes are measured with different units,
34
+ # attributes with larger values and variance will
35
35
  # dominate the metric.
36
- def self.euclidean_distance(a, b)
37
- Math.sqrt(squared_euclidean_distance(a, b))
36
+ def euclidean_distance(vec_a, vec_b)
37
+ Math.sqrt(squared_euclidean_distance(vec_a, vec_b))
38
38
  end
39
-
40
-
39
+
41
40
  # city block, Manhattan distance, or L1 norm.
42
41
  # Parameters a and b are vectors with continuous attributes.
43
- def self.manhattan_distance(a, b)
42
+ def manhattan_distance(vec_a, vec_b)
44
43
  sum = 0.0
45
- a.each_with_index do |item_a, i|
46
- item_b = b[i]
44
+ vec_a.each_with_index do |item_a, i|
45
+ item_b = vec_b[i]
47
46
  sum += (item_a - item_b).abs
48
47
  end
49
- return sum
48
+ sum
50
49
  end
51
-
50
+
52
51
  # Sup distance, or L-intinity norm
53
- # Parameters a and b are vectors with continuous attributes.
54
- def self.sup_distance(a, b)
52
+ # Parameters a and b are vectors with continuous attributes.
53
+ def sup_distance(vec_a, vec_b)
55
54
  distance = 0.0
56
- a.each_with_index do |item_a, i|
57
- item_b = b[i]
55
+ vec_a.each_with_index do |item_a, i|
56
+ item_b = vec_b[i]
58
57
  diff = (item_a - item_b).abs
59
58
  distance = diff if diff > distance
60
59
  end
61
- return distance
60
+ distance
62
61
  end
63
-
64
- # The Hamming distance between two attributes vectors of equal
65
- # length is the number of attributes for which the corresponding
62
+
63
+ # The Hamming distance between two attributes vectors of equal
64
+ # length is the number of attributes for which the corresponding
66
65
  # vectors are different
67
66
  # This distance function is frequently used with binary attributes,
68
67
  # though it can be used with other discrete attributes.
69
- def self.hamming_distance(a,b)
68
+ def hamming_distance(vec_a, vec_b)
70
69
  count = 0
71
- a.each_index do |i|
72
- count += 1 if a[i] != b[i]
70
+ vec_a.each_index do |i|
71
+ count += 1 if vec_a[i] != vec_b[i]
73
72
  end
74
- return count
73
+ count
75
74
  end
76
-
77
- # The "Simple matching" distance between two attribute sets is given
75
+
76
+ # The "Simple matching" distance between two attribute sets is given
78
77
  # by the number of values present on both vectors.
79
78
  # If sets a and b have lengths da and db then:
80
- #
79
+ #
81
80
  # S = 2/(da + db) * Number of values present on both sets
82
81
  # D = 1.0/S - 1
83
- #
84
- # Some considerations:
82
+ #
83
+ # Some considerations:
85
84
  # * a and b must not include repeated items
86
85
  # * all attributes are treated equally
87
86
  # * all attributes are treated equally
88
- def self.simple_matching_distance(a,b)
87
+ def simple_matching_distance(vec_a, vec_b)
89
88
  similarity = 0.0
90
- a.each {|item| similarity += 2 if b.include?(item)}
91
- similarity /= (a.length + b.length)
92
- return 1.0/similarity - 1
93
- end
94
-
95
- # Cosine similarity is a measure of similarity between two vectors
96
- # of an inner product space that measures the cosine of the
89
+ vec_a.each { |item| similarity += 2 if vec_b.include?(item) }
90
+ similarity /= (vec_a.length + vec_b.length)
91
+ (1.0 / similarity) - 1
92
+ end
93
+
94
+ # Cosine similarity is a measure of similarity between two vectors
95
+ # of an inner product space that measures the cosine of the
97
96
  # angle between them (http://en.wikipedia.org/wiki/Cosine_similarity).
98
- #
97
+ #
99
98
  # Parameters a and b are vectors with continuous attributes.
100
99
  #
101
100
  # D = sum(a[i] * b[i]) / sqrt(sum(a[i]**2)) * sqrt(sum(b[i]**2))
102
- def self.cosine_distance(a,b)
101
+ def cosine_distance(vec_a, vec_b)
103
102
  dot_product = 0.0
104
103
  norm_a = 0.0
105
104
  norm_b = 0.0
106
- magnitude = 0.0
107
-
108
- a.each_index do |i|
109
- dot_product += a[i] * b[i]
110
- norm_a += a[i] ** 2
111
- norm_b += b[i] ** 2
105
+
106
+ vec_a.each_index do |i|
107
+ dot_product += vec_a[i] * vec_b[i]
108
+ norm_a += vec_a[i]**2
109
+ norm_b += vec_b[i]**2
112
110
  end
113
-
111
+
114
112
  magnitude = Math.sqrt(norm_a) * Math.sqrt(norm_b)
115
- return 1 - (dot_product / magnitude)
113
+ 1 - (dot_product / magnitude)
116
114
  end
115
+
116
+ module_function :squared_euclidean_distance, :euclidean_distance,
117
+ :manhattan_distance, :sup_distance,
118
+ :hamming_distance, :simple_matching_distance,
119
+ :cosine_distance
117
120
  end
118
-
119
121
  end
120
-
121
122
  end
122
-
@@ -1,77 +1,88 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://www.ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/data_set'
11
-
12
12
  module Ai4r
13
13
  module Data
14
-
15
14
  # This module provides some basic statistics functions to operate on
16
15
  # data set attributes.
17
16
  module Statistics
18
-
19
17
  # Get the sample mean
18
+ # @param data_set [Object]
19
+ # @param attribute [Object]
20
+ # @return [Object]
20
21
  def self.mean(data_set, attribute)
21
22
  index = data_set.get_index(attribute)
22
23
  sum = 0.0
23
24
  data_set.data_items.each { |item| sum += item[index] }
24
- return sum / data_set.data_items.length
25
+ sum / data_set.data_items.length
25
26
  end
26
-
27
+
27
28
  # Get the variance.
28
29
  # You can provide the mean if you have it already, to speed up things.
30
+ # @param data_set [Object]
31
+ # @param attribute [Object]
32
+ # @param mean [Object]
33
+ # @return [Object]
29
34
  def self.variance(data_set, attribute, mean = nil)
30
35
  index = data_set.get_index(attribute)
31
- mean = mean(data_set, attribute)
36
+ mean ||= mean(data_set, attribute)
32
37
  sum = 0.0
33
- data_set.data_items.each { |item| sum += (item[index]-mean)**2 }
34
- return sum / (data_set.data_items.length-1)
38
+ data_set.data_items.each { |item| sum += (item[index] - mean)**2 }
39
+ sum / (data_set.data_items.length - 1)
35
40
  end
36
-
41
+
37
42
  # Get the standard deviation.
38
- # You can provide the variance if you have it already, to speed up things.
43
+ # You can provide the variance if you have it already, to speed up things.
44
+ # @param data_set [Object]
45
+ # @param attribute [Object]
46
+ # @param variance [Object]
47
+ # @return [Object]
39
48
  def self.standard_deviation(data_set, attribute, variance = nil)
40
49
  variance ||= variance(data_set, attribute)
41
50
  Math.sqrt(variance)
42
51
  end
43
-
44
- # Get the sample mode.
52
+
53
+ # Get the sample mode.
54
+ # @param data_set [Object]
55
+ # @param attribute [Object]
56
+ # @return [Object]
45
57
  def self.mode(data_set, attribute)
46
58
  index = data_set.get_index(attribute)
47
- count = Hash.new {0}
48
- max_count = 0
49
- mode = nil
50
- data_set.data_items.each do |data_item|
51
- attr_value = data_item[index]
52
- attr_count = (count[attr_value] += 1)
53
- if attr_count > max_count
54
- mode = attr_value
55
- max_count = attr_count
56
- end
57
- end
58
- return mode
59
+ data_set
60
+ .data_items
61
+ .map { |item| item[index] }
62
+ .tally
63
+ .max_by { _2 }
64
+ &.first
59
65
  end
60
-
66
+
61
67
  # Get the maximum value of an attribute in the data set
68
+ # @param data_set [Object]
69
+ # @param attribute [Object]
70
+ # @return [Object]
62
71
  def self.max(data_set, attribute)
63
72
  index = data_set.get_index(attribute)
64
- item = data_set.data_items.max {|x,y| x[index] <=> y[index]}
65
- return (item) ? item[index] : (-1.0/0)
73
+ item = data_set.data_items.max_by { |item| item[index] }
74
+ item ? item[index] : -Float::INFINITY
66
75
  end
67
-
76
+
68
77
  # Get the minimum value of an attribute in the data set
78
+ # @param data_set [Object]
79
+ # @param attribute [Object]
80
+ # @return [Object]
69
81
  def self.min(data_set, attribute)
70
82
  index = data_set.get_index(attribute)
71
- item = data_set.data_items.min {|x,y| x[index] <=> y[index]}
72
- return (item) ? item[index] : (1.0/0)
83
+ item = data_set.data_items.min_by { |item| item[index] }
84
+ item ? item[index] : Float::INFINITY
73
85
  end
74
-
75
86
  end
76
87
  end
77
88
  end
@@ -1,82 +1,137 @@
1
- require 'benchmark'
2
- require File.dirname(__FILE__) + '/../data/data_set'
1
+ # frozen_string_literal: true
3
2
 
3
+ require 'benchmark'
4
+ require_relative '../data/data_set'
5
+ require_relative 'split'
4
6
 
5
7
  module Ai4r
6
-
7
8
  module Experiment
8
-
9
- # The ClassifierEvaluator is useful to compare different classifiers
10
- # algorithms. The evaluator builds the Classifiers using the same data
9
+ # The ClassifierEvaluator is useful to compare different classifiers
10
+ # algorithms. The evaluator builds the Classifiers using the same data
11
11
  # examples, and provides methods to evalute their performance in parallel.
12
- # It is a nice tool to compare and evaluate the performance of different
13
- # algorithms, the same algorithm with different parameters, or your own new
12
+ # It is a nice tool to compare and evaluate the performance of different
13
+ # algorithms, the same algorithm with different parameters, or your own new
14
14
  # algorithm against the classic classifiers.
15
15
  class ClassifierEvaluator
16
-
17
16
  attr_reader :build_times, :eval_times, :classifiers
18
-
17
+
18
+ # @return [Object]
19
19
  def initialize
20
20
  @classifiers = []
21
21
  end
22
22
 
23
23
  # Add a classifier instance to the test batch
24
+ # @param classifier [Object]
25
+ # @return [Object]
24
26
  def add_classifier(classifier)
25
27
  @classifiers << classifier
26
- return self
28
+ self
27
29
  end
28
-
29
- alias :<< :add_classifier
30
-
30
+
31
+ alias << add_classifier
32
+
31
33
  # Build all classifiers, using data examples found in data_set.
32
34
  # The last attribute of each item is considered as the
33
35
  # item class.
34
36
  # Building times are measured by separate, and can be accessed
35
37
  # through build_times attribute reader.
38
+ # @param data_set [Object]
39
+ # @return [Object]
36
40
  def build(data_set)
37
41
  @build_times = []
38
42
  @classifiers.each do |classifier|
39
43
  @build_times << Benchmark.measure { classifier.build data_set }
40
44
  end
41
- return self
45
+ self
42
46
  end
43
47
 
44
48
  # You can evaluate new data, predicting its class.
45
49
  # e.g.
46
- # classifier.eval(['New York', '<30', 'F'])
50
+ # classifier.eval(['New York', '<30', 'F'])
47
51
  # => ['Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N']
48
52
  # Evaluation times are measured by separate, and can be accessed
49
53
  # through eval_times attribute reader.
54
+ # @param data [Object]
55
+ # @return [Object]
50
56
  def eval(data)
51
57
  @eval_times = []
52
58
  results = []
53
59
  @classifiers.each do |classifier|
54
60
  @eval_times << Benchmark.measure { results << classifier.eval(data) }
55
61
  end
56
- return results
62
+ results
57
63
  end
58
-
59
- # Test classifiers using a data set. The last attribute of each item
64
+
65
+ # Test classifiers using a data set. The last attribute of each item
60
66
  # is considered as the expected class. Data items are evaluated
61
67
  # using all classifiers: evalution times, sucess rate, and quantity of
62
68
  # classification errors are returned in a data set.
63
- # The return data set has a row for every classifier tested, and the
69
+ # The return data set has a row for every classifier tested, and the
64
70
  # following attributes:
65
71
  # ["Classifier", "Testing Time", "Errors", "Success rate"]
72
+ # @param data_set [Object]
73
+ # @return [Object]
66
74
  def test(data_set)
67
- result_data_items = []
68
- @classifiers.each do |classifier|
69
- result_data_items << test_classifier(classifier, data_set)
75
+ result_data_items = @classifiers.map do |classifier|
76
+ test_classifier(classifier, data_set)
70
77
  end
71
- return Ai4r::Data::DataSet.new(:data_items => result_data_items,
72
- :data_labels => ["Classifier","Testing Time","Errors","Success rate"])
78
+
79
+ Ai4r::Data::DataSet.new(data_items: result_data_items,
80
+ data_labels: ['Classifier',
81
+ 'Testing Time', 'Errors', 'Success rate'])
73
82
  end
74
-
83
+
84
+ # Perform k-fold cross validation on all classifiers.
85
+ # The dataset is split into +k+ folds using the Split utility. For each
86
+ # fold, classifiers are trained on the remaining folds and then tested on
87
+ # the held-out fold. The method returns a DataSet with the average time
88
+ # (build and test) and accuracy for each classifier.
89
+ # @param data_set [Ai4r::Data::DataSet] data to evaluate
90
+ # @param k [Integer] number of folds
91
+ # @return [Ai4r::Data::DataSet]
92
+ def cross_validate(data_set, k:)
93
+ folds = Split.split(data_set, k: k)
94
+ times = Array.new(@classifiers.length, 0.0)
95
+ accuracies = Array.new(@classifiers.length, 0.0)
96
+
97
+ folds.each_with_index do |test_set, i|
98
+ train_items = []
99
+ folds.each_with_index do |fold, j|
100
+ next if i == j
101
+
102
+ train_items.concat(fold.data_items)
103
+ end
104
+ train_set = Ai4r::Data::DataSet.new(
105
+ data_items: train_items,
106
+ data_labels: data_set.data_labels
107
+ )
108
+
109
+ @classifiers.each_with_index do |classifier, idx|
110
+ build_time = Benchmark.measure { classifier.build(train_set) }.real
111
+ result = test_classifier(classifier, test_set)
112
+ times[idx] += build_time + result[1]
113
+ accuracies[idx] += result[3]
114
+ end
115
+ end
116
+
117
+ result_items = @classifiers.each_index.map do |idx|
118
+ [@classifiers[idx], times[idx] / k, accuracies[idx] / k]
119
+ end
120
+ Ai4r::Data::DataSet.new(
121
+ data_items: result_items,
122
+ data_labels: ['Classifier', 'Avg. Time', 'Avg. Success rate']
123
+ )
124
+ end
125
+
75
126
  private
127
+
128
+ # @param classifier [Object]
129
+ # @param data_set [Object]
130
+ # @return [Object]
76
131
  def test_classifier(classifier, data_set)
77
132
  data_set_size = data_set.data_items.length
78
133
  errors = 0
79
- testing_times = Benchmark.measure do
134
+ testing_times = Benchmark.measure do
80
135
  data_set.data_items.each do |data_item|
81
136
  data = data_item[0...-1]
82
137
  expected_result = data_item.last
@@ -84,12 +139,9 @@ module Ai4r
84
139
  errors += 1 if result != expected_result
85
140
  end
86
141
  end
87
- return [classifier, testing_times.real, errors,
88
- ((data_set_size-errors*1.0)/data_set_size)]
142
+ [classifier, testing_times.real, errors,
143
+ ((data_set_size - (errors * 1.0)) / data_set_size)]
89
144
  end
90
-
91
145
  end
92
-
93
146
  end
94
-
95
147
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Sergio Fierens
4
+ # License:: MPL 1.1
5
+ # Project:: ai4r
6
+ # Url:: https://github.com/SergioFierens/ai4r
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
10
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
11
+
12
+ require_relative '../data/data_set'
13
+
14
+ module Ai4r
15
+ module Experiment
16
+ # Utility methods for experiment workflows.
17
+ module Split
18
+ module_function
19
+
20
+ # Split a dataset into +k+ folds.
21
+ # @param data_set [Ai4r::Data::DataSet] dataset to split
22
+ # @param k [Integer] number of folds
23
+ # @return [Array<Ai4r::Data::DataSet>] list of folds
24
+ def split(data_set, k:)
25
+ raise ArgumentError, 'k must be greater than 0' unless k.positive?
26
+
27
+ items = data_set.data_items.dup
28
+ labels = data_set.data_labels
29
+ fold_size = (items.length.to_f / k).ceil
30
+ folds = []
31
+ k.times do |i|
32
+ part = items.slice(i * fold_size, fold_size) || []
33
+ folds << Ai4r::Data::DataSet.new(data_items: part, data_labels: labels)
34
+ end
35
+ folds
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ai4r
4
+ module GeneticAlgorithm
5
+ # Base interface for chromosomes used by GeneticSearch.
6
+ # Implementations must define class methods `seed`, `mutate`,
7
+ # `reproduce` and the instance method `fitness`.
8
+ class ChromosomeBase
9
+ attr_accessor :data, :normalized_fitness
10
+
11
+ # @param data [Object]
12
+ # @return [Object]
13
+ def initialize(data = nil)
14
+ @data = data
15
+ end
16
+
17
+ # @return [Object]
18
+ def fitness
19
+ raise NotImplementedError, 'Subclasses must implement #fitness'
20
+ end
21
+
22
+ # @return [Object]
23
+ def self.seed
24
+ raise NotImplementedError, 'Implement .seed in subclass'
25
+ end
26
+
27
+ # @param _a [Object]
28
+ # @param _b [Object]
29
+ # @param _crossover_rate [Object]
30
+ # @return [Object]
31
+ def self.reproduce(_a, _b, _crossover_rate = 0.4)
32
+ raise NotImplementedError, 'Implement .reproduce in subclass'
33
+ end
34
+
35
+ # @param _chromosome [Object]
36
+ # @param _mutation_rate [Object]
37
+ # @return [Object]
38
+ def self.mutate(_chromosome, _mutation_rate = 0.3)
39
+ raise NotImplementedError, 'Implement .mutate in subclass'
40
+ end
41
+ end
42
+ end
43
+ end