ai4r 1.13 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.rb +14 -11
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +40 -34
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +48 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +524 -145
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +29 -14
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +270 -135
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +223 -103
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +62 -62
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +399 -134
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +57 -28
- metadata +79 -109
- data/README.rdoc +0 -39
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -220
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/simple_linear_regression_test.rb +0 -37
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -167
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -104
- data/test/data/proximity_test.rb +0 -87
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -1,64 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
12
|
module Ai4r
|
11
13
|
module Data
|
14
|
+
# Mix-in to declare configurable parameters for algorithms.
|
12
15
|
module Parameterizable
|
13
|
-
|
16
|
+
# Class-level helpers for Parameterizable.
|
14
17
|
module ClassMethods
|
15
|
-
|
16
18
|
# Get info on what can be parameterized on this algorithm.
|
17
19
|
# It returns a hash with the following format:
|
18
20
|
# { :param_name => "Info on the parameter" }
|
21
|
+
# @return [Object]
|
19
22
|
def get_parameters_info
|
20
|
-
|
23
|
+
@_params_info_ || {}
|
21
24
|
end
|
22
|
-
|
25
|
+
|
23
26
|
# Set info on what can be parameterized on this algorithm.
|
24
27
|
# You must provide a hash with the following format:
|
25
|
-
# { :param_name => "Info on the parameter" }
|
28
|
+
# { :param_name => "Info on the parameter" }
|
29
|
+
# @param params_info [Object]
|
30
|
+
# @return [Object]
|
26
31
|
def parameters_info(params_info)
|
27
|
-
@_params_info_ = params_info
|
28
|
-
params_info.
|
29
|
-
attr_accessor param
|
32
|
+
@_params_info_ = get_parameters_info.merge(params_info)
|
33
|
+
params_info.each_key do |param|
|
34
|
+
attr_accessor param unless method_defined?(param) || method_defined?("#{param}=")
|
30
35
|
end
|
31
36
|
end
|
32
37
|
end
|
33
|
-
|
38
|
+
|
34
39
|
# Set parameter values on this algorithm instance.
|
35
|
-
# You must provide a hash with the
|
40
|
+
# You must provide a hash with the following format:
|
36
41
|
# { :param_name => parameter_value }
|
42
|
+
# @param params [Object]
|
43
|
+
# @return [Object]
|
37
44
|
def set_parameters(params)
|
38
|
-
|
39
|
-
if
|
40
|
-
send("#{key}=".to_sym, params[key]) if params.has_key? key
|
41
|
-
end
|
45
|
+
params.each do |key, val|
|
46
|
+
public_send("#{key}=", val) if respond_to?("#{key}=")
|
42
47
|
end
|
43
|
-
|
48
|
+
self
|
44
49
|
end
|
45
|
-
|
50
|
+
|
46
51
|
# Get parameter values on this algorithm instance.
|
47
|
-
# Returns a hash with the
|
52
|
+
# Returns a hash with the following format:
|
48
53
|
# { :param_name => parameter_value }
|
54
|
+
# @return [Object]
|
49
55
|
def get_parameters
|
50
56
|
params = {}
|
51
|
-
self.class.get_parameters_info.
|
52
|
-
params[key] = send(key) if
|
57
|
+
self.class.get_parameters_info.each_key do |key|
|
58
|
+
params[key] = send(key) if respond_to?(key)
|
53
59
|
end
|
54
|
-
|
60
|
+
params
|
55
61
|
end
|
56
62
|
|
63
|
+
# @param base [Object]
|
64
|
+
# @return [Object]
|
57
65
|
def self.included(base)
|
58
66
|
base.extend(ClassMethods)
|
59
67
|
end
|
60
|
-
|
61
68
|
end
|
62
69
|
end
|
63
70
|
end
|
64
|
-
|
data/lib/ai4r/data/proximity.rb
CHANGED
@@ -1,122 +1,122 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
12
|
module Ai4r
|
11
13
|
module Data
|
12
|
-
|
13
14
|
# This module provides classical distance functions
|
14
15
|
module Proximity
|
15
|
-
|
16
16
|
# This is a faster computational replacement for eclidean distance.
|
17
17
|
# Parameters a and b are vectors with continuous attributes.
|
18
|
-
def
|
18
|
+
def squared_euclidean_distance(vec_a, vec_b)
|
19
19
|
sum = 0.0
|
20
|
-
|
21
|
-
item_b =
|
20
|
+
vec_a.each_with_index do |item_a, i|
|
21
|
+
item_b = vec_b[i]
|
22
22
|
sum += (item_a - item_b)**2
|
23
23
|
end
|
24
|
-
|
24
|
+
sum
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
# Euclidean distance, or L2 norm.
|
28
28
|
# Parameters a and b are vectors with continuous attributes.
|
29
|
-
# Euclidean distance tends to form hyperspherical
|
30
|
-
# clusters(Clustering, Xu and Wunsch, 2009).
|
31
|
-
# Translations and rotations do not cause a
|
29
|
+
# Euclidean distance tends to form hyperspherical
|
30
|
+
# clusters(Clustering, Xu and Wunsch, 2009).
|
31
|
+
# Translations and rotations do not cause a
|
32
32
|
# distortion in distance relation (Duda et al, 2001)
|
33
|
-
# If attributes are measured with different units,
|
34
|
-
# attributes with larger values and variance will
|
33
|
+
# If attributes are measured with different units,
|
34
|
+
# attributes with larger values and variance will
|
35
35
|
# dominate the metric.
|
36
|
-
def
|
37
|
-
Math.sqrt(squared_euclidean_distance(
|
36
|
+
def euclidean_distance(vec_a, vec_b)
|
37
|
+
Math.sqrt(squared_euclidean_distance(vec_a, vec_b))
|
38
38
|
end
|
39
|
-
|
40
|
-
|
39
|
+
|
41
40
|
# city block, Manhattan distance, or L1 norm.
|
42
41
|
# Parameters a and b are vectors with continuous attributes.
|
43
|
-
def
|
42
|
+
def manhattan_distance(vec_a, vec_b)
|
44
43
|
sum = 0.0
|
45
|
-
|
46
|
-
item_b =
|
44
|
+
vec_a.each_with_index do |item_a, i|
|
45
|
+
item_b = vec_b[i]
|
47
46
|
sum += (item_a - item_b).abs
|
48
47
|
end
|
49
|
-
|
48
|
+
sum
|
50
49
|
end
|
51
|
-
|
50
|
+
|
52
51
|
# Sup distance, or L-intinity norm
|
53
|
-
# Parameters a and b are vectors with continuous attributes.
|
54
|
-
def
|
52
|
+
# Parameters a and b are vectors with continuous attributes.
|
53
|
+
def sup_distance(vec_a, vec_b)
|
55
54
|
distance = 0.0
|
56
|
-
|
57
|
-
item_b =
|
55
|
+
vec_a.each_with_index do |item_a, i|
|
56
|
+
item_b = vec_b[i]
|
58
57
|
diff = (item_a - item_b).abs
|
59
58
|
distance = diff if diff > distance
|
60
59
|
end
|
61
|
-
|
60
|
+
distance
|
62
61
|
end
|
63
|
-
|
64
|
-
# The Hamming distance between two attributes vectors of equal
|
65
|
-
# length is the number of attributes for which the corresponding
|
62
|
+
|
63
|
+
# The Hamming distance between two attributes vectors of equal
|
64
|
+
# length is the number of attributes for which the corresponding
|
66
65
|
# vectors are different
|
67
66
|
# This distance function is frequently used with binary attributes,
|
68
67
|
# though it can be used with other discrete attributes.
|
69
|
-
def
|
68
|
+
def hamming_distance(vec_a, vec_b)
|
70
69
|
count = 0
|
71
|
-
|
72
|
-
count += 1 if
|
70
|
+
vec_a.each_index do |i|
|
71
|
+
count += 1 if vec_a[i] != vec_b[i]
|
73
72
|
end
|
74
|
-
|
73
|
+
count
|
75
74
|
end
|
76
|
-
|
77
|
-
# The "Simple matching" distance between two attribute sets is given
|
75
|
+
|
76
|
+
# The "Simple matching" distance between two attribute sets is given
|
78
77
|
# by the number of values present on both vectors.
|
79
78
|
# If sets a and b have lengths da and db then:
|
80
|
-
#
|
79
|
+
#
|
81
80
|
# S = 2/(da + db) * Number of values present on both sets
|
82
81
|
# D = 1.0/S - 1
|
83
|
-
#
|
84
|
-
# Some considerations:
|
82
|
+
#
|
83
|
+
# Some considerations:
|
85
84
|
# * a and b must not include repeated items
|
86
85
|
# * all attributes are treated equally
|
87
86
|
# * all attributes are treated equally
|
88
|
-
def
|
87
|
+
def simple_matching_distance(vec_a, vec_b)
|
89
88
|
similarity = 0.0
|
90
|
-
|
91
|
-
similarity /= (
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
# Cosine similarity is a measure of similarity between two vectors
|
96
|
-
# of an inner product space that measures the cosine of the
|
89
|
+
vec_a.each { |item| similarity += 2 if vec_b.include?(item) }
|
90
|
+
similarity /= (vec_a.length + vec_b.length)
|
91
|
+
(1.0 / similarity) - 1
|
92
|
+
end
|
93
|
+
|
94
|
+
# Cosine similarity is a measure of similarity between two vectors
|
95
|
+
# of an inner product space that measures the cosine of the
|
97
96
|
# angle between them (http://en.wikipedia.org/wiki/Cosine_similarity).
|
98
|
-
#
|
97
|
+
#
|
99
98
|
# Parameters a and b are vectors with continuous attributes.
|
100
99
|
#
|
101
100
|
# D = sum(a[i] * b[i]) / sqrt(sum(a[i]**2)) * sqrt(sum(b[i]**2))
|
102
|
-
def
|
101
|
+
def cosine_distance(vec_a, vec_b)
|
103
102
|
dot_product = 0.0
|
104
103
|
norm_a = 0.0
|
105
104
|
norm_b = 0.0
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
norm_b += b[i] ** 2
|
105
|
+
|
106
|
+
vec_a.each_index do |i|
|
107
|
+
dot_product += vec_a[i] * vec_b[i]
|
108
|
+
norm_a += vec_a[i]**2
|
109
|
+
norm_b += vec_b[i]**2
|
112
110
|
end
|
113
|
-
|
111
|
+
|
114
112
|
magnitude = Math.sqrt(norm_a) * Math.sqrt(norm_b)
|
115
|
-
|
113
|
+
1 - (dot_product / magnitude)
|
116
114
|
end
|
115
|
+
|
116
|
+
module_function :squared_euclidean_distance, :euclidean_distance,
|
117
|
+
:manhattan_distance, :sup_distance,
|
118
|
+
:hamming_distance, :simple_matching_distance,
|
119
|
+
:cosine_distance
|
117
120
|
end
|
118
|
-
|
119
121
|
end
|
120
|
-
|
121
122
|
end
|
122
|
-
|
data/lib/ai4r/data/statistics.rb
CHANGED
@@ -1,77 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
require File.dirname(__FILE__) + '/data_set'
|
11
|
-
|
12
12
|
module Ai4r
|
13
13
|
module Data
|
14
|
-
|
15
14
|
# This module provides some basic statistics functions to operate on
|
16
15
|
# data set attributes.
|
17
16
|
module Statistics
|
18
|
-
|
19
17
|
# Get the sample mean
|
18
|
+
# @param data_set [Object]
|
19
|
+
# @param attribute [Object]
|
20
|
+
# @return [Object]
|
20
21
|
def self.mean(data_set, attribute)
|
21
22
|
index = data_set.get_index(attribute)
|
22
23
|
sum = 0.0
|
23
24
|
data_set.data_items.each { |item| sum += item[index] }
|
24
|
-
|
25
|
+
sum / data_set.data_items.length
|
25
26
|
end
|
26
|
-
|
27
|
+
|
27
28
|
# Get the variance.
|
28
29
|
# You can provide the mean if you have it already, to speed up things.
|
30
|
+
# @param data_set [Object]
|
31
|
+
# @param attribute [Object]
|
32
|
+
# @param mean [Object]
|
33
|
+
# @return [Object]
|
29
34
|
def self.variance(data_set, attribute, mean = nil)
|
30
35
|
index = data_set.get_index(attribute)
|
31
|
-
mean
|
36
|
+
mean ||= mean(data_set, attribute)
|
32
37
|
sum = 0.0
|
33
|
-
data_set.data_items.each { |item| sum += (item[index]-mean)**2 }
|
34
|
-
|
38
|
+
data_set.data_items.each { |item| sum += (item[index] - mean)**2 }
|
39
|
+
sum / (data_set.data_items.length - 1)
|
35
40
|
end
|
36
|
-
|
41
|
+
|
37
42
|
# Get the standard deviation.
|
38
|
-
# You can provide the variance if you have it already, to speed up things.
|
43
|
+
# You can provide the variance if you have it already, to speed up things.
|
44
|
+
# @param data_set [Object]
|
45
|
+
# @param attribute [Object]
|
46
|
+
# @param variance [Object]
|
47
|
+
# @return [Object]
|
39
48
|
def self.standard_deviation(data_set, attribute, variance = nil)
|
40
49
|
variance ||= variance(data_set, attribute)
|
41
50
|
Math.sqrt(variance)
|
42
51
|
end
|
43
|
-
|
44
|
-
# Get the sample mode.
|
52
|
+
|
53
|
+
# Get the sample mode.
|
54
|
+
# @param data_set [Object]
|
55
|
+
# @param attribute [Object]
|
56
|
+
# @return [Object]
|
45
57
|
def self.mode(data_set, attribute)
|
46
58
|
index = data_set.get_index(attribute)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
if attr_count > max_count
|
54
|
-
mode = attr_value
|
55
|
-
max_count = attr_count
|
56
|
-
end
|
57
|
-
end
|
58
|
-
return mode
|
59
|
+
data_set
|
60
|
+
.data_items
|
61
|
+
.map { |item| item[index] }
|
62
|
+
.tally
|
63
|
+
.max_by { _2 }
|
64
|
+
&.first
|
59
65
|
end
|
60
|
-
|
66
|
+
|
61
67
|
# Get the maximum value of an attribute in the data set
|
68
|
+
# @param data_set [Object]
|
69
|
+
# @param attribute [Object]
|
70
|
+
# @return [Object]
|
62
71
|
def self.max(data_set, attribute)
|
63
72
|
index = data_set.get_index(attribute)
|
64
|
-
item = data_set.data_items.
|
65
|
-
|
73
|
+
item = data_set.data_items.max_by { |item| item[index] }
|
74
|
+
item ? item[index] : -Float::INFINITY
|
66
75
|
end
|
67
|
-
|
76
|
+
|
68
77
|
# Get the minimum value of an attribute in the data set
|
78
|
+
# @param data_set [Object]
|
79
|
+
# @param attribute [Object]
|
80
|
+
# @return [Object]
|
69
81
|
def self.min(data_set, attribute)
|
70
82
|
index = data_set.get_index(attribute)
|
71
|
-
item = data_set.data_items.
|
72
|
-
|
83
|
+
item = data_set.data_items.min_by { |item| item[index] }
|
84
|
+
item ? item[index] : Float::INFINITY
|
73
85
|
end
|
74
|
-
|
75
86
|
end
|
76
87
|
end
|
77
88
|
end
|
@@ -1,82 +1,137 @@
|
|
1
|
-
|
2
|
-
require File.dirname(__FILE__) + '/../data/data_set'
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
3
|
+
require 'benchmark'
|
4
|
+
require_relative '../data/data_set'
|
5
|
+
require_relative 'split'
|
4
6
|
|
5
7
|
module Ai4r
|
6
|
-
|
7
8
|
module Experiment
|
8
|
-
|
9
|
-
# The
|
10
|
-
# algorithms. The evaluator builds the Classifiers using the same data
|
9
|
+
# The ClassifierEvaluator is useful to compare different classifiers
|
10
|
+
# algorithms. The evaluator builds the Classifiers using the same data
|
11
11
|
# examples, and provides methods to evalute their performance in parallel.
|
12
|
-
# It is a nice tool to compare and evaluate the performance of different
|
13
|
-
# algorithms, the same algorithm with different parameters, or your own new
|
12
|
+
# It is a nice tool to compare and evaluate the performance of different
|
13
|
+
# algorithms, the same algorithm with different parameters, or your own new
|
14
14
|
# algorithm against the classic classifiers.
|
15
15
|
class ClassifierEvaluator
|
16
|
-
|
17
16
|
attr_reader :build_times, :eval_times, :classifiers
|
18
|
-
|
17
|
+
|
18
|
+
# @return [Object]
|
19
19
|
def initialize
|
20
20
|
@classifiers = []
|
21
21
|
end
|
22
22
|
|
23
23
|
# Add a classifier instance to the test batch
|
24
|
+
# @param classifier [Object]
|
25
|
+
# @return [Object]
|
24
26
|
def add_classifier(classifier)
|
25
27
|
@classifiers << classifier
|
26
|
-
|
28
|
+
self
|
27
29
|
end
|
28
|
-
|
29
|
-
alias
|
30
|
-
|
30
|
+
|
31
|
+
alias << add_classifier
|
32
|
+
|
31
33
|
# Build all classifiers, using data examples found in data_set.
|
32
34
|
# The last attribute of each item is considered as the
|
33
35
|
# item class.
|
34
36
|
# Building times are measured by separate, and can be accessed
|
35
37
|
# through build_times attribute reader.
|
38
|
+
# @param data_set [Object]
|
39
|
+
# @return [Object]
|
36
40
|
def build(data_set)
|
37
41
|
@build_times = []
|
38
42
|
@classifiers.each do |classifier|
|
39
43
|
@build_times << Benchmark.measure { classifier.build data_set }
|
40
44
|
end
|
41
|
-
|
45
|
+
self
|
42
46
|
end
|
43
47
|
|
44
48
|
# You can evaluate new data, predicting its class.
|
45
49
|
# e.g.
|
46
|
-
# classifier.eval(['New York', '<30', 'F'])
|
50
|
+
# classifier.eval(['New York', '<30', 'F'])
|
47
51
|
# => ['Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N']
|
48
52
|
# Evaluation times are measured by separate, and can be accessed
|
49
53
|
# through eval_times attribute reader.
|
54
|
+
# @param data [Object]
|
55
|
+
# @return [Object]
|
50
56
|
def eval(data)
|
51
57
|
@eval_times = []
|
52
58
|
results = []
|
53
59
|
@classifiers.each do |classifier|
|
54
60
|
@eval_times << Benchmark.measure { results << classifier.eval(data) }
|
55
61
|
end
|
56
|
-
|
62
|
+
results
|
57
63
|
end
|
58
|
-
|
59
|
-
# Test classifiers using a data set. The last attribute of each item
|
64
|
+
|
65
|
+
# Test classifiers using a data set. The last attribute of each item
|
60
66
|
# is considered as the expected class. Data items are evaluated
|
61
67
|
# using all classifiers: evalution times, sucess rate, and quantity of
|
62
68
|
# classification errors are returned in a data set.
|
63
|
-
# The return data set has a row for every classifier tested, and the
|
69
|
+
# The return data set has a row for every classifier tested, and the
|
64
70
|
# following attributes:
|
65
71
|
# ["Classifier", "Testing Time", "Errors", "Success rate"]
|
72
|
+
# @param data_set [Object]
|
73
|
+
# @return [Object]
|
66
74
|
def test(data_set)
|
67
|
-
result_data_items =
|
68
|
-
|
69
|
-
result_data_items << test_classifier(classifier, data_set)
|
75
|
+
result_data_items = @classifiers.map do |classifier|
|
76
|
+
test_classifier(classifier, data_set)
|
70
77
|
end
|
71
|
-
|
72
|
-
|
78
|
+
|
79
|
+
Ai4r::Data::DataSet.new(data_items: result_data_items,
|
80
|
+
data_labels: ['Classifier',
|
81
|
+
'Testing Time', 'Errors', 'Success rate'])
|
73
82
|
end
|
74
|
-
|
83
|
+
|
84
|
+
# Perform k-fold cross validation on all classifiers.
|
85
|
+
# The dataset is split into +k+ folds using the Split utility. For each
|
86
|
+
# fold, classifiers are trained on the remaining folds and then tested on
|
87
|
+
# the held-out fold. The method returns a DataSet with the average time
|
88
|
+
# (build and test) and accuracy for each classifier.
|
89
|
+
# @param data_set [Ai4r::Data::DataSet] data to evaluate
|
90
|
+
# @param k [Integer] number of folds
|
91
|
+
# @return [Ai4r::Data::DataSet]
|
92
|
+
def cross_validate(data_set, k:)
|
93
|
+
folds = Split.split(data_set, k: k)
|
94
|
+
times = Array.new(@classifiers.length, 0.0)
|
95
|
+
accuracies = Array.new(@classifiers.length, 0.0)
|
96
|
+
|
97
|
+
folds.each_with_index do |test_set, i|
|
98
|
+
train_items = []
|
99
|
+
folds.each_with_index do |fold, j|
|
100
|
+
next if i == j
|
101
|
+
|
102
|
+
train_items.concat(fold.data_items)
|
103
|
+
end
|
104
|
+
train_set = Ai4r::Data::DataSet.new(
|
105
|
+
data_items: train_items,
|
106
|
+
data_labels: data_set.data_labels
|
107
|
+
)
|
108
|
+
|
109
|
+
@classifiers.each_with_index do |classifier, idx|
|
110
|
+
build_time = Benchmark.measure { classifier.build(train_set) }.real
|
111
|
+
result = test_classifier(classifier, test_set)
|
112
|
+
times[idx] += build_time + result[1]
|
113
|
+
accuracies[idx] += result[3]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
result_items = @classifiers.each_index.map do |idx|
|
118
|
+
[@classifiers[idx], times[idx] / k, accuracies[idx] / k]
|
119
|
+
end
|
120
|
+
Ai4r::Data::DataSet.new(
|
121
|
+
data_items: result_items,
|
122
|
+
data_labels: ['Classifier', 'Avg. Time', 'Avg. Success rate']
|
123
|
+
)
|
124
|
+
end
|
125
|
+
|
75
126
|
private
|
127
|
+
|
128
|
+
# @param classifier [Object]
|
129
|
+
# @param data_set [Object]
|
130
|
+
# @return [Object]
|
76
131
|
def test_classifier(classifier, data_set)
|
77
132
|
data_set_size = data_set.data_items.length
|
78
133
|
errors = 0
|
79
|
-
testing_times = Benchmark.measure do
|
134
|
+
testing_times = Benchmark.measure do
|
80
135
|
data_set.data_items.each do |data_item|
|
81
136
|
data = data_item[0...-1]
|
82
137
|
expected_result = data_item.last
|
@@ -84,12 +139,9 @@ module Ai4r
|
|
84
139
|
errors += 1 if result != expected_result
|
85
140
|
end
|
86
141
|
end
|
87
|
-
|
88
|
-
|
142
|
+
[classifier, testing_times.real, errors,
|
143
|
+
((data_set_size - (errors * 1.0)) / data_set_size)]
|
89
144
|
end
|
90
|
-
|
91
145
|
end
|
92
|
-
|
93
146
|
end
|
94
|
-
|
95
147
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Sergio Fierens
|
4
|
+
# License:: MPL 1.1
|
5
|
+
# Project:: ai4r
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
7
|
+
#
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
10
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
11
|
+
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Experiment
|
16
|
+
# Utility methods for experiment workflows.
|
17
|
+
module Split
|
18
|
+
module_function
|
19
|
+
|
20
|
+
# Split a dataset into +k+ folds.
|
21
|
+
# @param data_set [Ai4r::Data::DataSet] dataset to split
|
22
|
+
# @param k [Integer] number of folds
|
23
|
+
# @return [Array<Ai4r::Data::DataSet>] list of folds
|
24
|
+
def split(data_set, k:)
|
25
|
+
raise ArgumentError, 'k must be greater than 0' unless k.positive?
|
26
|
+
|
27
|
+
items = data_set.data_items.dup
|
28
|
+
labels = data_set.data_labels
|
29
|
+
fold_size = (items.length.to_f / k).ceil
|
30
|
+
folds = []
|
31
|
+
k.times do |i|
|
32
|
+
part = items.slice(i * fold_size, fold_size) || []
|
33
|
+
folds << Ai4r::Data::DataSet.new(data_items: part, data_labels: labels)
|
34
|
+
end
|
35
|
+
folds
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ai4r
|
4
|
+
module GeneticAlgorithm
|
5
|
+
# Base interface for chromosomes used by GeneticSearch.
|
6
|
+
# Implementations must define class methods `seed`, `mutate`,
|
7
|
+
# `reproduce` and the instance method `fitness`.
|
8
|
+
class ChromosomeBase
|
9
|
+
attr_accessor :data, :normalized_fitness
|
10
|
+
|
11
|
+
# @param data [Object]
|
12
|
+
# @return [Object]
|
13
|
+
def initialize(data = nil)
|
14
|
+
@data = data
|
15
|
+
end
|
16
|
+
|
17
|
+
# @return [Object]
|
18
|
+
def fitness
|
19
|
+
raise NotImplementedError, 'Subclasses must implement #fitness'
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Object]
|
23
|
+
def self.seed
|
24
|
+
raise NotImplementedError, 'Implement .seed in subclass'
|
25
|
+
end
|
26
|
+
|
27
|
+
# @param _a [Object]
|
28
|
+
# @param _b [Object]
|
29
|
+
# @param _crossover_rate [Object]
|
30
|
+
# @return [Object]
|
31
|
+
def self.reproduce(_a, _b, _crossover_rate = 0.4)
|
32
|
+
raise NotImplementedError, 'Implement .reproduce in subclass'
|
33
|
+
end
|
34
|
+
|
35
|
+
# @param _chromosome [Object]
|
36
|
+
# @param _mutation_rate [Object]
|
37
|
+
# @return [Object]
|
38
|
+
def self.mutate(_chromosome, _mutation_rate = 0.3)
|
39
|
+
raise NotImplementedError, 'Implement .mutate in subclass'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|