ai4r 1.13 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.rb +14 -11
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +40 -34
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +48 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +524 -145
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +29 -14
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +270 -135
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +223 -103
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +62 -62
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +399 -134
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +57 -28
- metadata +79 -109
- data/README.rdoc +0 -39
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -220
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/simple_linear_regression_test.rb +0 -37
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -167
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -104
- data/test/data/proximity_test.rb +0 -87
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -1,19 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Malav Bhavsar
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
8
|
# You can redistribute it and/or modify it under the terms of
|
7
9
|
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative 'classifier'
|
12
14
|
|
13
15
|
module Ai4r
|
14
16
|
module Classifiers
|
15
|
-
|
16
|
-
|
17
17
|
# = Introduction
|
18
18
|
#
|
19
19
|
# This is an implementation of a Simple Linear Regression Classifier.
|
@@ -30,88 +30,113 @@ module Ai4r
|
|
30
30
|
# build data
|
31
31
|
# c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
|
32
32
|
#
|
33
|
-
|
34
|
-
class SimpleLinearRegression < Classifier
|
35
33
|
|
34
|
+
# SimpleLinearRegression performs linear regression on one attribute.
|
35
|
+
class SimpleLinearRegression < Classifier
|
36
36
|
attr_reader :attribute, :attribute_index, :slope, :intercept
|
37
37
|
|
38
|
+
parameters_info selected_attribute: 'Index of attribute to use for regression.'
|
39
|
+
|
40
|
+
# @return [Object]
|
38
41
|
def initialize
|
42
|
+
super()
|
39
43
|
@attribute = nil
|
40
44
|
@attribute_index = 0
|
41
45
|
@slope = 0
|
42
46
|
@intercept = 0
|
47
|
+
@selected_attribute = nil
|
43
48
|
end
|
44
49
|
|
45
50
|
# You can evaluate new data, predicting its category.
|
46
51
|
# e.g.
|
47
52
|
# c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
|
48
53
|
# => 11876.96774193548
|
54
|
+
# @param data [Object]
|
55
|
+
# @return [Object]
|
49
56
|
def eval(data)
|
50
|
-
@intercept + @slope * data[@attribute_index]
|
57
|
+
@intercept + (@slope * data[@attribute_index])
|
51
58
|
end
|
52
59
|
|
53
60
|
# Gets the best attribute and does Linear Regression using it to find out the
|
54
61
|
# slope and intercept.
|
55
62
|
# Parameter data has to be an instance of DataSet
|
63
|
+
# @param data [Object]
|
64
|
+
# @return [Object]
|
56
65
|
def build(data)
|
57
|
-
|
58
|
-
|
66
|
+
validate_data(data)
|
67
|
+
|
59
68
|
y_mean = data.get_mean_or_mode[data.num_attributes - 1]
|
69
|
+
result = if @selected_attribute
|
70
|
+
evaluate_attribute(data, @selected_attribute, y_mean)
|
71
|
+
else
|
72
|
+
evaluate_all_attributes(data, y_mean)
|
73
|
+
end
|
74
|
+
assign_result(data, result)
|
75
|
+
end
|
60
76
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
chosen_slope = 0.0 / 0.0 # Float::NAN
|
66
|
-
chosen_intercept = 0.0 / 0.0 # Float::NAN
|
77
|
+
def validate_data(data)
|
78
|
+
raise 'Error instance must be passed' unless data.is_a?(Ai4r::Data::DataSet)
|
79
|
+
raise 'Data should not be empty' if data.data_items.empty?
|
80
|
+
end
|
67
81
|
|
82
|
+
def evaluate_attribute(data, attr_index, y_mean)
|
83
|
+
x_mean = data.get_mean_or_mode[attr_index]
|
84
|
+
slope, x_diff_sq, y_diff_sq = attribute_sums(data, attr_index, x_mean, y_mean)
|
85
|
+
if x_diff_sq.zero?
|
86
|
+
{ chosen: attr_index, slope: 0, intercept: y_mean, msq: Float::MAX }
|
87
|
+
else
|
88
|
+
chosen_slope = slope / x_diff_sq
|
89
|
+
intercept = y_mean - (chosen_slope * x_mean)
|
90
|
+
{ chosen: attr_index, slope: chosen_slope, intercept: intercept, msq: y_diff_sq - (chosen_slope * slope) }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def evaluate_all_attributes(data, y_mean)
|
95
|
+
result = { chosen: -1, msq: Float::MAX }
|
68
96
|
data.data_labels.each do |attr_name|
|
69
97
|
attr_index = data.get_index attr_name
|
70
|
-
if attr_index
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
data.data_items.map do |instance|
|
77
|
-
x_diff = instance[attr_index] - x_mean
|
78
|
-
y_diff = instance[attr_index] - y_mean
|
79
|
-
slope += x_diff * y_diff
|
80
|
-
sum_x_diff_squared += x_diff * x_diff
|
81
|
-
sum_y_diff_squared += y_diff * y_diff
|
82
|
-
end
|
83
|
-
|
84
|
-
if sum_x_diff_squared == 0
|
85
|
-
next
|
86
|
-
end
|
87
|
-
|
88
|
-
numerator = slope
|
89
|
-
slope /= sum_x_diff_squared
|
90
|
-
intercept = y_mean - slope * x_mean
|
91
|
-
msq = sum_y_diff_squared - slope * numerator
|
92
|
-
|
93
|
-
if msq < min_msq
|
94
|
-
min_msq = msq
|
95
|
-
chosen = attr_index
|
96
|
-
chosen_slope = slope
|
97
|
-
chosen_intercept = intercept
|
98
|
-
end
|
99
|
-
end
|
98
|
+
next if attr_index == data.num_attributes - 1
|
99
|
+
|
100
|
+
candidate = evaluate_attribute(data, attr_index, y_mean)
|
101
|
+
next unless candidate[:msq] < result[:msq]
|
102
|
+
|
103
|
+
result = candidate
|
100
104
|
end
|
105
|
+
result
|
106
|
+
end
|
101
107
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
108
|
+
def assign_result(data, result)
|
109
|
+
raise 'no useful attribute found' if result[:chosen] == -1
|
110
|
+
|
111
|
+
@attribute = data.data_labels[result[:chosen]]
|
112
|
+
@attribute_index = result[:chosen]
|
113
|
+
@slope = result[:slope]
|
114
|
+
@intercept = result[:intercept]
|
115
|
+
self
|
116
|
+
end
|
117
|
+
|
118
|
+
# Simple Linear Regression classifiers cannot generate human readable
|
119
|
+
# rules. This method returns a descriptive string indicating that rule
|
120
|
+
# extraction is not supported.
|
121
|
+
def get_rules
|
122
|
+
'SimpleLinearRegression does not support rule extraction.'
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
127
|
+
# Calculate regression sums for the given attribute.
|
128
|
+
def attribute_sums(data, attr_index, x_mean, y_mean)
|
129
|
+
slope = 0
|
130
|
+
sum_x_diff_squared = 0
|
131
|
+
sum_y_diff_squared = 0
|
132
|
+
data.data_items.each do |instance|
|
133
|
+
x_diff = instance[attr_index] - x_mean
|
134
|
+
y_diff = instance[data.num_attributes - 1] - y_mean
|
135
|
+
slope += x_diff * y_diff
|
136
|
+
sum_x_diff_squared += x_diff * x_diff
|
137
|
+
sum_y_diff_squared += y_diff * y_diff
|
113
138
|
end
|
114
|
-
|
139
|
+
[slope, sum_x_diff_squared, sum_y_diff_squared]
|
115
140
|
end
|
116
141
|
end
|
117
142
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: OpenAI Assistant
|
4
|
+
# License:: MPL 1.1
|
5
|
+
# Project:: ai4r
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
7
|
+
#
|
8
|
+
# A minimal linear Support Vector Machine implementation using
|
9
|
+
# stochastic gradient descent. This implementation is intentionally
|
10
|
+
# simple and only supports binary classification with numeric
|
11
|
+
# attributes.
|
12
|
+
|
13
|
+
require_relative '../data/data_set'
|
14
|
+
require_relative 'classifier'
|
15
|
+
|
16
|
+
module Ai4r
|
17
|
+
module Classifiers
|
18
|
+
# A lightweight linear SVM classifier trained via gradient descent.
|
19
|
+
# Only two classes are supported. Predictions return the same class
|
20
|
+
# labels used in the training data.
|
21
|
+
class SupportVectorMachine < Classifier
|
22
|
+
attr_reader :weights, :bias, :classes
|
23
|
+
|
24
|
+
parameters_info learning_rate: 'Learning rate for gradient descent.',
|
25
|
+
iterations: 'Training iterations.',
|
26
|
+
c: 'Regularization strength.'
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
super()
|
30
|
+
@learning_rate = 0.01
|
31
|
+
@iterations = 1000
|
32
|
+
@c = 1.0
|
33
|
+
@weights = []
|
34
|
+
@bias = 0.0
|
35
|
+
@classes = []
|
36
|
+
end
|
37
|
+
|
38
|
+
# Train the SVM using the provided DataSet. Only numeric attributes and
|
39
|
+
# exactly two classes are supported.
|
40
|
+
def build(data_set)
|
41
|
+
data_set.check_not_empty
|
42
|
+
@classes = data_set.build_domains.last.to_a
|
43
|
+
raise ArgumentError, 'SVM only supports two classes' unless @classes.size == 2
|
44
|
+
|
45
|
+
num_features = data_set.data_labels.length - 1
|
46
|
+
@weights = Array.new(num_features, 0.0)
|
47
|
+
@bias = 0.0
|
48
|
+
|
49
|
+
samples = data_set.data_items.map do |row|
|
50
|
+
[row[0...-1].map(&:to_f), row.last]
|
51
|
+
end
|
52
|
+
|
53
|
+
@iterations.times do
|
54
|
+
samples.each do |features, label|
|
55
|
+
y = label == @classes[0] ? 1.0 : -1.0
|
56
|
+
prediction = dot(@weights, features) + @bias
|
57
|
+
if y * prediction < 1
|
58
|
+
@weights.map!.with_index do |w, i|
|
59
|
+
w + (@learning_rate * ((@c * y * features[i]) - (2 * w)))
|
60
|
+
end
|
61
|
+
@bias += @learning_rate * @c * y
|
62
|
+
else
|
63
|
+
@weights.map!.with_index { |w, _i| w - (@learning_rate * 2 * w) }
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
# Predict the class for the given numeric feature vector.
|
71
|
+
def eval(data)
|
72
|
+
score = dot(@weights, data.map(&:to_f)) + @bias
|
73
|
+
score >= 0 ? @classes[0] : @classes[1]
|
74
|
+
end
|
75
|
+
|
76
|
+
# Support Vector Machine classifiers cannot generate human readable rules.
|
77
|
+
# This method returns a string indicating rule extraction is unsupported.
|
78
|
+
def get_rules
|
79
|
+
'SupportVectorMachine does not support rule extraction.'
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def dot(a, b)
|
85
|
+
sum = 0.0
|
86
|
+
a.each_index { |i| sum += a[i] * b[i] }
|
87
|
+
sum
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Will Warner
|
4
|
+
# License:: MPL 1.1
|
5
|
+
# Project:: ai4r
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
7
|
+
#
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
10
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
11
|
+
|
12
|
+
module Ai4r
|
13
|
+
module Classifiers
|
14
|
+
# Simple vote counter used by ensemble methods.
|
15
|
+
class Votes
|
16
|
+
# @return [Object]
|
17
|
+
def initialize
|
18
|
+
self.tally_sheet = Hash.new(0)
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param category [Object]
|
22
|
+
# @return [Object]
|
23
|
+
def increment_category(category)
|
24
|
+
tally_sheet[category] += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
# @param category [Object]
|
28
|
+
# @return [Object]
|
29
|
+
def tally_for(category)
|
30
|
+
tally_sheet[category]
|
31
|
+
end
|
32
|
+
|
33
|
+
# @param tie_break [Object]
|
34
|
+
# @return [Object]
|
35
|
+
def get_winner(tie_break = :last, rng: Random.new)
|
36
|
+
n = 0 # used to create a stable sort of the tallys
|
37
|
+
sorted_sheet = tally_sheet.sort_by do |_, score|
|
38
|
+
n += 1
|
39
|
+
[score, n]
|
40
|
+
end
|
41
|
+
return nil if sorted_sheet.empty?
|
42
|
+
|
43
|
+
if tie_break == :random
|
44
|
+
max_score = sorted_sheet.last[1]
|
45
|
+
tied = sorted_sheet.select { |_, score| score == max_score }.map(&:first)
|
46
|
+
tied.sample(random: rng)
|
47
|
+
else
|
48
|
+
sorted_sheet.last.first
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
attr_accessor :tally_sheet
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -1,73 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (Implementation only)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../classifiers/classifier'
|
12
14
|
|
13
15
|
module Ai4r
|
14
16
|
module Classifiers
|
15
|
-
|
16
17
|
# = Introduction
|
17
|
-
#
|
18
|
-
# The idea behind the ZeroR classifier is to identify the
|
19
|
-
# the most common class value in the training set.
|
20
|
-
# It always returns that value when evaluating an instance.
|
21
|
-
# It is frequently used as a baseline for evaluating other machine learning
|
18
|
+
#
|
19
|
+
# The idea behind the ZeroR classifier is to identify the
|
20
|
+
# the most common class value in the training set.
|
21
|
+
# It always returns that value when evaluating an instance.
|
22
|
+
# It is frequently used as a baseline for evaluating other machine learning
|
22
23
|
# algorithms.
|
23
24
|
class ZeroR < Classifier
|
24
|
-
|
25
25
|
attr_reader :data_set, :class_value
|
26
|
-
|
26
|
+
|
27
|
+
parameters_info default_class: 'Return this value when the provided ' \
|
28
|
+
'dataset is empty.',
|
29
|
+
tie_break: 'Strategy used when more than one class has the ' \
|
30
|
+
'same maximal frequency. Valid values are :first (default) ' \
|
31
|
+
'and :random.',
|
32
|
+
random_seed: 'Seed for tie resolution when using :random strategy.'
|
33
|
+
|
34
|
+
# @return [Object]
|
35
|
+
def initialize
|
36
|
+
super()
|
37
|
+
@default_class = nil
|
38
|
+
@tie_break = :first
|
39
|
+
@random_seed = nil
|
40
|
+
@rng = nil
|
41
|
+
end
|
42
|
+
|
27
43
|
# Build a new ZeroR classifier. You must provide a DataSet instance
|
28
|
-
# as parameter. The last attribute of each item is considered as
|
44
|
+
# as parameter. The last attribute of each item is considered as
|
29
45
|
# the item class.
|
46
|
+
# @param data_set [Object]
|
47
|
+
# @return [Object]
|
30
48
|
def build(data_set)
|
31
|
-
data_set.check_not_empty
|
32
49
|
@data_set = data_set
|
33
|
-
|
50
|
+
|
51
|
+
if @data_set.data_items.empty?
|
52
|
+
@class_value = @default_class
|
53
|
+
return self
|
54
|
+
end
|
55
|
+
|
56
|
+
frequencies = Hash.new(0)
|
34
57
|
max_freq = 0
|
35
|
-
|
58
|
+
tied_classes = []
|
59
|
+
|
36
60
|
@data_set.data_items.each do |example|
|
37
61
|
class_value = example.last
|
38
|
-
frequencies[class_value]
|
62
|
+
frequencies[class_value] += 1
|
39
63
|
class_frequency = frequencies[class_value]
|
40
|
-
if
|
64
|
+
if class_frequency > max_freq
|
41
65
|
max_freq = class_frequency
|
42
|
-
|
66
|
+
tied_classes = [class_value]
|
67
|
+
elsif class_frequency == max_freq && !tied_classes.include?(class_value)
|
68
|
+
tied_classes << class_value
|
43
69
|
end
|
44
70
|
end
|
45
|
-
|
71
|
+
|
72
|
+
rng = @rng || (@random_seed.nil? ? Random.new : Random.new(@random_seed))
|
73
|
+
|
74
|
+
@class_value = if tied_classes.length == 1
|
75
|
+
tied_classes.first
|
76
|
+
else
|
77
|
+
case @tie_break
|
78
|
+
when :random
|
79
|
+
tied_classes.sample(random: rng)
|
80
|
+
else
|
81
|
+
tied_classes.first
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
self
|
46
86
|
end
|
47
|
-
|
87
|
+
|
48
88
|
# You can evaluate new data, predicting its class.
|
49
89
|
# e.g.
|
50
90
|
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
51
|
-
|
91
|
+
# @param data [Object]
|
92
|
+
# @return [Object]
|
93
|
+
def eval(_data)
|
52
94
|
@class_value
|
53
95
|
end
|
54
|
-
|
96
|
+
|
55
97
|
# This method returns the generated rules in ruby code.
|
56
98
|
# e.g.
|
57
|
-
#
|
99
|
+
#
|
58
100
|
# classifier.get_rules
|
59
101
|
# # => marketing_target='Y'
|
60
102
|
#
|
61
|
-
# It is a nice way to inspect induction results, and also to execute them:
|
103
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
62
104
|
# marketing_target = nil
|
63
|
-
# eval classifier.get_rules
|
105
|
+
# eval classifier.get_rules
|
64
106
|
# puts marketing_target
|
65
107
|
# # => 'Y'
|
108
|
+
# @return [Object]
|
66
109
|
def get_rules
|
67
|
-
|
110
|
+
"#{@data_set.category_label} = '#{@class_value}'"
|
68
111
|
end
|
69
|
-
|
70
112
|
end
|
71
|
-
|
72
113
|
end
|
73
114
|
end
|
@@ -1,59 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../clusterers/single_linkage'
|
14
|
+
require_relative '../clusterers/cluster_tree'
|
12
15
|
|
13
16
|
module Ai4r
|
14
17
|
module Clusterers
|
15
|
-
|
16
18
|
# Implementation of a Hierarchical clusterer with group average
|
17
|
-
# linkage, AKA unweighted pair group method average or UPGMA (Everitt
|
19
|
+
# linkage, AKA unweighted pair group method average or UPGMA (Everitt
|
18
20
|
# et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
|
19
|
-
# Hierarchical clusterer create one cluster per element, and then
|
21
|
+
# Hierarchical clusterer create one cluster per element, and then
|
20
22
|
# progressively merge clusters, until the required number of clusters
|
21
23
|
# is reached.
|
22
|
-
# With average linkage, the distance between a clusters cx and
|
24
|
+
# With average linkage, the distance between a clusters cx and
|
23
25
|
# cluster (ci U cj) the the average distance between cx and ci, and
|
24
26
|
# cx and cj.
|
25
27
|
#
|
26
28
|
# D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
|
27
29
|
class AverageLinkage < SingleLinkage
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
30
|
+
include ClusterTree
|
31
|
+
|
32
|
+
parameters_info distance_function:
|
33
|
+
'Custom implementation of distance function. ' \
|
34
|
+
'It must be a closure receiving two data items and return the ' \
|
35
|
+
'distance between them. By default, this algorithm uses ' \
|
36
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
37
|
+
|
35
38
|
# Build a new clusterer, using data examples found in data_set.
|
36
39
|
# Items will be clustered in "number_of_clusters" different
|
37
40
|
# clusters.
|
38
|
-
|
41
|
+
# @param data_set [Object]
|
42
|
+
# @param number_of_clusters [Object]
|
43
|
+
# @param *options [Object]
|
44
|
+
# @return [Object]
|
45
|
+
def build(data_set, number_of_clusters = 1, **options)
|
39
46
|
super
|
40
47
|
end
|
41
|
-
|
42
|
-
# This algorithms does not allow classification of new data items
|
48
|
+
|
49
|
+
# This algorithms does not allow classification of new data items
|
43
50
|
# once it has been built. Rebuild the cluster including you data element.
|
44
|
-
|
45
|
-
|
51
|
+
# @param _data_item [Object]
|
52
|
+
# @return [Object]
|
53
|
+
def eval(_data_item)
|
54
|
+
raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
|
46
55
|
end
|
47
|
-
|
56
|
+
|
57
|
+
# Average linkage builds a dendrogram and cannot classify new data
|
58
|
+
# once built.
|
59
|
+
# @return [Object]
|
60
|
+
def supports_eval?
|
61
|
+
false
|
62
|
+
end
|
63
|
+
|
48
64
|
protected
|
49
|
-
|
65
|
+
|
50
66
|
# return distance between cluster cx and cluster (ci U cj),
|
51
67
|
# using average linkage
|
52
|
-
|
53
|
-
|
54
|
-
|
68
|
+
# @param cx [Object]
|
69
|
+
# @param ci [Object]
|
70
|
+
# @param cj [Object]
|
71
|
+
# @return [Object]
|
72
|
+
def linkage_distance(cluster_x, cluster_i, cluster_j)
|
73
|
+
(read_distance_matrix(cluster_x, cluster_i) +
|
74
|
+
read_distance_matrix(cluster_x, cluster_j)) / 2
|
55
75
|
end
|
56
|
-
|
57
76
|
end
|
58
77
|
end
|
59
78
|
end
|