ai4r 1.12 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.csv +159 -0
- data/examples/classifiers/simple_linear_regression_example.rb +18 -0
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +62 -0
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +49 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +527 -144
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +112 -48
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +143 -0
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +28 -24
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +309 -72
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +63 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +229 -100
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +72 -50
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +419 -143
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +58 -27
- metadata +117 -106
- data/README.rdoc +0 -44
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -208
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -100
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -96
- data/test/data/proximity_test.rb +0 -81
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: OpenAI Assistant
|
4
|
+
# License:: MPL 1.1
|
5
|
+
# Project:: ai4r
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
7
|
+
#
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
10
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
11
|
+
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative 'classifier'
|
14
|
+
|
15
|
+
module Ai4r
|
16
|
+
module Classifiers
|
17
|
+
# Implementation of binary Logistic Regression using gradient descent.
|
18
|
+
#
|
19
|
+
# Training data must have numeric attributes with the last attribute being
|
20
|
+
# the class label (0 or 1). Parameters can be adjusted with
|
21
|
+
# {Parameterizable#set_parameters}.
|
22
|
+
#
|
23
|
+
# Example:
|
24
|
+
# data = Ai4r::Data::DataSet.new(:data_items => [[0.2, 1], [0.4, 0]])
|
25
|
+
# classifier = LogisticRegression.new.build(data)
|
26
|
+
# classifier.eval([0.3])
|
27
|
+
class LogisticRegression < Classifier
|
28
|
+
attr_reader :weights
|
29
|
+
|
30
|
+
parameters_info learning_rate: 'Learning rate for gradient descent.',
|
31
|
+
iterations: 'Number of iterations to train.'
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
super()
|
35
|
+
@learning_rate = 0.1
|
36
|
+
@iterations = 1000
|
37
|
+
@weights = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Train the logistic regression classifier using the provided dataset.
|
41
|
+
def build(data_set)
|
42
|
+
raise 'Error instance must be passed' unless data_set.is_a?(Ai4r::Data::DataSet)
|
43
|
+
|
44
|
+
data_set.check_not_empty
|
45
|
+
|
46
|
+
x = data_set.data_items.map { |item| item[0...-1].map(&:to_f) }
|
47
|
+
y = data_set.data_items.map { |item| item.last.to_f }
|
48
|
+
m = x.length
|
49
|
+
n = x.first.length
|
50
|
+
@weights = Array.new(n + 1, 0.0) # last value is bias
|
51
|
+
|
52
|
+
@iterations.times do
|
53
|
+
predictions = x.map do |row|
|
54
|
+
z = row.each_with_index.inject(@weights.last) { |s, (v, j)| s + (v * @weights[j]) }
|
55
|
+
1.0 / (1.0 + Math.exp(-z))
|
56
|
+
end
|
57
|
+
errors = predictions.zip(y).map { |p, label| p - label }
|
58
|
+
|
59
|
+
n.times do |j|
|
60
|
+
grad = (0...m).inject(0.0) { |sum, i| sum + (errors[i] * x[i][j]) } / m
|
61
|
+
@weights[j] -= @learning_rate * grad
|
62
|
+
end
|
63
|
+
bias_grad = errors.sum / m
|
64
|
+
@weights[n] -= @learning_rate * bias_grad
|
65
|
+
end
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
# Predict the class (0 or 1) for the given data array.
|
70
|
+
def eval(data)
|
71
|
+
raise 'Model not trained' unless @weights
|
72
|
+
|
73
|
+
expected_size = @weights.length - 1
|
74
|
+
if data.length != expected_size
|
75
|
+
raise ArgumentError,
|
76
|
+
"Wrong number of inputs. Expected: #{expected_size}, " \
|
77
|
+
"received: #{data.length}."
|
78
|
+
end
|
79
|
+
|
80
|
+
z = data.each_with_index.inject(@weights.last) do |s, (v, j)|
|
81
|
+
s + (v.to_f * @weights[j])
|
82
|
+
end
|
83
|
+
prob = 1.0 / (1.0 + Math.exp(-z))
|
84
|
+
prob >= 0.5 ? 1 : 0
|
85
|
+
end
|
86
|
+
|
87
|
+
# Logistic Regression classifiers cannot generate human readable rules.
|
88
|
+
#
|
89
|
+
# This method returns a string explaining that rule extraction is not
|
90
|
+
# supported for this algorithm.
|
91
|
+
def get_rules
|
92
|
+
'LogisticRegression does not support rule extraction.'
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -1,104 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (Implementation only)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../classifiers/classifier'
|
14
|
+
require_relative '../neural_network/backpropagation'
|
13
15
|
|
14
16
|
module Ai4r
|
15
17
|
module Classifiers
|
16
|
-
|
17
18
|
# = Introduction
|
18
|
-
#
|
19
|
-
# The idea behind the MultilayerPerceptron classifier is to
|
20
|
-
# train a Multilayer Perceptron neural network with the provided examples,
|
19
|
+
#
|
20
|
+
# The idea behind the MultilayerPerceptron classifier is to
|
21
|
+
# train a Multilayer Perceptron neural network with the provided examples,
|
21
22
|
# and predict the class for new data items.
|
22
|
-
#
|
23
|
+
#
|
23
24
|
# = Parameters
|
24
|
-
#
|
25
|
+
#
|
25
26
|
# Use class method get_parameters_info to obtain details on the algorithm
|
26
27
|
# parameters. Use set_parameters to set values for this parameters.
|
27
28
|
# See Parameterizable module documentation.
|
28
|
-
#
|
29
|
-
# * :network_class => Neural network implementation class.
|
29
|
+
#
|
30
|
+
# * :network_class => Neural network implementation class.
|
30
31
|
# By default: Ai4r::NeuralNetwork::Backpropagation.
|
31
32
|
# * :network_parameters => Parameters to be forwarded to the back end
|
32
|
-
# neural ntework.
|
33
|
-
# * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
|
33
|
+
# neural ntework.
|
34
|
+
# * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
|
34
35
|
# 2 hidden layers with 8 and 6 neurons each. By default []
|
35
|
-
# * :training_iterations => How many times the training should be repeated.
|
36
|
-
# By default:
|
37
|
-
# :active_node_value => Default: 1
|
36
|
+
# * :training_iterations => How many times the training should be repeated.
|
37
|
+
# By default: 500.
|
38
|
+
# :active_node_value => Default: 1
|
38
39
|
# :inactive_node_value => Default: 1
|
39
40
|
class MultilayerPerceptron < Classifier
|
40
|
-
|
41
41
|
attr_reader :data_set, :class_value, :network, :domains
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
42
|
+
|
43
|
+
TRAINING_ITERATIONS = 500
|
44
|
+
|
45
|
+
parameters_info network_class: 'Neural network implementation class.' \
|
46
|
+
'By default: Ai4r::NeuralNetwork::Backpropagation.',
|
47
|
+
network_parameters: 'parameters to be forwarded to the back end ' \
|
48
|
+
'neural network.',
|
49
|
+
hidden_layers: 'Hidden layer structure. E.g. [8, 6] will generate ' \
|
50
|
+
'2 hidden layers with 8 and 6 neurons each. By default []',
|
51
|
+
training_iterations: 'How many times the training should be ' \
|
52
|
+
"repeated. By default: #{TRAINING_ITERATIONS}",
|
53
|
+
active_node_value: 'Default: 1',
|
54
|
+
inactive_node_value: 'Default: 0'
|
55
|
+
|
56
|
+
# @return [Object]
|
54
57
|
def initialize
|
58
|
+
super()
|
55
59
|
@network_class = Ai4r::NeuralNetwork::Backpropagation
|
56
60
|
@hidden_layers = []
|
57
|
-
@training_iterations =
|
61
|
+
@training_iterations = TRAINING_ITERATIONS
|
58
62
|
@network_parameters = {}
|
59
63
|
@active_node_value = 1
|
60
64
|
@inactive_node_value = 0
|
61
65
|
end
|
62
|
-
|
63
|
-
# Build a new MultilayerPerceptron classifier. You must provide a DataSet
|
64
|
-
# instance as parameter. The last attribute of each item is considered as
|
66
|
+
|
67
|
+
# Build a new MultilayerPerceptron classifier. You must provide a DataSet
|
68
|
+
# instance as parameter. The last attribute of each item is considered as
|
65
69
|
# the item class.
|
70
|
+
# @param data_set [Object]
|
71
|
+
# @return [Object]
|
66
72
|
def build(data_set)
|
67
73
|
data_set.check_not_empty
|
68
74
|
@data_set = data_set
|
69
|
-
@domains = @data_set.build_domains.collect
|
75
|
+
@domains = @data_set.build_domains.collect(&:to_a)
|
70
76
|
@outputs = @domains.last.length
|
71
77
|
@inputs = 0
|
72
|
-
@domains[0...-1].each {|domain| @inputs += domain.length}
|
78
|
+
@domains[0...-1].each { |domain| @inputs += domain.length }
|
73
79
|
@structure = [@inputs] + @hidden_layers + [@outputs]
|
74
80
|
@network = @network_class.new @structure
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
end
|
81
|
+
inputs = []
|
82
|
+
outputs = []
|
83
|
+
data_set.data_items.each do |data_item|
|
84
|
+
inputs << data_to_input(data_item[0...-1])
|
85
|
+
outputs << data_to_output(data_item.last)
|
81
86
|
end
|
82
|
-
|
87
|
+
@network.train_epochs(inputs, outputs,
|
88
|
+
epochs: @training_iterations, batch_size: 1)
|
89
|
+
self
|
83
90
|
end
|
84
|
-
|
91
|
+
# rubocop:enable Metrics/AbcSize
|
92
|
+
|
85
93
|
# You can evaluate new data, predicting its class.
|
86
94
|
# e.g.
|
87
95
|
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
96
|
+
# @param data [Object]
|
97
|
+
# @return [Object]
|
88
98
|
def eval(data)
|
89
99
|
input_values = data_to_input(data)
|
90
100
|
output_values = @network.eval(input_values)
|
91
|
-
|
101
|
+
@domains.last[get_max_index(output_values)]
|
92
102
|
end
|
93
|
-
|
94
|
-
# Multilayer Perceptron Classifiers cannot generate
|
103
|
+
|
104
|
+
# Multilayer Perceptron Classifiers cannot generate
|
95
105
|
# human-readable rules.
|
106
|
+
# @return [Object]
|
96
107
|
def get_rules
|
97
|
-
|
108
|
+
"raise 'Neural networks classifiers do not generate human-readable rules.'"
|
98
109
|
end
|
110
|
+
# rubocop:enable Naming/AccessorMethodName
|
99
111
|
|
100
112
|
protected
|
101
|
-
|
113
|
+
|
114
|
+
# @param data_item [Object]
|
115
|
+
# @return [Object]
|
102
116
|
def data_to_input(data_item)
|
103
117
|
input_values = Array.new(@inputs, @inactive_node_value)
|
104
118
|
accum_index = 0
|
@@ -106,17 +120,21 @@ module Ai4r
|
|
106
120
|
att_value = data_item[att_index]
|
107
121
|
domain_index = @domains[att_index].index(att_value)
|
108
122
|
input_values[domain_index + accum_index] = @active_node_value
|
109
|
-
accum_index
|
123
|
+
accum_index += @domains[att_index].length
|
110
124
|
end
|
111
|
-
|
125
|
+
input_values
|
112
126
|
end
|
113
|
-
|
127
|
+
|
128
|
+
# @param data_item [Object]
|
129
|
+
# @return [Object]
|
114
130
|
def data_to_output(data_item)
|
115
131
|
output_values = Array.new(@outputs, @inactive_node_value)
|
116
132
|
output_values[@domains.last.index(data_item)] = @active_node_value
|
117
|
-
|
133
|
+
output_values
|
118
134
|
end
|
119
|
-
|
135
|
+
|
136
|
+
# @param output_values [Object]
|
137
|
+
# @return [Object]
|
120
138
|
def get_max_index(output_values)
|
121
139
|
max_value = @inactive_node_value
|
122
140
|
max_index = 0
|
@@ -126,10 +144,8 @@ module Ai4r
|
|
126
144
|
max_index = output_index
|
127
145
|
end
|
128
146
|
end
|
129
|
-
|
147
|
+
max_index
|
130
148
|
end
|
131
|
-
|
132
149
|
end
|
133
|
-
|
134
150
|
end
|
135
151
|
end
|
@@ -1,19 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Thomas Kern
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
8
|
# You can redistribute it and/or modify it under the terms of
|
7
9
|
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative 'classifier'
|
12
14
|
|
13
15
|
module Ai4r
|
14
16
|
module Classifiers
|
15
|
-
|
16
|
-
|
17
17
|
# = Introduction
|
18
18
|
#
|
19
19
|
# This is an implementation of a Naive Bayesian Classifier without any
|
@@ -21,7 +21,7 @@ module Ai4r
|
|
21
21
|
# Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
|
22
22
|
# m parameter as second parameter when isntantiating the class.
|
23
23
|
# The estimation looks like this:
|
24
|
-
#(n_c + mp) / (n + m)
|
24
|
+
# (n_c + mp) / (n + m)
|
25
25
|
#
|
26
26
|
# the variables are:
|
27
27
|
# n = the number of training examples for which v = v_j
|
@@ -54,14 +54,21 @@ module Ai4r
|
|
54
54
|
# build data
|
55
55
|
# b.eval(["Red", "SUV", "Domestic"])
|
56
56
|
#
|
57
|
-
|
57
|
+
|
58
|
+
# Probabilistic classifier based on Bayes' theorem.
|
58
59
|
class NaiveBayes < Classifier
|
60
|
+
attr_reader :class_prob, :pcc, :pcp
|
59
61
|
|
60
|
-
parameters_info :
|
61
|
-
|
62
|
-
|
62
|
+
parameters_info m: 'Default value is set to 0. It may be set to a value greater than ' \
|
63
|
+
'0 when the size of the dataset is relatively small',
|
64
|
+
unknown_value_strategy: 'Behaviour when evaluating unseen attribute values: ' \
|
65
|
+
':ignore (default), :uniform or :error.'
|
66
|
+
|
67
|
+
# @return [Object]
|
63
68
|
def initialize
|
69
|
+
super()
|
64
70
|
@m = 0
|
71
|
+
@unknown_value_strategy = :ignore
|
65
72
|
@class_counts = []
|
66
73
|
@class_prob = [] # stores the probability of the classes
|
67
74
|
@pcc = [] # stores the number of instances divided into attribute/value/class
|
@@ -69,144 +76,199 @@ module Ai4r
|
|
69
76
|
@klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
|
70
77
|
@values = {} # hashmap for quick lookup of all the values
|
71
78
|
end
|
72
|
-
|
79
|
+
|
73
80
|
# You can evaluate new data, predicting its category.
|
74
81
|
# e.g.
|
75
82
|
# b.eval(["Red", "SUV", "Domestic"])
|
76
83
|
# => 'No'
|
84
|
+
# @param data [Object]
|
85
|
+
# @return [Object]
|
77
86
|
def eval(data)
|
78
|
-
prob = @class_prob.
|
87
|
+
prob = @class_prob.dup
|
79
88
|
prob = calculate_class_probabilities_for_entry(data, prob)
|
80
89
|
index_to_klass(prob.index(prob.max))
|
81
90
|
end
|
82
91
|
|
83
92
|
# Calculates the probabilities for the data entry Data.
|
84
93
|
# data has to be an array of the same dimension as the training data minus the
|
85
|
-
# class column.
|
94
|
+
# class column.
|
86
95
|
# Returns a map containint all classes as keys:
|
87
96
|
# {Class_1 => probability, Class_2 => probability2 ... }
|
88
97
|
# Probability is <= 1 and of type Float.
|
89
98
|
# e.g.
|
90
99
|
# b.get_probability_map(["Red", "SUV", "Domestic"])
|
91
100
|
# => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
|
101
|
+
# @param data [Object]
|
102
|
+
# @return [Object]
|
92
103
|
def get_probability_map(data)
|
93
|
-
prob = @class_prob.
|
104
|
+
prob = @class_prob.dup
|
94
105
|
prob = calculate_class_probabilities_for_entry(data, prob)
|
95
106
|
prob = normalize_class_probability prob
|
96
107
|
probability_map = {}
|
97
108
|
prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p }
|
98
|
-
|
109
|
+
|
110
|
+
probability_map
|
99
111
|
end
|
100
112
|
|
101
113
|
# counts values of the attribute instances and calculates the probability of the classes
|
102
114
|
# and the conditional probabilities
|
103
115
|
# Parameter data has to be an instance of CsvDataSet
|
116
|
+
# @param data [Object]
|
117
|
+
# @return [Object]
|
104
118
|
def build(data)
|
105
|
-
raise
|
106
|
-
raise
|
119
|
+
raise 'Error instance must be passed' unless data.is_a?(Ai4r::Data::DataSet)
|
120
|
+
raise 'Data should not be empty' if data.data_items.empty?
|
107
121
|
|
108
122
|
initialize_domain_data(data)
|
109
123
|
initialize_klass_index
|
110
124
|
initialize_pc
|
111
125
|
calculate_probabilities
|
112
126
|
|
113
|
-
|
127
|
+
self
|
128
|
+
end
|
129
|
+
|
130
|
+
# Naive Bayes classifiers cannot generate human readable rules.
|
131
|
+
# This method returns a descriptive string explaining that rule
|
132
|
+
# extraction is not supported for this algorithm.
|
133
|
+
def get_rules
|
134
|
+
'NaiveBayes does not support rule extraction.'
|
114
135
|
end
|
115
136
|
|
116
137
|
private
|
117
138
|
|
139
|
+
# @param data [Object]
|
140
|
+
# @return [Object]
|
118
141
|
def initialize_domain_data(data)
|
119
142
|
@domains = data.build_domains
|
120
143
|
@data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
|
121
144
|
@data_labels = data.data_labels[0...-1]
|
122
|
-
@klasses = @domains.last.to_a
|
145
|
+
@klasses = @domains.last.to_a.sort
|
123
146
|
end
|
124
147
|
|
125
|
-
|
126
148
|
# calculates the klass probability of a data entry
|
127
149
|
# as usual, the probability of the value is multiplied with every conditional
|
128
150
|
# probability of every attribute in condition to a specific class
|
129
151
|
# this is repeated for every class
|
152
|
+
# @param data [Object]
|
153
|
+
# @param prob [Object]
|
154
|
+
# @return [Object]
|
130
155
|
def calculate_class_probabilities_for_entry(data, prob)
|
131
|
-
prob.
|
156
|
+
0.upto(prob.length - 1) do |prob_index|
|
132
157
|
data.each_with_index do |att, index|
|
133
|
-
|
134
|
-
|
158
|
+
val_index = value_index(att, index)
|
159
|
+
if val_index.nil?
|
160
|
+
case @unknown_value_strategy
|
161
|
+
when :ignore
|
162
|
+
next
|
163
|
+
when :uniform
|
164
|
+
value_count = @pcc[index].count { |arr| arr[prob_index].positive? }
|
165
|
+
value_count = 1 if value_count.zero?
|
166
|
+
prob[prob_index] *= 1.0 / value_count
|
167
|
+
when :error
|
168
|
+
raise "Unknown value '#{att}' for attribute #{@data_labels[index]}"
|
169
|
+
else
|
170
|
+
next
|
171
|
+
end
|
172
|
+
else
|
173
|
+
prob[prob_index] *= @pcp[index][val_index][prob_index]
|
174
|
+
end
|
135
175
|
end
|
176
|
+
# rubocop:enable Metrics/ClassLength
|
136
177
|
end
|
178
|
+
|
179
|
+
prob
|
137
180
|
end
|
138
181
|
|
139
182
|
# normalises the array of probabilities so the sum of the array equals 1
|
183
|
+
# @param prob [Object]
|
184
|
+
# @return [Object]
|
140
185
|
def normalize_class_probability(prob)
|
141
186
|
prob_sum = sum(prob)
|
142
|
-
prob_sum
|
143
|
-
prob.map {|prob_entry| prob_entry / prob_sum }
|
187
|
+
if prob_sum.positive?
|
188
|
+
prob.map { |prob_entry| prob_entry / prob_sum }
|
189
|
+
else
|
144
190
|
prob
|
191
|
+
end
|
145
192
|
end
|
146
193
|
|
147
194
|
# sums an array up; returns a number of type Float
|
195
|
+
# @param array [Object]
|
196
|
+
# @return [Object]
|
148
197
|
def sum(array)
|
149
|
-
array.
|
198
|
+
array.sum(0.0)
|
150
199
|
end
|
151
200
|
|
152
201
|
# returns the name of the class when the index is found
|
202
|
+
# @param index [Object]
|
203
|
+
# @return [Object]
|
153
204
|
def index_to_klass(index)
|
154
|
-
@klass_index.
|
205
|
+
@klass_index.value?(index) ? @klass_index.key(index) : nil
|
155
206
|
end
|
156
207
|
|
157
208
|
# initializes @values and @klass_index; maps a certain value to a uniq index
|
209
|
+
# @return [Object]
|
158
210
|
def initialize_klass_index
|
159
211
|
@klasses.each_with_index do |dl, index|
|
160
212
|
@klass_index[dl] = index
|
161
213
|
end
|
162
214
|
|
163
|
-
@data_labels.
|
215
|
+
0.upto(@data_labels.length - 1) do |index|
|
164
216
|
@values[index] = {}
|
165
|
-
@domains[index].each_with_index do |d, d_index|
|
217
|
+
@domains[index].to_a.sort.each_with_index do |d, d_index|
|
166
218
|
@values[index][d] = d_index
|
167
219
|
end
|
168
220
|
end
|
169
221
|
end
|
170
222
|
|
171
223
|
# returns the index of a class
|
224
|
+
# @param klass [Object]
|
225
|
+
# @return [Object]
|
172
226
|
def klass_index(klass)
|
173
227
|
@klass_index[klass]
|
174
228
|
end
|
175
229
|
|
176
230
|
# returns the index of a value, depending on the attribute index
|
231
|
+
# @param value [Object]
|
232
|
+
# @param dl_index [Object]
|
233
|
+
# @return [Object]
|
177
234
|
def value_index(value, dl_index)
|
178
235
|
@values[dl_index][value]
|
179
236
|
end
|
180
237
|
|
181
238
|
# builds an array of the form:
|
182
239
|
# array[attributes][values][classes]
|
183
|
-
|
240
|
+
# @param index [Object]
|
241
|
+
# @return [Object]
|
242
|
+
def build_array(index)
|
184
243
|
domains = Array.new(@domains[index].length)
|
185
|
-
domains.map do
|
186
|
-
|
244
|
+
domains.map do
|
245
|
+
Array.new @klasses.length, 0
|
187
246
|
end
|
188
247
|
end
|
189
248
|
|
190
249
|
# initializes the two array for storing the count and conditional probabilities of
|
191
250
|
# the attributes
|
251
|
+
# @return [Object]
|
192
252
|
def initialize_pc
|
193
|
-
@data_labels.
|
194
|
-
@pcc << build_array(
|
195
|
-
@pcp << build_array(
|
253
|
+
0.upto(@data_labels.length - 1) do |index|
|
254
|
+
@pcc << build_array(index)
|
255
|
+
@pcp << build_array(index)
|
196
256
|
end
|
197
257
|
end
|
198
258
|
|
199
259
|
# calculates the occurrences of a class and the instances of a certain value of a
|
200
260
|
# certain attribute and the assigned class.
|
201
261
|
# In addition to that, it also calculates the conditional probabilities and values
|
262
|
+
# @return [Object]
|
202
263
|
def calculate_probabilities
|
203
|
-
@klasses.each {|dl| @class_counts[klass_index(dl)] = 0}
|
264
|
+
@klasses.each { |dl| @class_counts[klass_index(dl)] = 0 }
|
204
265
|
|
205
266
|
calculate_class_probabilities
|
206
267
|
count_instances
|
207
268
|
calculate_conditional_probabilities
|
208
269
|
end
|
209
270
|
|
271
|
+
# @return [Object]
|
210
272
|
def calculate_class_probabilities
|
211
273
|
@data_items.each do |entry|
|
212
274
|
@class_counts[klass_index(entry.klass)] += 1
|
@@ -218,48 +280,50 @@ module Ai4r
|
|
218
280
|
end
|
219
281
|
|
220
282
|
# counts the instances of a certain value of a certain attribute and the assigned class
|
283
|
+
# @return [Object]
|
221
284
|
def count_instances
|
222
285
|
@data_items.each do |item|
|
223
|
-
@data_labels.
|
286
|
+
0.upto(@data_labels.length - 1) do |dl_index|
|
224
287
|
@pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1
|
225
288
|
end
|
226
289
|
end
|
227
290
|
end
|
228
291
|
|
229
292
|
# calculates the conditional probability and stores it in the @pcp-array
|
293
|
+
# @return [Object]
|
230
294
|
def calculate_conditional_probabilities
|
231
295
|
@pcc.each_with_index do |attributes, a_index|
|
232
296
|
attributes.each_with_index do |values, v_index|
|
233
297
|
values.each_with_index do |klass, k_index|
|
234
|
-
@pcp[a_index][v_index][k_index] =
|
298
|
+
@pcp[a_index][v_index][k_index] =
|
299
|
+
(klass.to_f + (@m * @class_prob[k_index])) / (@class_counts[k_index] + @m)
|
235
300
|
end
|
236
301
|
end
|
237
302
|
end
|
238
303
|
end
|
239
304
|
|
240
|
-
#DataEntry stores the instance of the data entry
|
241
|
-
#the data is accessible via entries
|
242
|
-
#stores the class-column in the attribute klass and
|
243
|
-
#removes the column for the class-entry
|
305
|
+
# DataEntry stores the instance of the data entry
|
306
|
+
# the data is accessible via entries
|
307
|
+
# stores the class-column in the attribute klass and
|
308
|
+
# removes the column for the class-entry
|
244
309
|
class DataEntry
|
245
310
|
attr_accessor :klass, :entries
|
246
311
|
|
312
|
+
# @param attributes [Object]
|
313
|
+
# @param klass [Object]
|
314
|
+
# @return [Object]
|
247
315
|
def initialize(attributes, klass)
|
248
316
|
@klass = klass
|
249
317
|
@entries = attributes
|
250
318
|
end
|
251
319
|
|
252
320
|
# wrapper method for the access to @entries
|
321
|
+
# @param index [Object]
|
322
|
+
# @return [Object]
|
253
323
|
def [](index)
|
254
324
|
@entries[index]
|
255
325
|
end
|
256
326
|
end
|
257
|
-
|
258
327
|
end
|
259
328
|
end
|
260
329
|
end
|
261
|
-
|
262
|
-
# Monkeypatch to support both ruby 1.8 and 1.9 (key vs index method)
|
263
|
-
class Hash
|
264
|
-
alias_method(:key, :index) unless method_defined?(:key)
|
265
|
-
end
|