ai4r 1.13 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.rb +14 -11
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +40 -34
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +48 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +524 -145
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +29 -14
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +270 -135
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +223 -103
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +62 -62
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +399 -134
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +57 -28
- metadata +79 -109
- data/README.rdoc +0 -39
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -220
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/simple_linear_regression_test.rb +0 -37
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -167
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -104
- data/test/data/proximity_test.rb +0 -87
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: OpenAI Assistant
|
4
|
+
# License:: MPL 1.1
|
5
|
+
# Project:: ai4r
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
7
|
+
#
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
10
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
11
|
+
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative 'classifier'
|
14
|
+
|
15
|
+
module Ai4r
|
16
|
+
module Classifiers
|
17
|
+
# Implementation of binary Logistic Regression using gradient descent.
|
18
|
+
#
|
19
|
+
# Training data must have numeric attributes with the last attribute being
|
20
|
+
# the class label (0 or 1). Parameters can be adjusted with
|
21
|
+
# {Parameterizable#set_parameters}.
|
22
|
+
#
|
23
|
+
# Example:
|
24
|
+
# data = Ai4r::Data::DataSet.new(:data_items => [[0.2, 1], [0.4, 0]])
|
25
|
+
# classifier = LogisticRegression.new.build(data)
|
26
|
+
# classifier.eval([0.3])
|
27
|
+
class LogisticRegression < Classifier
|
28
|
+
attr_reader :weights
|
29
|
+
|
30
|
+
parameters_info learning_rate: 'Learning rate for gradient descent.',
|
31
|
+
iterations: 'Number of iterations to train.'
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
super()
|
35
|
+
@learning_rate = 0.1
|
36
|
+
@iterations = 1000
|
37
|
+
@weights = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Train the logistic regression classifier using the provided dataset.
|
41
|
+
def build(data_set)
|
42
|
+
raise 'Error instance must be passed' unless data_set.is_a?(Ai4r::Data::DataSet)
|
43
|
+
|
44
|
+
data_set.check_not_empty
|
45
|
+
|
46
|
+
x = data_set.data_items.map { |item| item[0...-1].map(&:to_f) }
|
47
|
+
y = data_set.data_items.map { |item| item.last.to_f }
|
48
|
+
m = x.length
|
49
|
+
n = x.first.length
|
50
|
+
@weights = Array.new(n + 1, 0.0) # last value is bias
|
51
|
+
|
52
|
+
@iterations.times do
|
53
|
+
predictions = x.map do |row|
|
54
|
+
z = row.each_with_index.inject(@weights.last) { |s, (v, j)| s + (v * @weights[j]) }
|
55
|
+
1.0 / (1.0 + Math.exp(-z))
|
56
|
+
end
|
57
|
+
errors = predictions.zip(y).map { |p, label| p - label }
|
58
|
+
|
59
|
+
n.times do |j|
|
60
|
+
grad = (0...m).inject(0.0) { |sum, i| sum + (errors[i] * x[i][j]) } / m
|
61
|
+
@weights[j] -= @learning_rate * grad
|
62
|
+
end
|
63
|
+
bias_grad = errors.sum / m
|
64
|
+
@weights[n] -= @learning_rate * bias_grad
|
65
|
+
end
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
# Predict the class (0 or 1) for the given data array.
|
70
|
+
def eval(data)
|
71
|
+
raise 'Model not trained' unless @weights
|
72
|
+
|
73
|
+
expected_size = @weights.length - 1
|
74
|
+
if data.length != expected_size
|
75
|
+
raise ArgumentError,
|
76
|
+
"Wrong number of inputs. Expected: #{expected_size}, " \
|
77
|
+
"received: #{data.length}."
|
78
|
+
end
|
79
|
+
|
80
|
+
z = data.each_with_index.inject(@weights.last) do |s, (v, j)|
|
81
|
+
s + (v.to_f * @weights[j])
|
82
|
+
end
|
83
|
+
prob = 1.0 / (1.0 + Math.exp(-z))
|
84
|
+
prob >= 0.5 ? 1 : 0
|
85
|
+
end
|
86
|
+
|
87
|
+
# Logistic Regression classifiers cannot generate human readable rules.
|
88
|
+
#
|
89
|
+
# This method returns a string explaining that rule extraction is not
|
90
|
+
# supported for this algorithm.
|
91
|
+
def get_rules
|
92
|
+
'LogisticRegression does not support rule extraction.'
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -1,104 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (Implementation only)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../classifiers/classifier'
|
14
|
+
require_relative '../neural_network/backpropagation'
|
13
15
|
|
14
16
|
module Ai4r
|
15
17
|
module Classifiers
|
16
|
-
|
17
18
|
# = Introduction
|
18
|
-
#
|
19
|
-
# The idea behind the MultilayerPerceptron classifier is to
|
20
|
-
# train a Multilayer Perceptron neural network with the provided examples,
|
19
|
+
#
|
20
|
+
# The idea behind the MultilayerPerceptron classifier is to
|
21
|
+
# train a Multilayer Perceptron neural network with the provided examples,
|
21
22
|
# and predict the class for new data items.
|
22
|
-
#
|
23
|
+
#
|
23
24
|
# = Parameters
|
24
|
-
#
|
25
|
+
#
|
25
26
|
# Use class method get_parameters_info to obtain details on the algorithm
|
26
27
|
# parameters. Use set_parameters to set values for this parameters.
|
27
28
|
# See Parameterizable module documentation.
|
28
|
-
#
|
29
|
-
# * :network_class => Neural network implementation class.
|
29
|
+
#
|
30
|
+
# * :network_class => Neural network implementation class.
|
30
31
|
# By default: Ai4r::NeuralNetwork::Backpropagation.
|
31
32
|
# * :network_parameters => Parameters to be forwarded to the back end
|
32
|
-
# neural ntework.
|
33
|
-
# * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
|
33
|
+
# neural ntework.
|
34
|
+
# * :hidden_layers => Hidden layer structure. E.g. [8, 6] will generate
|
34
35
|
# 2 hidden layers with 8 and 6 neurons each. By default []
|
35
|
-
# * :training_iterations => How many times the training should be repeated.
|
36
|
-
# By default:
|
37
|
-
# :active_node_value => Default: 1
|
36
|
+
# * :training_iterations => How many times the training should be repeated.
|
37
|
+
# By default: 500.
|
38
|
+
# :active_node_value => Default: 1
|
38
39
|
# :inactive_node_value => Default: 1
|
39
40
|
class MultilayerPerceptron < Classifier
|
40
|
-
|
41
41
|
attr_reader :data_set, :class_value, :network, :domains
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
42
|
+
|
43
|
+
TRAINING_ITERATIONS = 500
|
44
|
+
|
45
|
+
parameters_info network_class: 'Neural network implementation class.' \
|
46
|
+
'By default: Ai4r::NeuralNetwork::Backpropagation.',
|
47
|
+
network_parameters: 'parameters to be forwarded to the back end ' \
|
48
|
+
'neural network.',
|
49
|
+
hidden_layers: 'Hidden layer structure. E.g. [8, 6] will generate ' \
|
50
|
+
'2 hidden layers with 8 and 6 neurons each. By default []',
|
51
|
+
training_iterations: 'How many times the training should be ' \
|
52
|
+
"repeated. By default: #{TRAINING_ITERATIONS}",
|
53
|
+
active_node_value: 'Default: 1',
|
54
|
+
inactive_node_value: 'Default: 0'
|
55
|
+
|
56
|
+
# @return [Object]
|
54
57
|
def initialize
|
58
|
+
super()
|
55
59
|
@network_class = Ai4r::NeuralNetwork::Backpropagation
|
56
60
|
@hidden_layers = []
|
57
|
-
@training_iterations =
|
61
|
+
@training_iterations = TRAINING_ITERATIONS
|
58
62
|
@network_parameters = {}
|
59
63
|
@active_node_value = 1
|
60
64
|
@inactive_node_value = 0
|
61
65
|
end
|
62
|
-
|
63
|
-
# Build a new MultilayerPerceptron classifier. You must provide a DataSet
|
64
|
-
# instance as parameter. The last attribute of each item is considered as
|
66
|
+
|
67
|
+
# Build a new MultilayerPerceptron classifier. You must provide a DataSet
|
68
|
+
# instance as parameter. The last attribute of each item is considered as
|
65
69
|
# the item class.
|
70
|
+
# @param data_set [Object]
|
71
|
+
# @return [Object]
|
66
72
|
def build(data_set)
|
67
73
|
data_set.check_not_empty
|
68
74
|
@data_set = data_set
|
69
|
-
@domains = @data_set.build_domains.collect
|
75
|
+
@domains = @data_set.build_domains.collect(&:to_a)
|
70
76
|
@outputs = @domains.last.length
|
71
77
|
@inputs = 0
|
72
|
-
@domains[0...-1].each {|domain| @inputs += domain.length}
|
78
|
+
@domains[0...-1].each { |domain| @inputs += domain.length }
|
73
79
|
@structure = [@inputs] + @hidden_layers + [@outputs]
|
74
80
|
@network = @network_class.new @structure
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
end
|
81
|
+
inputs = []
|
82
|
+
outputs = []
|
83
|
+
data_set.data_items.each do |data_item|
|
84
|
+
inputs << data_to_input(data_item[0...-1])
|
85
|
+
outputs << data_to_output(data_item.last)
|
81
86
|
end
|
82
|
-
|
87
|
+
@network.train_epochs(inputs, outputs,
|
88
|
+
epochs: @training_iterations, batch_size: 1)
|
89
|
+
self
|
83
90
|
end
|
84
|
-
|
91
|
+
# rubocop:enable Metrics/AbcSize
|
92
|
+
|
85
93
|
# You can evaluate new data, predicting its class.
|
86
94
|
# e.g.
|
87
95
|
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
96
|
+
# @param data [Object]
|
97
|
+
# @return [Object]
|
88
98
|
def eval(data)
|
89
99
|
input_values = data_to_input(data)
|
90
100
|
output_values = @network.eval(input_values)
|
91
|
-
|
101
|
+
@domains.last[get_max_index(output_values)]
|
92
102
|
end
|
93
|
-
|
94
|
-
# Multilayer Perceptron Classifiers cannot generate
|
103
|
+
|
104
|
+
# Multilayer Perceptron Classifiers cannot generate
|
95
105
|
# human-readable rules.
|
106
|
+
# @return [Object]
|
96
107
|
def get_rules
|
97
|
-
|
108
|
+
"raise 'Neural networks classifiers do not generate human-readable rules.'"
|
98
109
|
end
|
110
|
+
# rubocop:enable Naming/AccessorMethodName
|
99
111
|
|
100
112
|
protected
|
101
|
-
|
113
|
+
|
114
|
+
# @param data_item [Object]
|
115
|
+
# @return [Object]
|
102
116
|
def data_to_input(data_item)
|
103
117
|
input_values = Array.new(@inputs, @inactive_node_value)
|
104
118
|
accum_index = 0
|
@@ -106,17 +120,21 @@ module Ai4r
|
|
106
120
|
att_value = data_item[att_index]
|
107
121
|
domain_index = @domains[att_index].index(att_value)
|
108
122
|
input_values[domain_index + accum_index] = @active_node_value
|
109
|
-
accum_index
|
123
|
+
accum_index += @domains[att_index].length
|
110
124
|
end
|
111
|
-
|
125
|
+
input_values
|
112
126
|
end
|
113
|
-
|
127
|
+
|
128
|
+
# @param data_item [Object]
|
129
|
+
# @return [Object]
|
114
130
|
def data_to_output(data_item)
|
115
131
|
output_values = Array.new(@outputs, @inactive_node_value)
|
116
132
|
output_values[@domains.last.index(data_item)] = @active_node_value
|
117
|
-
|
133
|
+
output_values
|
118
134
|
end
|
119
|
-
|
135
|
+
|
136
|
+
# @param output_values [Object]
|
137
|
+
# @return [Object]
|
120
138
|
def get_max_index(output_values)
|
121
139
|
max_value = @inactive_node_value
|
122
140
|
max_index = 0
|
@@ -126,10 +144,8 @@ module Ai4r
|
|
126
144
|
max_index = output_index
|
127
145
|
end
|
128
146
|
end
|
129
|
-
|
147
|
+
max_index
|
130
148
|
end
|
131
|
-
|
132
149
|
end
|
133
|
-
|
134
150
|
end
|
135
151
|
end
|
@@ -1,19 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Thomas Kern
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
8
|
# You can redistribute it and/or modify it under the terms of
|
7
9
|
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative 'classifier'
|
12
14
|
|
13
15
|
module Ai4r
|
14
16
|
module Classifiers
|
15
|
-
|
16
|
-
|
17
17
|
# = Introduction
|
18
18
|
#
|
19
19
|
# This is an implementation of a Naive Bayesian Classifier without any
|
@@ -21,7 +21,7 @@ module Ai4r
|
|
21
21
|
# Probabilities P(a_i | v_j) are estimated using m-estimates, hence the
|
22
22
|
# m parameter as second parameter when isntantiating the class.
|
23
23
|
# The estimation looks like this:
|
24
|
-
#(n_c + mp) / (n + m)
|
24
|
+
# (n_c + mp) / (n + m)
|
25
25
|
#
|
26
26
|
# the variables are:
|
27
27
|
# n = the number of training examples for which v = v_j
|
@@ -54,14 +54,21 @@ module Ai4r
|
|
54
54
|
# build data
|
55
55
|
# b.eval(["Red", "SUV", "Domestic"])
|
56
56
|
#
|
57
|
-
|
57
|
+
|
58
|
+
# Probabilistic classifier based on Bayes' theorem.
|
58
59
|
class NaiveBayes < Classifier
|
60
|
+
attr_reader :class_prob, :pcc, :pcp
|
59
61
|
|
60
|
-
parameters_info :
|
61
|
-
|
62
|
-
|
62
|
+
parameters_info m: 'Default value is set to 0. It may be set to a value greater than ' \
|
63
|
+
'0 when the size of the dataset is relatively small',
|
64
|
+
unknown_value_strategy: 'Behaviour when evaluating unseen attribute values: ' \
|
65
|
+
':ignore (default), :uniform or :error.'
|
66
|
+
|
67
|
+
# @return [Object]
|
63
68
|
def initialize
|
69
|
+
super()
|
64
70
|
@m = 0
|
71
|
+
@unknown_value_strategy = :ignore
|
65
72
|
@class_counts = []
|
66
73
|
@class_prob = [] # stores the probability of the classes
|
67
74
|
@pcc = [] # stores the number of instances divided into attribute/value/class
|
@@ -69,11 +76,13 @@ module Ai4r
|
|
69
76
|
@klass_index = {} # hashmap for quick lookup of all the used klasses and their indice
|
70
77
|
@values = {} # hashmap for quick lookup of all the values
|
71
78
|
end
|
72
|
-
|
79
|
+
|
73
80
|
# You can evaluate new data, predicting its category.
|
74
81
|
# e.g.
|
75
82
|
# b.eval(["Red", "SUV", "Domestic"])
|
76
83
|
# => 'No'
|
84
|
+
# @param data [Object]
|
85
|
+
# @return [Object]
|
77
86
|
def eval(data)
|
78
87
|
prob = @class_prob.dup
|
79
88
|
prob = calculate_class_probabilities_for_entry(data, prob)
|
@@ -82,13 +91,15 @@ module Ai4r
|
|
82
91
|
|
83
92
|
# Calculates the probabilities for the data entry Data.
|
84
93
|
# data has to be an array of the same dimension as the training data minus the
|
85
|
-
# class column.
|
94
|
+
# class column.
|
86
95
|
# Returns a map containint all classes as keys:
|
87
96
|
# {Class_1 => probability, Class_2 => probability2 ... }
|
88
97
|
# Probability is <= 1 and of type Float.
|
89
98
|
# e.g.
|
90
99
|
# b.get_probability_map(["Red", "SUV", "Domestic"])
|
91
100
|
# => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334}
|
101
|
+
# @param data [Object]
|
102
|
+
# @return [Object]
|
92
103
|
def get_probability_map(data)
|
93
104
|
prob = @class_prob.dup
|
94
105
|
prob = calculate_class_probabilities_for_entry(data, prob)
|
@@ -102,9 +113,11 @@ module Ai4r
|
|
102
113
|
# counts values of the attribute instances and calculates the probability of the classes
|
103
114
|
# and the conditional probabilities
|
104
115
|
# Parameter data has to be an instance of CsvDataSet
|
116
|
+
# @param data [Object]
|
117
|
+
# @return [Object]
|
105
118
|
def build(data)
|
106
119
|
raise 'Error instance must be passed' unless data.is_a?(Ai4r::Data::DataSet)
|
107
|
-
raise 'Data should not be empty' if data.data_items.
|
120
|
+
raise 'Data should not be empty' if data.data_items.empty?
|
108
121
|
|
109
122
|
initialize_domain_data(data)
|
110
123
|
initialize_klass_index
|
@@ -114,50 +127,86 @@ module Ai4r
|
|
114
127
|
self
|
115
128
|
end
|
116
129
|
|
130
|
+
# Naive Bayes classifiers cannot generate human readable rules.
|
131
|
+
# This method returns a descriptive string explaining that rule
|
132
|
+
# extraction is not supported for this algorithm.
|
133
|
+
def get_rules
|
134
|
+
'NaiveBayes does not support rule extraction.'
|
135
|
+
end
|
136
|
+
|
117
137
|
private
|
118
138
|
|
139
|
+
# @param data [Object]
|
140
|
+
# @return [Object]
|
119
141
|
def initialize_domain_data(data)
|
120
142
|
@domains = data.build_domains
|
121
143
|
@data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) }
|
122
144
|
@data_labels = data.data_labels[0...-1]
|
123
|
-
@klasses = @domains.last.to_a
|
145
|
+
@klasses = @domains.last.to_a.sort
|
124
146
|
end
|
125
147
|
|
126
|
-
|
127
148
|
# calculates the klass probability of a data entry
|
128
149
|
# as usual, the probability of the value is multiplied with every conditional
|
129
150
|
# probability of every attribute in condition to a specific class
|
130
151
|
# this is repeated for every class
|
152
|
+
# @param data [Object]
|
153
|
+
# @param prob [Object]
|
154
|
+
# @return [Object]
|
131
155
|
def calculate_class_probabilities_for_entry(data, prob)
|
132
156
|
0.upto(prob.length - 1) do |prob_index|
|
133
157
|
data.each_with_index do |att, index|
|
134
|
-
|
135
|
-
|
158
|
+
val_index = value_index(att, index)
|
159
|
+
if val_index.nil?
|
160
|
+
case @unknown_value_strategy
|
161
|
+
when :ignore
|
162
|
+
next
|
163
|
+
when :uniform
|
164
|
+
value_count = @pcc[index].count { |arr| arr[prob_index].positive? }
|
165
|
+
value_count = 1 if value_count.zero?
|
166
|
+
prob[prob_index] *= 1.0 / value_count
|
167
|
+
when :error
|
168
|
+
raise "Unknown value '#{att}' for attribute #{@data_labels[index]}"
|
169
|
+
else
|
170
|
+
next
|
171
|
+
end
|
172
|
+
else
|
173
|
+
prob[prob_index] *= @pcp[index][val_index][prob_index]
|
174
|
+
end
|
136
175
|
end
|
176
|
+
# rubocop:enable Metrics/ClassLength
|
137
177
|
end
|
138
|
-
|
178
|
+
|
139
179
|
prob
|
140
180
|
end
|
141
181
|
|
142
182
|
# normalises the array of probabilities so the sum of the array equals 1
|
183
|
+
# @param prob [Object]
|
184
|
+
# @return [Object]
|
143
185
|
def normalize_class_probability(prob)
|
144
186
|
prob_sum = sum(prob)
|
145
|
-
prob_sum
|
146
|
-
prob.map { |prob_entry| prob_entry / prob_sum }
|
187
|
+
if prob_sum.positive?
|
188
|
+
prob.map { |prob_entry| prob_entry / prob_sum }
|
189
|
+
else
|
147
190
|
prob
|
191
|
+
end
|
148
192
|
end
|
149
193
|
|
150
194
|
# sums an array up; returns a number of type Float
|
195
|
+
# @param array [Object]
|
196
|
+
# @return [Object]
|
151
197
|
def sum(array)
|
152
|
-
array.
|
198
|
+
array.sum(0.0)
|
153
199
|
end
|
154
200
|
|
155
201
|
# returns the name of the class when the index is found
|
202
|
+
# @param index [Object]
|
203
|
+
# @return [Object]
|
156
204
|
def index_to_klass(index)
|
157
|
-
@klass_index.
|
205
|
+
@klass_index.value?(index) ? @klass_index.key(index) : nil
|
158
206
|
end
|
159
207
|
|
160
208
|
# initializes @values and @klass_index; maps a certain value to a uniq index
|
209
|
+
# @return [Object]
|
161
210
|
def initialize_klass_index
|
162
211
|
@klasses.each_with_index do |dl, index|
|
163
212
|
@klass_index[dl] = index
|
@@ -165,24 +214,31 @@ module Ai4r
|
|
165
214
|
|
166
215
|
0.upto(@data_labels.length - 1) do |index|
|
167
216
|
@values[index] = {}
|
168
|
-
@domains[index].each_with_index do |d, d_index|
|
217
|
+
@domains[index].to_a.sort.each_with_index do |d, d_index|
|
169
218
|
@values[index][d] = d_index
|
170
219
|
end
|
171
220
|
end
|
172
221
|
end
|
173
222
|
|
174
223
|
# returns the index of a class
|
224
|
+
# @param klass [Object]
|
225
|
+
# @return [Object]
|
175
226
|
def klass_index(klass)
|
176
227
|
@klass_index[klass]
|
177
228
|
end
|
178
229
|
|
179
230
|
# returns the index of a value, depending on the attribute index
|
231
|
+
# @param value [Object]
|
232
|
+
# @param dl_index [Object]
|
233
|
+
# @return [Object]
|
180
234
|
def value_index(value, dl_index)
|
181
235
|
@values[dl_index][value]
|
182
236
|
end
|
183
237
|
|
184
238
|
# builds an array of the form:
|
185
239
|
# array[attributes][values][classes]
|
240
|
+
# @param index [Object]
|
241
|
+
# @return [Object]
|
186
242
|
def build_array(index)
|
187
243
|
domains = Array.new(@domains[index].length)
|
188
244
|
domains.map do
|
@@ -192,6 +248,7 @@ module Ai4r
|
|
192
248
|
|
193
249
|
# initializes the two array for storing the count and conditional probabilities of
|
194
250
|
# the attributes
|
251
|
+
# @return [Object]
|
195
252
|
def initialize_pc
|
196
253
|
0.upto(@data_labels.length - 1) do |index|
|
197
254
|
@pcc << build_array(index)
|
@@ -202,6 +259,7 @@ module Ai4r
|
|
202
259
|
# calculates the occurrences of a class and the instances of a certain value of a
|
203
260
|
# certain attribute and the assigned class.
|
204
261
|
# In addition to that, it also calculates the conditional probabilities and values
|
262
|
+
# @return [Object]
|
205
263
|
def calculate_probabilities
|
206
264
|
@klasses.each { |dl| @class_counts[klass_index(dl)] = 0 }
|
207
265
|
|
@@ -210,6 +268,7 @@ module Ai4r
|
|
210
268
|
calculate_conditional_probabilities
|
211
269
|
end
|
212
270
|
|
271
|
+
# @return [Object]
|
213
272
|
def calculate_class_probabilities
|
214
273
|
@data_items.each do |entry|
|
215
274
|
@class_counts[klass_index(entry.klass)] += 1
|
@@ -221,6 +280,7 @@ module Ai4r
|
|
221
280
|
end
|
222
281
|
|
223
282
|
# counts the instances of a certain value of a certain attribute and the assigned class
|
283
|
+
# @return [Object]
|
224
284
|
def count_instances
|
225
285
|
@data_items.each do |item|
|
226
286
|
0.upto(@data_labels.length - 1) do |dl_index|
|
@@ -230,39 +290,40 @@ module Ai4r
|
|
230
290
|
end
|
231
291
|
|
232
292
|
# calculates the conditional probability and stores it in the @pcp-array
|
293
|
+
# @return [Object]
|
233
294
|
def calculate_conditional_probabilities
|
234
295
|
@pcc.each_with_index do |attributes, a_index|
|
235
296
|
attributes.each_with_index do |values, v_index|
|
236
297
|
values.each_with_index do |klass, k_index|
|
237
|
-
@pcp[a_index][v_index][k_index] =
|
298
|
+
@pcp[a_index][v_index][k_index] =
|
299
|
+
(klass.to_f + (@m * @class_prob[k_index])) / (@class_counts[k_index] + @m)
|
238
300
|
end
|
239
301
|
end
|
240
302
|
end
|
241
303
|
end
|
242
304
|
|
243
|
-
#DataEntry stores the instance of the data entry
|
244
|
-
#the data is accessible via entries
|
245
|
-
#stores the class-column in the attribute klass and
|
246
|
-
#removes the column for the class-entry
|
305
|
+
# DataEntry stores the instance of the data entry
|
306
|
+
# the data is accessible via entries
|
307
|
+
# stores the class-column in the attribute klass and
|
308
|
+
# removes the column for the class-entry
|
247
309
|
class DataEntry
|
248
310
|
attr_accessor :klass, :entries
|
249
311
|
|
312
|
+
# @param attributes [Object]
|
313
|
+
# @param klass [Object]
|
314
|
+
# @return [Object]
|
250
315
|
def initialize(attributes, klass)
|
251
316
|
@klass = klass
|
252
317
|
@entries = attributes
|
253
318
|
end
|
254
319
|
|
255
320
|
# wrapper method for the access to @entries
|
321
|
+
# @param index [Object]
|
322
|
+
# @return [Object]
|
256
323
|
def [](index)
|
257
324
|
@entries[index]
|
258
325
|
end
|
259
326
|
end
|
260
|
-
|
261
327
|
end
|
262
328
|
end
|
263
329
|
end
|
264
|
-
|
265
|
-
# Monkeypatch to support both ruby 1.8 and 1.9 (key vs index method)
|
266
|
-
class Hash
|
267
|
-
alias_method(:key, :index) unless method_defined?(:key)
|
268
|
-
end
|