ai4r 1.13 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +174 -0
- data/examples/classifiers/hyperpipes_data.csv +14 -0
- data/examples/classifiers/hyperpipes_example.rb +22 -0
- data/examples/classifiers/ib1_example.rb +12 -0
- data/examples/classifiers/id3_example.rb +15 -10
- data/examples/classifiers/id3_graphviz_example.rb +17 -0
- data/examples/classifiers/logistic_regression_example.rb +11 -0
- data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
- data/examples/classifiers/naive_bayes_example.rb +12 -13
- data/examples/classifiers/one_r_example.rb +27 -0
- data/examples/classifiers/parameter_tutorial.rb +29 -0
- data/examples/classifiers/prism_nominal_example.rb +15 -0
- data/examples/classifiers/prism_numeric_example.rb +21 -0
- data/examples/classifiers/simple_linear_regression_example.rb +14 -11
- data/examples/classifiers/zero_and_one_r_example.rb +34 -0
- data/examples/classifiers/zero_one_r_data.csv +8 -0
- data/examples/clusterers/clusterer_example.rb +40 -34
- data/examples/clusterers/dbscan_example.rb +17 -0
- data/examples/clusterers/dendrogram_example.rb +17 -0
- data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
- data/examples/clusterers/kmeans_custom_example.rb +26 -0
- data/examples/genetic_algorithm/bitstring_example.rb +41 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
- data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
- data/examples/neural_network/backpropagation_example.rb +48 -48
- data/examples/neural_network/hopfield_example.rb +45 -0
- data/examples/neural_network/patterns_with_base_noise.rb +39 -39
- data/examples/neural_network/patterns_with_noise.rb +41 -39
- data/examples/neural_network/train_epochs_callback.rb +25 -0
- data/examples/neural_network/training_patterns.rb +39 -39
- data/examples/neural_network/transformer_text_classification.rb +78 -0
- data/examples/neural_network/xor_example.rb +23 -22
- data/examples/reinforcement/q_learning_example.rb +10 -0
- data/examples/som/som_data.rb +155 -152
- data/examples/som/som_multi_node_example.rb +12 -13
- data/examples/som/som_single_example.rb +12 -15
- data/examples/transformer/decode_classifier_example.rb +68 -0
- data/examples/transformer/deterministic_example.rb +10 -0
- data/examples/transformer/seq2seq_example.rb +16 -0
- data/lib/ai4r/classifiers/classifier.rb +24 -16
- data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
- data/lib/ai4r/classifiers/ib1.rb +122 -32
- data/lib/ai4r/classifiers/id3.rb +524 -145
- data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
- data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
- data/lib/ai4r/classifiers/one_r.rb +112 -44
- data/lib/ai4r/classifiers/prism.rb +167 -76
- data/lib/ai4r/classifiers/random_forest.rb +72 -0
- data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
- data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
- data/lib/ai4r/classifiers/votes.rb +57 -0
- data/lib/ai4r/classifiers/zero_r.rb +71 -30
- data/lib/ai4r/clusterers/average_linkage.rb +46 -27
- data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
- data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
- data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
- data/lib/ai4r/clusterers/clusterer.rb +29 -14
- data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
- data/lib/ai4r/clusterers/dbscan.rb +134 -0
- data/lib/ai4r/clusterers/diana.rb +75 -49
- data/lib/ai4r/clusterers/k_means.rb +270 -135
- data/lib/ai4r/clusterers/median_linkage.rb +49 -33
- data/lib/ai4r/clusterers/single_linkage.rb +196 -88
- data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
- data/lib/ai4r/data/data_set.rb +223 -103
- data/lib/ai4r/data/parameterizable.rb +31 -25
- data/lib/ai4r/data/proximity.rb +62 -62
- data/lib/ai4r/data/statistics.rb +46 -35
- data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
- data/lib/ai4r/experiment/split.rb +39 -0
- data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
- data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
- data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
- data/lib/ai4r/neural_network/activation_functions.rb +37 -0
- data/lib/ai4r/neural_network/backpropagation.rb +399 -134
- data/lib/ai4r/neural_network/hopfield.rb +175 -58
- data/lib/ai4r/neural_network/transformer.rb +194 -0
- data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
- data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
- data/lib/ai4r/reinforcement/q_learning.rb +51 -0
- data/lib/ai4r/search/a_star.rb +76 -0
- data/lib/ai4r/search/bfs.rb +50 -0
- data/lib/ai4r/search/dfs.rb +50 -0
- data/lib/ai4r/search/mcts.rb +118 -0
- data/lib/ai4r/search.rb +12 -0
- data/lib/ai4r/som/distance_metrics.rb +29 -0
- data/lib/ai4r/som/layer.rb +28 -17
- data/lib/ai4r/som/node.rb +61 -32
- data/lib/ai4r/som/som.rb +158 -41
- data/lib/ai4r/som/two_phase_layer.rb +21 -25
- data/lib/ai4r/version.rb +3 -0
- data/lib/ai4r.rb +57 -28
- metadata +79 -109
- data/README.rdoc +0 -39
- data/test/classifiers/hyperpipes_test.rb +0 -84
- data/test/classifiers/ib1_test.rb +0 -78
- data/test/classifiers/id3_test.rb +0 -220
- data/test/classifiers/multilayer_perceptron_test.rb +0 -79
- data/test/classifiers/naive_bayes_test.rb +0 -43
- data/test/classifiers/one_r_test.rb +0 -62
- data/test/classifiers/prism_test.rb +0 -85
- data/test/classifiers/simple_linear_regression_test.rb +0 -37
- data/test/classifiers/zero_r_test.rb +0 -50
- data/test/clusterers/average_linkage_test.rb +0 -51
- data/test/clusterers/bisecting_k_means_test.rb +0 -66
- data/test/clusterers/centroid_linkage_test.rb +0 -53
- data/test/clusterers/complete_linkage_test.rb +0 -57
- data/test/clusterers/diana_test.rb +0 -69
- data/test/clusterers/k_means_test.rb +0 -167
- data/test/clusterers/median_linkage_test.rb +0 -53
- data/test/clusterers/single_linkage_test.rb +0 -122
- data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
- data/test/clusterers/ward_linkage_test.rb +0 -53
- data/test/clusterers/weighted_average_linkage_test.rb +0 -53
- data/test/data/data_set_test.rb +0 -104
- data/test/data/proximity_test.rb +0 -87
- data/test/data/statistics_test.rb +0 -65
- data/test/experiment/classifier_evaluator_test.rb +0 -76
- data/test/genetic_algorithm/chromosome_test.rb +0 -57
- data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
- data/test/neural_network/backpropagation_test.rb +0 -82
- data/test/neural_network/hopfield_test.rb +0 -72
- data/test/som/som_test.rb +0 -97
@@ -1,80 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../clusterers/k_means'
|
12
14
|
|
13
15
|
module Ai4r
|
14
16
|
module Clusterers
|
15
|
-
|
16
17
|
# The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
|
17
18
|
# somewhat less sensitive to the initial election of centroids than the
|
18
|
-
# original.
|
19
|
-
#
|
19
|
+
# original.
|
20
|
+
#
|
20
21
|
# More about K Means algorithm:
|
21
|
-
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
23
|
class BisectingKMeans < KMeans
|
23
|
-
|
24
24
|
attr_reader :data_set, :number_of_clusters, :clusters, :centroids
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
25
|
+
|
26
|
+
parameters_info max_iterations: 'Maximum number of iterations to ' \
|
27
|
+
'build the clusterer. By default it is uncapped.',
|
28
|
+
distance_function: 'Custom implementation of distance function. ' \
|
29
|
+
'It must be a closure receiving two data items and return the ' \
|
30
|
+
'distance between them. By default, this algorithm uses ' \
|
31
|
+
'euclidean distance of numeric attributes to the power of 2.',
|
32
|
+
centroid_function: 'Custom implementation to calculate the ' \
|
33
|
+
'centroid of a cluster. It must be a closure receiving an array of ' \
|
34
|
+
'data sets, and return an array of data items, representing the ' \
|
35
|
+
'centroids of for each data set. ' \
|
36
|
+
'By default, this algorithm returns a data items using the mode ' \
|
37
|
+
'or mean of each attribute on each data set.',
|
38
|
+
refine: 'Boolean value. True by default. It will run the ' \
|
39
|
+
'classic K Means algorithm, using as initial centroids the ' \
|
40
|
+
'result of the bisecting approach.'
|
41
|
+
|
42
|
+
# @return [Object]
|
43
|
+
def initialize
|
44
|
+
super
|
45
45
|
@refine = true
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
# Build a new clusterer, using data examples found in data_set.
|
49
49
|
# Items will be clustered in "number_of_clusters" different
|
50
50
|
# clusters.
|
51
|
+
# @param data_set [Object]
|
52
|
+
# @param number_of_clusters [Object]
|
53
|
+
# @return [Object]
|
51
54
|
def build(data_set, number_of_clusters)
|
52
55
|
@data_set = data_set
|
53
56
|
@number_of_clusters = number_of_clusters
|
54
|
-
|
57
|
+
|
55
58
|
@clusters = [@data_set]
|
56
59
|
@centroids = [@data_set.get_mean_or_mode]
|
57
60
|
while @clusters.length < @number_of_clusters
|
58
61
|
biggest_cluster_index = find_biggest_cluster_index(@clusters)
|
59
|
-
clusterer = KMeans.new
|
60
|
-
|
61
|
-
|
62
|
+
clusterer = KMeans.new
|
63
|
+
.set_parameters(get_parameters)
|
64
|
+
.build(@clusters[biggest_cluster_index], 2)
|
62
65
|
@clusters.delete_at(biggest_cluster_index)
|
63
66
|
@centroids.delete_at(biggest_cluster_index)
|
64
67
|
@clusters.concat(clusterer.clusters)
|
65
68
|
@centroids.concat(clusterer.centroids)
|
66
69
|
end
|
67
|
-
|
70
|
+
|
68
71
|
super if @refine
|
69
|
-
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
protected
|
72
|
+
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
protected
|
77
|
+
|
78
|
+
# @return [Object]
|
74
79
|
def calc_initial_centroids
|
75
80
|
@centroids # Use existing centroids
|
76
81
|
end
|
77
|
-
|
82
|
+
|
83
|
+
# @param clusters [Object]
|
84
|
+
# @return [Object]
|
78
85
|
def find_biggest_cluster_index(clusters)
|
79
86
|
max_index = 0
|
80
87
|
max_length = 0
|
@@ -85,9 +92,8 @@ module Ai4r
|
|
85
92
|
max_index = cluster_index
|
86
93
|
end
|
87
94
|
end
|
88
|
-
|
95
|
+
max_index
|
89
96
|
end
|
90
|
-
|
91
97
|
end
|
92
98
|
end
|
93
99
|
end
|
@@ -1,66 +1,82 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../clusterers/single_linkage'
|
14
|
+
require_relative '../clusterers/cluster_tree'
|
12
15
|
|
13
16
|
module Ai4r
|
14
17
|
module Clusterers
|
15
|
-
|
16
|
-
#
|
17
|
-
# centroid linkage algorithm, aka unweighted pair group method
|
18
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
19
|
+
# centroid linkage algorithm, aka unweighted pair group method
|
18
20
|
# centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
|
19
21
|
# Sokal and Michener, 1958 )
|
20
|
-
# Hierarchical clusterer create one cluster per element, and then
|
22
|
+
# Hierarchical clusterer create one cluster per element, and then
|
21
23
|
# progressively merge clusters, until the required number of clusters
|
22
24
|
# is reached.
|
23
|
-
# The distance between clusters is the squared euclidean distance
|
24
|
-
# between their centroids.
|
25
|
-
#
|
25
|
+
# The distance between clusters is the squared euclidean distance
|
26
|
+
# between their centroids.
|
27
|
+
#
|
26
28
|
# D(cx, (ci U cj)) = | mx - mij |^2
|
27
|
-
# D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
|
28
|
-
# (nj/(ni+nj))*D(cx, cj) -
|
29
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
|
30
|
+
# (nj/(ni+nj))*D(cx, cj) -
|
29
31
|
# (ni*nj/(ni+nj)^2)*D(ci, cj)
|
30
32
|
class CentroidLinkage < SingleLinkage
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
include ClusterTree
|
34
|
+
|
35
|
+
parameters_info distance_function:
|
36
|
+
'Custom implementation of distance function. ' \
|
37
|
+
'It must be a closure receiving two data items and return the ' \
|
38
|
+
'distance between them. By default, this algorithm uses ' \
|
39
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
40
|
+
|
38
41
|
# Build a new clusterer, using data examples found in data_set.
|
39
42
|
# Items will be clustered in "number_of_clusters" different
|
40
43
|
# clusters.
|
41
|
-
|
44
|
+
# @param data_set [Object]
|
45
|
+
# @param number_of_clusters [Object]
|
46
|
+
# @param *options [Object]
|
47
|
+
# @return [Object]
|
48
|
+
def build(data_set, number_of_clusters = 1, **options)
|
42
49
|
super
|
43
50
|
end
|
44
|
-
|
45
|
-
# This algorithms does not allow classification of new data items
|
51
|
+
|
52
|
+
# This algorithms does not allow classification of new data items
|
46
53
|
# once it has been built. Rebuild the cluster including you data element.
|
47
|
-
|
48
|
-
|
54
|
+
# @param _data_item [Object]
|
55
|
+
# @return [Object]
|
56
|
+
def eval(_data_item)
|
57
|
+
raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
|
49
58
|
end
|
50
|
-
|
59
|
+
|
60
|
+
# @return [Object]
|
61
|
+
def supports_eval?
|
62
|
+
false
|
63
|
+
end
|
64
|
+
|
51
65
|
protected
|
52
|
-
|
66
|
+
|
53
67
|
# return distance between cluster cx and cluster (ci U cj),
|
54
68
|
# using centroid linkage
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
69
|
+
# @param cx [Object]
|
70
|
+
# @param ci [Object]
|
71
|
+
# @param cj [Object]
|
72
|
+
# @return [Object]
|
73
|
+
def linkage_distance(cluster_x, cluster_i, cluster_j)
|
74
|
+
ni = @index_clusters[cluster_i].length
|
75
|
+
nj = @index_clusters[cluster_j].length
|
76
|
+
((ni * read_distance_matrix(cluster_x, cluster_i)) +
|
77
|
+
(nj * read_distance_matrix(cluster_x, cluster_j)) -
|
78
|
+
(1.0 * ni * nj * read_distance_matrix(cluster_i, cluster_j) / (ni + nj))) / (ni + nj)
|
61
79
|
end
|
62
|
-
|
63
80
|
end
|
64
81
|
end
|
65
82
|
end
|
66
|
-
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ai4r
|
4
|
+
module Clusterers
|
5
|
+
# Mixin to capture merge steps during agglomerative clustering.
|
6
|
+
# Stores intermediate clusters in +cluster_tree+. Optional +depth+
|
7
|
+
# limits how many last merges are recorded.
|
8
|
+
module ClusterTree
|
9
|
+
attr_reader :cluster_tree
|
10
|
+
|
11
|
+
# @param depth [Object]
|
12
|
+
# @param args [Object]
|
13
|
+
# @return [Object]
|
14
|
+
def initialize(depth = nil, *args)
|
15
|
+
@cluster_tree = []
|
16
|
+
@depth = depth
|
17
|
+
@merges_so_far = 0
|
18
|
+
super(*args)
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param data_set [Object]
|
22
|
+
# @param number_of_clusters [Object]
|
23
|
+
# @param *options [Object]
|
24
|
+
# @return [Object]
|
25
|
+
def build(data_set, number_of_clusters = 1, **options)
|
26
|
+
@total_merges = data_set.data_items.length - number_of_clusters
|
27
|
+
super
|
28
|
+
@cluster_tree << clusters
|
29
|
+
@cluster_tree.reverse!
|
30
|
+
self
|
31
|
+
end
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
# @param index_a [Object]
|
36
|
+
# @param index_b [Object]
|
37
|
+
# @param index_clusters [Object]
|
38
|
+
# @return [Object]
|
39
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
40
|
+
if @depth.nil? || @merges_so_far > @total_merges - @depth
|
41
|
+
stored_distance_matrix = @distance_matrix.dup
|
42
|
+
@cluster_tree << build_clusters_from_index_clusters(index_clusters)
|
43
|
+
@distance_matrix = stored_distance_matrix
|
44
|
+
end
|
45
|
+
@merges_so_far += 1
|
46
|
+
super
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -1,37 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
12
|
+
require_relative '../data/parameterizable'
|
11
13
|
|
12
14
|
module Ai4r
|
13
15
|
module Clusterers
|
14
|
-
|
15
16
|
# The purpose of this class is to define a common API for Clusterers.
|
16
|
-
# All methods in this class (other than eval) must be implemented in
|
17
|
-
# subclasses.
|
17
|
+
# All methods in this class (other than eval) must be implemented in
|
18
|
+
# subclasses.
|
18
19
|
class Clusterer
|
19
|
-
|
20
20
|
include Ai4r::Data::Parameterizable
|
21
|
-
|
21
|
+
|
22
22
|
# Build a new clusterer, using data examples found in data_set.
|
23
23
|
# Data items will be clustered in "number_of_clusters" different
|
24
24
|
# clusters.
|
25
|
+
# @param data_set [Object]
|
26
|
+
# @param number_of_clusters [Object]
|
27
|
+
# @return [Object]
|
25
28
|
def build(data_set, number_of_clusters)
|
26
29
|
raise NotImplementedError
|
27
30
|
end
|
28
|
-
|
31
|
+
|
29
32
|
# Classifies the given data item, returning the cluster it belongs to.
|
33
|
+
# @param data_item [Object]
|
34
|
+
# @return [Object]
|
30
35
|
def eval(data_item)
|
31
36
|
raise NotImplementedError
|
32
37
|
end
|
33
|
-
|
34
|
-
|
38
|
+
|
39
|
+
# Returns +true+ if this clusterer supports evaluating new data items
|
40
|
+
# with {#eval}. Hierarchical algorithms that only build a dendrogram
|
41
|
+
# will override this method to return +false+.
|
42
|
+
# @return [Object]
|
43
|
+
def supports_eval?
|
44
|
+
true
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
# @param array [Object]
|
50
|
+
# @return [Object]
|
35
51
|
def get_min_index(array)
|
36
52
|
min = array.first
|
37
53
|
index = 0
|
@@ -42,9 +58,8 @@ module Ai4r
|
|
42
58
|
index = i
|
43
59
|
end
|
44
60
|
end
|
45
|
-
|
61
|
+
index
|
46
62
|
end
|
47
|
-
|
48
63
|
end
|
49
64
|
end
|
50
65
|
end
|
@@ -1,67 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Sergio Fierens (implementation)
|
2
4
|
# License:: MPL 1.1
|
3
5
|
# Project:: ai4r
|
4
|
-
# Url::
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
5
7
|
#
|
6
|
-
# You can redistribute it and/or modify it under the terms of
|
7
|
-
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
10
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
require_relative '../data/data_set'
|
13
|
+
require_relative '../clusterers/single_linkage'
|
14
|
+
require_relative '../clusterers/cluster_tree'
|
12
15
|
|
13
16
|
module Ai4r
|
14
17
|
module Clusterers
|
15
|
-
|
16
|
-
# Implementation of a Hierarchical clusterer with complete linkage (Everitt
|
18
|
+
# Implementation of a Hierarchical clusterer with complete linkage (Everitt
|
17
19
|
# et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
|
18
|
-
# Hierarchical clusterer create one cluster per element, and then
|
20
|
+
# Hierarchical clusterer create one cluster per element, and then
|
19
21
|
# progressively merge clusters, until the required number of clusters
|
20
22
|
# is reached.
|
21
|
-
# With complete linkage, the distance between two clusters is computed as
|
23
|
+
# With complete linkage, the distance between two clusters is computed as
|
22
24
|
# the maximum distance between elements of each cluster.
|
23
25
|
#
|
24
26
|
# D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
|
25
27
|
class CompleteLinkage < SingleLinkage
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
include ClusterTree
|
29
|
+
|
30
|
+
parameters_info distance_function:
|
31
|
+
'Custom implementation of distance function. ' \
|
32
|
+
'It must be a closure receiving two data items and return the ' \
|
33
|
+
'distance between them. By default, this algorithm uses ' \
|
34
|
+
'euclidean distance of numeric attributes to the power of 2.'
|
35
|
+
|
34
36
|
# Build a new clusterer, using data examples found in data_set.
|
35
37
|
# Items will be clustered in "number_of_clusters" different
|
36
38
|
# clusters.
|
37
|
-
|
39
|
+
# @param data_set [Object]
|
40
|
+
# @param number_of_clusters [Object]
|
41
|
+
# @param *options [Object]
|
42
|
+
# @return [Object]
|
43
|
+
def build(data_set, number_of_clusters = 1, **options)
|
38
44
|
super
|
39
45
|
end
|
40
|
-
|
41
|
-
# Classifies the given data item, returning the cluster index it belongs
|
46
|
+
|
47
|
+
# Classifies the given data item, returning the cluster index it belongs
|
42
48
|
# to (0-based).
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
49
|
+
# @param data_item [Object]
|
50
|
+
# @return [Object]
|
51
|
+
|
47
52
|
protected
|
48
|
-
|
53
|
+
|
49
54
|
# return distance between cluster cx and new cluster (ci U cj),
|
50
55
|
# using complete linkage
|
51
|
-
|
52
|
-
|
53
|
-
|
56
|
+
# @param cx [Object]
|
57
|
+
# @param ci [Object]
|
58
|
+
# @param cj [Object]
|
59
|
+
# @return [Object]
|
60
|
+
def linkage_distance(cluster_x, cluster_i, cluster_j)
|
61
|
+
[read_distance_matrix(cluster_x, cluster_i),
|
62
|
+
read_distance_matrix(cluster_x, cluster_j)].max
|
54
63
|
end
|
55
|
-
|
64
|
+
|
65
|
+
# @param data_item [Object]
|
66
|
+
# @param cluster [Object]
|
67
|
+
# @return [Object]
|
56
68
|
def distance_between_item_and_cluster(data_item, cluster)
|
57
69
|
max_dist = 0
|
58
70
|
cluster.data_items.each do |another_item|
|
59
71
|
dist = @distance_function.call(data_item, another_item)
|
60
72
|
max_dist = dist if dist > max_dist
|
61
73
|
end
|
62
|
-
|
74
|
+
max_dist
|
63
75
|
end
|
64
|
-
|
65
76
|
end
|
66
77
|
end
|
67
78
|
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Gwénaël Rault (implementation)
|
4
|
+
# License:: AGPL-3.0
|
5
|
+
# Project:: ai4r
|
6
|
+
# Url:: https://github.com/SergioFierens/ai4r
|
7
|
+
|
8
|
+
require_relative '../data/data_set'
|
9
|
+
require_relative '../data/proximity'
|
10
|
+
require_relative '../clusterers/clusterer'
|
11
|
+
|
12
|
+
module Ai4r
|
13
|
+
module Clusterers
|
14
|
+
# More about DBSCAN algorithm:
|
15
|
+
# https://en.wikipedia.org/wiki/DBSCAN
|
16
|
+
class DBSCAN < Clusterer
|
17
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :cluster_indices, :labels
|
18
|
+
|
19
|
+
parameters_info epsilon: 'Squared radius used with squared Euclidean distance.',
|
20
|
+
min_points: 'Minimum neighbours excluding the point itself required to form a cluster.',
|
21
|
+
distance_function: 'Optional closure computing distance; defaults to squared Euclidean.'
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
super()
|
25
|
+
@distance_function = nil
|
26
|
+
@epsilon = nil
|
27
|
+
@min_points = 5
|
28
|
+
@clusters = []
|
29
|
+
@cluster_indices = []
|
30
|
+
end
|
31
|
+
|
32
|
+
# Build a new clusterer using data from +data_set+.
|
33
|
+
# An optional +number_of_clusters+ argument is ignored and present only to
|
34
|
+
# keep a consistent interface with other clusterers.
|
35
|
+
#
|
36
|
+
# @param data_set [Ai4r::Data::DataSet]
|
37
|
+
# @param number_of_clusters [Integer, nil]
|
38
|
+
# @return [DBSCAN]
|
39
|
+
def build(data_set, _number_of_clusters = nil)
|
40
|
+
@data_set = data_set
|
41
|
+
@clusters = []
|
42
|
+
@cluster_indices = []
|
43
|
+
@labels = Array.new(data_set.data_items.size)
|
44
|
+
@number_of_clusters = 0
|
45
|
+
|
46
|
+
raise ArgumentError, 'epsilon must be defined' if @epsilon.nil?
|
47
|
+
|
48
|
+
# Detect if the neighborhood of the current item
|
49
|
+
# is dense enough
|
50
|
+
data_set.data_items.each_with_index do |data_item, data_index|
|
51
|
+
next unless @labels[data_index].nil?
|
52
|
+
|
53
|
+
neighbors = range_query(data_item) - [data_index]
|
54
|
+
if neighbors.size < @min_points
|
55
|
+
@labels[data_index] = :noise
|
56
|
+
else
|
57
|
+
@number_of_clusters += 1
|
58
|
+
@labels[data_index] = @number_of_clusters
|
59
|
+
ds = Ai4r::Data::DataSet.new(data_labels: @data_set.data_labels)
|
60
|
+
ds << data_item
|
61
|
+
@clusters.push(ds)
|
62
|
+
@cluster_indices.push([data_index])
|
63
|
+
extend_cluster(neighbors, @number_of_clusters)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
raise 'number_of_clusters must be positive' if !@clusters.empty? && @number_of_clusters <= 0
|
68
|
+
|
69
|
+
valid_labels = (1..@number_of_clusters).to_a << :noise
|
70
|
+
raise 'labels must be cluster ids or :noise' unless @labels.all? { |l| valid_labels.include?(l) }
|
71
|
+
|
72
|
+
self
|
73
|
+
end
|
74
|
+
|
75
|
+
# This algorithm cannot classify new data items once it has been built.
|
76
|
+
# Rebuild the cluster with your new data item instead.
|
77
|
+
# @param _data_item [Object]
|
78
|
+
# @return [Object]
|
79
|
+
def eval(_data_item)
|
80
|
+
raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
|
81
|
+
end
|
82
|
+
|
83
|
+
# @return [Object]
|
84
|
+
def supports_eval?
|
85
|
+
false
|
86
|
+
end
|
87
|
+
|
88
|
+
def distance(a, b)
|
89
|
+
return @distance_function.call(a, b) if @distance_function
|
90
|
+
|
91
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
92
|
+
a.select { |att_a| att_a.is_a? Numeric },
|
93
|
+
b.select { |att_b| att_b.is_a? Numeric }
|
94
|
+
)
|
95
|
+
end
|
96
|
+
|
97
|
+
protected
|
98
|
+
|
99
|
+
# Scan the data set and return the indices of all points
|
100
|
+
# belonging to the neighborhood of the current item
|
101
|
+
def range_query(evaluated_data_item)
|
102
|
+
neighbors = []
|
103
|
+
@data_set.data_items.each_with_index do |data_item, data_index|
|
104
|
+
neighbors << data_index if distance(evaluated_data_item, data_item) <= @epsilon
|
105
|
+
end
|
106
|
+
neighbors
|
107
|
+
end
|
108
|
+
|
109
|
+
# Expand the cluster by visiting neighbours of the current point.
|
110
|
+
# Skip neighbours already assigned to another cluster.
|
111
|
+
# If a neighbour was previously labeled as noise, assign it to the current
|
112
|
+
# cluster.
|
113
|
+
def extend_cluster(neighbors, current_cluster)
|
114
|
+
while neighbors.any?
|
115
|
+
data_index = neighbors.shift
|
116
|
+
if @labels[data_index] == :noise
|
117
|
+
@labels[data_index] = current_cluster
|
118
|
+
@clusters.last << @data_set.data_items[data_index]
|
119
|
+
@cluster_indices.last << data_index
|
120
|
+
elsif @labels[data_index].nil?
|
121
|
+
@labels[data_index] = current_cluster
|
122
|
+
@clusters.last << @data_set.data_items[data_index]
|
123
|
+
@cluster_indices.last << data_index
|
124
|
+
new_neighbors = range_query(@data_set.data_items[data_index]) - [data_index]
|
125
|
+
if new_neighbors.size >= @min_points
|
126
|
+
neighbors.concat(new_neighbors)
|
127
|
+
neighbors.uniq!
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|