ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +174 -0
  3. data/examples/classifiers/hyperpipes_data.csv +14 -0
  4. data/examples/classifiers/hyperpipes_example.rb +22 -0
  5. data/examples/classifiers/ib1_example.rb +12 -0
  6. data/examples/classifiers/id3_example.rb +15 -10
  7. data/examples/classifiers/id3_graphviz_example.rb +17 -0
  8. data/examples/classifiers/logistic_regression_example.rb +11 -0
  9. data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
  10. data/examples/classifiers/naive_bayes_example.rb +12 -13
  11. data/examples/classifiers/one_r_example.rb +27 -0
  12. data/examples/classifiers/parameter_tutorial.rb +29 -0
  13. data/examples/classifiers/prism_nominal_example.rb +15 -0
  14. data/examples/classifiers/prism_numeric_example.rb +21 -0
  15. data/examples/classifiers/simple_linear_regression_example.rb +14 -11
  16. data/examples/classifiers/zero_and_one_r_example.rb +34 -0
  17. data/examples/classifiers/zero_one_r_data.csv +8 -0
  18. data/examples/clusterers/clusterer_example.rb +40 -34
  19. data/examples/clusterers/dbscan_example.rb +17 -0
  20. data/examples/clusterers/dendrogram_example.rb +17 -0
  21. data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
  22. data/examples/clusterers/kmeans_custom_example.rb +26 -0
  23. data/examples/genetic_algorithm/bitstring_example.rb +41 -0
  24. data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
  25. data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
  26. data/examples/neural_network/backpropagation_example.rb +48 -48
  27. data/examples/neural_network/hopfield_example.rb +45 -0
  28. data/examples/neural_network/patterns_with_base_noise.rb +39 -39
  29. data/examples/neural_network/patterns_with_noise.rb +41 -39
  30. data/examples/neural_network/train_epochs_callback.rb +25 -0
  31. data/examples/neural_network/training_patterns.rb +39 -39
  32. data/examples/neural_network/transformer_text_classification.rb +78 -0
  33. data/examples/neural_network/xor_example.rb +23 -22
  34. data/examples/reinforcement/q_learning_example.rb +10 -0
  35. data/examples/som/som_data.rb +155 -152
  36. data/examples/som/som_multi_node_example.rb +12 -13
  37. data/examples/som/som_single_example.rb +12 -15
  38. data/examples/transformer/decode_classifier_example.rb +68 -0
  39. data/examples/transformer/deterministic_example.rb +10 -0
  40. data/examples/transformer/seq2seq_example.rb +16 -0
  41. data/lib/ai4r/classifiers/classifier.rb +24 -16
  42. data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
  43. data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
  44. data/lib/ai4r/classifiers/ib1.rb +122 -32
  45. data/lib/ai4r/classifiers/id3.rb +524 -145
  46. data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
  47. data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
  48. data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
  49. data/lib/ai4r/classifiers/one_r.rb +112 -44
  50. data/lib/ai4r/classifiers/prism.rb +167 -76
  51. data/lib/ai4r/classifiers/random_forest.rb +72 -0
  52. data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
  53. data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
  54. data/lib/ai4r/classifiers/votes.rb +57 -0
  55. data/lib/ai4r/classifiers/zero_r.rb +71 -30
  56. data/lib/ai4r/clusterers/average_linkage.rb +46 -27
  57. data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
  58. data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
  59. data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
  60. data/lib/ai4r/clusterers/clusterer.rb +29 -14
  61. data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
  62. data/lib/ai4r/clusterers/dbscan.rb +134 -0
  63. data/lib/ai4r/clusterers/diana.rb +75 -49
  64. data/lib/ai4r/clusterers/k_means.rb +270 -135
  65. data/lib/ai4r/clusterers/median_linkage.rb +49 -33
  66. data/lib/ai4r/clusterers/single_linkage.rb +196 -88
  67. data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
  68. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
  69. data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
  70. data/lib/ai4r/data/data_set.rb +223 -103
  71. data/lib/ai4r/data/parameterizable.rb +31 -25
  72. data/lib/ai4r/data/proximity.rb +62 -62
  73. data/lib/ai4r/data/statistics.rb +46 -35
  74. data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
  75. data/lib/ai4r/experiment/split.rb +39 -0
  76. data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
  77. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
  78. data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
  79. data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
  80. data/lib/ai4r/neural_network/activation_functions.rb +37 -0
  81. data/lib/ai4r/neural_network/backpropagation.rb +399 -134
  82. data/lib/ai4r/neural_network/hopfield.rb +175 -58
  83. data/lib/ai4r/neural_network/transformer.rb +194 -0
  84. data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
  85. data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
  86. data/lib/ai4r/reinforcement/q_learning.rb +51 -0
  87. data/lib/ai4r/search/a_star.rb +76 -0
  88. data/lib/ai4r/search/bfs.rb +50 -0
  89. data/lib/ai4r/search/dfs.rb +50 -0
  90. data/lib/ai4r/search/mcts.rb +118 -0
  91. data/lib/ai4r/search.rb +12 -0
  92. data/lib/ai4r/som/distance_metrics.rb +29 -0
  93. data/lib/ai4r/som/layer.rb +28 -17
  94. data/lib/ai4r/som/node.rb +61 -32
  95. data/lib/ai4r/som/som.rb +158 -41
  96. data/lib/ai4r/som/two_phase_layer.rb +21 -25
  97. data/lib/ai4r/version.rb +3 -0
  98. data/lib/ai4r.rb +57 -28
  99. metadata +79 -109
  100. data/README.rdoc +0 -39
  101. data/test/classifiers/hyperpipes_test.rb +0 -84
  102. data/test/classifiers/ib1_test.rb +0 -78
  103. data/test/classifiers/id3_test.rb +0 -220
  104. data/test/classifiers/multilayer_perceptron_test.rb +0 -79
  105. data/test/classifiers/naive_bayes_test.rb +0 -43
  106. data/test/classifiers/one_r_test.rb +0 -62
  107. data/test/classifiers/prism_test.rb +0 -85
  108. data/test/classifiers/simple_linear_regression_test.rb +0 -37
  109. data/test/classifiers/zero_r_test.rb +0 -50
  110. data/test/clusterers/average_linkage_test.rb +0 -51
  111. data/test/clusterers/bisecting_k_means_test.rb +0 -66
  112. data/test/clusterers/centroid_linkage_test.rb +0 -53
  113. data/test/clusterers/complete_linkage_test.rb +0 -57
  114. data/test/clusterers/diana_test.rb +0 -69
  115. data/test/clusterers/k_means_test.rb +0 -167
  116. data/test/clusterers/median_linkage_test.rb +0 -53
  117. data/test/clusterers/single_linkage_test.rb +0 -122
  118. data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
  119. data/test/clusterers/ward_linkage_test.rb +0 -53
  120. data/test/clusterers/weighted_average_linkage_test.rb +0 -53
  121. data/test/data/data_set_test.rb +0 -104
  122. data/test/data/proximity_test.rb +0 -87
  123. data/test/data/statistics_test.rb +0 -65
  124. data/test/experiment/classifier_evaluator_test.rb +0 -76
  125. data/test/genetic_algorithm/chromosome_test.rb +0 -57
  126. data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
  127. data/test/neural_network/backpropagation_test.rb +0 -82
  128. data/test/neural_network/hopfield_test.rb +0 -72
  129. data/test/som/som_test.rb +0 -97
@@ -1,80 +1,87 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://www.ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/k_means'
12
+ require_relative '../data/data_set'
13
+ require_relative '../clusterers/k_means'
12
14
 
13
15
  module Ai4r
14
16
  module Clusterers
15
-
16
17
  # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
17
18
  # somewhat less sensitive to the initial election of centroids than the
18
- # original.
19
- #
19
+ # original.
20
+ #
20
21
  # More about K Means algorithm:
21
- # http://en.wikipedia.org/wiki/K-means_algorithm
22
+ # http://en.wikipedia.org/wiki/K-means_algorithm
22
23
  class BisectingKMeans < KMeans
23
-
24
24
  attr_reader :data_set, :number_of_clusters, :clusters, :centroids
25
- attr_accessor :max_iterations, :distance_function, :refine
26
-
27
- parameters_info :max_iterations => "Maximum number of iterations to " +
28
- "build the clusterer. By default it is uncapped.",
29
- :distance_function => "Custom implementation of distance function. " +
30
- "It must be a closure receiving two data items and return the " +
31
- "distance between them. By default, this algorithm uses " +
32
- "euclidean distance of numeric attributes to the power of 2.",
33
- :centroid_function => "Custom implementation to calculate the " +
34
- "centroid of a cluster. It must be a closure receiving an array of " +
35
- "data sets, and return an array of data items, representing the " +
36
- "centroids of for each data set. " +
37
- "By default, this algorithm returns a data items using the mode "+
38
- "or mean of each attribute on each data set.",
39
- :refine => "Boolean value. True by default. It will run the " +
40
- "classic K Means algorithm, using as initial centroids the " +
41
- "result of the bisecting approach."
42
-
43
-
44
- def intialize
25
+
26
+ parameters_info max_iterations: 'Maximum number of iterations to ' \
27
+ 'build the clusterer. By default it is uncapped.',
28
+ distance_function: 'Custom implementation of distance function. ' \
29
+ 'It must be a closure receiving two data items and return the ' \
30
+ 'distance between them. By default, this algorithm uses ' \
31
+ 'euclidean distance of numeric attributes to the power of 2.',
32
+ centroid_function: 'Custom implementation to calculate the ' \
33
+ 'centroid of a cluster. It must be a closure receiving an array of ' \
34
+ 'data sets, and return an array of data items, representing the ' \
35
+ 'centroids of for each data set. ' \
36
+ 'By default, this algorithm returns a data items using the mode ' \
37
+ 'or mean of each attribute on each data set.',
38
+ refine: 'Boolean value. True by default. It will run the ' \
39
+ 'classic K Means algorithm, using as initial centroids the ' \
40
+ 'result of the bisecting approach.'
41
+
42
+ # @return [Object]
43
+ def initialize
44
+ super
45
45
  @refine = true
46
46
  end
47
-
47
+
48
48
  # Build a new clusterer, using data examples found in data_set.
49
49
  # Items will be clustered in "number_of_clusters" different
50
50
  # clusters.
51
+ # @param data_set [Object]
52
+ # @param number_of_clusters [Object]
53
+ # @return [Object]
51
54
  def build(data_set, number_of_clusters)
52
55
  @data_set = data_set
53
56
  @number_of_clusters = number_of_clusters
54
-
57
+
55
58
  @clusters = [@data_set]
56
59
  @centroids = [@data_set.get_mean_or_mode]
57
60
  while @clusters.length < @number_of_clusters
58
61
  biggest_cluster_index = find_biggest_cluster_index(@clusters)
59
- clusterer = KMeans.new.
60
- set_parameters(get_parameters).
61
- build(@clusters[biggest_cluster_index], 2)
62
+ clusterer = KMeans.new
63
+ .set_parameters(get_parameters)
64
+ .build(@clusters[biggest_cluster_index], 2)
62
65
  @clusters.delete_at(biggest_cluster_index)
63
66
  @centroids.delete_at(biggest_cluster_index)
64
67
  @clusters.concat(clusterer.clusters)
65
68
  @centroids.concat(clusterer.centroids)
66
69
  end
67
-
70
+
68
71
  super if @refine
69
-
70
- return self
71
- end
72
-
73
- protected
72
+
73
+ self
74
+ end
75
+
76
+ protected
77
+
78
+ # @return [Object]
74
79
  def calc_initial_centroids
75
80
  @centroids # Use existing centroids
76
81
  end
77
-
82
+
83
+ # @param clusters [Object]
84
+ # @return [Object]
78
85
  def find_biggest_cluster_index(clusters)
79
86
  max_index = 0
80
87
  max_length = 0
@@ -85,9 +92,8 @@ module Ai4r
85
92
  max_index = cluster_index
86
93
  end
87
94
  end
88
- return max_index
95
+ max_index
89
96
  end
90
-
91
97
  end
92
98
  end
93
99
  end
@@ -1,66 +1,82 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+ require_relative '../data/data_set'
13
+ require_relative '../clusterers/single_linkage'
14
+ require_relative '../clusterers/cluster_tree'
12
15
 
13
16
  module Ai4r
14
17
  module Clusterers
15
-
16
- # Implementation of an Agglomerative Hierarchical clusterer with
17
- # centroid linkage algorithm, aka unweighted pair group method
18
+ # Implementation of an Agglomerative Hierarchical clusterer with
19
+ # centroid linkage algorithm, aka unweighted pair group method
18
20
  # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
19
21
  # Sokal and Michener, 1958 )
20
- # Hierarchical clusterer create one cluster per element, and then
22
+ # Hierarchical clusterer create one cluster per element, and then
21
23
  # progressively merge clusters, until the required number of clusters
22
24
  # is reached.
23
- # The distance between clusters is the squared euclidean distance
24
- # between their centroids.
25
- #
25
+ # The distance between clusters is the squared euclidean distance
26
+ # between their centroids.
27
+ #
26
28
  # D(cx, (ci U cj)) = | mx - mij |^2
27
- # D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
28
- # (nj/(ni+nj))*D(cx, cj) -
29
+ # D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
30
+ # (nj/(ni+nj))*D(cx, cj) -
29
31
  # (ni*nj/(ni+nj)^2)*D(ci, cj)
30
32
  class CentroidLinkage < SingleLinkage
31
-
32
- parameters_info :distance_function =>
33
- "Custom implementation of distance function. " +
34
- "It must be a closure receiving two data items and return the " +
35
- "distance between them. By default, this algorithm uses " +
36
- "euclidean distance of numeric attributes to the power of 2."
37
-
33
+ include ClusterTree
34
+
35
+ parameters_info distance_function:
36
+ 'Custom implementation of distance function. ' \
37
+ 'It must be a closure receiving two data items and return the ' \
38
+ 'distance between them. By default, this algorithm uses ' \
39
+ 'euclidean distance of numeric attributes to the power of 2.'
40
+
38
41
  # Build a new clusterer, using data examples found in data_set.
39
42
  # Items will be clustered in "number_of_clusters" different
40
43
  # clusters.
41
- def build(data_set, number_of_clusters)
44
+ # @param data_set [Object]
45
+ # @param number_of_clusters [Object]
46
+ # @param *options [Object]
47
+ # @return [Object]
48
+ def build(data_set, number_of_clusters = 1, **options)
42
49
  super
43
50
  end
44
-
45
- # This algorithms does not allow classification of new data items
51
+
52
+ # This algorithms does not allow classification of new data items
46
53
  # once it has been built. Rebuild the cluster including you data element.
47
- def eval(data_item)
48
- Raise "Eval of new data is not supported by this algorithm."
54
+ # @param _data_item [Object]
55
+ # @return [Object]
56
+ def eval(_data_item)
57
+ raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
49
58
  end
50
-
59
+
60
+ # @return [Object]
61
+ def supports_eval?
62
+ false
63
+ end
64
+
51
65
  protected
52
-
66
+
53
67
  # return distance between cluster cx and cluster (ci U cj),
54
68
  # using centroid linkage
55
- def linkage_distance(cx, ci, cj)
56
- ni = @index_clusters[ci].length
57
- nj = @index_clusters[cj].length
58
- ( ni * read_distance_matrix(cx, ci) +
59
- nj * read_distance_matrix(cx, cj) -
60
- 1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
69
+ # @param cx [Object]
70
+ # @param ci [Object]
71
+ # @param cj [Object]
72
+ # @return [Object]
73
+ def linkage_distance(cluster_x, cluster_i, cluster_j)
74
+ ni = @index_clusters[cluster_i].length
75
+ nj = @index_clusters[cluster_j].length
76
+ ((ni * read_distance_matrix(cluster_x, cluster_i)) +
77
+ (nj * read_distance_matrix(cluster_x, cluster_j)) -
78
+ (1.0 * ni * nj * read_distance_matrix(cluster_i, cluster_j) / (ni + nj))) / (ni + nj)
61
79
  end
62
-
63
80
  end
64
81
  end
65
82
  end
66
-
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ai4r
4
+ module Clusterers
5
+ # Mixin to capture merge steps during agglomerative clustering.
6
+ # Stores intermediate clusters in +cluster_tree+. Optional +depth+
7
+ # limits how many last merges are recorded.
8
+ module ClusterTree
9
+ attr_reader :cluster_tree
10
+
11
+ # @param depth [Object]
12
+ # @param args [Object]
13
+ # @return [Object]
14
+ def initialize(depth = nil, *args)
15
+ @cluster_tree = []
16
+ @depth = depth
17
+ @merges_so_far = 0
18
+ super(*args)
19
+ end
20
+
21
+ # @param data_set [Object]
22
+ # @param number_of_clusters [Object]
23
+ # @param *options [Object]
24
+ # @return [Object]
25
+ def build(data_set, number_of_clusters = 1, **options)
26
+ @total_merges = data_set.data_items.length - number_of_clusters
27
+ super
28
+ @cluster_tree << clusters
29
+ @cluster_tree.reverse!
30
+ self
31
+ end
32
+
33
+ protected
34
+
35
+ # @param index_a [Object]
36
+ # @param index_b [Object]
37
+ # @param index_clusters [Object]
38
+ # @return [Object]
39
+ def merge_clusters(index_a, index_b, index_clusters)
40
+ if @depth.nil? || @merges_so_far > @total_merges - @depth
41
+ stored_distance_matrix = @distance_matrix.dup
42
+ @cluster_tree << build_clusters_from_index_clusters(index_clusters)
43
+ @distance_matrix = stored_distance_matrix
44
+ end
45
+ @merges_so_far += 1
46
+ super
47
+ end
48
+ end
49
+ end
50
+ end
@@ -1,37 +1,53 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/parameterizable'
12
+ require_relative '../data/parameterizable'
11
13
 
12
14
  module Ai4r
13
15
  module Clusterers
14
-
15
16
  # The purpose of this class is to define a common API for Clusterers.
16
- # All methods in this class (other than eval) must be implemented in
17
- # subclasses.
17
+ # All methods in this class (other than eval) must be implemented in
18
+ # subclasses.
18
19
  class Clusterer
19
-
20
20
  include Ai4r::Data::Parameterizable
21
-
21
+
22
22
  # Build a new clusterer, using data examples found in data_set.
23
23
  # Data items will be clustered in "number_of_clusters" different
24
24
  # clusters.
25
+ # @param data_set [Object]
26
+ # @param number_of_clusters [Object]
27
+ # @return [Object]
25
28
  def build(data_set, number_of_clusters)
26
29
  raise NotImplementedError
27
30
  end
28
-
31
+
29
32
  # Classifies the given data item, returning the cluster it belongs to.
33
+ # @param data_item [Object]
34
+ # @return [Object]
30
35
  def eval(data_item)
31
36
  raise NotImplementedError
32
37
  end
33
-
34
- protected
38
+
39
+ # Returns +true+ if this clusterer supports evaluating new data items
40
+ # with {#eval}. Hierarchical algorithms that only build a dendrogram
41
+ # will override this method to return +false+.
42
+ # @return [Object]
43
+ def supports_eval?
44
+ true
45
+ end
46
+
47
+ protected
48
+
49
+ # @param array [Object]
50
+ # @return [Object]
35
51
  def get_min_index(array)
36
52
  min = array.first
37
53
  index = 0
@@ -42,9 +58,8 @@ module Ai4r
42
58
  index = i
43
59
  end
44
60
  end
45
- return index
61
+ index
46
62
  end
47
-
48
63
  end
49
64
  end
50
65
  end
@@ -1,67 +1,78 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Sergio Fierens (implementation)
2
4
  # License:: MPL 1.1
3
5
  # Project:: ai4r
4
- # Url:: http://ai4r.org/
6
+ # Url:: https://github.com/SergioFierens/ai4r
5
7
  #
6
- # You can redistribute it and/or modify it under the terms of
7
- # the Mozilla Public License version 1.1 as published by the
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the Mozilla Public License version 1.1 as published by the
8
10
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
11
 
10
- require File.dirname(__FILE__) + '/../data/data_set'
11
- require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+ require_relative '../data/data_set'
13
+ require_relative '../clusterers/single_linkage'
14
+ require_relative '../clusterers/cluster_tree'
12
15
 
13
16
  module Ai4r
14
17
  module Clusterers
15
-
16
- # Implementation of a Hierarchical clusterer with complete linkage (Everitt
18
+ # Implementation of a Hierarchical clusterer with complete linkage (Everitt
17
19
  # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
18
- # Hierarchical clusterer create one cluster per element, and then
20
+ # Hierarchical clusterer create one cluster per element, and then
19
21
  # progressively merge clusters, until the required number of clusters
20
22
  # is reached.
21
- # With complete linkage, the distance between two clusters is computed as
23
+ # With complete linkage, the distance between two clusters is computed as
22
24
  # the maximum distance between elements of each cluster.
23
25
  #
24
26
  # D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
25
27
  class CompleteLinkage < SingleLinkage
26
-
27
- parameters_info :distance_function =>
28
- "Custom implementation of distance function. " +
29
- "It must be a closure receiving two data items and return the " +
30
- "distance between them. By default, this algorithm uses " +
31
- "euclidean distance of numeric attributes to the power of 2."
32
-
33
-
28
+ include ClusterTree
29
+
30
+ parameters_info distance_function:
31
+ 'Custom implementation of distance function. ' \
32
+ 'It must be a closure receiving two data items and return the ' \
33
+ 'distance between them. By default, this algorithm uses ' \
34
+ 'euclidean distance of numeric attributes to the power of 2.'
35
+
34
36
  # Build a new clusterer, using data examples found in data_set.
35
37
  # Items will be clustered in "number_of_clusters" different
36
38
  # clusters.
37
- def build(data_set, number_of_clusters)
39
+ # @param data_set [Object]
40
+ # @param number_of_clusters [Object]
41
+ # @param *options [Object]
42
+ # @return [Object]
43
+ def build(data_set, number_of_clusters = 1, **options)
38
44
  super
39
45
  end
40
-
41
- # Classifies the given data item, returning the cluster index it belongs
46
+
47
+ # Classifies the given data item, returning the cluster index it belongs
42
48
  # to (0-based).
43
- def eval(data_item)
44
- super
45
- end
46
-
49
+ # @param data_item [Object]
50
+ # @return [Object]
51
+
47
52
  protected
48
-
53
+
49
54
  # return distance between cluster cx and new cluster (ci U cj),
50
55
  # using complete linkage
51
- def linkage_distance(cx, ci, cj)
52
- [read_distance_matrix(cx, ci),
53
- read_distance_matrix(cx, cj)].max
56
+ # @param cx [Object]
57
+ # @param ci [Object]
58
+ # @param cj [Object]
59
+ # @return [Object]
60
+ def linkage_distance(cluster_x, cluster_i, cluster_j)
61
+ [read_distance_matrix(cluster_x, cluster_i),
62
+ read_distance_matrix(cluster_x, cluster_j)].max
54
63
  end
55
-
64
+
65
+ # @param data_item [Object]
66
+ # @param cluster [Object]
67
+ # @return [Object]
56
68
  def distance_between_item_and_cluster(data_item, cluster)
57
69
  max_dist = 0
58
70
  cluster.data_items.each do |another_item|
59
71
  dist = @distance_function.call(data_item, another_item)
60
72
  max_dist = dist if dist > max_dist
61
73
  end
62
- return max_dist
74
+ max_dist
63
75
  end
64
-
65
76
  end
66
77
  end
67
78
  end
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Gwénaël Rault (implementation)
4
+ # License:: AGPL-3.0
5
+ # Project:: ai4r
6
+ # Url:: https://github.com/SergioFierens/ai4r
7
+
8
+ require_relative '../data/data_set'
9
+ require_relative '../data/proximity'
10
+ require_relative '../clusterers/clusterer'
11
+
12
+ module Ai4r
13
+ module Clusterers
14
+ # More about DBSCAN algorithm:
15
+ # https://en.wikipedia.org/wiki/DBSCAN
16
+ class DBSCAN < Clusterer
17
+ attr_reader :data_set, :number_of_clusters, :clusters, :cluster_indices, :labels
18
+
19
+ parameters_info epsilon: 'Squared radius used with squared Euclidean distance.',
20
+ min_points: 'Minimum neighbours excluding the point itself required to form a cluster.',
21
+ distance_function: 'Optional closure computing distance; defaults to squared Euclidean.'
22
+
23
+ def initialize
24
+ super()
25
+ @distance_function = nil
26
+ @epsilon = nil
27
+ @min_points = 5
28
+ @clusters = []
29
+ @cluster_indices = []
30
+ end
31
+
32
+ # Build a new clusterer using data from +data_set+.
33
+ # An optional +number_of_clusters+ argument is ignored and present only to
34
+ # keep a consistent interface with other clusterers.
35
+ #
36
+ # @param data_set [Ai4r::Data::DataSet]
37
+ # @param number_of_clusters [Integer, nil]
38
+ # @return [DBSCAN]
39
+ def build(data_set, _number_of_clusters = nil)
40
+ @data_set = data_set
41
+ @clusters = []
42
+ @cluster_indices = []
43
+ @labels = Array.new(data_set.data_items.size)
44
+ @number_of_clusters = 0
45
+
46
+ raise ArgumentError, 'epsilon must be defined' if @epsilon.nil?
47
+
48
+ # Detect if the neighborhood of the current item
49
+ # is dense enough
50
+ data_set.data_items.each_with_index do |data_item, data_index|
51
+ next unless @labels[data_index].nil?
52
+
53
+ neighbors = range_query(data_item) - [data_index]
54
+ if neighbors.size < @min_points
55
+ @labels[data_index] = :noise
56
+ else
57
+ @number_of_clusters += 1
58
+ @labels[data_index] = @number_of_clusters
59
+ ds = Ai4r::Data::DataSet.new(data_labels: @data_set.data_labels)
60
+ ds << data_item
61
+ @clusters.push(ds)
62
+ @cluster_indices.push([data_index])
63
+ extend_cluster(neighbors, @number_of_clusters)
64
+ end
65
+ end
66
+
67
+ raise 'number_of_clusters must be positive' if !@clusters.empty? && @number_of_clusters <= 0
68
+
69
+ valid_labels = (1..@number_of_clusters).to_a << :noise
70
+ raise 'labels must be cluster ids or :noise' unless @labels.all? { |l| valid_labels.include?(l) }
71
+
72
+ self
73
+ end
74
+
75
+ # This algorithm cannot classify new data items once it has been built.
76
+ # Rebuild the cluster with your new data item instead.
77
+ # @param _data_item [Object]
78
+ # @return [Object]
79
+ def eval(_data_item)
80
+ raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
81
+ end
82
+
83
+ # @return [Object]
84
+ def supports_eval?
85
+ false
86
+ end
87
+
88
+ def distance(a, b)
89
+ return @distance_function.call(a, b) if @distance_function
90
+
91
+ Ai4r::Data::Proximity.squared_euclidean_distance(
92
+ a.select { |att_a| att_a.is_a? Numeric },
93
+ b.select { |att_b| att_b.is_a? Numeric }
94
+ )
95
+ end
96
+
97
+ protected
98
+
99
+ # Scan the data set and return the indices of all points
100
+ # belonging to the neighborhood of the current item
101
+ def range_query(evaluated_data_item)
102
+ neighbors = []
103
+ @data_set.data_items.each_with_index do |data_item, data_index|
104
+ neighbors << data_index if distance(evaluated_data_item, data_item) <= @epsilon
105
+ end
106
+ neighbors
107
+ end
108
+
109
+ # Expand the cluster by visiting neighbours of the current point.
110
+ # Skip neighbours already assigned to another cluster.
111
+ # If a neighbour was previously labeled as noise, assign it to the current
112
+ # cluster.
113
+ def extend_cluster(neighbors, current_cluster)
114
+ while neighbors.any?
115
+ data_index = neighbors.shift
116
+ if @labels[data_index] == :noise
117
+ @labels[data_index] = current_cluster
118
+ @clusters.last << @data_set.data_items[data_index]
119
+ @cluster_indices.last << data_index
120
+ elsif @labels[data_index].nil?
121
+ @labels[data_index] = current_cluster
122
+ @clusters.last << @data_set.data_items[data_index]
123
+ @cluster_indices.last << data_index
124
+ new_neighbors = range_query(@data_set.data_items[data_index]) - [data_index]
125
+ if new_neighbors.size >= @min_points
126
+ neighbors.concat(new_neighbors)
127
+ neighbors.uniq!
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end