ai4ruby 1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. data/README.rdoc +47 -0
  2. data/examples/classifiers/id3_data.csv +121 -0
  3. data/examples/classifiers/id3_example.rb +29 -0
  4. data/examples/classifiers/naive_bayes_data.csv +11 -0
  5. data/examples/classifiers/naive_bayes_example.rb +16 -0
  6. data/examples/classifiers/results.txt +31 -0
  7. data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
  8. data/examples/genetic_algorithm/travel_cost.csv +16 -0
  9. data/examples/neural_network/backpropagation_example.rb +67 -0
  10. data/examples/neural_network/patterns_with_base_noise.rb +68 -0
  11. data/examples/neural_network/patterns_with_noise.rb +66 -0
  12. data/examples/neural_network/training_patterns.rb +68 -0
  13. data/examples/neural_network/xor_example.rb +35 -0
  14. data/examples/som/som_data.rb +156 -0
  15. data/examples/som/som_multi_node_example.rb +22 -0
  16. data/examples/som/som_single_example.rb +24 -0
  17. data/lib/ai4r.rb +33 -0
  18. data/lib/ai4r/classifiers/classifier.rb +62 -0
  19. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  20. data/lib/ai4r/classifiers/ib1.rb +121 -0
  21. data/lib/ai4r/classifiers/id3.rb +326 -0
  22. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  23. data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
  24. data/lib/ai4r/classifiers/one_r.rb +110 -0
  25. data/lib/ai4r/classifiers/prism.rb +197 -0
  26. data/lib/ai4r/classifiers/zero_r.rb +73 -0
  27. data/lib/ai4r/clusterers/average_linkage.rb +59 -0
  28. data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
  29. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  30. data/lib/ai4r/clusterers/clusterer.rb +61 -0
  31. data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
  32. data/lib/ai4r/clusterers/diana.rb +139 -0
  33. data/lib/ai4r/clusterers/k_means.rb +126 -0
  34. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  35. data/lib/ai4r/clusterers/single_linkage.rb +194 -0
  36. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  37. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
  38. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  39. data/lib/ai4r/data/data_set.rb +266 -0
  40. data/lib/ai4r/data/parameterizable.rb +64 -0
  41. data/lib/ai4r/data/proximity.rb +100 -0
  42. data/lib/ai4r/data/statistics.rb +77 -0
  43. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  44. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
  45. data/lib/ai4r/neural_network/backpropagation.rb +326 -0
  46. data/lib/ai4r/neural_network/hopfield.rb +149 -0
  47. data/lib/ai4r/som/layer.rb +68 -0
  48. data/lib/ai4r/som/node.rb +96 -0
  49. data/lib/ai4r/som/som.rb +155 -0
  50. data/lib/ai4r/som/two_phase_layer.rb +90 -0
  51. data/test/classifiers/hyperpipes_test.rb +84 -0
  52. data/test/classifiers/ib1_test.rb +78 -0
  53. data/test/classifiers/id3_test.rb +208 -0
  54. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  55. data/test/classifiers/naive_bayes_test.rb +43 -0
  56. data/test/classifiers/one_r_test.rb +62 -0
  57. data/test/classifiers/prism_test.rb +85 -0
  58. data/test/classifiers/zero_r_test.rb +49 -0
  59. data/test/clusterers/average_linkage_test.rb +51 -0
  60. data/test/clusterers/bisecting_k_means_test.rb +66 -0
  61. data/test/clusterers/centroid_linkage_test.rb +53 -0
  62. data/test/clusterers/complete_linkage_test.rb +57 -0
  63. data/test/clusterers/diana_test.rb +69 -0
  64. data/test/clusterers/k_means_test.rb +100 -0
  65. data/test/clusterers/median_linkage_test.rb +53 -0
  66. data/test/clusterers/single_linkage_test.rb +122 -0
  67. data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
  68. data/test/clusterers/ward_linkage_test.rb +53 -0
  69. data/test/clusterers/weighted_average_linkage_test.rb +53 -0
  70. data/test/data/data_set_test.rb +96 -0
  71. data/test/data/proximity_test.rb +81 -0
  72. data/test/data/statistics_test.rb +65 -0
  73. data/test/experiment/classifier_evaluator_test.rb +76 -0
  74. data/test/genetic_algorithm/chromosome_test.rb +58 -0
  75. data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
  76. data/test/neural_network/backpropagation_test.rb +82 -0
  77. data/test/neural_network/hopfield_test.rb +72 -0
  78. data/test/som/som_test.rb +97 -0
  79. metadata +168 -0
@@ -0,0 +1,61 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/parameterizable'
11
+
12
+ module Ai4r
13
+ module Clusterers
14
+
15
+ # The purpose of this class is to define a common API for Clusterers.
16
+ # All methods in this class (other than eval) must be implemented in
17
+ # subclasses.
18
+ class Clusterer
19
+
20
+ include Ai4r::Data::Parameterizable
21
+
22
+ # Build a new clusterer, using data examples found in data_set.
23
+ # Data items will be clustered in "number_of_clusters" different
24
+ # clusters.
25
+ def build(data_set, number_of_clusters)
26
+ raise NotImplementedError
27
+ end
28
+
29
+ # Classifies the given data item, returning the cluster it belongs to.
30
+ def eval(data_item)
31
+ raise NotImplementedError
32
+ end
33
+
34
+ protected
35
+ # Usefull as a defult distance function for clustering algorithms
36
+ def euclidean_distance(a, b)
37
+ dist = 0.0
38
+ a.each_index do |index|
39
+ if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
40
+ dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
41
+ end
42
+ end
43
+ return dist
44
+ end
45
+
46
+ def get_min_index(array)
47
+ min = array.first
48
+ index = 0
49
+ array.each_index do |i|
50
+ x = array[i]
51
+ if x < min
52
+ min = x
53
+ index = i
54
+ end
55
+ end
56
+ return index
57
+ end
58
+
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,67 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of a Hierarchical clusterer with complete linkage (Everitt
17
+ # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
18
+ # Hierarchical clusteres create one cluster per element, and then
19
+ # progressively merge clusters, until the required number of clusters
20
+ # is reached.
21
+ # With complete linkage, the distance between two clusters is computed as
22
+ # the maximum distance between elements of each cluster.
23
+ #
24
+ # D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
25
+ class CompleteLinkage < SingleLinkage
26
+
27
+ parameters_info :distance_function =>
28
+ "Custom implementation of distance function. " +
29
+ "It must be a closure receiving two data items and return the " +
30
+ "distance bewteen them. By default, this algorithm uses " +
31
+ "ecuclidean distance of numeric attributes to the power of 2."
32
+
33
+
34
+ # Build a new clusterer, using data examples found in data_set.
35
+ # Items will be clustered in "number_of_clusters" different
36
+ # clusters.
37
+ def build(data_set, number_of_clusters)
38
+ super
39
+ end
40
+
41
+ # Classifies the given data item, returning the cluster index it belongs
42
+ # to (0-based).
43
+ def eval(data_item)
44
+ super
45
+ end
46
+
47
+ protected
48
+
49
+ # return distance between cluster cx and new cluster (ci U cj),
50
+ # using complete linkage
51
+ def linkage_distance(cx, ci, cj)
52
+ [read_distance_matrix(cx, ci),
53
+ read_distance_matrix(cx, cj)].max
54
+ end
55
+
56
+ def distance_between_item_and_cluster(data_item, cluster)
57
+ max_dist = 0
58
+ cluster.data_items.each do |another_item|
59
+ dist = @distance_function.call(data_item, another_item)
60
+ max_dist = dist if dist > max_dist
61
+ end
62
+ return max_dist
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,139 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
18
+ # Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
19
+ # Clusterer. It begins with only one cluster with all data items,
20
+ # and divides the clusters until the desired clusters number is reached.
21
+ class Diana < Clusterer
22
+
23
+ attr_reader :data_set, :number_of_clusters, :clusters
24
+
25
+ parameters_info :distance_function =>
26
+ "Custom implementation of distance function. " +
27
+ "It must be a closure receiving two data items and return the " +
28
+ "distance bewteen them. By default, this algorithm uses " +
29
+ "ecuclidean distance of numeric attributes to the power of 2."
30
+
31
+ def initialize
32
+ @distance_function = lambda do |a,b|
33
+ Ai4r::Data::Proximity.squared_euclidean_distance(
34
+ a.select {|att_a| att_a.is_a? Numeric} ,
35
+ b.select {|att_b| att_b.is_a? Numeric})
36
+ end
37
+ end
38
+
39
+ # Build a new clusterer, using divisive analysis (DIANA algorithm)
40
+ def build(data_set, number_of_clusters)
41
+ @data_set = data_set
42
+ @number_of_clusters = number_of_clusters
43
+ @clusters = [@data_set[0..-1]]
44
+
45
+ while(@clusters.length < @number_of_clusters)
46
+ cluster_index_to_split = max_diameter_cluster(@clusters)
47
+ cluster_to_split = @clusters[cluster_index_to_split]
48
+ splinter_cluster = init_splinter_cluster(cluster_to_split)
49
+ while true
50
+ dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
51
+ break if dist_diff < 0
52
+ splinter_cluster << cluster_to_split.data_items[index]
53
+ cluster_to_split.data_items.delete_at(index)
54
+ end
55
+ @clusters << splinter_cluster
56
+ end
57
+
58
+ return self
59
+ end
60
+
61
+ # Classifies the given data item, returning the cluster index it belongs
62
+ # to (0-based).
63
+ def eval(data_item)
64
+ get_min_index(@clusters.collect do |cluster|
65
+ distance_sum(data_item, cluster) / cluster.data_items.length
66
+ end)
67
+ end
68
+
69
+ protected
70
+
71
+ # return the cluster with max diameter
72
+ def max_diameter_cluster(clusters)
73
+ max_index = 0
74
+ max_diameter = 0
75
+ clusters.each_with_index do |cluster, index|
76
+ diameter = cluster_diameter(cluster)
77
+ if diameter > max_diameter
78
+ max_index = index
79
+ max_diameter = diameter
80
+ end
81
+ end
82
+ return max_index
83
+ end
84
+
85
+ # Max distance between 2 items in a cluster
86
+ def cluster_diameter(cluster)
87
+ diameter = 0
88
+ cluster.data_items.each_with_index do |item_a, item_a_pos|
89
+ item_a_pos.times do |item_b_pos|
90
+ d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
91
+ diameter = d if d > diameter
92
+ end
93
+ end
94
+ return diameter
95
+ end
96
+
97
+ # Create a cluster with the item with mx distance
98
+ # to the rest of the cluster's items.
99
+ # That item is removed from the initial cluster.
100
+ def init_splinter_cluster(cluster_to_split)
101
+ max = 0.0
102
+ max_index = 0
103
+ cluster_to_split.data_items.each_with_index do |item, index|
104
+ sum = distance_sum(item, cluster_to_split)
105
+ max, max_index = sum, index if sum > max
106
+ end
107
+ splinter_cluster = cluster_to_split[max_index]
108
+ cluster_to_split.data_items.delete_at(max_index)
109
+ return splinter_cluster
110
+ end
111
+
112
+ # Return the max average distance between any item of
113
+ # cluster_to_split and the rest of items in that cluster,
114
+ # minus the average distance with the items of splinter_cluster,
115
+ # and the index of the item.
116
+ # A positive value means that the items is closer to the
117
+ # splinter group than to its current cluster.
118
+ def max_distance_difference(cluster_to_split, splinter_cluster)
119
+ max_diff = -1.0/0
120
+ max_diff_index = 0
121
+ cluster_to_split.data_items.each_with_index do |item, index|
122
+ dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
123
+ dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
124
+ dist_diff = dist_a - dist_b
125
+ max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
126
+ end
127
+ return max_diff, max_diff_index
128
+ end
129
+
130
+ # Sum up the distance between an item and all the items in a cluster
131
+ def distance_sum(item_a, cluster)
132
+ cluster.data_items.inject(0.0) do |sum, item_b|
133
+ sum + @distance_function.call(item_a, item_b)
134
+ end
135
+ end
136
+
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,126 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # The k-means algorithm is an algorithm to cluster n objects
17
+ # based on attributes into k partitions, with k < n.
18
+ #
19
+ # More about K Means algorithm:
20
+ # http://en.wikipedia.org/wiki/K-means_algorithm
21
+ class KMeans < Clusterer
22
+
23
+ attr_reader :data_set, :number_of_clusters
24
+ attr_reader :clusters, :centroids, :iterations
25
+
26
+ parameters_info :max_iterations => "Maximum number of iterations to " +
27
+ "build the clusterer. By default it is uncapped.",
28
+ :distance_function => "Custom implementation of distance function. " +
29
+ "It must be a closure receiving two data items and return the " +
30
+ "distance bewteen them. By default, this algorithm uses " +
31
+ "ecuclidean distance of numeric attributes to the power of 2.",
32
+ :centroid_function => "Custom implementation to calculate the " +
33
+ "centroid of a cluster. It must be a closure receiving an array of " +
34
+ "data sets, and return an array of data items, representing the " +
35
+ "centroids of for each data set. " +
36
+ "By default, this algorithm returns a data items using the mode "+
37
+ "or mean of each attribute on each data set."
38
+
39
+ def initialize
40
+ @distance_function = nil
41
+ @max_iterations = nil
42
+ @old_centroids = nil
43
+ @centroid_function = lambda do |data_sets|
44
+ data_sets.collect{ |data_set| data_set.get_mean_or_mode}
45
+ end
46
+ end
47
+
48
+
49
+ # Build a new clusterer, using data examples found in data_set.
50
+ # Items will be clustered in "number_of_clusters" different
51
+ # clusters.
52
+ def build(data_set, number_of_clusters)
53
+ @data_set = data_set
54
+ @number_of_clusters = number_of_clusters
55
+ @iterations = 0
56
+
57
+ calc_initial_centroids
58
+ while(not stop_criteria_met)
59
+ calculate_membership_clusters
60
+ recompute_centroids
61
+ end
62
+
63
+ return self
64
+ end
65
+
66
+ # Classifies the given data item, returning the cluster index it belongs
67
+ # to (0-based).
68
+ def eval(data_item)
69
+ get_min_index(@centroids.collect {|centroid|
70
+ distance(data_item, centroid)})
71
+ end
72
+
73
+ # This function calculates the distance between 2 different
74
+ # instances. By default, it returns the euclidean distance to the
75
+ # power of 2.
76
+ # You can provide a more convinient distance implementation:
77
+ #
78
+ # 1- Overwriting this method
79
+ #
80
+ # 2- Providing a closure to the :distance_function parameter
81
+ def distance(a, b)
82
+ return @distance_function.call(a, b) if @distance_function
83
+ return euclidean_distance(a, b)
84
+ end
85
+
86
+ protected
87
+
88
+ def calc_initial_centroids
89
+ @centroids = []
90
+ tried_indexes = []
91
+ while @centroids.length < @number_of_clusters &&
92
+ tried_indexes.length < @data_set.data_items.length
93
+ random_index = rand(@data_set.data_items.length)
94
+ if !tried_indexes.include?(random_index)
95
+ tried_indexes << random_index
96
+ if !@centroids.include? @data_set.data_items[random_index]
97
+ @centroids << @data_set.data_items[random_index]
98
+ end
99
+ end
100
+ end
101
+ @number_of_clusters = @centroids.length
102
+ end
103
+
104
+ def stop_criteria_met
105
+ @old_centroids == @centroids ||
106
+ (@max_iterations && (@max_iterations <= @iterations))
107
+ end
108
+
109
+ def calculate_membership_clusters
110
+ @clusters = Array.new(@number_of_clusters) do
111
+ Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
112
+ end
113
+ @data_set.data_items.each do |data_item|
114
+ @clusters[eval(data_item)] << data_item
115
+ end
116
+ end
117
+
118
+ def recompute_centroids
119
+ @old_centroids = @centroids
120
+ @iterations += 1
121
+ @centroids = @centroid_function.call(@clusters)
122
+ end
123
+
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,61 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # median linkage algorithm, aka weighted pair group method centroid
18
+ # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # Similar to centroid linkages, but using fix weight:
23
+ #
24
+ # D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
25
+ # (1/2)*D(cx, cj) -
26
+ # (1/4)*D(ci, cj)
27
+ class MedianLinkage < SingleLinkage
28
+
29
+ parameters_info :distance_function =>
30
+ "Custom implementation of distance function. " +
31
+ "It must be a closure receiving two data items and return the " +
32
+ "distance bewteen them. By default, this algorithm uses " +
33
+ "ecuclidean distance of numeric attributes to the power of 2."
34
+
35
+ # Build a new clusterer, using data examples found in data_set.
36
+ # Items will be clustered in "number_of_clusters" different
37
+ # clusters.
38
+ def build(data_set, number_of_clusters)
39
+ super
40
+ end
41
+
42
+ # This algorithms does not allow classification of new data items
43
+ # once it has been built. Rebuild the cluster including you data element.
44
+ def eval(data_item)
45
+ Raise "Eval of new data is not supported by this algorithm."
46
+ end
47
+
48
+ protected
49
+
50
+ # return distance between cluster cx and cluster (ci U cj),
51
+ # using median linkage
52
+ def linkage_distance(cx, ci, cj)
53
+ ( 0.5 * read_distance_matrix(cx, ci) +
54
+ 0.5 * read_distance_matrix(cx, cj) -
55
+ 0.25 * read_distance_matrix(ci, cj))
56
+ end
57
+
58
+ end
59
+ end
60
+ end
61
+