ai4r 1.4 → 1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/README.rdoc +24 -3
  2. data/examples/decision_trees/id3_example.rb +1 -1
  3. data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
  4. data/lib/ai4r.rb +11 -0
  5. data/lib/ai4r/classifiers/classifier.rb +2 -0
  6. data/lib/ai4r/classifiers/id3.rb +3 -2
  7. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  8. data/lib/ai4r/classifiers/one_r.rb +2 -1
  9. data/lib/ai4r/classifiers/prism.rb +2 -1
  10. data/lib/ai4r/classifiers/zero_r.rb +2 -1
  11. data/lib/ai4r/clusterers/average_linkage.rb +60 -0
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +17 -39
  13. data/lib/ai4r/clusterers/clusterer.rb +25 -0
  14. data/lib/ai4r/clusterers/complete_linkage.rb +62 -0
  15. data/lib/ai4r/clusterers/k_means.rb +18 -25
  16. data/lib/ai4r/clusterers/single_linkage.rb +179 -0
  17. data/lib/ai4r/data/data_set.rb +33 -41
  18. data/lib/ai4r/data/proximity.rb +82 -0
  19. data/lib/ai4r/data/statistics.rb +77 -0
  20. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  21. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +2 -4
  22. data/site/build/site/en/build/tmp/build-info.xml +5 -0
  23. data/site/build/site/en/build/tmp/plugins-1.xml +212 -0
  24. data/site/build/site/en/build/tmp/plugins-2.xml +252 -0
  25. data/site/build/site/en/build/tmp/projfilters.properties +41 -0
  26. data/site/build/site/en/downloads.html +1 -1
  27. data/site/build/site/en/geneticAlgorithms.html +1 -1
  28. data/site/build/site/en/index.html +44 -7
  29. data/site/build/site/en/index.pdf +278 -155
  30. data/site/build/site/en/linkmap.html +2 -2
  31. data/site/build/site/en/linkmap.pdf +12 -12
  32. data/site/build/site/en/machineLearning.html +1 -1
  33. data/site/build/site/en/neuralNetworks.html +1 -1
  34. data/site/build/site/en/sourceCode.html +244 -0
  35. data/site/build/site/en/sourceCode.pdf +278 -0
  36. data/site/build/site/en/svn.html +34 -42
  37. data/site/build/site/en/svn.pdf +86 -114
  38. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  39. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  40. data/site/build/tmp/projfilters.properties +1 -1
  41. data/site/build/webapp/WEB-INF/logs/core.log +628 -629
  42. data/site/build/webapp/WEB-INF/logs/error.log +213 -213
  43. data/site/src/documentation/content/xdocs/index.xml +20 -1
  44. data/site/src/documentation/content/xdocs/site.xml +1 -1
  45. data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
  46. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  47. data/test/classifiers/id3_test.rb +0 -1
  48. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  49. data/test/classifiers/one_r_test.rb +0 -2
  50. data/test/classifiers/prism_test.rb +0 -2
  51. data/test/classifiers/zero_r_test.rb +0 -2
  52. data/test/clusterers/average_linkage_test.rb +45 -0
  53. data/test/clusterers/bisecting_k_means_test.rb +0 -2
  54. data/test/clusterers/complete_linkage_test.rb +45 -0
  55. data/test/clusterers/k_means_test.rb +0 -2
  56. data/test/clusterers/single_linkage_test.rb +113 -0
  57. data/test/data/data_set_test.rb +3 -15
  58. data/test/data/proximity_test.rb +71 -0
  59. data/test/data/statistics_test.rb +65 -0
  60. data/test/experiment/classifier_evaluator_test.rb +76 -0
  61. metadata +27 -6
  62. data/site/src/documentation/content/xdocs/svn.xml +0 -41
@@ -7,7 +7,6 @@
7
7
  # the Mozilla Public License version 1.1 as published by the
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
- require "set"
11
10
  require File.dirname(__FILE__) + '/../data/data_set'
12
11
  require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
12
 
@@ -29,7 +28,23 @@ module Ai4r
29
28
  :distance_function => "Custom implementation of distance function. " +
30
29
  "It must be a closure receiving two data items and return the " +
31
30
  "distance bewteen them. By default, this algorithm uses " +
32
- "ecuclidean distance of numeric attributes to the power of 2."
31
+ "ecuclidean distance of numeric attributes to the power of 2.",
32
+ :centroid_function => "Custom implementation to calculate the " +
33
+ "centroid of a cluster. It must be a closure receiving an array of " +
34
+ "data sets, and return an array of data items, representing the " +
35
+ "centroids of for each data set. " +
36
+ "By default, this algorithm returns a data items using the mode "+
37
+ "or mean of each attribute on each data set."
38
+
39
+ def initialize
40
+ @distance_function = nil
41
+ @max_iterations = nil
42
+ @old_centroids = nil
43
+ @centroid_function = lambda do |data_sets|
44
+ data_sets.collect{ |data_set| data_set.get_mean_or_mode}
45
+ end
46
+ end
47
+
33
48
 
34
49
  # Build a new clusterer, using data examples found in data_set.
35
50
  # Items will be clustered in "number_of_clusters" different
@@ -69,15 +84,6 @@ module Ai4r
69
84
  end
70
85
 
71
86
  protected
72
- def euclidean_distance(a, b)
73
- dist = 0.0
74
- a.each_index do |index|
75
- if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
76
- dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
77
- end
78
- end
79
- return dist
80
- end
81
87
 
82
88
  def calc_initial_centroids
83
89
  @centroids = []
@@ -111,21 +117,8 @@ module Ai4r
111
117
 
112
118
  def recompute_centroids
113
119
  @old_centroids = @centroids
114
- @centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
115
120
  @iterations += 1
116
- end
117
-
118
- def get_min_index(array)
119
- min = array.first
120
- index = 0
121
- array.each_index do |i|
122
- x = array[i]
123
- if x < min
124
- min = x
125
- index = i
126
- end
127
- end
128
- return index
121
+ @centroids = @centroid_function.call(@clusters)
129
122
  end
130
123
 
131
124
  end
@@ -0,0 +1,179 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of a Hierarchical clusterer with single linkage.
17
+ # Hierarchical clusteres create one cluster per element, and then
18
+ # progressively merge clusters, until the required number of clusters
19
+ # is reached.
20
+ # With single linkage, the distance between two clusters is computed as the
21
+ # distance between the two closest elements in the two clusters.
22
+ class SingleLinkage < Clusterer
23
+
24
+ attr_reader :data_set, :number_of_clusters, :clusters
25
+
26
+ parameters_info :distance_function =>
27
+ "Custom implementation of distance function. " +
28
+ "It must be a closure receiving two data items and return the " +
29
+ "distance bewteen them. By default, this algorithm uses " +
30
+ "ecuclidean distance of numeric attributes to the power of 2."
31
+
32
+ def initialize
33
+ @distance_function = nil
34
+ end
35
+
36
+ # Build a new clusterer, using data examples found in data_set.
37
+ # Items will be clustered in "number_of_clusters" different
38
+ # clusters.
39
+ def build(data_set, number_of_clusters)
40
+ @data_set = data_set
41
+ @number_of_clusters = number_of_clusters
42
+
43
+ index_clusters = create_initial_index_clusters
44
+ create_distance_matrix(data_set)
45
+ while index_clusters.length > @number_of_clusters
46
+ clusters_to_merge = get_closest_clusters(index_clusters)
47
+ index_clusters = merge_clusters(clusters_to_merge, index_clusters)
48
+ end
49
+ @clusters = build_clusters_from_index_clusters index_clusters
50
+
51
+ return self
52
+ end
53
+
54
+ # Classifies the given data item, returning the cluster index it belongs
55
+ # to (0-based).
56
+ def eval(data_item)
57
+ get_min_index(@clusters.collect {|cluster|
58
+ distance_between_item_and_cluster(data_item, cluster)})
59
+ end
60
+
61
+ # This function calculates the distance between 2 different
62
+ # instances. By default, it returns the euclidean distance to the
63
+ # power of 2.
64
+ # You can provide a more convinient distance implementation:
65
+ #
66
+ # 1- Overwriting this method
67
+ #
68
+ # 2- Providing a closure to the :distance_function parameter
69
+ def distance(a, b)
70
+ return @distance_function.call(a, b) if @distance_function
71
+ return euclidean_distance(a, b)
72
+ end
73
+
74
+ protected
75
+
76
+ # returns [ [0], [1], [2], ... , [n-1] ]
77
+ # where n is the number of data items in the data set
78
+ def create_initial_index_clusters
79
+ index_clusters = []
80
+ @data_set.data_items.length.times {|i| index_clusters << [i]}
81
+ return index_clusters
82
+ end
83
+
84
+ # Create a partial distance matrix:
85
+ # [
86
+ # [d(1,0)],
87
+ # [d(2,0)], [d(2,1)],
88
+ # [d(3,0)], [d(3,1)], [d(3,2)],
89
+ # ...
90
+ # [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
91
+ # ]
92
+ # where n is the number of data items in the data set
93
+ def create_distance_matrix(data_set)
94
+ @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
95
+ data_set.data_items.each_with_index do |a, i|
96
+ i.times do |j|
97
+ b = data_set.data_items[j]
98
+ @distance_matrix[i-1][j] = distance(a, b)
99
+ end
100
+ end
101
+ end
102
+
103
+ # Returns the distance between element data_item[index_a] and
104
+ # data_item[index_b] using the distance matrix
105
+ def read_distance_matrix(index_a, index_b)
106
+ return 0 if index_a == index_b
107
+ index_a, index_b = index_b, index_a if index_b > index_a
108
+ return @distance_matrix[index_a-1][index_b]
109
+ end
110
+
111
+ # clusters_to_merge = [index_cluster_a, index_cluster_b].
112
+ # cluster_a and cluster_b are removed from index_cluster,
113
+ # and a new cluster with all members of cluster_a and cluster_b
114
+ # is added.
115
+ # It returns the new clusters array.
116
+ def merge_clusters(clusters_to_merge, index_clusters)
117
+ index_a = clusters_to_merge.first
118
+ index_b = clusters_to_merge.last
119
+ index_a, index_b = index_b, index_a if index_b > index_a
120
+ new_index_cluster = index_clusters[index_a] +
121
+ index_clusters[index_b]
122
+ index_clusters.delete_at index_a
123
+ index_clusters.delete_at index_b
124
+ index_clusters << new_index_cluster
125
+ return index_clusters
126
+ end
127
+
128
+ # Given an array with clusters of data_items indexes,
129
+ # it returns an array of data_items clusters
130
+ def build_clusters_from_index_clusters(index_clusters)
131
+ @distance_matrix = nil
132
+ return index_clusters.collect do |index_cluster|
133
+ Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
134
+ :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
135
+ end
136
+ end
137
+
138
+ # Returns ans array with the indexes of the two closest
139
+ # clusters => [index_cluster_a, index_cluster_b]
140
+ def get_closest_clusters(index_clusters)
141
+ min_distance = 1.0/0
142
+ closest_clusters = [1, 0]
143
+ index_clusters.each_with_index do |cluster_a, index_a|
144
+ index_a.times do |index_b|
145
+ cluster_b = index_clusters[index_b]
146
+ cluster_distance = calc_index_clusters_distance(cluster_a, cluster_b)
147
+ if cluster_distance < min_distance
148
+ closest_clusters = [index_a, index_b]
149
+ min_distance = cluster_distance
150
+ end
151
+ end
152
+ end
153
+ return closest_clusters
154
+ end
155
+
156
+ # Calculate cluster distance using the single linkage method
157
+ def calc_index_clusters_distance(cluster_a, cluster_b)
158
+ min_dist = 1.0/0
159
+ cluster_a.each do |index_a|
160
+ cluster_b.each do |index_b|
161
+ dist = read_distance_matrix(index_a, index_b)
162
+ min_dist = dist if dist < min_dist
163
+ end
164
+ end
165
+ return min_dist
166
+ end
167
+
168
+ def distance_between_item_and_cluster(data_item, cluster)
169
+ min_dist = 1.0/0
170
+ cluster.data_items.each do |another_item|
171
+ dist = distance(data_item, another_item)
172
+ min_dist = dist if dist < min_dist
173
+ end
174
+ return min_dist
175
+ end
176
+
177
+ end
178
+ end
179
+ end
@@ -9,11 +9,19 @@
9
9
 
10
10
  require 'csv'
11
11
  require 'set'
12
+ require File.dirname(__FILE__) + '/statistics'
12
13
 
13
14
  module Ai4r
14
15
  module Data
16
+
17
+ # A data set is a collection of N data items. Each data item is
18
+ # described by a set of attributes, represented as an array.
19
+ # Optionally, you can assign a label to the attributes, using
20
+ # the data_labels property.
15
21
  class DataSet
16
22
 
23
+ @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
24
+
17
25
  attr_reader :data_labels, :data_items
18
26
 
19
27
  # Create a new DataSet. By default, empty.
@@ -24,7 +32,7 @@ module Ai4r
24
32
  # If you provide data items, but no data labels, the data set will
25
33
  # use the default data label values (see set_data_labels)
26
34
  def initialize(options = {})
27
- @data_labels = options[:data_labels] || []
35
+ @data_labels = []
28
36
  @data_items = options[:data_items] || []
29
37
  set_data_labels(options[:data_labels]) if options[:data_labels]
30
38
  set_data_items(options[:data_items]) if options[:data_items]
@@ -38,7 +46,7 @@ module Ai4r
38
46
  end
39
47
 
40
48
  # Load data items from csv file
41
- def load_data_from_csv(filepath)
49
+ def load_csv(filepath)
42
50
  items = []
43
51
  CSV::Reader.parse(File.open(filepath, 'r')) do |row|
44
52
  items << row
@@ -47,12 +55,21 @@ module Ai4r
47
55
  end
48
56
 
49
57
  # Load data items from csv file. The first row is used as data labels.
50
- def load_data_and_labels_from_csv(filepath)
51
- load_data_from_csv(filepath)
58
+ def load_csv_with_labels(filepath)
59
+ load_csv(filepath)
52
60
  @data_labels = @data_items.shift
53
61
  return self
54
62
  end
55
63
 
64
+ # Same as load_csv, but it will try to convert cell contents as numbers.
65
+ def parse_csv(filepath)
66
+ items = []
67
+ CSV::Reader.parse(File.open(filepath, 'r')) do |row|
68
+ items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
69
+ end
70
+ set_data_items(items)
71
+ end
72
+
56
73
  # Set data labels.
57
74
  # Data labels must have the following format:
58
75
  # [ 'city', 'age_range', 'gender', 'marketing_target' ]
@@ -144,7 +161,7 @@ module Ai4r
144
161
  # get_index("gender")
145
162
  # => 2
146
163
  def get_index(attr)
147
- return (attr.is_a?(String)) ? @data_labels.index(attr) : attr
164
+ return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
148
165
  end
149
166
 
150
167
  # Raise an exception if there is no data item.
@@ -168,44 +185,19 @@ module Ai4r
168
185
  @data_items << data_item
169
186
  end
170
187
  end
171
-
172
- def get_attribute_mean(attribute)
173
- index = get_index(attribute)
174
- mean = 0.0
175
- @data_items.each { |data_item| mean += data_item[index] }
176
- mean /= @data_items.length
177
- return mean
178
- end
179
-
180
- def get_attribute_mode(attribute)
181
- index = get_index(attribute)
182
- domain = build_domain(attribute)
183
- count = {}
184
- domain.each {|value| count[value]=0}
185
- @data_items.each { |data_item| count[data_item[index]] += 1 }
186
- max_count = 0
187
- mode = nil
188
- count.each_pair do |value, value_count|
189
- if value_count > max_count
190
- mode = value
191
- max_count = value_count
192
- end
193
- end
194
- return mode
195
- end
196
-
197
- def get_attribute_mean_or_mode(attribute)
198
- index = get_index(attribute)
199
- if @data_items.first[index].is_a?(Numeric)
200
- return get_attribute_mean(attribute)
201
- else
202
- return get_attribute_mode(attribute)
203
- end
204
- end
205
-
188
+
189
+ # Returns an array with the mean value of numeric attributes, and
190
+ # the most frequent value of non numeric attributes
206
191
  def get_mean_or_mode
207
192
  mean = []
208
- num_attributes.times {|i| mean[i] = get_attribute_mean_or_mode(i) }
193
+ num_attributes.times do |i|
194
+ mean[i] =
195
+ if @data_items.first[i].is_a?(Numeric)
196
+ Statistics.mean(self, i)
197
+ else
198
+ Statistics.mode(self, i)
199
+ end
200
+ end
209
201
  return mean
210
202
  end
211
203
 
@@ -0,0 +1,82 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ module Ai4r
11
+ module Data
12
+
13
+ # This module provides classical distance functions
14
+ module Proximity
15
+
16
+ # This is a faster computational replacement for eclidean distance.
17
+ # Parameters a and b are vectors with continuous attributes.
18
+ def self.squared_euclidean_distance(a, b)
19
+ sum = 0.0
20
+ a.each_with_index do |item_a, i|
21
+ item_b = b[i]
22
+ sum += (item_a - item_b)**2
23
+ end
24
+ return sum
25
+ end
26
+
27
+ # Euclidean distance, or L2 norm.
28
+ # Parameters a and b are vectors with continuous attributes.
29
+ # Euclidean distance tends to form hyperspherical
30
+ # clusters(Clustering, Xu and Wunsch, 2009).
31
+ # Translations and rotations do not cause a
32
+ # distortion in distance relation (Duda et al, 2001)
33
+ # If attributes are measured with different units,
34
+ # attributes with larger values and variance will
35
+ # dominate the metric.
36
+ def self.euclidean_distance(a, b)
37
+ Math.sqrt(squared_euclidean_distance(a, b))
38
+ end
39
+
40
+
41
+ # city block, Manhattan distance, or L1 norm.
42
+ # Parameters a and b are vectors with continuous attributes.
43
+ def self.manhattan_distance(a, b)
44
+ sum = 0.0
45
+ a.each_with_index do |item_a, i|
46
+ item_b = b[i]
47
+ sum += (item_a - item_b).abs
48
+ end
49
+ return sum
50
+ end
51
+
52
+ # Sup distance, or L-intinity norm
53
+ # Parameters a and b are vectors with continuous attributes.
54
+ def self.sup_distance(a, b)
55
+ distance = 0.0
56
+ a.each_with_index do |item_a, i|
57
+ item_b = b[i]
58
+ diff = (item_a - item_b).abs
59
+ distance = diff if diff > distance
60
+ end
61
+ return distance
62
+ end
63
+
64
+ # The Hamming distance between two attributes vectors of equal
65
+ # length is the number of attributes for which the corresponding
66
+ # vectors are different
67
+ # This distance function is frequently used with binary attributes,
68
+ # though it can be used with other discrete attributes.
69
+ def self.hamming_distance(a,b)
70
+ count = 0
71
+ a.each_index do |i|
72
+ count += 1 if a[i] != b[i]
73
+ end
74
+ return count
75
+ end
76
+
77
+ end
78
+
79
+ end
80
+
81
+ end
82
+