ai4r 1.4 → 1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/README.rdoc +24 -3
  2. data/examples/decision_trees/id3_example.rb +1 -1
  3. data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
  4. data/lib/ai4r.rb +11 -0
  5. data/lib/ai4r/classifiers/classifier.rb +2 -0
  6. data/lib/ai4r/classifiers/id3.rb +3 -2
  7. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  8. data/lib/ai4r/classifiers/one_r.rb +2 -1
  9. data/lib/ai4r/classifiers/prism.rb +2 -1
  10. data/lib/ai4r/classifiers/zero_r.rb +2 -1
  11. data/lib/ai4r/clusterers/average_linkage.rb +60 -0
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +17 -39
  13. data/lib/ai4r/clusterers/clusterer.rb +25 -0
  14. data/lib/ai4r/clusterers/complete_linkage.rb +62 -0
  15. data/lib/ai4r/clusterers/k_means.rb +18 -25
  16. data/lib/ai4r/clusterers/single_linkage.rb +179 -0
  17. data/lib/ai4r/data/data_set.rb +33 -41
  18. data/lib/ai4r/data/proximity.rb +82 -0
  19. data/lib/ai4r/data/statistics.rb +77 -0
  20. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  21. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +2 -4
  22. data/site/build/site/en/build/tmp/build-info.xml +5 -0
  23. data/site/build/site/en/build/tmp/plugins-1.xml +212 -0
  24. data/site/build/site/en/build/tmp/plugins-2.xml +252 -0
  25. data/site/build/site/en/build/tmp/projfilters.properties +41 -0
  26. data/site/build/site/en/downloads.html +1 -1
  27. data/site/build/site/en/geneticAlgorithms.html +1 -1
  28. data/site/build/site/en/index.html +44 -7
  29. data/site/build/site/en/index.pdf +278 -155
  30. data/site/build/site/en/linkmap.html +2 -2
  31. data/site/build/site/en/linkmap.pdf +12 -12
  32. data/site/build/site/en/machineLearning.html +1 -1
  33. data/site/build/site/en/neuralNetworks.html +1 -1
  34. data/site/build/site/en/sourceCode.html +244 -0
  35. data/site/build/site/en/sourceCode.pdf +278 -0
  36. data/site/build/site/en/svn.html +34 -42
  37. data/site/build/site/en/svn.pdf +86 -114
  38. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  39. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  40. data/site/build/tmp/projfilters.properties +1 -1
  41. data/site/build/webapp/WEB-INF/logs/core.log +628 -629
  42. data/site/build/webapp/WEB-INF/logs/error.log +213 -213
  43. data/site/src/documentation/content/xdocs/index.xml +20 -1
  44. data/site/src/documentation/content/xdocs/site.xml +1 -1
  45. data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
  46. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  47. data/test/classifiers/id3_test.rb +0 -1
  48. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  49. data/test/classifiers/one_r_test.rb +0 -2
  50. data/test/classifiers/prism_test.rb +0 -2
  51. data/test/classifiers/zero_r_test.rb +0 -2
  52. data/test/clusterers/average_linkage_test.rb +45 -0
  53. data/test/clusterers/bisecting_k_means_test.rb +0 -2
  54. data/test/clusterers/complete_linkage_test.rb +45 -0
  55. data/test/clusterers/k_means_test.rb +0 -2
  56. data/test/clusterers/single_linkage_test.rb +113 -0
  57. data/test/data/data_set_test.rb +3 -15
  58. data/test/data/proximity_test.rb +71 -0
  59. data/test/data/statistics_test.rb +65 -0
  60. data/test/experiment/classifier_evaluator_test.rb +76 -0
  61. metadata +27 -6
  62. data/site/src/documentation/content/xdocs/svn.xml +0 -41
@@ -7,7 +7,6 @@
7
7
  # the Mozilla Public License version 1.1 as published by the
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
- require "set"
11
10
  require File.dirname(__FILE__) + '/../data/data_set'
12
11
  require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
12
 
@@ -29,7 +28,23 @@ module Ai4r
29
28
  :distance_function => "Custom implementation of distance function. " +
30
29
  "It must be a closure receiving two data items and return the " +
31
30
  "distance bewteen them. By default, this algorithm uses " +
32
- "ecuclidean distance of numeric attributes to the power of 2."
31
+ "ecuclidean distance of numeric attributes to the power of 2.",
32
+ :centroid_function => "Custom implementation to calculate the " +
33
+ "centroid of a cluster. It must be a closure receiving an array of " +
34
+ "data sets, and return an array of data items, representing the " +
35
+ "centroids of for each data set. " +
36
+ "By default, this algorithm returns a data items using the mode "+
37
+ "or mean of each attribute on each data set."
38
+
39
+ def initialize
40
+ @distance_function = nil
41
+ @max_iterations = nil
42
+ @old_centroids = nil
43
+ @centroid_function = lambda do |data_sets|
44
+ data_sets.collect{ |data_set| data_set.get_mean_or_mode}
45
+ end
46
+ end
47
+
33
48
 
34
49
  # Build a new clusterer, using data examples found in data_set.
35
50
  # Items will be clustered in "number_of_clusters" different
@@ -69,15 +84,6 @@ module Ai4r
69
84
  end
70
85
 
71
86
  protected
72
- def euclidean_distance(a, b)
73
- dist = 0.0
74
- a.each_index do |index|
75
- if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
76
- dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
77
- end
78
- end
79
- return dist
80
- end
81
87
 
82
88
  def calc_initial_centroids
83
89
  @centroids = []
@@ -111,21 +117,8 @@ module Ai4r
111
117
 
112
118
  def recompute_centroids
113
119
  @old_centroids = @centroids
114
- @centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
115
120
  @iterations += 1
116
- end
117
-
118
- def get_min_index(array)
119
- min = array.first
120
- index = 0
121
- array.each_index do |i|
122
- x = array[i]
123
- if x < min
124
- min = x
125
- index = i
126
- end
127
- end
128
- return index
121
+ @centroids = @centroid_function.call(@clusters)
129
122
  end
130
123
 
131
124
  end
@@ -0,0 +1,179 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of a Hierarchical clusterer with single linkage.
17
+ # Hierarchical clusteres create one cluster per element, and then
18
+ # progressively merge clusters, until the required number of clusters
19
+ # is reached.
20
+ # With single linkage, the distance between two clusters is computed as the
21
+ # distance between the two closest elements in the two clusters.
22
+ class SingleLinkage < Clusterer
23
+
24
+ attr_reader :data_set, :number_of_clusters, :clusters
25
+
26
+ parameters_info :distance_function =>
27
+ "Custom implementation of distance function. " +
28
+ "It must be a closure receiving two data items and return the " +
29
+ "distance bewteen them. By default, this algorithm uses " +
30
+ "ecuclidean distance of numeric attributes to the power of 2."
31
+
32
+ def initialize
33
+ @distance_function = nil
34
+ end
35
+
36
+ # Build a new clusterer, using data examples found in data_set.
37
+ # Items will be clustered in "number_of_clusters" different
38
+ # clusters.
39
+ def build(data_set, number_of_clusters)
40
+ @data_set = data_set
41
+ @number_of_clusters = number_of_clusters
42
+
43
+ index_clusters = create_initial_index_clusters
44
+ create_distance_matrix(data_set)
45
+ while index_clusters.length > @number_of_clusters
46
+ clusters_to_merge = get_closest_clusters(index_clusters)
47
+ index_clusters = merge_clusters(clusters_to_merge, index_clusters)
48
+ end
49
+ @clusters = build_clusters_from_index_clusters index_clusters
50
+
51
+ return self
52
+ end
53
+
54
+ # Classifies the given data item, returning the cluster index it belongs
55
+ # to (0-based).
56
+ def eval(data_item)
57
+ get_min_index(@clusters.collect {|cluster|
58
+ distance_between_item_and_cluster(data_item, cluster)})
59
+ end
60
+
61
+ # This function calculates the distance between 2 different
62
+ # instances. By default, it returns the euclidean distance to the
63
+ # power of 2.
64
+ # You can provide a more convinient distance implementation:
65
+ #
66
+ # 1- Overwriting this method
67
+ #
68
+ # 2- Providing a closure to the :distance_function parameter
69
+ def distance(a, b)
70
+ return @distance_function.call(a, b) if @distance_function
71
+ return euclidean_distance(a, b)
72
+ end
73
+
74
+ protected
75
+
76
+ # returns [ [0], [1], [2], ... , [n-1] ]
77
+ # where n is the number of data items in the data set
78
+ def create_initial_index_clusters
79
+ index_clusters = []
80
+ @data_set.data_items.length.times {|i| index_clusters << [i]}
81
+ return index_clusters
82
+ end
83
+
84
+ # Create a partial distance matrix:
85
+ # [
86
+ # [d(1,0)],
87
+ # [d(2,0)], [d(2,1)],
88
+ # [d(3,0)], [d(3,1)], [d(3,2)],
89
+ # ...
90
+ # [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
91
+ # ]
92
+ # where n is the number of data items in the data set
93
+ def create_distance_matrix(data_set)
94
+ @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
95
+ data_set.data_items.each_with_index do |a, i|
96
+ i.times do |j|
97
+ b = data_set.data_items[j]
98
+ @distance_matrix[i-1][j] = distance(a, b)
99
+ end
100
+ end
101
+ end
102
+
103
+ # Returns the distance between element data_item[index_a] and
104
+ # data_item[index_b] using the distance matrix
105
+ def read_distance_matrix(index_a, index_b)
106
+ return 0 if index_a == index_b
107
+ index_a, index_b = index_b, index_a if index_b > index_a
108
+ return @distance_matrix[index_a-1][index_b]
109
+ end
110
+
111
+ # clusters_to_merge = [index_cluster_a, index_cluster_b].
112
+ # cluster_a and cluster_b are removed from index_cluster,
113
+ # and a new cluster with all members of cluster_a and cluster_b
114
+ # is added.
115
+ # It returns the new clusters array.
116
+ def merge_clusters(clusters_to_merge, index_clusters)
117
+ index_a = clusters_to_merge.first
118
+ index_b = clusters_to_merge.last
119
+ index_a, index_b = index_b, index_a if index_b > index_a
120
+ new_index_cluster = index_clusters[index_a] +
121
+ index_clusters[index_b]
122
+ index_clusters.delete_at index_a
123
+ index_clusters.delete_at index_b
124
+ index_clusters << new_index_cluster
125
+ return index_clusters
126
+ end
127
+
128
+ # Given an array with clusters of data_items indexes,
129
+ # it returns an array of data_items clusters
130
+ def build_clusters_from_index_clusters(index_clusters)
131
+ @distance_matrix = nil
132
+ return index_clusters.collect do |index_cluster|
133
+ Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
134
+ :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
135
+ end
136
+ end
137
+
138
+ # Returns ans array with the indexes of the two closest
139
+ # clusters => [index_cluster_a, index_cluster_b]
140
+ def get_closest_clusters(index_clusters)
141
+ min_distance = 1.0/0
142
+ closest_clusters = [1, 0]
143
+ index_clusters.each_with_index do |cluster_a, index_a|
144
+ index_a.times do |index_b|
145
+ cluster_b = index_clusters[index_b]
146
+ cluster_distance = calc_index_clusters_distance(cluster_a, cluster_b)
147
+ if cluster_distance < min_distance
148
+ closest_clusters = [index_a, index_b]
149
+ min_distance = cluster_distance
150
+ end
151
+ end
152
+ end
153
+ return closest_clusters
154
+ end
155
+
156
+ # Calculate cluster distance using the single linkage method
157
+ def calc_index_clusters_distance(cluster_a, cluster_b)
158
+ min_dist = 1.0/0
159
+ cluster_a.each do |index_a|
160
+ cluster_b.each do |index_b|
161
+ dist = read_distance_matrix(index_a, index_b)
162
+ min_dist = dist if dist < min_dist
163
+ end
164
+ end
165
+ return min_dist
166
+ end
167
+
168
+ def distance_between_item_and_cluster(data_item, cluster)
169
+ min_dist = 1.0/0
170
+ cluster.data_items.each do |another_item|
171
+ dist = distance(data_item, another_item)
172
+ min_dist = dist if dist < min_dist
173
+ end
174
+ return min_dist
175
+ end
176
+
177
+ end
178
+ end
179
+ end
@@ -9,11 +9,19 @@
9
9
 
10
10
  require 'csv'
11
11
  require 'set'
12
+ require File.dirname(__FILE__) + '/statistics'
12
13
 
13
14
  module Ai4r
14
15
  module Data
16
+
17
+ # A data set is a collection of N data items. Each data item is
18
+ # described by a set of attributes, represented as an array.
19
+ # Optionally, you can assign a label to the attributes, using
20
+ # the data_labels property.
15
21
  class DataSet
16
22
 
23
+ @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
24
+
17
25
  attr_reader :data_labels, :data_items
18
26
 
19
27
  # Create a new DataSet. By default, empty.
@@ -24,7 +32,7 @@ module Ai4r
24
32
  # If you provide data items, but no data labels, the data set will
25
33
  # use the default data label values (see set_data_labels)
26
34
  def initialize(options = {})
27
- @data_labels = options[:data_labels] || []
35
+ @data_labels = []
28
36
  @data_items = options[:data_items] || []
29
37
  set_data_labels(options[:data_labels]) if options[:data_labels]
30
38
  set_data_items(options[:data_items]) if options[:data_items]
@@ -38,7 +46,7 @@ module Ai4r
38
46
  end
39
47
 
40
48
  # Load data items from csv file
41
- def load_data_from_csv(filepath)
49
+ def load_csv(filepath)
42
50
  items = []
43
51
  CSV::Reader.parse(File.open(filepath, 'r')) do |row|
44
52
  items << row
@@ -47,12 +55,21 @@ module Ai4r
47
55
  end
48
56
 
49
57
  # Load data items from csv file. The first row is used as data labels.
50
- def load_data_and_labels_from_csv(filepath)
51
- load_data_from_csv(filepath)
58
+ def load_csv_with_labels(filepath)
59
+ load_csv(filepath)
52
60
  @data_labels = @data_items.shift
53
61
  return self
54
62
  end
55
63
 
64
+ # Same as load_csv, but it will try to convert cell contents as numbers.
65
+ def parse_csv(filepath)
66
+ items = []
67
+ CSV::Reader.parse(File.open(filepath, 'r')) do |row|
68
+ items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
69
+ end
70
+ set_data_items(items)
71
+ end
72
+
56
73
  # Set data labels.
57
74
  # Data labels must have the following format:
58
75
  # [ 'city', 'age_range', 'gender', 'marketing_target' ]
@@ -144,7 +161,7 @@ module Ai4r
144
161
  # get_index("gender")
145
162
  # => 2
146
163
  def get_index(attr)
147
- return (attr.is_a?(String)) ? @data_labels.index(attr) : attr
164
+ return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
148
165
  end
149
166
 
150
167
  # Raise an exception if there is no data item.
@@ -168,44 +185,19 @@ module Ai4r
168
185
  @data_items << data_item
169
186
  end
170
187
  end
171
-
172
- def get_attribute_mean(attribute)
173
- index = get_index(attribute)
174
- mean = 0.0
175
- @data_items.each { |data_item| mean += data_item[index] }
176
- mean /= @data_items.length
177
- return mean
178
- end
179
-
180
- def get_attribute_mode(attribute)
181
- index = get_index(attribute)
182
- domain = build_domain(attribute)
183
- count = {}
184
- domain.each {|value| count[value]=0}
185
- @data_items.each { |data_item| count[data_item[index]] += 1 }
186
- max_count = 0
187
- mode = nil
188
- count.each_pair do |value, value_count|
189
- if value_count > max_count
190
- mode = value
191
- max_count = value_count
192
- end
193
- end
194
- return mode
195
- end
196
-
197
- def get_attribute_mean_or_mode(attribute)
198
- index = get_index(attribute)
199
- if @data_items.first[index].is_a?(Numeric)
200
- return get_attribute_mean(attribute)
201
- else
202
- return get_attribute_mode(attribute)
203
- end
204
- end
205
-
188
+
189
+ # Returns an array with the mean value of numeric attributes, and
190
+ # the most frequent value of non numeric attributes
206
191
  def get_mean_or_mode
207
192
  mean = []
208
- num_attributes.times {|i| mean[i] = get_attribute_mean_or_mode(i) }
193
+ num_attributes.times do |i|
194
+ mean[i] =
195
+ if @data_items.first[i].is_a?(Numeric)
196
+ Statistics.mean(self, i)
197
+ else
198
+ Statistics.mode(self, i)
199
+ end
200
+ end
209
201
  return mean
210
202
  end
211
203
 
@@ -0,0 +1,82 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ module Ai4r
11
+ module Data
12
+
13
+ # This module provides classical distance functions
14
+ module Proximity
15
+
16
+ # This is a faster computational replacement for eclidean distance.
17
+ # Parameters a and b are vectors with continuous attributes.
18
+ def self.squared_euclidean_distance(a, b)
19
+ sum = 0.0
20
+ a.each_with_index do |item_a, i|
21
+ item_b = b[i]
22
+ sum += (item_a - item_b)**2
23
+ end
24
+ return sum
25
+ end
26
+
27
+ # Euclidean distance, or L2 norm.
28
+ # Parameters a and b are vectors with continuous attributes.
29
+ # Euclidean distance tends to form hyperspherical
30
+ # clusters(Clustering, Xu and Wunsch, 2009).
31
+ # Translations and rotations do not cause a
32
+ # distortion in distance relation (Duda et al, 2001)
33
+ # If attributes are measured with different units,
34
+ # attributes with larger values and variance will
35
+ # dominate the metric.
36
+ def self.euclidean_distance(a, b)
37
+ Math.sqrt(squared_euclidean_distance(a, b))
38
+ end
39
+
40
+
41
+ # city block, Manhattan distance, or L1 norm.
42
+ # Parameters a and b are vectors with continuous attributes.
43
+ def self.manhattan_distance(a, b)
44
+ sum = 0.0
45
+ a.each_with_index do |item_a, i|
46
+ item_b = b[i]
47
+ sum += (item_a - item_b).abs
48
+ end
49
+ return sum
50
+ end
51
+
52
+ # Sup distance, or L-intinity norm
53
+ # Parameters a and b are vectors with continuous attributes.
54
+ def self.sup_distance(a, b)
55
+ distance = 0.0
56
+ a.each_with_index do |item_a, i|
57
+ item_b = b[i]
58
+ diff = (item_a - item_b).abs
59
+ distance = diff if diff > distance
60
+ end
61
+ return distance
62
+ end
63
+
64
+ # The Hamming distance between two attributes vectors of equal
65
+ # length is the number of attributes for which the corresponding
66
+ # vectors are different
67
+ # This distance function is frequently used with binary attributes,
68
+ # though it can be used with other discrete attributes.
69
+ def self.hamming_distance(a,b)
70
+ count = 0
71
+ a.each_index do |i|
72
+ count += 1 if a[i] != b[i]
73
+ end
74
+ return count
75
+ end
76
+
77
+ end
78
+
79
+ end
80
+
81
+ end
82
+