ai4ruby 1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. data/README.rdoc +47 -0
  2. data/examples/classifiers/id3_data.csv +121 -0
  3. data/examples/classifiers/id3_example.rb +29 -0
  4. data/examples/classifiers/naive_bayes_data.csv +11 -0
  5. data/examples/classifiers/naive_bayes_example.rb +16 -0
  6. data/examples/classifiers/results.txt +31 -0
  7. data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
  8. data/examples/genetic_algorithm/travel_cost.csv +16 -0
  9. data/examples/neural_network/backpropagation_example.rb +67 -0
  10. data/examples/neural_network/patterns_with_base_noise.rb +68 -0
  11. data/examples/neural_network/patterns_with_noise.rb +66 -0
  12. data/examples/neural_network/training_patterns.rb +68 -0
  13. data/examples/neural_network/xor_example.rb +35 -0
  14. data/examples/som/som_data.rb +156 -0
  15. data/examples/som/som_multi_node_example.rb +22 -0
  16. data/examples/som/som_single_example.rb +24 -0
  17. data/lib/ai4r.rb +33 -0
  18. data/lib/ai4r/classifiers/classifier.rb +62 -0
  19. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  20. data/lib/ai4r/classifiers/ib1.rb +121 -0
  21. data/lib/ai4r/classifiers/id3.rb +326 -0
  22. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  23. data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
  24. data/lib/ai4r/classifiers/one_r.rb +110 -0
  25. data/lib/ai4r/classifiers/prism.rb +197 -0
  26. data/lib/ai4r/classifiers/zero_r.rb +73 -0
  27. data/lib/ai4r/clusterers/average_linkage.rb +59 -0
  28. data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
  29. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  30. data/lib/ai4r/clusterers/clusterer.rb +61 -0
  31. data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
  32. data/lib/ai4r/clusterers/diana.rb +139 -0
  33. data/lib/ai4r/clusterers/k_means.rb +126 -0
  34. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  35. data/lib/ai4r/clusterers/single_linkage.rb +194 -0
  36. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  37. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
  38. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  39. data/lib/ai4r/data/data_set.rb +266 -0
  40. data/lib/ai4r/data/parameterizable.rb +64 -0
  41. data/lib/ai4r/data/proximity.rb +100 -0
  42. data/lib/ai4r/data/statistics.rb +77 -0
  43. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  44. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
  45. data/lib/ai4r/neural_network/backpropagation.rb +326 -0
  46. data/lib/ai4r/neural_network/hopfield.rb +149 -0
  47. data/lib/ai4r/som/layer.rb +68 -0
  48. data/lib/ai4r/som/node.rb +96 -0
  49. data/lib/ai4r/som/som.rb +155 -0
  50. data/lib/ai4r/som/two_phase_layer.rb +90 -0
  51. data/test/classifiers/hyperpipes_test.rb +84 -0
  52. data/test/classifiers/ib1_test.rb +78 -0
  53. data/test/classifiers/id3_test.rb +208 -0
  54. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  55. data/test/classifiers/naive_bayes_test.rb +43 -0
  56. data/test/classifiers/one_r_test.rb +62 -0
  57. data/test/classifiers/prism_test.rb +85 -0
  58. data/test/classifiers/zero_r_test.rb +49 -0
  59. data/test/clusterers/average_linkage_test.rb +51 -0
  60. data/test/clusterers/bisecting_k_means_test.rb +66 -0
  61. data/test/clusterers/centroid_linkage_test.rb +53 -0
  62. data/test/clusterers/complete_linkage_test.rb +57 -0
  63. data/test/clusterers/diana_test.rb +69 -0
  64. data/test/clusterers/k_means_test.rb +100 -0
  65. data/test/clusterers/median_linkage_test.rb +53 -0
  66. data/test/clusterers/single_linkage_test.rb +122 -0
  67. data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
  68. data/test/clusterers/ward_linkage_test.rb +53 -0
  69. data/test/clusterers/weighted_average_linkage_test.rb +53 -0
  70. data/test/data/data_set_test.rb +96 -0
  71. data/test/data/proximity_test.rb +81 -0
  72. data/test/data/statistics_test.rb +65 -0
  73. data/test/experiment/classifier_evaluator_test.rb +76 -0
  74. data/test/genetic_algorithm/chromosome_test.rb +58 -0
  75. data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
  76. data/test/neural_network/backpropagation_test.rb +82 -0
  77. data/test/neural_network/hopfield_test.rb +72 -0
  78. data/test/som/som_test.rb +97 -0
  79. metadata +168 -0
@@ -0,0 +1,194 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # Implementation of a Hierarchical clusterer with single linkage (Everitt et
18
+ # al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # With single linkage, the distance between two clusters is computed as the
23
+ # distance between the two closest elements in the two clusters.
24
+ #
25
+ # D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
26
+ class SingleLinkage < Clusterer
27
+
28
+ attr_reader :data_set, :number_of_clusters, :clusters
29
+
30
+ parameters_info :distance_function =>
31
+ "Custom implementation of distance function. " +
32
+ "It must be a closure receiving two data items and return the " +
33
+ "distance bewteen them. By default, this algorithm uses " +
34
+ "ecuclidean distance of numeric attributes to the power of 2."
35
+
36
+ def initialize
37
+ @distance_function = lambda do |a,b|
38
+ Ai4r::Data::Proximity.squared_euclidean_distance(
39
+ a.select {|att_a| att_a.is_a? Numeric} ,
40
+ b.select {|att_b| att_b.is_a? Numeric})
41
+ end
42
+ end
43
+
44
+ # Build a new clusterer, using data examples found in data_set.
45
+ # Items will be clustered in "number_of_clusters" different
46
+ # clusters.
47
+ def build(data_set, number_of_clusters)
48
+ @data_set = data_set
49
+ @number_of_clusters = number_of_clusters
50
+
51
+ @index_clusters = create_initial_index_clusters
52
+ create_distance_matrix(data_set)
53
+ while @index_clusters.length > @number_of_clusters
54
+ ci, cj = get_closest_clusters(@index_clusters)
55
+ update_distance_matrix(ci, cj)
56
+ merge_clusters(ci, cj, @index_clusters)
57
+ end
58
+ @clusters = build_clusters_from_index_clusters @index_clusters
59
+
60
+ return self
61
+ end
62
+
63
+ # Classifies the given data item, returning the cluster index it belongs
64
+ # to (0-based).
65
+ def eval(data_item)
66
+ get_min_index(@clusters.collect {|cluster|
67
+ distance_between_item_and_cluster(data_item, cluster)})
68
+ end
69
+
70
+ protected
71
+
72
+ # returns [ [0], [1], [2], ... , [n-1] ]
73
+ # where n is the number of data items in the data set
74
+ def create_initial_index_clusters
75
+ index_clusters = []
76
+ @data_set.data_items.length.times {|i| index_clusters << [i]}
77
+ return index_clusters
78
+ end
79
+
80
+ # Create a partial distance matrix:
81
+ # [
82
+ # [d(1,0)],
83
+ # [d(2,0)], [d(2,1)],
84
+ # [d(3,0)], [d(3,1)], [d(3,2)],
85
+ # ...
86
+ # [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
87
+ # ]
88
+ # where n is the number of data items in the data set
89
+ def create_distance_matrix(data_set)
90
+ @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
91
+ data_set.data_items.each_with_index do |a, i|
92
+ i.times do |j|
93
+ b = data_set.data_items[j]
94
+ @distance_matrix[i-1][j] = @distance_function.call(a, b)
95
+ end
96
+ end
97
+ end
98
+
99
+ # Returns the distance between element data_item[index_a] and
100
+ # data_item[index_b] using the distance matrix
101
+ def read_distance_matrix(index_a, index_b)
102
+ return 0 if index_a == index_b
103
+ index_a, index_b = index_b, index_a if index_b > index_a
104
+ return @distance_matrix[index_a-1][index_b]
105
+ end
106
+
107
+ # ci and cj are the indexes of the clusters that are going to
108
+ # be merged. We need to remove distances from/to ci and ci,
109
+ # and add distances from/to new cluster (ci U cj)
110
+ def update_distance_matrix(ci, cj)
111
+ ci, cj = cj, ci if cj > ci
112
+ distances_to_new_cluster = Array.new
113
+ (@distance_matrix.length+1).times do |cx|
114
+ if cx!= ci && cx!=cj
115
+ distances_to_new_cluster << linkage_distance(cx, ci, cj)
116
+ end
117
+ end
118
+ if cj==0 && ci==1
119
+ @distance_matrix.delete_at(1)
120
+ @distance_matrix.delete_at(0)
121
+ elsif cj==0
122
+ @distance_matrix.delete_at(ci-1)
123
+ @distance_matrix.delete_at(0)
124
+ else
125
+ @distance_matrix.delete_at(ci-1)
126
+ @distance_matrix.delete_at(cj-1)
127
+ end
128
+ @distance_matrix.each do |d|
129
+ d.delete_at(ci)
130
+ d.delete_at(cj)
131
+ end
132
+ @distance_matrix << distances_to_new_cluster
133
+ end
134
+
135
+ # return distance between cluster cx and new cluster (ci U cj),
136
+ # using single linkage
137
+ def linkage_distance(cx, ci, cj)
138
+ [read_distance_matrix(cx, ci),
139
+ read_distance_matrix(cx, cj)].min
140
+ end
141
+
142
+ # cluster_a and cluster_b are removed from index_cluster,
143
+ # and a new cluster with all members of cluster_a and cluster_b
144
+ # is added.
145
+ # It modifies index clusters array.
146
+ def merge_clusters(index_a, index_b, index_clusters)
147
+ index_a, index_b = index_b, index_a if index_b > index_a
148
+ new_index_cluster = index_clusters[index_a] +
149
+ index_clusters[index_b]
150
+ index_clusters.delete_at index_a
151
+ index_clusters.delete_at index_b
152
+ index_clusters << new_index_cluster
153
+ return index_clusters
154
+ end
155
+
156
+ # Given an array with clusters of data_items indexes,
157
+ # it returns an array of data_items clusters
158
+ def build_clusters_from_index_clusters(index_clusters)
159
+ @distance_matrix = nil
160
+ return index_clusters.collect do |index_cluster|
161
+ Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
162
+ :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
163
+ end
164
+ end
165
+
166
+ # Returns ans array with the indexes of the two closest
167
+ # clusters => [index_cluster_a, index_cluster_b]
168
+ def get_closest_clusters(index_clusters)
169
+ min_distance = 1.0/0
170
+ closest_clusters = [1, 0]
171
+ index_clusters.each_index do |index_a|
172
+ index_a.times do |index_b|
173
+ cluster_distance = read_distance_matrix(index_a, index_b)
174
+ if cluster_distance < min_distance
175
+ closest_clusters = [index_a, index_b]
176
+ min_distance = cluster_distance
177
+ end
178
+ end
179
+ end
180
+ return closest_clusters
181
+ end
182
+
183
+ def distance_between_item_and_cluster(data_item, cluster)
184
+ min_dist = 1.0/0
185
+ cluster.data_items.each do |another_item|
186
+ dist = @distance_function.call(data_item, another_item)
187
+ min_dist = dist if dist < min_dist
188
+ end
189
+ return min_dist
190
+ end
191
+
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,64 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # Ward's method linkage algorithm, aka the minimum variance method (Everitt
18
+ # et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # The objective of this method is to minime the variance.
23
+ #
24
+ # D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
25
+ # (nj/(ni+nj+nx))*D(cx, cj) -
26
+ # (nx/(ni+nj)^2)*D(ci, cj)
27
+ class WardLinkage < SingleLinkage
28
+
29
+ parameters_info :distance_function =>
30
+ "Custom implementation of distance function. " +
31
+ "It must be a closure receiving two data items and return the " +
32
+ "distance bewteen them. By default, this algorithm uses " +
33
+ "ecuclidean distance of numeric attributes to the power of 2."
34
+
35
+ # Build a new clusterer, using data examples found in data_set.
36
+ # Items will be clustered in "number_of_clusters" different
37
+ # clusters.
38
+ def build(data_set, number_of_clusters)
39
+ super
40
+ end
41
+
42
+ # This algorithms does not allow classification of new data items
43
+ # once it has been built. Rebuild the cluster including you data element.
44
+ def eval(data_item)
45
+ Raise "Eval of new data is not supported by this algorithm."
46
+ end
47
+
48
+ protected
49
+
50
+ # return distance between cluster cx and cluster (ci U cj),
51
+ # using ward's method linkage
52
+ def linkage_distance(cx, ci, cj)
53
+ ni = @index_clusters[ci].length
54
+ nj = @index_clusters[cj].length
55
+ nx = @index_clusters[cx].length
56
+ ( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
57
+ ( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx) -
58
+ ( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
59
+ end
60
+
61
+ end
62
+ end
63
+ end
64
+
@@ -0,0 +1,31 @@
1
+ # Author:: Peter Lubell-Doughtie
2
+ # License:: BSD 3 Clause
3
+ # Project:: ai4r
4
+ # Url:: http://peet.ldee.org
5
+
6
+ require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
7
+
8
+ module Ai4r
9
+ module Clusterers
10
+
11
+ # Hierarchical version to store classes as merges occur.
12
+ class WardLinkageHierarchical < WardLinkage
13
+
14
+ attr_reader :cluster_tree
15
+
16
+ def initialize
17
+ @cluster_tree = []
18
+ super
19
+ end
20
+
21
+ protected
22
+
23
+ def merge_clusters(index_a, index_b, index_clusters)
24
+ # store current index_clusters
25
+ @cluster_tree << index_clusters.dup
26
+ super
27
+ end
28
+ end
29
+ end
30
+ end
31
+
@@ -0,0 +1,61 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # weighted average linkage algorithm, aka weighted pair group method
18
+ # average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # Similar to AverageLinkage, but the distances between clusters are
23
+ # weighted based on the number of data items in each of them.
24
+ #
25
+ # D(cx, (ci U cj)) = ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
26
+ class WeightedAverageLinkage < SingleLinkage
27
+
28
+ parameters_info :distance_function =>
29
+ "Custom implementation of distance function. " +
30
+ "It must be a closure receiving two data items and return the " +
31
+ "distance bewteen them. By default, this algorithm uses " +
32
+ "ecuclidean distance of numeric attributes to the power of 2."
33
+
34
+ # Build a new clusterer, using data examples found in data_set.
35
+ # Items will be clustered in "number_of_clusters" different
36
+ # clusters.
37
+ def build(data_set, number_of_clusters)
38
+ super
39
+ end
40
+
41
+ # This algorithms does not allow classification of new data items
42
+ # once it has been built. Rebuild the cluster including you data element.
43
+ def eval(data_item)
44
+ Raise "Eval of new data is not supported by this algorithm."
45
+ end
46
+
47
+ protected
48
+
49
+ # return distance between cluster cx and cluster (ci U cj),
50
+ # using weighted average linkage
51
+ def linkage_distance(cx, ci, cj)
52
+ ni = @index_clusters[ci].length
53
+ nj = @index_clusters[cj].length
54
+ (1.0 * ni * read_distance_matrix(cx, ci)+
55
+ nj * read_distance_matrix(cx, cj))/(ni+nj)
56
+ end
57
+
58
+ end
59
+ end
60
+ end
61
+
@@ -0,0 +1,266 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'csv'
11
+ require 'set'
12
+ require File.dirname(__FILE__) + '/statistics'
13
+
14
+ module Ai4r
15
+ module Data
16
+
17
+ # A data set is a collection of N data items. Each data item is
18
+ # described by a set of attributes, represented as an array.
19
+ # Optionally, you can assign a label to the attributes, using
20
+ # the data_labels property.
21
+ class DataSet
22
+
23
+ @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
24
+
25
+ attr_reader :data_labels, :data_items
26
+
27
+ # Create a new DataSet. By default, empty.
28
+ # Optionaly, you can provide the initial data items and data labels.
29
+ #
30
+ # e.g. DataSet.new(:data_items => data_items, :data_labels => labels)
31
+ #
32
+ # If you provide data items, but no data labels, the data set will
33
+ # use the default data label values (see set_data_labels)
34
+ def initialize(options = {})
35
+ @data_labels = []
36
+ @data_items = options[:data_items] || []
37
+ set_data_labels(options[:data_labels]) if options[:data_labels]
38
+ set_data_items(options[:data_items]) if options[:data_items]
39
+ end
40
+
41
+ # Retrieve a new DataSet, with the item(s) selected by the provided
42
+ # index. You can specify an index range, too.
43
+ def [](index)
44
+ selected_items = (index.is_a?(Fixnum)) ?
45
+ [@data_items[index]] : @data_items[index]
46
+ return DataSet.new(:data_items => selected_items,
47
+ :data_labels =>@data_labels)
48
+ end
49
+
50
+ # Load data items from csv file
51
+ def load_csv(filepath)
52
+ items = []
53
+ open_csv_file(filepath) do |entry|
54
+ items << entry
55
+ end
56
+ set_data_items(items)
57
+ end
58
+
59
+ # opens a csv-file and reads it line by line
60
+ # for each line, a block is called and the row is passed to the block
61
+ # ruby1.8 and 1.9 safe
62
+ def open_csv_file(filepath, &block)
63
+ if CSV.const_defined? :Reader
64
+ CSV::Reader.parse(File.open(filepath, 'r')) do |row|
65
+ block.call row
66
+ end
67
+ else
68
+ CSV.parse(File.open(filepath, 'r')) do |row|
69
+ block.call row
70
+ end
71
+ end
72
+ end
73
+
74
+ # Load data items from csv file. The first row is used as data labels.
75
+ def load_csv_with_labels(filepath)
76
+ load_csv(filepath)
77
+ @data_labels = @data_items.shift
78
+ return self
79
+ end
80
+
81
+ # Same as load_csv, but it will try to convert cell contents as numbers.
82
+ def parse_csv(filepath)
83
+ items = []
84
+ open_csv_file(filepath) do |row|
85
+ items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
86
+ end
87
+ set_data_items(items)
88
+ end
89
+
90
+ # Set data labels.
91
+ # Data labels must have the following format:
92
+ # [ 'city', 'age_range', 'gender', 'marketing_target' ]
93
+ #
94
+ # If you do not provide labels for you data, the following labels will
95
+ # be created by default:
96
+ # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
97
+ def set_data_labels(labels)
98
+ check_data_labels(labels)
99
+ @data_labels = labels
100
+ return self
101
+ end
102
+
103
+ # Set the data items.
104
+ # M data items with N attributes must have the following
105
+ # format:
106
+ #
107
+ # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
108
+ # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
109
+ # ...
110
+ # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
111
+ # ]
112
+ #
113
+ # e.g.
114
+ # [ ['New York', '<30', 'M', 'Y'],
115
+ # ['Chicago', '<30', 'M', 'Y'],
116
+ # ['Chicago', '<30', 'F', 'Y'],
117
+ # ['New York', '<30', 'M', 'Y'],
118
+ # ['New York', '<30', 'M', 'Y'],
119
+ # ['Chicago', '[30-50)', 'M', 'Y'],
120
+ # ['New York', '[30-50)', 'F', 'N'],
121
+ # ['Chicago', '[30-50)', 'F', 'Y'],
122
+ # ['New York', '[30-50)', 'F', 'N'],
123
+ # ['Chicago', '[50-80]', 'M', 'N'],
124
+ # ['New York', '[50-80]', 'F', 'N'],
125
+ # ['New York', '[50-80]', 'M', 'N'],
126
+ # ['Chicago', '[50-80]', 'M', 'N'],
127
+ # ['New York', '[50-80]', 'F', 'N'],
128
+ # ['Chicago', '>80', 'F', 'Y']
129
+ # ]
130
+ #
131
+ # This method returns the classifier (self), allowing method chaining.
132
+ def set_data_items(items)
133
+ check_data_items(items)
134
+ @data_labels = default_data_labels(items) if @data_labels.empty?
135
+ @data_items = items
136
+ return self
137
+ end
138
+
139
+ # Returns an array with the domain of each attribute:
140
+ # * Set instance containing all possible values for nominal attributes
141
+ # * Array with min and max values for numeric attributes (i.e. [min, max])
142
+ #
143
+ # Return example:
144
+ # => [#<Set: {"New York", "Chicago"}>,
145
+ # #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
146
+ # #<Set: {"M", "F"}>,
147
+ # [5, 85],
148
+ # #<Set: {"Y", "N"}>]
149
+ def build_domains
150
+ @data_labels.collect {|attr_label| build_domain(attr_label) }
151
+ end
152
+
153
+ # Returns a Set instance containing all possible values for an attribute
154
+ # The parameter can be an attribute label or index (0 based).
155
+ # * Set instance containing all possible values for nominal attributes
156
+ # * Array with min and max values for numeric attributes (i.e. [min, max])
157
+ #
158
+ # build_domain("city")
159
+ # => #<Set: {"New York", "Chicago"}>
160
+ #
161
+ # build_domain("age")
162
+ # => [5, 85]
163
+ #
164
+ # build_domain(2) # In this example, the third attribute is gender
165
+ # => #<Set: {"M", "F"}>
166
+ def build_domain(attr)
167
+ index = get_index(attr)
168
+ if @data_items.first[index].is_a?(Numeric)
169
+ return [Statistics.min(self, index), Statistics.max(self, index)]
170
+ else
171
+ return @data_items.inject(Set.new){|domain, x| domain << x[index]}
172
+ end
173
+ end
174
+
175
+ # Returns attributes number, including class attribute
176
+ def num_attributes
177
+ return (@data_items.empty?) ? 0 : @data_items.first.size
178
+ end
179
+
180
+ # Returns the index of a given attribute (0-based).
181
+ # For example, if "gender" is the third attribute, then:
182
+ # get_index("gender")
183
+ # => 2
184
+ def get_index(attr)
185
+ return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
186
+ end
187
+
188
+ # Raise an exception if there is no data item.
189
+ def check_not_empty
190
+ if @data_items.empty?
191
+ raise ArgumentError, "Examples data set must not be empty."
192
+ end
193
+ end
194
+
195
+ # Add a data item to the data set
196
+ def << data_item
197
+ if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
198
+ raise ArgumentError, "Data must not be an non empty array."
199
+ elsif @data_items.empty?
200
+ set_data_items([data_item])
201
+ elsif data_item.length != num_attributes
202
+ raise ArgumentError, "Number of attributes do not match. " +
203
+ "#{data_item.length} attributes provided, " +
204
+ "#{num_attributes} attributes expected."
205
+ else
206
+ @data_items << data_item
207
+ end
208
+ end
209
+
210
+ # Returns an array with the mean value of numeric attributes, and
211
+ # the most frequent value of non numeric attributes
212
+ def get_mean_or_mode
213
+ mean = []
214
+ num_attributes.times do |i|
215
+ mean[i] =
216
+ if @data_items.first[i].is_a?(Numeric)
217
+ Statistics.mean(self, i)
218
+ else
219
+ Statistics.mode(self, i)
220
+ end
221
+ end
222
+ return mean
223
+ end
224
+
225
+ protected
226
+
227
+ def check_data_items(data_items)
228
+ if !data_items || data_items.empty?
229
+ raise ArgumentError, "Examples data set must not be empty."
230
+ elsif !data_items.first.is_a?(Enumerable)
231
+ raise ArgumentError, "Unkown format for example data."
232
+ end
233
+ attributes_num = data_items.first.length
234
+ data_items.each_index do |index|
235
+ if data_items[index].length != attributes_num
236
+ raise ArgumentError,
237
+ "Quantity of attributes is inconsistent. " +
238
+ "The first item has #{attributes_num} attributes "+
239
+ "and row #{index} has #{data_items[index].length} attributes"
240
+ end
241
+ end
242
+ end
243
+
244
+ def check_data_labels(labels)
245
+ if !@data_items.empty?
246
+ if labels.length != @data_items.first.length
247
+ raise ArgumentError,
248
+ "Number of labels and attributes do not match. " +
249
+ "#{labels.length} labels and " +
250
+ "#{@data_items.first.length} attributes found."
251
+ end
252
+ end
253
+ end
254
+
255
+ def default_data_labels(data_items)
256
+ data_labels = []
257
+ data_items[0][0..-2].each_index do |i|
258
+ data_labels[i] = "attribute_#{i+1}"
259
+ end
260
+ data_labels[data_labels.length]="class_value"
261
+ return data_labels
262
+ end
263
+
264
+ end
265
+ end
266
+ end