ai4ruby 1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
18
|
+
# al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# With single linkage, the distance between two clusters is computed as the
|
23
|
+
# distance between the two closest elements in the two clusters.
|
24
|
+
#
|
25
|
+
# D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
|
26
|
+
class SingleLinkage < Clusterer
|
27
|
+
|
28
|
+
attr_reader :data_set, :number_of_clusters, :clusters
|
29
|
+
|
30
|
+
parameters_info :distance_function =>
|
31
|
+
"Custom implementation of distance function. " +
|
32
|
+
"It must be a closure receiving two data items and return the " +
|
33
|
+
"distance bewteen them. By default, this algorithm uses " +
|
34
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
35
|
+
|
36
|
+
def initialize
|
37
|
+
@distance_function = lambda do |a,b|
|
38
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
39
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
40
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Build a new clusterer, using data examples found in data_set.
|
45
|
+
# Items will be clustered in "number_of_clusters" different
|
46
|
+
# clusters.
|
47
|
+
def build(data_set, number_of_clusters)
|
48
|
+
@data_set = data_set
|
49
|
+
@number_of_clusters = number_of_clusters
|
50
|
+
|
51
|
+
@index_clusters = create_initial_index_clusters
|
52
|
+
create_distance_matrix(data_set)
|
53
|
+
while @index_clusters.length > @number_of_clusters
|
54
|
+
ci, cj = get_closest_clusters(@index_clusters)
|
55
|
+
update_distance_matrix(ci, cj)
|
56
|
+
merge_clusters(ci, cj, @index_clusters)
|
57
|
+
end
|
58
|
+
@clusters = build_clusters_from_index_clusters @index_clusters
|
59
|
+
|
60
|
+
return self
|
61
|
+
end
|
62
|
+
|
63
|
+
# Classifies the given data item, returning the cluster index it belongs
|
64
|
+
# to (0-based).
|
65
|
+
def eval(data_item)
|
66
|
+
get_min_index(@clusters.collect {|cluster|
|
67
|
+
distance_between_item_and_cluster(data_item, cluster)})
|
68
|
+
end
|
69
|
+
|
70
|
+
protected
|
71
|
+
|
72
|
+
# returns [ [0], [1], [2], ... , [n-1] ]
|
73
|
+
# where n is the number of data items in the data set
|
74
|
+
def create_initial_index_clusters
|
75
|
+
index_clusters = []
|
76
|
+
@data_set.data_items.length.times {|i| index_clusters << [i]}
|
77
|
+
return index_clusters
|
78
|
+
end
|
79
|
+
|
80
|
+
# Create a partial distance matrix:
|
81
|
+
# [
|
82
|
+
# [d(1,0)],
|
83
|
+
# [d(2,0)], [d(2,1)],
|
84
|
+
# [d(3,0)], [d(3,1)], [d(3,2)],
|
85
|
+
# ...
|
86
|
+
# [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
|
87
|
+
# ]
|
88
|
+
# where n is the number of data items in the data set
|
89
|
+
def create_distance_matrix(data_set)
|
90
|
+
@distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
|
91
|
+
data_set.data_items.each_with_index do |a, i|
|
92
|
+
i.times do |j|
|
93
|
+
b = data_set.data_items[j]
|
94
|
+
@distance_matrix[i-1][j] = @distance_function.call(a, b)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Returns the distance between element data_item[index_a] and
|
100
|
+
# data_item[index_b] using the distance matrix
|
101
|
+
def read_distance_matrix(index_a, index_b)
|
102
|
+
return 0 if index_a == index_b
|
103
|
+
index_a, index_b = index_b, index_a if index_b > index_a
|
104
|
+
return @distance_matrix[index_a-1][index_b]
|
105
|
+
end
|
106
|
+
|
107
|
+
# ci and cj are the indexes of the clusters that are going to
|
108
|
+
# be merged. We need to remove distances from/to ci and ci,
|
109
|
+
# and add distances from/to new cluster (ci U cj)
|
110
|
+
def update_distance_matrix(ci, cj)
|
111
|
+
ci, cj = cj, ci if cj > ci
|
112
|
+
distances_to_new_cluster = Array.new
|
113
|
+
(@distance_matrix.length+1).times do |cx|
|
114
|
+
if cx!= ci && cx!=cj
|
115
|
+
distances_to_new_cluster << linkage_distance(cx, ci, cj)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
if cj==0 && ci==1
|
119
|
+
@distance_matrix.delete_at(1)
|
120
|
+
@distance_matrix.delete_at(0)
|
121
|
+
elsif cj==0
|
122
|
+
@distance_matrix.delete_at(ci-1)
|
123
|
+
@distance_matrix.delete_at(0)
|
124
|
+
else
|
125
|
+
@distance_matrix.delete_at(ci-1)
|
126
|
+
@distance_matrix.delete_at(cj-1)
|
127
|
+
end
|
128
|
+
@distance_matrix.each do |d|
|
129
|
+
d.delete_at(ci)
|
130
|
+
d.delete_at(cj)
|
131
|
+
end
|
132
|
+
@distance_matrix << distances_to_new_cluster
|
133
|
+
end
|
134
|
+
|
135
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
136
|
+
# using single linkage
|
137
|
+
def linkage_distance(cx, ci, cj)
|
138
|
+
[read_distance_matrix(cx, ci),
|
139
|
+
read_distance_matrix(cx, cj)].min
|
140
|
+
end
|
141
|
+
|
142
|
+
# cluster_a and cluster_b are removed from index_cluster,
|
143
|
+
# and a new cluster with all members of cluster_a and cluster_b
|
144
|
+
# is added.
|
145
|
+
# It modifies index clusters array.
|
146
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
147
|
+
index_a, index_b = index_b, index_a if index_b > index_a
|
148
|
+
new_index_cluster = index_clusters[index_a] +
|
149
|
+
index_clusters[index_b]
|
150
|
+
index_clusters.delete_at index_a
|
151
|
+
index_clusters.delete_at index_b
|
152
|
+
index_clusters << new_index_cluster
|
153
|
+
return index_clusters
|
154
|
+
end
|
155
|
+
|
156
|
+
# Given an array with clusters of data_items indexes,
|
157
|
+
# it returns an array of data_items clusters
|
158
|
+
def build_clusters_from_index_clusters(index_clusters)
|
159
|
+
@distance_matrix = nil
|
160
|
+
return index_clusters.collect do |index_cluster|
|
161
|
+
Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
|
162
|
+
:data_items => index_cluster.collect {|i| @data_set.data_items[i]})
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Returns ans array with the indexes of the two closest
|
167
|
+
# clusters => [index_cluster_a, index_cluster_b]
|
168
|
+
def get_closest_clusters(index_clusters)
|
169
|
+
min_distance = 1.0/0
|
170
|
+
closest_clusters = [1, 0]
|
171
|
+
index_clusters.each_index do |index_a|
|
172
|
+
index_a.times do |index_b|
|
173
|
+
cluster_distance = read_distance_matrix(index_a, index_b)
|
174
|
+
if cluster_distance < min_distance
|
175
|
+
closest_clusters = [index_a, index_b]
|
176
|
+
min_distance = cluster_distance
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
return closest_clusters
|
181
|
+
end
|
182
|
+
|
183
|
+
def distance_between_item_and_cluster(data_item, cluster)
|
184
|
+
min_dist = 1.0/0
|
185
|
+
cluster.data_items.each do |another_item|
|
186
|
+
dist = @distance_function.call(data_item, another_item)
|
187
|
+
min_dist = dist if dist < min_dist
|
188
|
+
end
|
189
|
+
return min_dist
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# Ward's method linkage algorithm, aka the minimum variance method (Everitt
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# The objective of this method is to minime the variance.
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
25
|
+
# (nj/(ni+nj+nx))*D(cx, cj) -
|
26
|
+
# (nx/(ni+nj)^2)*D(ci, cj)
|
27
|
+
class WardLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using ward's method linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
ni = @index_clusters[ci].length
|
54
|
+
nj = @index_clusters[cj].length
|
55
|
+
nx = @index_clusters[cx].length
|
56
|
+
( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
|
57
|
+
( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx) -
|
58
|
+
( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Peter Lubell-Doughtie
|
2
|
+
# License:: BSD 3 Clause
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://peet.ldee.org
|
5
|
+
|
6
|
+
require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
|
7
|
+
|
8
|
+
module Ai4r
|
9
|
+
module Clusterers
|
10
|
+
|
11
|
+
# Hierarchical version to store classes as merges occur.
|
12
|
+
class WardLinkageHierarchical < WardLinkage
|
13
|
+
|
14
|
+
attr_reader :cluster_tree
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@cluster_tree = []
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
protected
|
22
|
+
|
23
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
24
|
+
# store current index_clusters
|
25
|
+
@cluster_tree << index_clusters.dup
|
26
|
+
super
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# weighted average linkage algorithm, aka weighted pair group method
|
18
|
+
# average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# Similar to AverageLinkage, but the distances between clusters are
|
23
|
+
# weighted based on the number of data items in each of them.
|
24
|
+
#
|
25
|
+
# D(cx, (ci U cj)) = ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
|
26
|
+
class WeightedAverageLinkage < SingleLinkage
|
27
|
+
|
28
|
+
parameters_info :distance_function =>
|
29
|
+
"Custom implementation of distance function. " +
|
30
|
+
"It must be a closure receiving two data items and return the " +
|
31
|
+
"distance bewteen them. By default, this algorithm uses " +
|
32
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
33
|
+
|
34
|
+
# Build a new clusterer, using data examples found in data_set.
|
35
|
+
# Items will be clustered in "number_of_clusters" different
|
36
|
+
# clusters.
|
37
|
+
def build(data_set, number_of_clusters)
|
38
|
+
super
|
39
|
+
end
|
40
|
+
|
41
|
+
# This algorithms does not allow classification of new data items
|
42
|
+
# once it has been built. Rebuild the cluster including you data element.
|
43
|
+
def eval(data_item)
|
44
|
+
Raise "Eval of new data is not supported by this algorithm."
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
# return distance between cluster cx and cluster (ci U cj),
|
50
|
+
# using weighted average linkage
|
51
|
+
def linkage_distance(cx, ci, cj)
|
52
|
+
ni = @index_clusters[ci].length
|
53
|
+
nj = @index_clusters[cj].length
|
54
|
+
(1.0 * ni * read_distance_matrix(cx, ci)+
|
55
|
+
nj * read_distance_matrix(cx, cj))/(ni+nj)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
@@ -0,0 +1,266 @@
|
|
1
|
+
# Author:: Sergio Fierens
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'csv'
|
11
|
+
require 'set'
|
12
|
+
require File.dirname(__FILE__) + '/statistics'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Data
|
16
|
+
|
17
|
+
# A data set is a collection of N data items. Each data item is
|
18
|
+
# described by a set of attributes, represented as an array.
|
19
|
+
# Optionally, you can assign a label to the attributes, using
|
20
|
+
# the data_labels property.
|
21
|
+
class DataSet
|
22
|
+
|
23
|
+
@@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
|
24
|
+
|
25
|
+
attr_reader :data_labels, :data_items
|
26
|
+
|
27
|
+
# Create a new DataSet. By default, empty.
|
28
|
+
# Optionaly, you can provide the initial data items and data labels.
|
29
|
+
#
|
30
|
+
# e.g. DataSet.new(:data_items => data_items, :data_labels => labels)
|
31
|
+
#
|
32
|
+
# If you provide data items, but no data labels, the data set will
|
33
|
+
# use the default data label values (see set_data_labels)
|
34
|
+
def initialize(options = {})
|
35
|
+
@data_labels = []
|
36
|
+
@data_items = options[:data_items] || []
|
37
|
+
set_data_labels(options[:data_labels]) if options[:data_labels]
|
38
|
+
set_data_items(options[:data_items]) if options[:data_items]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Retrieve a new DataSet, with the item(s) selected by the provided
|
42
|
+
# index. You can specify an index range, too.
|
43
|
+
def [](index)
|
44
|
+
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
+
[@data_items[index]] : @data_items[index]
|
46
|
+
return DataSet.new(:data_items => selected_items,
|
47
|
+
:data_labels =>@data_labels)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Load data items from csv file
|
51
|
+
def load_csv(filepath)
|
52
|
+
items = []
|
53
|
+
open_csv_file(filepath) do |entry|
|
54
|
+
items << entry
|
55
|
+
end
|
56
|
+
set_data_items(items)
|
57
|
+
end
|
58
|
+
|
59
|
+
# opens a csv-file and reads it line by line
|
60
|
+
# for each line, a block is called and the row is passed to the block
|
61
|
+
# ruby1.8 and 1.9 safe
|
62
|
+
def open_csv_file(filepath, &block)
|
63
|
+
if CSV.const_defined? :Reader
|
64
|
+
CSV::Reader.parse(File.open(filepath, 'r')) do |row|
|
65
|
+
block.call row
|
66
|
+
end
|
67
|
+
else
|
68
|
+
CSV.parse(File.open(filepath, 'r')) do |row|
|
69
|
+
block.call row
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Load data items from csv file. The first row is used as data labels.
|
75
|
+
def load_csv_with_labels(filepath)
|
76
|
+
load_csv(filepath)
|
77
|
+
@data_labels = @data_items.shift
|
78
|
+
return self
|
79
|
+
end
|
80
|
+
|
81
|
+
# Same as load_csv, but it will try to convert cell contents as numbers.
|
82
|
+
def parse_csv(filepath)
|
83
|
+
items = []
|
84
|
+
open_csv_file(filepath) do |row|
|
85
|
+
items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
|
86
|
+
end
|
87
|
+
set_data_items(items)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Set data labels.
|
91
|
+
# Data labels must have the following format:
|
92
|
+
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
93
|
+
#
|
94
|
+
# If you do not provide labels for you data, the following labels will
|
95
|
+
# be created by default:
|
96
|
+
# [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
|
97
|
+
def set_data_labels(labels)
|
98
|
+
check_data_labels(labels)
|
99
|
+
@data_labels = labels
|
100
|
+
return self
|
101
|
+
end
|
102
|
+
|
103
|
+
# Set the data items.
|
104
|
+
# M data items with N attributes must have the following
|
105
|
+
# format:
|
106
|
+
#
|
107
|
+
# [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
|
108
|
+
# [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
|
109
|
+
# ...
|
110
|
+
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
|
111
|
+
# ]
|
112
|
+
#
|
113
|
+
# e.g.
|
114
|
+
# [ ['New York', '<30', 'M', 'Y'],
|
115
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
116
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
117
|
+
# ['New York', '<30', 'M', 'Y'],
|
118
|
+
# ['New York', '<30', 'M', 'Y'],
|
119
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
120
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
121
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
122
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
123
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
124
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
125
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
126
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
127
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
128
|
+
# ['Chicago', '>80', 'F', 'Y']
|
129
|
+
# ]
|
130
|
+
#
|
131
|
+
# This method returns the classifier (self), allowing method chaining.
|
132
|
+
def set_data_items(items)
|
133
|
+
check_data_items(items)
|
134
|
+
@data_labels = default_data_labels(items) if @data_labels.empty?
|
135
|
+
@data_items = items
|
136
|
+
return self
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns an array with the domain of each attribute:
|
140
|
+
# * Set instance containing all possible values for nominal attributes
|
141
|
+
# * Array with min and max values for numeric attributes (i.e. [min, max])
|
142
|
+
#
|
143
|
+
# Return example:
|
144
|
+
# => [#<Set: {"New York", "Chicago"}>,
|
145
|
+
# #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
|
146
|
+
# #<Set: {"M", "F"}>,
|
147
|
+
# [5, 85],
|
148
|
+
# #<Set: {"Y", "N"}>]
|
149
|
+
def build_domains
|
150
|
+
@data_labels.collect {|attr_label| build_domain(attr_label) }
|
151
|
+
end
|
152
|
+
|
153
|
+
# Returns a Set instance containing all possible values for an attribute
|
154
|
+
# The parameter can be an attribute label or index (0 based).
|
155
|
+
# * Set instance containing all possible values for nominal attributes
|
156
|
+
# * Array with min and max values for numeric attributes (i.e. [min, max])
|
157
|
+
#
|
158
|
+
# build_domain("city")
|
159
|
+
# => #<Set: {"New York", "Chicago"}>
|
160
|
+
#
|
161
|
+
# build_domain("age")
|
162
|
+
# => [5, 85]
|
163
|
+
#
|
164
|
+
# build_domain(2) # In this example, the third attribute is gender
|
165
|
+
# => #<Set: {"M", "F"}>
|
166
|
+
def build_domain(attr)
|
167
|
+
index = get_index(attr)
|
168
|
+
if @data_items.first[index].is_a?(Numeric)
|
169
|
+
return [Statistics.min(self, index), Statistics.max(self, index)]
|
170
|
+
else
|
171
|
+
return @data_items.inject(Set.new){|domain, x| domain << x[index]}
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Returns attributes number, including class attribute
|
176
|
+
def num_attributes
|
177
|
+
return (@data_items.empty?) ? 0 : @data_items.first.size
|
178
|
+
end
|
179
|
+
|
180
|
+
# Returns the index of a given attribute (0-based).
|
181
|
+
# For example, if "gender" is the third attribute, then:
|
182
|
+
# get_index("gender")
|
183
|
+
# => 2
|
184
|
+
def get_index(attr)
|
185
|
+
return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Raise an exception if there is no data item.
|
189
|
+
def check_not_empty
|
190
|
+
if @data_items.empty?
|
191
|
+
raise ArgumentError, "Examples data set must not be empty."
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# Add a data item to the data set
|
196
|
+
def << data_item
|
197
|
+
if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
|
198
|
+
raise ArgumentError, "Data must not be an non empty array."
|
199
|
+
elsif @data_items.empty?
|
200
|
+
set_data_items([data_item])
|
201
|
+
elsif data_item.length != num_attributes
|
202
|
+
raise ArgumentError, "Number of attributes do not match. " +
|
203
|
+
"#{data_item.length} attributes provided, " +
|
204
|
+
"#{num_attributes} attributes expected."
|
205
|
+
else
|
206
|
+
@data_items << data_item
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# Returns an array with the mean value of numeric attributes, and
|
211
|
+
# the most frequent value of non numeric attributes
|
212
|
+
def get_mean_or_mode
|
213
|
+
mean = []
|
214
|
+
num_attributes.times do |i|
|
215
|
+
mean[i] =
|
216
|
+
if @data_items.first[i].is_a?(Numeric)
|
217
|
+
Statistics.mean(self, i)
|
218
|
+
else
|
219
|
+
Statistics.mode(self, i)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
return mean
|
223
|
+
end
|
224
|
+
|
225
|
+
protected
|
226
|
+
|
227
|
+
def check_data_items(data_items)
|
228
|
+
if !data_items || data_items.empty?
|
229
|
+
raise ArgumentError, "Examples data set must not be empty."
|
230
|
+
elsif !data_items.first.is_a?(Enumerable)
|
231
|
+
raise ArgumentError, "Unkown format for example data."
|
232
|
+
end
|
233
|
+
attributes_num = data_items.first.length
|
234
|
+
data_items.each_index do |index|
|
235
|
+
if data_items[index].length != attributes_num
|
236
|
+
raise ArgumentError,
|
237
|
+
"Quantity of attributes is inconsistent. " +
|
238
|
+
"The first item has #{attributes_num} attributes "+
|
239
|
+
"and row #{index} has #{data_items[index].length} attributes"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def check_data_labels(labels)
|
245
|
+
if !@data_items.empty?
|
246
|
+
if labels.length != @data_items.first.length
|
247
|
+
raise ArgumentError,
|
248
|
+
"Number of labels and attributes do not match. " +
|
249
|
+
"#{labels.length} labels and " +
|
250
|
+
"#{@data_items.first.length} attributes found."
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def default_data_labels(data_items)
|
256
|
+
data_labels = []
|
257
|
+
data_items[0][0..-2].each_index do |i|
|
258
|
+
data_labels[i] = "attribute_#{i+1}"
|
259
|
+
end
|
260
|
+
data_labels[data_labels.length]="class_value"
|
261
|
+
return data_labels
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|