ai4ruby 1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
18
|
+
# al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# With single linkage, the distance between two clusters is computed as the
|
23
|
+
# distance between the two closest elements in the two clusters.
|
24
|
+
#
|
25
|
+
# D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
|
26
|
+
class SingleLinkage < Clusterer
|
27
|
+
|
28
|
+
attr_reader :data_set, :number_of_clusters, :clusters
|
29
|
+
|
30
|
+
parameters_info :distance_function =>
|
31
|
+
"Custom implementation of distance function. " +
|
32
|
+
"It must be a closure receiving two data items and return the " +
|
33
|
+
"distance bewteen them. By default, this algorithm uses " +
|
34
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
35
|
+
|
36
|
+
def initialize
|
37
|
+
@distance_function = lambda do |a,b|
|
38
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
39
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
40
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Build a new clusterer, using data examples found in data_set.
|
45
|
+
# Items will be clustered in "number_of_clusters" different
|
46
|
+
# clusters.
|
47
|
+
def build(data_set, number_of_clusters)
|
48
|
+
@data_set = data_set
|
49
|
+
@number_of_clusters = number_of_clusters
|
50
|
+
|
51
|
+
@index_clusters = create_initial_index_clusters
|
52
|
+
create_distance_matrix(data_set)
|
53
|
+
while @index_clusters.length > @number_of_clusters
|
54
|
+
ci, cj = get_closest_clusters(@index_clusters)
|
55
|
+
update_distance_matrix(ci, cj)
|
56
|
+
merge_clusters(ci, cj, @index_clusters)
|
57
|
+
end
|
58
|
+
@clusters = build_clusters_from_index_clusters @index_clusters
|
59
|
+
|
60
|
+
return self
|
61
|
+
end
|
62
|
+
|
63
|
+
# Classifies the given data item, returning the cluster index it belongs
|
64
|
+
# to (0-based).
|
65
|
+
def eval(data_item)
|
66
|
+
get_min_index(@clusters.collect {|cluster|
|
67
|
+
distance_between_item_and_cluster(data_item, cluster)})
|
68
|
+
end
|
69
|
+
|
70
|
+
protected
|
71
|
+
|
72
|
+
# returns [ [0], [1], [2], ... , [n-1] ]
|
73
|
+
# where n is the number of data items in the data set
|
74
|
+
def create_initial_index_clusters
|
75
|
+
index_clusters = []
|
76
|
+
@data_set.data_items.length.times {|i| index_clusters << [i]}
|
77
|
+
return index_clusters
|
78
|
+
end
|
79
|
+
|
80
|
+
# Create a partial distance matrix:
|
81
|
+
# [
|
82
|
+
# [d(1,0)],
|
83
|
+
# [d(2,0)], [d(2,1)],
|
84
|
+
# [d(3,0)], [d(3,1)], [d(3,2)],
|
85
|
+
# ...
|
86
|
+
# [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
|
87
|
+
# ]
|
88
|
+
# where n is the number of data items in the data set
|
89
|
+
def create_distance_matrix(data_set)
|
90
|
+
@distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
|
91
|
+
data_set.data_items.each_with_index do |a, i|
|
92
|
+
i.times do |j|
|
93
|
+
b = data_set.data_items[j]
|
94
|
+
@distance_matrix[i-1][j] = @distance_function.call(a, b)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Returns the distance between element data_item[index_a] and
|
100
|
+
# data_item[index_b] using the distance matrix
|
101
|
+
def read_distance_matrix(index_a, index_b)
|
102
|
+
return 0 if index_a == index_b
|
103
|
+
index_a, index_b = index_b, index_a if index_b > index_a
|
104
|
+
return @distance_matrix[index_a-1][index_b]
|
105
|
+
end
|
106
|
+
|
107
|
+
# ci and cj are the indexes of the clusters that are going to
|
108
|
+
# be merged. We need to remove distances from/to ci and ci,
|
109
|
+
# and add distances from/to new cluster (ci U cj)
|
110
|
+
def update_distance_matrix(ci, cj)
|
111
|
+
ci, cj = cj, ci if cj > ci
|
112
|
+
distances_to_new_cluster = Array.new
|
113
|
+
(@distance_matrix.length+1).times do |cx|
|
114
|
+
if cx!= ci && cx!=cj
|
115
|
+
distances_to_new_cluster << linkage_distance(cx, ci, cj)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
if cj==0 && ci==1
|
119
|
+
@distance_matrix.delete_at(1)
|
120
|
+
@distance_matrix.delete_at(0)
|
121
|
+
elsif cj==0
|
122
|
+
@distance_matrix.delete_at(ci-1)
|
123
|
+
@distance_matrix.delete_at(0)
|
124
|
+
else
|
125
|
+
@distance_matrix.delete_at(ci-1)
|
126
|
+
@distance_matrix.delete_at(cj-1)
|
127
|
+
end
|
128
|
+
@distance_matrix.each do |d|
|
129
|
+
d.delete_at(ci)
|
130
|
+
d.delete_at(cj)
|
131
|
+
end
|
132
|
+
@distance_matrix << distances_to_new_cluster
|
133
|
+
end
|
134
|
+
|
135
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
136
|
+
# using single linkage
|
137
|
+
def linkage_distance(cx, ci, cj)
|
138
|
+
[read_distance_matrix(cx, ci),
|
139
|
+
read_distance_matrix(cx, cj)].min
|
140
|
+
end
|
141
|
+
|
142
|
+
# cluster_a and cluster_b are removed from index_cluster,
|
143
|
+
# and a new cluster with all members of cluster_a and cluster_b
|
144
|
+
# is added.
|
145
|
+
# It modifies index clusters array.
|
146
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
147
|
+
index_a, index_b = index_b, index_a if index_b > index_a
|
148
|
+
new_index_cluster = index_clusters[index_a] +
|
149
|
+
index_clusters[index_b]
|
150
|
+
index_clusters.delete_at index_a
|
151
|
+
index_clusters.delete_at index_b
|
152
|
+
index_clusters << new_index_cluster
|
153
|
+
return index_clusters
|
154
|
+
end
|
155
|
+
|
156
|
+
# Given an array with clusters of data_items indexes,
|
157
|
+
# it returns an array of data_items clusters
|
158
|
+
def build_clusters_from_index_clusters(index_clusters)
|
159
|
+
@distance_matrix = nil
|
160
|
+
return index_clusters.collect do |index_cluster|
|
161
|
+
Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
|
162
|
+
:data_items => index_cluster.collect {|i| @data_set.data_items[i]})
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Returns ans array with the indexes of the two closest
|
167
|
+
# clusters => [index_cluster_a, index_cluster_b]
|
168
|
+
def get_closest_clusters(index_clusters)
|
169
|
+
min_distance = 1.0/0
|
170
|
+
closest_clusters = [1, 0]
|
171
|
+
index_clusters.each_index do |index_a|
|
172
|
+
index_a.times do |index_b|
|
173
|
+
cluster_distance = read_distance_matrix(index_a, index_b)
|
174
|
+
if cluster_distance < min_distance
|
175
|
+
closest_clusters = [index_a, index_b]
|
176
|
+
min_distance = cluster_distance
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
return closest_clusters
|
181
|
+
end
|
182
|
+
|
183
|
+
def distance_between_item_and_cluster(data_item, cluster)
|
184
|
+
min_dist = 1.0/0
|
185
|
+
cluster.data_items.each do |another_item|
|
186
|
+
dist = @distance_function.call(data_item, another_item)
|
187
|
+
min_dist = dist if dist < min_dist
|
188
|
+
end
|
189
|
+
return min_dist
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# Ward's method linkage algorithm, aka the minimum variance method (Everitt
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# The objective of this method is to minime the variance.
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
25
|
+
# (nj/(ni+nj+nx))*D(cx, cj) -
|
26
|
+
# (nx/(ni+nj)^2)*D(ci, cj)
|
27
|
+
class WardLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using ward's method linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
ni = @index_clusters[ci].length
|
54
|
+
nj = @index_clusters[cj].length
|
55
|
+
nx = @index_clusters[cx].length
|
56
|
+
( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
|
57
|
+
( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx) -
|
58
|
+
( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Peter Lubell-Doughtie
|
2
|
+
# License:: BSD 3 Clause
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://peet.ldee.org
|
5
|
+
|
6
|
+
require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
|
7
|
+
|
8
|
+
module Ai4r
|
9
|
+
module Clusterers
|
10
|
+
|
11
|
+
# Hierarchical version to store classes as merges occur.
|
12
|
+
class WardLinkageHierarchical < WardLinkage
|
13
|
+
|
14
|
+
attr_reader :cluster_tree
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@cluster_tree = []
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
protected
|
22
|
+
|
23
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
24
|
+
# store current index_clusters
|
25
|
+
@cluster_tree << index_clusters.dup
|
26
|
+
super
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# weighted average linkage algorithm, aka weighted pair group method
|
18
|
+
# average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# Similar to AverageLinkage, but the distances between clusters are
|
23
|
+
# weighted based on the number of data items in each of them.
|
24
|
+
#
|
25
|
+
# D(cx, (ci U cj)) = ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
|
26
|
+
class WeightedAverageLinkage < SingleLinkage
|
27
|
+
|
28
|
+
parameters_info :distance_function =>
|
29
|
+
"Custom implementation of distance function. " +
|
30
|
+
"It must be a closure receiving two data items and return the " +
|
31
|
+
"distance bewteen them. By default, this algorithm uses " +
|
32
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
33
|
+
|
34
|
+
# Build a new clusterer, using data examples found in data_set.
|
35
|
+
# Items will be clustered in "number_of_clusters" different
|
36
|
+
# clusters.
|
37
|
+
def build(data_set, number_of_clusters)
|
38
|
+
super
|
39
|
+
end
|
40
|
+
|
41
|
+
# This algorithms does not allow classification of new data items
|
42
|
+
# once it has been built. Rebuild the cluster including you data element.
|
43
|
+
def eval(data_item)
|
44
|
+
Raise "Eval of new data is not supported by this algorithm."
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
# return distance between cluster cx and cluster (ci U cj),
|
50
|
+
# using weighted average linkage
|
51
|
+
def linkage_distance(cx, ci, cj)
|
52
|
+
ni = @index_clusters[ci].length
|
53
|
+
nj = @index_clusters[cj].length
|
54
|
+
(1.0 * ni * read_distance_matrix(cx, ci)+
|
55
|
+
nj * read_distance_matrix(cx, cj))/(ni+nj)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
@@ -0,0 +1,266 @@
|
|
1
|
+
# Author:: Sergio Fierens
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'csv'
|
11
|
+
require 'set'
|
12
|
+
require File.dirname(__FILE__) + '/statistics'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Data
|
16
|
+
|
17
|
+
# A data set is a collection of N data items. Each data item is
|
18
|
+
# described by a set of attributes, represented as an array.
|
19
|
+
# Optionally, you can assign a label to the attributes, using
|
20
|
+
# the data_labels property.
|
21
|
+
class DataSet
|
22
|
+
|
23
|
+
@@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
|
24
|
+
|
25
|
+
attr_reader :data_labels, :data_items
|
26
|
+
|
27
|
+
# Create a new DataSet. By default, empty.
|
28
|
+
# Optionaly, you can provide the initial data items and data labels.
|
29
|
+
#
|
30
|
+
# e.g. DataSet.new(:data_items => data_items, :data_labels => labels)
|
31
|
+
#
|
32
|
+
# If you provide data items, but no data labels, the data set will
|
33
|
+
# use the default data label values (see set_data_labels)
|
34
|
+
def initialize(options = {})
|
35
|
+
@data_labels = []
|
36
|
+
@data_items = options[:data_items] || []
|
37
|
+
set_data_labels(options[:data_labels]) if options[:data_labels]
|
38
|
+
set_data_items(options[:data_items]) if options[:data_items]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Retrieve a new DataSet, with the item(s) selected by the provided
|
42
|
+
# index. You can specify an index range, too.
|
43
|
+
def [](index)
|
44
|
+
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
+
[@data_items[index]] : @data_items[index]
|
46
|
+
return DataSet.new(:data_items => selected_items,
|
47
|
+
:data_labels =>@data_labels)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Load data items from csv file
|
51
|
+
def load_csv(filepath)
|
52
|
+
items = []
|
53
|
+
open_csv_file(filepath) do |entry|
|
54
|
+
items << entry
|
55
|
+
end
|
56
|
+
set_data_items(items)
|
57
|
+
end
|
58
|
+
|
59
|
+
# opens a csv-file and reads it line by line
|
60
|
+
# for each line, a block is called and the row is passed to the block
|
61
|
+
# ruby1.8 and 1.9 safe
|
62
|
+
def open_csv_file(filepath, &block)
|
63
|
+
if CSV.const_defined? :Reader
|
64
|
+
CSV::Reader.parse(File.open(filepath, 'r')) do |row|
|
65
|
+
block.call row
|
66
|
+
end
|
67
|
+
else
|
68
|
+
CSV.parse(File.open(filepath, 'r')) do |row|
|
69
|
+
block.call row
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Load data items from csv file. The first row is used as data labels.
|
75
|
+
def load_csv_with_labels(filepath)
|
76
|
+
load_csv(filepath)
|
77
|
+
@data_labels = @data_items.shift
|
78
|
+
return self
|
79
|
+
end
|
80
|
+
|
81
|
+
# Same as load_csv, but it will try to convert cell contents as numbers.
|
82
|
+
def parse_csv(filepath)
|
83
|
+
items = []
|
84
|
+
open_csv_file(filepath) do |row|
|
85
|
+
items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
|
86
|
+
end
|
87
|
+
set_data_items(items)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Set data labels.
|
91
|
+
# Data labels must have the following format:
|
92
|
+
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
93
|
+
#
|
94
|
+
# If you do not provide labels for you data, the following labels will
|
95
|
+
# be created by default:
|
96
|
+
# [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
|
97
|
+
def set_data_labels(labels)
|
98
|
+
check_data_labels(labels)
|
99
|
+
@data_labels = labels
|
100
|
+
return self
|
101
|
+
end
|
102
|
+
|
103
|
+
# Set the data items.
|
104
|
+
# M data items with N attributes must have the following
|
105
|
+
# format:
|
106
|
+
#
|
107
|
+
# [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
|
108
|
+
# [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
|
109
|
+
# ...
|
110
|
+
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
|
111
|
+
# ]
|
112
|
+
#
|
113
|
+
# e.g.
|
114
|
+
# [ ['New York', '<30', 'M', 'Y'],
|
115
|
+
# ['Chicago', '<30', 'M', 'Y'],
|
116
|
+
# ['Chicago', '<30', 'F', 'Y'],
|
117
|
+
# ['New York', '<30', 'M', 'Y'],
|
118
|
+
# ['New York', '<30', 'M', 'Y'],
|
119
|
+
# ['Chicago', '[30-50)', 'M', 'Y'],
|
120
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
121
|
+
# ['Chicago', '[30-50)', 'F', 'Y'],
|
122
|
+
# ['New York', '[30-50)', 'F', 'N'],
|
123
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
124
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
125
|
+
# ['New York', '[50-80]', 'M', 'N'],
|
126
|
+
# ['Chicago', '[50-80]', 'M', 'N'],
|
127
|
+
# ['New York', '[50-80]', 'F', 'N'],
|
128
|
+
# ['Chicago', '>80', 'F', 'Y']
|
129
|
+
# ]
|
130
|
+
#
|
131
|
+
# This method returns the classifier (self), allowing method chaining.
|
132
|
+
def set_data_items(items)
|
133
|
+
check_data_items(items)
|
134
|
+
@data_labels = default_data_labels(items) if @data_labels.empty?
|
135
|
+
@data_items = items
|
136
|
+
return self
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns an array with the domain of each attribute:
|
140
|
+
# * Set instance containing all possible values for nominal attributes
|
141
|
+
# * Array with min and max values for numeric attributes (i.e. [min, max])
|
142
|
+
#
|
143
|
+
# Return example:
|
144
|
+
# => [#<Set: {"New York", "Chicago"}>,
|
145
|
+
# #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
|
146
|
+
# #<Set: {"M", "F"}>,
|
147
|
+
# [5, 85],
|
148
|
+
# #<Set: {"Y", "N"}>]
|
149
|
+
def build_domains
|
150
|
+
@data_labels.collect {|attr_label| build_domain(attr_label) }
|
151
|
+
end
|
152
|
+
|
153
|
+
# Returns a Set instance containing all possible values for an attribute
|
154
|
+
# The parameter can be an attribute label or index (0 based).
|
155
|
+
# * Set instance containing all possible values for nominal attributes
|
156
|
+
# * Array with min and max values for numeric attributes (i.e. [min, max])
|
157
|
+
#
|
158
|
+
# build_domain("city")
|
159
|
+
# => #<Set: {"New York", "Chicago"}>
|
160
|
+
#
|
161
|
+
# build_domain("age")
|
162
|
+
# => [5, 85]
|
163
|
+
#
|
164
|
+
# build_domain(2) # In this example, the third attribute is gender
|
165
|
+
# => #<Set: {"M", "F"}>
|
166
|
+
def build_domain(attr)
|
167
|
+
index = get_index(attr)
|
168
|
+
if @data_items.first[index].is_a?(Numeric)
|
169
|
+
return [Statistics.min(self, index), Statistics.max(self, index)]
|
170
|
+
else
|
171
|
+
return @data_items.inject(Set.new){|domain, x| domain << x[index]}
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Returns attributes number, including class attribute
|
176
|
+
def num_attributes
|
177
|
+
return (@data_items.empty?) ? 0 : @data_items.first.size
|
178
|
+
end
|
179
|
+
|
180
|
+
# Returns the index of a given attribute (0-based).
|
181
|
+
# For example, if "gender" is the third attribute, then:
|
182
|
+
# get_index("gender")
|
183
|
+
# => 2
|
184
|
+
def get_index(attr)
|
185
|
+
return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
|
186
|
+
end
|
187
|
+
|
188
|
+
# Raise an exception if there is no data item.
|
189
|
+
def check_not_empty
|
190
|
+
if @data_items.empty?
|
191
|
+
raise ArgumentError, "Examples data set must not be empty."
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# Add a data item to the data set
|
196
|
+
def << data_item
|
197
|
+
if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
|
198
|
+
raise ArgumentError, "Data must not be an non empty array."
|
199
|
+
elsif @data_items.empty?
|
200
|
+
set_data_items([data_item])
|
201
|
+
elsif data_item.length != num_attributes
|
202
|
+
raise ArgumentError, "Number of attributes do not match. " +
|
203
|
+
"#{data_item.length} attributes provided, " +
|
204
|
+
"#{num_attributes} attributes expected."
|
205
|
+
else
|
206
|
+
@data_items << data_item
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# Returns an array with the mean value of numeric attributes, and
|
211
|
+
# the most frequent value of non numeric attributes
|
212
|
+
def get_mean_or_mode
|
213
|
+
mean = []
|
214
|
+
num_attributes.times do |i|
|
215
|
+
mean[i] =
|
216
|
+
if @data_items.first[i].is_a?(Numeric)
|
217
|
+
Statistics.mean(self, i)
|
218
|
+
else
|
219
|
+
Statistics.mode(self, i)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
return mean
|
223
|
+
end
|
224
|
+
|
225
|
+
protected
|
226
|
+
|
227
|
+
def check_data_items(data_items)
|
228
|
+
if !data_items || data_items.empty?
|
229
|
+
raise ArgumentError, "Examples data set must not be empty."
|
230
|
+
elsif !data_items.first.is_a?(Enumerable)
|
231
|
+
raise ArgumentError, "Unkown format for example data."
|
232
|
+
end
|
233
|
+
attributes_num = data_items.first.length
|
234
|
+
data_items.each_index do |index|
|
235
|
+
if data_items[index].length != attributes_num
|
236
|
+
raise ArgumentError,
|
237
|
+
"Quantity of attributes is inconsistent. " +
|
238
|
+
"The first item has #{attributes_num} attributes "+
|
239
|
+
"and row #{index} has #{data_items[index].length} attributes"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def check_data_labels(labels)
|
245
|
+
if !@data_items.empty?
|
246
|
+
if labels.length != @data_items.first.length
|
247
|
+
raise ArgumentError,
|
248
|
+
"Number of labels and attributes do not match. " +
|
249
|
+
"#{labels.length} labels and " +
|
250
|
+
"#{@data_items.first.length} attributes found."
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def default_data_labels(data_items)
|
256
|
+
data_labels = []
|
257
|
+
data_items[0][0..-2].each_index do |i|
|
258
|
+
data_labels[i] = "attribute_#{i+1}"
|
259
|
+
end
|
260
|
+
data_labels[data_labels.length]="class_value"
|
261
|
+
return data_labels
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|