nirvdrum-ai4r 1.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (150) hide show
  1. data/.gitignore +1 -0
  2. data/.rakeTasks +7 -0
  3. data/README.rdoc +56 -0
  4. data/Rakefile.rb +42 -0
  5. data/VERSION +1 -0
  6. data/ai4r.gemspec +221 -0
  7. data/change_log +49 -0
  8. data/examples/classifiers/id3_data.csv +121 -0
  9. data/examples/classifiers/id3_example.rb +29 -0
  10. data/examples/classifiers/naive_bayes_data.csv +11 -0
  11. data/examples/classifiers/naive_bayes_example.rb +16 -0
  12. data/examples/classifiers/results.txt +31 -0
  13. data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
  14. data/examples/genetic_algorithm/travel_cost.csv +16 -0
  15. data/examples/neural_network/backpropagation_example.rb +67 -0
  16. data/examples/neural_network/patterns_with_base_noise.rb +68 -0
  17. data/examples/neural_network/patterns_with_noise.rb +66 -0
  18. data/examples/neural_network/training_patterns.rb +68 -0
  19. data/examples/neural_network/xor_example.rb +35 -0
  20. data/examples/som/som_data.rb +156 -0
  21. data/examples/som/som_multi_node_example.rb +22 -0
  22. data/examples/som/som_single_example.rb +24 -0
  23. data/lib/ai4r.rb +32 -0
  24. data/lib/ai4r/classifiers/classifier.rb +59 -0
  25. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  26. data/lib/ai4r/classifiers/id3.rb +326 -0
  27. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  28. data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
  29. data/lib/ai4r/classifiers/one_r.rb +110 -0
  30. data/lib/ai4r/classifiers/prism.rb +197 -0
  31. data/lib/ai4r/classifiers/zero_r.rb +73 -0
  32. data/lib/ai4r/clusterers/average_linkage.rb +59 -0
  33. data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
  34. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  35. data/lib/ai4r/clusterers/clusterer.rb +61 -0
  36. data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
  37. data/lib/ai4r/clusterers/diana.rb +139 -0
  38. data/lib/ai4r/clusterers/k_means.rb +126 -0
  39. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  40. data/lib/ai4r/clusterers/single_linkage.rb +194 -0
  41. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  42. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  43. data/lib/ai4r/data/data_set.rb +266 -0
  44. data/lib/ai4r/data/parameterizable.rb +64 -0
  45. data/lib/ai4r/data/proximity.rb +100 -0
  46. data/lib/ai4r/data/statistics.rb +77 -0
  47. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  48. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
  49. data/lib/ai4r/neural_network/backpropagation.rb +293 -0
  50. data/lib/ai4r/neural_network/hopfield.rb +149 -0
  51. data/lib/ai4r/som/layer.rb +68 -0
  52. data/lib/ai4r/som/node.rb +96 -0
  53. data/lib/ai4r/som/som.rb +155 -0
  54. data/lib/ai4r/som/two_phase_layer.rb +90 -0
  55. data/site/forrest.properties +152 -0
  56. data/site/forrest.properties.dispatcher.properties +25 -0
  57. data/site/forrest.properties.xml +29 -0
  58. data/site/src/documentation/README.txt +7 -0
  59. data/site/src/documentation/classes/CatalogManager.properties +62 -0
  60. data/site/src/documentation/content/locationmap.xml +72 -0
  61. data/site/src/documentation/content/xdocs/downloads.html +9 -0
  62. data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +294 -0
  63. data/site/src/documentation/content/xdocs/index.xml +155 -0
  64. data/site/src/documentation/content/xdocs/machineLearning.xml +131 -0
  65. data/site/src/documentation/content/xdocs/neuralNetworks.xml +270 -0
  66. data/site/src/documentation/content/xdocs/site.xml +54 -0
  67. data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
  68. data/site/src/documentation/content/xdocs/tabs.xml +35 -0
  69. data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
  70. data/site/src/documentation/resources/images/c.png +0 -0
  71. data/site/src/documentation/resources/images/c_wbn.png +0 -0
  72. data/site/src/documentation/resources/images/c_wn.png +0 -0
  73. data/site/src/documentation/resources/images/ellipse-2.svg +30 -0
  74. data/site/src/documentation/resources/images/ero.gif +0 -0
  75. data/site/src/documentation/resources/images/europe2.png +0 -0
  76. data/site/src/documentation/resources/images/europe3.png +0 -0
  77. data/site/src/documentation/resources/images/fitness.png +0 -0
  78. data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
  79. data/site/src/documentation/resources/images/icon-a.png +0 -0
  80. data/site/src/documentation/resources/images/icon-b.png +0 -0
  81. data/site/src/documentation/resources/images/icon.png +0 -0
  82. data/site/src/documentation/resources/images/jadeferret.png +0 -0
  83. data/site/src/documentation/resources/images/my_email.png +0 -0
  84. data/site/src/documentation/resources/images/neural_network_example.png +0 -0
  85. data/site/src/documentation/resources/images/project-logo.png +0 -0
  86. data/site/src/documentation/resources/images/rubyforge.png +0 -0
  87. data/site/src/documentation/resources/images/s.png +0 -0
  88. data/site/src/documentation/resources/images/s_wbn.png +0 -0
  89. data/site/src/documentation/resources/images/s_wn.png +0 -0
  90. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  91. data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
  92. data/site/src/documentation/resources/images/t.png +0 -0
  93. data/site/src/documentation/resources/images/t_wbn.png +0 -0
  94. data/site/src/documentation/resources/images/t_wn.png +0 -0
  95. data/site/src/documentation/resources/schema/catalog.xcat +29 -0
  96. data/site/src/documentation/resources/schema/hello-v10.dtd +51 -0
  97. data/site/src/documentation/resources/schema/symbols-project-v10.ent +26 -0
  98. data/site/src/documentation/resources/stylesheets/hello2document.xsl +33 -0
  99. data/site/src/documentation/sitemap.xmap +66 -0
  100. data/site/src/documentation/skinconf.xml +418 -0
  101. data/site/src/documentation/translations/langcode.xml +29 -0
  102. data/site/src/documentation/translations/languages_de.xml +24 -0
  103. data/site/src/documentation/translations/languages_en.xml +24 -0
  104. data/site/src/documentation/translations/languages_es.xml +22 -0
  105. data/site/src/documentation/translations/languages_fr.xml +24 -0
  106. data/site/src/documentation/translations/languages_nl.xml +24 -0
  107. data/site/src/documentation/translations/menu.xml +33 -0
  108. data/site/src/documentation/translations/menu_af.xml +33 -0
  109. data/site/src/documentation/translations/menu_de.xml +33 -0
  110. data/site/src/documentation/translations/menu_es.xml +33 -0
  111. data/site/src/documentation/translations/menu_fr.xml +33 -0
  112. data/site/src/documentation/translations/menu_it.xml +33 -0
  113. data/site/src/documentation/translations/menu_nl.xml +33 -0
  114. data/site/src/documentation/translations/menu_no.xml +33 -0
  115. data/site/src/documentation/translations/menu_ru.xml +33 -0
  116. data/site/src/documentation/translations/menu_sk.xml +33 -0
  117. data/site/src/documentation/translations/tabs.xml +22 -0
  118. data/site/src/documentation/translations/tabs_de.xml +22 -0
  119. data/site/src/documentation/translations/tabs_es.xml +22 -0
  120. data/site/src/documentation/translations/tabs_fr.xml +22 -0
  121. data/site/src/documentation/translations/tabs_nl.xml +22 -0
  122. data/test/classifiers/hyperpipes_test.rb +84 -0
  123. data/test/classifiers/id3_test.rb +208 -0
  124. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  125. data/test/classifiers/naive_bayes_test.rb +43 -0
  126. data/test/classifiers/one_r_test.rb +62 -0
  127. data/test/classifiers/prism_test.rb +85 -0
  128. data/test/classifiers/zero_r_test.rb +50 -0
  129. data/test/clusterers/average_linkage_test.rb +51 -0
  130. data/test/clusterers/bisecting_k_means_test.rb +66 -0
  131. data/test/clusterers/centroid_linkage_test.rb +53 -0
  132. data/test/clusterers/complete_linkage_test.rb +57 -0
  133. data/test/clusterers/diana_test.rb +69 -0
  134. data/test/clusterers/k_means_test.rb +100 -0
  135. data/test/clusterers/median_linkage_test.rb +53 -0
  136. data/test/clusterers/single_linkage_test.rb +122 -0
  137. data/test/clusterers/ward_linkage_test.rb +53 -0
  138. data/test/clusterers/weighted_average_linkage_test.rb +53 -0
  139. data/test/data/data_set.csv +121 -0
  140. data/test/data/data_set_test.rb +96 -0
  141. data/test/data/proximity_test.rb +81 -0
  142. data/test/data/statistics_data_set.csv +5 -0
  143. data/test/data/statistics_test.rb +65 -0
  144. data/test/experiment/classifier_evaluator_test.rb +76 -0
  145. data/test/genetic_algorithm/chromosome_test.rb +58 -0
  146. data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
  147. data/test/neural_network/backpropagation_test.rb +69 -0
  148. data/test/neural_network/hopfield_test.rb +72 -0
  149. data/test/som/som_test.rb +97 -0
  150. metadata +238 -0
@@ -0,0 +1,194 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # Implementation of a Hierarchical clusterer with single linkage (Everitt et
18
+ # al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # With single linkage, the distance between two clusters is computed as the
23
+ # distance between the two closest elements in the two clusters.
24
+ #
25
+ # D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
26
+ class SingleLinkage < Clusterer
27
+
28
+ attr_reader :data_set, :number_of_clusters, :clusters
29
+
30
+ parameters_info :distance_function =>
31
+ "Custom implementation of distance function. " +
32
+ "It must be a closure receiving two data items and return the " +
33
+ "distance bewteen them. By default, this algorithm uses " +
34
+ "ecuclidean distance of numeric attributes to the power of 2."
35
+
36
+ def initialize
37
+ @distance_function = lambda do |a,b|
38
+ Ai4r::Data::Proximity.squared_euclidean_distance(
39
+ a.select {|att_a| att_a.is_a? Numeric} ,
40
+ b.select {|att_b| att_b.is_a? Numeric})
41
+ end
42
+ end
43
+
44
+ # Build a new clusterer, using data examples found in data_set.
45
+ # Items will be clustered in "number_of_clusters" different
46
+ # clusters.
47
+ def build(data_set, number_of_clusters)
48
+ @data_set = data_set
49
+ @number_of_clusters = number_of_clusters
50
+
51
+ @index_clusters = create_initial_index_clusters
52
+ create_distance_matrix(data_set)
53
+ while @index_clusters.length > @number_of_clusters
54
+ ci, cj = get_closest_clusters(@index_clusters)
55
+ update_distance_matrix(ci, cj)
56
+ merge_clusters(ci, cj, @index_clusters)
57
+ end
58
+ @clusters = build_clusters_from_index_clusters @index_clusters
59
+
60
+ return self
61
+ end
62
+
63
+ # Classifies the given data item, returning the cluster index it belongs
64
+ # to (0-based).
65
+ def eval(data_item)
66
+ get_min_index(@clusters.collect {|cluster|
67
+ distance_between_item_and_cluster(data_item, cluster)})
68
+ end
69
+
70
+ protected
71
+
72
+ # returns [ [0], [1], [2], ... , [n-1] ]
73
+ # where n is the number of data items in the data set
74
+ def create_initial_index_clusters
75
+ index_clusters = []
76
+ @data_set.data_items.length.times {|i| index_clusters << [i]}
77
+ return index_clusters
78
+ end
79
+
80
+ # Create a partial distance matrix:
81
+ # [
82
+ # [d(1,0)],
83
+ # [d(2,0)], [d(2,1)],
84
+ # [d(3,0)], [d(3,1)], [d(3,2)],
85
+ # ...
86
+ # [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
87
+ # ]
88
+ # where n is the number of data items in the data set
89
+ def create_distance_matrix(data_set)
90
+ @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
91
+ data_set.data_items.each_with_index do |a, i|
92
+ i.times do |j|
93
+ b = data_set.data_items[j]
94
+ @distance_matrix[i-1][j] = @distance_function.call(a, b)
95
+ end
96
+ end
97
+ end
98
+
99
+ # Returns the distance between element data_item[index_a] and
100
+ # data_item[index_b] using the distance matrix
101
+ def read_distance_matrix(index_a, index_b)
102
+ return 0 if index_a == index_b
103
+ index_a, index_b = index_b, index_a if index_b > index_a
104
+ return @distance_matrix[index_a-1][index_b]
105
+ end
106
+
107
+ # ci and cj are the indexes of the clusters that are going to
108
+ # be merged. We need to remove distances from/to ci and ci,
109
+ # and add distances from/to new cluster (ci U cj)
110
+ def update_distance_matrix(ci, cj)
111
+ ci, cj = cj, ci if cj > ci
112
+ distances_to_new_cluster = Array.new
113
+ (@distance_matrix.length+1).times do |cx|
114
+ if cx!= ci && cx!=cj
115
+ distances_to_new_cluster << linkage_distance(cx, ci, cj)
116
+ end
117
+ end
118
+ if cj==0 && ci==1
119
+ @distance_matrix.delete_at(1)
120
+ @distance_matrix.delete_at(0)
121
+ elsif cj==0
122
+ @distance_matrix.delete_at(ci-1)
123
+ @distance_matrix.delete_at(0)
124
+ else
125
+ @distance_matrix.delete_at(ci-1)
126
+ @distance_matrix.delete_at(cj-1)
127
+ end
128
+ @distance_matrix.each do |d|
129
+ d.delete_at(ci)
130
+ d.delete_at(cj)
131
+ end
132
+ @distance_matrix << distances_to_new_cluster
133
+ end
134
+
135
+ # return distance between cluster cx and new cluster (ci U cj),
136
+ # using single linkage
137
+ def linkage_distance(cx, ci, cj)
138
+ [read_distance_matrix(cx, ci),
139
+ read_distance_matrix(cx, cj)].min
140
+ end
141
+
142
+ # cluster_a and cluster_b are removed from index_cluster,
143
+ # and a new cluster with all members of cluster_a and cluster_b
144
+ # is added.
145
+ # It modifies index clusters array.
146
+ def merge_clusters(index_a, index_b, index_clusters)
147
+ index_a, index_b = index_b, index_a if index_b > index_a
148
+ new_index_cluster = index_clusters[index_a] +
149
+ index_clusters[index_b]
150
+ index_clusters.delete_at index_a
151
+ index_clusters.delete_at index_b
152
+ index_clusters << new_index_cluster
153
+ return index_clusters
154
+ end
155
+
156
+ # Given an array with clusters of data_items indexes,
157
+ # it returns an array of data_items clusters
158
+ def build_clusters_from_index_clusters(index_clusters)
159
+ @distance_matrix = nil
160
+ return index_clusters.collect do |index_cluster|
161
+ Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
162
+ :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
163
+ end
164
+ end
165
+
166
+ # Returns ans array with the indexes of the two closest
167
+ # clusters => [index_cluster_a, index_cluster_b]
168
+ def get_closest_clusters(index_clusters)
169
+ min_distance = 1.0/0
170
+ closest_clusters = [1, 0]
171
+ index_clusters.each_index do |index_a|
172
+ index_a.times do |index_b|
173
+ cluster_distance = read_distance_matrix(index_a, index_b)
174
+ if cluster_distance < min_distance
175
+ closest_clusters = [index_a, index_b]
176
+ min_distance = cluster_distance
177
+ end
178
+ end
179
+ end
180
+ return closest_clusters
181
+ end
182
+
183
+ def distance_between_item_and_cluster(data_item, cluster)
184
+ min_dist = 1.0/0
185
+ cluster.data_items.each do |another_item|
186
+ dist = @distance_function.call(data_item, another_item)
187
+ min_dist = dist if dist < min_dist
188
+ end
189
+ return min_dist
190
+ end
191
+
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,64 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # Ward's method linkage algorithm, aka the minimum variance method (Everitt
18
+ # et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # The objective of this method is to minime the variance.
23
+ #
24
+ # D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
25
+ # (nj/(ni+nj+nx))*D(cx, cj) -
26
+ # (nx/(ni+nj)^2)*D(ci, cj)
27
+ class WardLinkage < SingleLinkage
28
+
29
+ parameters_info :distance_function =>
30
+ "Custom implementation of distance function. " +
31
+ "It must be a closure receiving two data items and return the " +
32
+ "distance bewteen them. By default, this algorithm uses " +
33
+ "ecuclidean distance of numeric attributes to the power of 2."
34
+
35
+ # Build a new clusterer, using data examples found in data_set.
36
+ # Items will be clustered in "number_of_clusters" different
37
+ # clusters.
38
+ def build(data_set, number_of_clusters)
39
+ super
40
+ end
41
+
42
+ # This algorithms does not allow classification of new data items
43
+ # once it has been built. Rebuild the cluster including you data element.
44
+ def eval(data_item)
45
+ Raise "Eval of new data is not supported by this algorithm."
46
+ end
47
+
48
+ protected
49
+
50
+ # return distance between cluster cx and cluster (ci U cj),
51
+ # using ward's method linkage
52
+ def linkage_distance(cx, ci, cj)
53
+ ni = @index_clusters[ci].length
54
+ nj = @index_clusters[cj].length
55
+ nx = @index_clusters[cx].length
56
+ ( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
57
+ ( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx) -
58
+ ( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
59
+ end
60
+
61
+ end
62
+ end
63
+ end
64
+
@@ -0,0 +1,61 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # weighted average linkage algorithm, aka weighted pair group method
18
+ # average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # Similar to AverageLinkage, but the distances between clusters are
23
+ # weighted based on the number of data items in each of them.
24
+ #
25
+ # D(cx, (ci U cj)) = ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
26
+ class WeightedAverageLinkage < SingleLinkage
27
+
28
+ parameters_info :distance_function =>
29
+ "Custom implementation of distance function. " +
30
+ "It must be a closure receiving two data items and return the " +
31
+ "distance bewteen them. By default, this algorithm uses " +
32
+ "ecuclidean distance of numeric attributes to the power of 2."
33
+
34
+ # Build a new clusterer, using data examples found in data_set.
35
+ # Items will be clustered in "number_of_clusters" different
36
+ # clusters.
37
+ def build(data_set, number_of_clusters)
38
+ super
39
+ end
40
+
41
+ # This algorithms does not allow classification of new data items
42
+ # once it has been built. Rebuild the cluster including you data element.
43
+ def eval(data_item)
44
+ Raise "Eval of new data is not supported by this algorithm."
45
+ end
46
+
47
+ protected
48
+
49
+ # return distance between cluster cx and cluster (ci U cj),
50
+ # using weighted average linkage
51
+ def linkage_distance(cx, ci, cj)
52
+ ni = @index_clusters[ci].length
53
+ nj = @index_clusters[cj].length
54
+ (1.0 * ni * read_distance_matrix(cx, ci)+
55
+ nj * read_distance_matrix(cx, cj))/(ni+nj)
56
+ end
57
+
58
+ end
59
+ end
60
+ end
61
+
@@ -0,0 +1,266 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'csv'
11
+ require 'set'
12
+ require File.dirname(__FILE__) + '/statistics'
13
+
14
+ module Ai4r
15
+ module Data
16
+
17
+ # A data set is a collection of N data items. Each data item is
18
+ # described by a set of attributes, represented as an array.
19
+ # Optionally, you can assign a label to the attributes, using
20
+ # the data_labels property.
21
+ class DataSet
22
+
23
+ @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
24
+
25
+ attr_reader :data_labels, :data_items
26
+
27
+ # Create a new DataSet. By default, empty.
28
+ # Optionaly, you can provide the initial data items and data labels.
29
+ #
30
+ # e.g. DataSet.new(:data_items => data_items, :data_labels => labels)
31
+ #
32
+ # If you provide data items, but no data labels, the data set will
33
+ # use the default data label values (see set_data_labels)
34
+ def initialize(options = {})
35
+ @data_labels = []
36
+ @data_items = options[:data_items] || []
37
+ set_data_labels(options[:data_labels]) if options[:data_labels]
38
+ set_data_items(options[:data_items]) if options[:data_items]
39
+ end
40
+
41
+ # Retrieve a new DataSet, with the item(s) selected by the provided
42
+ # index. You can specify an index range, too.
43
+ def [](index)
44
+ selected_items = (index.is_a?(Fixnum)) ?
45
+ [@data_items[index]] : @data_items[index]
46
+ return DataSet.new(:data_items => selected_items,
47
+ :data_labels =>@data_labels)
48
+ end
49
+
50
+ # Load data items from csv file
51
+ def load_csv(filepath)
52
+ items = []
53
+ open_csv_file(filepath) do |entry|
54
+ items << entry
55
+ end
56
+ set_data_items(items)
57
+ end
58
+
59
+ # opens a csv-file and reads it line by line
60
+ # for each line, a block is called and the row is passed to the block
61
+ # ruby1.8 and 1.9 safe
62
+ def open_csv_file(filepath, &block)
63
+ if CSV.const_defined? :Reader
64
+ CSV::Reader.parse(File.open(filepath, 'r')) do |row|
65
+ block.call row
66
+ end
67
+ else
68
+ CSV.parse(File.open(filepath, 'r')) do |row|
69
+ block.call row
70
+ end
71
+ end
72
+ end
73
+
74
+ # Load data items from csv file. The first row is used as data labels.
75
+ def load_csv_with_labels(filepath)
76
+ load_csv(filepath)
77
+ @data_labels = @data_items.shift
78
+ return self
79
+ end
80
+
81
+ # Same as load_csv, but it will try to convert cell contents as numbers.
82
+ def parse_csv(filepath)
83
+ items = []
84
+ open_csv_file(filepath) do |row|
85
+ items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
86
+ end
87
+ set_data_items(items)
88
+ end
89
+
90
+ # Set data labels.
91
+ # Data labels must have the following format:
92
+ # [ 'city', 'age_range', 'gender', 'marketing_target' ]
93
+ #
94
+ # If you do not provide labels for you data, the following labels will
95
+ # be created by default:
96
+ # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
97
+ def set_data_labels(labels)
98
+ check_data_labels(labels)
99
+ @data_labels = labels
100
+ return self
101
+ end
102
+
103
+ # Set the data items.
104
+ # M data items with N attributes must have the following
105
+ # format:
106
+ #
107
+ # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
108
+ # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
109
+ # ...
110
+ # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
111
+ # ]
112
+ #
113
+ # e.g.
114
+ # [ ['New York', '<30', 'M', 'Y'],
115
+ # ['Chicago', '<30', 'M', 'Y'],
116
+ # ['Chicago', '<30', 'F', 'Y'],
117
+ # ['New York', '<30', 'M', 'Y'],
118
+ # ['New York', '<30', 'M', 'Y'],
119
+ # ['Chicago', '[30-50)', 'M', 'Y'],
120
+ # ['New York', '[30-50)', 'F', 'N'],
121
+ # ['Chicago', '[30-50)', 'F', 'Y'],
122
+ # ['New York', '[30-50)', 'F', 'N'],
123
+ # ['Chicago', '[50-80]', 'M', 'N'],
124
+ # ['New York', '[50-80]', 'F', 'N'],
125
+ # ['New York', '[50-80]', 'M', 'N'],
126
+ # ['Chicago', '[50-80]', 'M', 'N'],
127
+ # ['New York', '[50-80]', 'F', 'N'],
128
+ # ['Chicago', '>80', 'F', 'Y']
129
+ # ]
130
+ #
131
+ # This method returns the classifier (self), allowing method chaining.
132
+ def set_data_items(items)
133
+ check_data_items(items)
134
+ @data_labels = default_data_labels(items) if @data_labels.empty?
135
+ @data_items = items
136
+ return self
137
+ end
138
+
139
+ # Returns an array with the domain of each attribute:
140
+ # * Set instance containing all possible values for nominal attributes
141
+ # * Array with min and max values for numeric attributes (i.e. [min, max])
142
+ #
143
+ # Return example:
144
+ # => [#<Set: {"New York", "Chicago"}>,
145
+ # #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
146
+ # #<Set: {"M", "F"}>,
147
+ # [5, 85],
148
+ # #<Set: {"Y", "N"}>]
149
+ def build_domains
150
+ @data_labels.collect {|attr_label| build_domain(attr_label) }
151
+ end
152
+
153
+ # Returns a Set instance containing all possible values for an attribute
154
+ # The parameter can be an attribute label or index (0 based).
155
+ # * Set instance containing all possible values for nominal attributes
156
+ # * Array with min and max values for numeric attributes (i.e. [min, max])
157
+ #
158
+ # build_domain("city")
159
+ # => #<Set: {"New York", "Chicago"}>
160
+ #
161
+ # build_domain("age")
162
+ # => [5, 85]
163
+ #
164
+ # build_domain(2) # In this example, the third attribute is gender
165
+ # => #<Set: {"M", "F"}>
166
+ def build_domain(attr)
167
+ index = get_index(attr)
168
+ if @data_items.first[index].is_a?(Numeric)
169
+ return [Statistics.min(self, index), Statistics.max(self, index)]
170
+ else
171
+ return @data_items.inject(Set.new){|domain, x| domain << x[index]}
172
+ end
173
+ end
174
+
175
+ # Returns attributes number, including class attribute
176
+ def num_attributes
177
+ return (@data_items.empty?) ? 0 : @data_items.first.size
178
+ end
179
+
180
+ # Returns the index of a given attribute (0-based).
181
+ # For example, if "gender" is the third attribute, then:
182
+ # get_index("gender")
183
+ # => 2
184
+ def get_index(attr)
185
+ return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
186
+ end
187
+
188
+ # Raise an exception if there is no data item.
189
+ def check_not_empty
190
+ if @data_items.empty?
191
+ raise ArgumentError, "Examples data set must not be empty."
192
+ end
193
+ end
194
+
195
+ # Add a data item to the data set
196
+ def << data_item
197
+ if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
198
+ raise ArgumentError, "Data must not be an non empty array."
199
+ elsif @data_items.empty?
200
+ set_data_items([data_item])
201
+ elsif data_item.length != num_attributes
202
+ raise ArgumentError, "Number of attributes do not match. " +
203
+ "#{data_item.length} attributes provided, " +
204
+ "#{num_attributes} attributes expected."
205
+ else
206
+ @data_items << data_item
207
+ end
208
+ end
209
+
210
+ # Returns an array with the mean value of numeric attributes, and
211
+ # the most frequent value of non numeric attributes
212
+ def get_mean_or_mode
213
+ mean = []
214
+ num_attributes.times do |i|
215
+ mean[i] =
216
+ if @data_items.first[i].is_a?(Numeric)
217
+ Statistics.mean(self, i)
218
+ else
219
+ Statistics.mode(self, i)
220
+ end
221
+ end
222
+ return mean
223
+ end
224
+
225
+ protected
226
+
227
+ def check_data_items(data_items)
228
+ if !data_items || data_items.empty?
229
+ raise ArgumentError, "Examples data set must not be empty."
230
+ elsif !data_items.first.is_a?(Enumerable)
231
+ raise ArgumentError, "Unkown format for example data."
232
+ end
233
+ attributes_num = data_items.first.length
234
+ data_items.each_index do |index|
235
+ if data_items[index].length != attributes_num
236
+ raise ArgumentError,
237
+ "Quantity of attributes is inconsistent. " +
238
+ "The first item has #{attributes_num} attributes "+
239
+ "and row #{index} has #{data_items[index].length} attributes"
240
+ end
241
+ end
242
+ end
243
+
244
+ def check_data_labels(labels)
245
+ if !@data_items.empty?
246
+ if labels.length != @data_items.first.length
247
+ raise ArgumentError,
248
+ "Number of labels and attributes do not match. " +
249
+ "#{labels.length} labels and " +
250
+ "#{@data_items.first.length} attributes found."
251
+ end
252
+ end
253
+ end
254
+
255
+ def default_data_labels(data_items)
256
+ data_labels = []
257
+ data_items[0][0..-2].each_index do |i|
258
+ data_labels[i] = "attribute_#{i+1}"
259
+ end
260
+ data_labels[data_labels.length]="class_value"
261
+ return data_labels
262
+ end
263
+
264
+ end
265
+ end
266
+ end