nirvdrum-ai4r 1.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (150) hide show
  1. data/.gitignore +1 -0
  2. data/.rakeTasks +7 -0
  3. data/README.rdoc +56 -0
  4. data/Rakefile.rb +42 -0
  5. data/VERSION +1 -0
  6. data/ai4r.gemspec +221 -0
  7. data/change_log +49 -0
  8. data/examples/classifiers/id3_data.csv +121 -0
  9. data/examples/classifiers/id3_example.rb +29 -0
  10. data/examples/classifiers/naive_bayes_data.csv +11 -0
  11. data/examples/classifiers/naive_bayes_example.rb +16 -0
  12. data/examples/classifiers/results.txt +31 -0
  13. data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
  14. data/examples/genetic_algorithm/travel_cost.csv +16 -0
  15. data/examples/neural_network/backpropagation_example.rb +67 -0
  16. data/examples/neural_network/patterns_with_base_noise.rb +68 -0
  17. data/examples/neural_network/patterns_with_noise.rb +66 -0
  18. data/examples/neural_network/training_patterns.rb +68 -0
  19. data/examples/neural_network/xor_example.rb +35 -0
  20. data/examples/som/som_data.rb +156 -0
  21. data/examples/som/som_multi_node_example.rb +22 -0
  22. data/examples/som/som_single_example.rb +24 -0
  23. data/lib/ai4r.rb +32 -0
  24. data/lib/ai4r/classifiers/classifier.rb +59 -0
  25. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  26. data/lib/ai4r/classifiers/id3.rb +326 -0
  27. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  28. data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
  29. data/lib/ai4r/classifiers/one_r.rb +110 -0
  30. data/lib/ai4r/classifiers/prism.rb +197 -0
  31. data/lib/ai4r/classifiers/zero_r.rb +73 -0
  32. data/lib/ai4r/clusterers/average_linkage.rb +59 -0
  33. data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
  34. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  35. data/lib/ai4r/clusterers/clusterer.rb +61 -0
  36. data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
  37. data/lib/ai4r/clusterers/diana.rb +139 -0
  38. data/lib/ai4r/clusterers/k_means.rb +126 -0
  39. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  40. data/lib/ai4r/clusterers/single_linkage.rb +194 -0
  41. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  42. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  43. data/lib/ai4r/data/data_set.rb +266 -0
  44. data/lib/ai4r/data/parameterizable.rb +64 -0
  45. data/lib/ai4r/data/proximity.rb +100 -0
  46. data/lib/ai4r/data/statistics.rb +77 -0
  47. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  48. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
  49. data/lib/ai4r/neural_network/backpropagation.rb +293 -0
  50. data/lib/ai4r/neural_network/hopfield.rb +149 -0
  51. data/lib/ai4r/som/layer.rb +68 -0
  52. data/lib/ai4r/som/node.rb +96 -0
  53. data/lib/ai4r/som/som.rb +155 -0
  54. data/lib/ai4r/som/two_phase_layer.rb +90 -0
  55. data/site/forrest.properties +152 -0
  56. data/site/forrest.properties.dispatcher.properties +25 -0
  57. data/site/forrest.properties.xml +29 -0
  58. data/site/src/documentation/README.txt +7 -0
  59. data/site/src/documentation/classes/CatalogManager.properties +62 -0
  60. data/site/src/documentation/content/locationmap.xml +72 -0
  61. data/site/src/documentation/content/xdocs/downloads.html +9 -0
  62. data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +294 -0
  63. data/site/src/documentation/content/xdocs/index.xml +155 -0
  64. data/site/src/documentation/content/xdocs/machineLearning.xml +131 -0
  65. data/site/src/documentation/content/xdocs/neuralNetworks.xml +270 -0
  66. data/site/src/documentation/content/xdocs/site.xml +54 -0
  67. data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
  68. data/site/src/documentation/content/xdocs/tabs.xml +35 -0
  69. data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
  70. data/site/src/documentation/resources/images/c.png +0 -0
  71. data/site/src/documentation/resources/images/c_wbn.png +0 -0
  72. data/site/src/documentation/resources/images/c_wn.png +0 -0
  73. data/site/src/documentation/resources/images/ellipse-2.svg +30 -0
  74. data/site/src/documentation/resources/images/ero.gif +0 -0
  75. data/site/src/documentation/resources/images/europe2.png +0 -0
  76. data/site/src/documentation/resources/images/europe3.png +0 -0
  77. data/site/src/documentation/resources/images/fitness.png +0 -0
  78. data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
  79. data/site/src/documentation/resources/images/icon-a.png +0 -0
  80. data/site/src/documentation/resources/images/icon-b.png +0 -0
  81. data/site/src/documentation/resources/images/icon.png +0 -0
  82. data/site/src/documentation/resources/images/jadeferret.png +0 -0
  83. data/site/src/documentation/resources/images/my_email.png +0 -0
  84. data/site/src/documentation/resources/images/neural_network_example.png +0 -0
  85. data/site/src/documentation/resources/images/project-logo.png +0 -0
  86. data/site/src/documentation/resources/images/rubyforge.png +0 -0
  87. data/site/src/documentation/resources/images/s.png +0 -0
  88. data/site/src/documentation/resources/images/s_wbn.png +0 -0
  89. data/site/src/documentation/resources/images/s_wn.png +0 -0
  90. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  91. data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
  92. data/site/src/documentation/resources/images/t.png +0 -0
  93. data/site/src/documentation/resources/images/t_wbn.png +0 -0
  94. data/site/src/documentation/resources/images/t_wn.png +0 -0
  95. data/site/src/documentation/resources/schema/catalog.xcat +29 -0
  96. data/site/src/documentation/resources/schema/hello-v10.dtd +51 -0
  97. data/site/src/documentation/resources/schema/symbols-project-v10.ent +26 -0
  98. data/site/src/documentation/resources/stylesheets/hello2document.xsl +33 -0
  99. data/site/src/documentation/sitemap.xmap +66 -0
  100. data/site/src/documentation/skinconf.xml +418 -0
  101. data/site/src/documentation/translations/langcode.xml +29 -0
  102. data/site/src/documentation/translations/languages_de.xml +24 -0
  103. data/site/src/documentation/translations/languages_en.xml +24 -0
  104. data/site/src/documentation/translations/languages_es.xml +22 -0
  105. data/site/src/documentation/translations/languages_fr.xml +24 -0
  106. data/site/src/documentation/translations/languages_nl.xml +24 -0
  107. data/site/src/documentation/translations/menu.xml +33 -0
  108. data/site/src/documentation/translations/menu_af.xml +33 -0
  109. data/site/src/documentation/translations/menu_de.xml +33 -0
  110. data/site/src/documentation/translations/menu_es.xml +33 -0
  111. data/site/src/documentation/translations/menu_fr.xml +33 -0
  112. data/site/src/documentation/translations/menu_it.xml +33 -0
  113. data/site/src/documentation/translations/menu_nl.xml +33 -0
  114. data/site/src/documentation/translations/menu_no.xml +33 -0
  115. data/site/src/documentation/translations/menu_ru.xml +33 -0
  116. data/site/src/documentation/translations/menu_sk.xml +33 -0
  117. data/site/src/documentation/translations/tabs.xml +22 -0
  118. data/site/src/documentation/translations/tabs_de.xml +22 -0
  119. data/site/src/documentation/translations/tabs_es.xml +22 -0
  120. data/site/src/documentation/translations/tabs_fr.xml +22 -0
  121. data/site/src/documentation/translations/tabs_nl.xml +22 -0
  122. data/test/classifiers/hyperpipes_test.rb +84 -0
  123. data/test/classifiers/id3_test.rb +208 -0
  124. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  125. data/test/classifiers/naive_bayes_test.rb +43 -0
  126. data/test/classifiers/one_r_test.rb +62 -0
  127. data/test/classifiers/prism_test.rb +85 -0
  128. data/test/classifiers/zero_r_test.rb +50 -0
  129. data/test/clusterers/average_linkage_test.rb +51 -0
  130. data/test/clusterers/bisecting_k_means_test.rb +66 -0
  131. data/test/clusterers/centroid_linkage_test.rb +53 -0
  132. data/test/clusterers/complete_linkage_test.rb +57 -0
  133. data/test/clusterers/diana_test.rb +69 -0
  134. data/test/clusterers/k_means_test.rb +100 -0
  135. data/test/clusterers/median_linkage_test.rb +53 -0
  136. data/test/clusterers/single_linkage_test.rb +122 -0
  137. data/test/clusterers/ward_linkage_test.rb +53 -0
  138. data/test/clusterers/weighted_average_linkage_test.rb +53 -0
  139. data/test/data/data_set.csv +121 -0
  140. data/test/data/data_set_test.rb +96 -0
  141. data/test/data/proximity_test.rb +81 -0
  142. data/test/data/statistics_data_set.csv +5 -0
  143. data/test/data/statistics_test.rb +65 -0
  144. data/test/experiment/classifier_evaluator_test.rb +76 -0
  145. data/test/genetic_algorithm/chromosome_test.rb +58 -0
  146. data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
  147. data/test/neural_network/backpropagation_test.rb +69 -0
  148. data/test/neural_network/hopfield_test.rb +72 -0
  149. data/test/som/som_test.rb +97 -0
  150. metadata +238 -0
@@ -0,0 +1,197 @@
1
+ # Author:: Sergio Fierens (Implementation only, Cendrowska is
2
+ # the creator of the algorithm)
3
+ # License:: MPL 1.1
4
+ # Project:: ai4r
5
+ # Url:: http://ai4r.rubyforge.org/
6
+ #
7
+ # You can redistribute it and/or modify it under the terms of
8
+ # the Mozilla Public License version 1.1 as published by the
9
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
10
+ #
11
+ # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
12
+ # International Journal of Man-Machine Studies. 27(4):349-370.
13
+
14
+ require File.dirname(__FILE__) + '/../data/data_set'
15
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
16
+
17
+ module Ai4r
18
+ module Classifiers
19
+
20
+ # = Introduction
21
+ # This is an implementation of the PRISM algorithm (Cendrowska, 1987)
22
+ # Given a set of preclassified examples, it builds a set of rules
23
+ # to predict the class of other instaces.
24
+ #
25
+ # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
26
+ # International Journal of Man-Machine Studies. 27(4):349-370.
27
+ class Prism < Classifier
28
+
29
+ attr_reader :data_set, :rules
30
+
31
+ # Build a new Prism classifier. You must provide a DataSet instance
32
+ # as parameter. The last attribute of each item is considered as
33
+ # the item class.
34
+ def build(data_set)
35
+ data_set.check_not_empty
36
+ @data_set = data_set
37
+ domains = @data_set.build_domains
38
+ instances = @data_set.data_items.collect {|data| data }
39
+ @rules = []
40
+ domains.last.each do |class_value|
41
+ while(has_class_value(instances, class_value))
42
+ rule = build_rule(class_value, instances)
43
+ @rules << rule
44
+ instances = instances.select {|data| !matches_conditions(data, rule[:conditions])}
45
+ end
46
+ end
47
+ return self
48
+ end
49
+
50
+ # You can evaluate new data, predicting its class.
51
+ # e.g.
52
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
53
+ def eval(instace)
54
+ @rules.each do |rule|
55
+ return rule[:class_value] if matches_conditions(instace, rule[:conditions])
56
+ end
57
+ return nil
58
+ end
59
+
60
+ # This method returns the generated rules in ruby code.
61
+ # e.g.
62
+ #
63
+ # classifier.get_rules
64
+ # # => if age_range == '<30' then marketing_target = 'Y'
65
+ # elsif age_range == '>80' then marketing_target = 'Y'
66
+ # elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
67
+ # else marketing_target = 'N'
68
+ # end
69
+ #
70
+ # It is a nice way to inspect induction results, and also to execute them:
71
+ # age_range = '[30-50)'
72
+ # city = 'New York'
73
+ # eval(classifier.get_rules)
74
+ # puts marketing_target
75
+ # 'Y'
76
+ def get_rules
77
+ out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
78
+ @rules[1...-1].each do |rule|
79
+ out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
80
+ end
81
+ out += "\nelse #{then_clause(@rules.last)}" if @rules.size > 1
82
+ out += "\nend"
83
+ return out
84
+ end
85
+
86
+ protected
87
+
88
+ def get_attr_value(data, attr)
89
+ data[@data_set.get_index(attr)]
90
+ end
91
+
92
+ def has_class_value(instances, class_value)
93
+ instances.each { |data| return true if data.last == class_value}
94
+ return false
95
+ end
96
+
97
+ def is_perfect(instances, rule)
98
+ class_value = rule[:class_value]
99
+ instances.each do |data|
100
+ return false if data.last != class_value and matches_conditions(data, rule[:conditions])
101
+ end
102
+ return true
103
+ end
104
+
105
+ def matches_conditions(data, conditions)
106
+ conditions.each_pair do |attr_label, attr_value|
107
+ return false if get_attr_value(data, attr_label) != attr_value
108
+ end
109
+ return true
110
+ end
111
+
112
+ def build_rule(class_value, instances)
113
+ rule = {:class_value => class_value, :conditions => {}}
114
+ rule_instances = instances.collect {|data| data }
115
+ attributes = @data_set.data_labels[0...-1].collect {|label| label }
116
+ until(is_perfect(instances, rule) || attributes.empty?)
117
+ freq_table = build_freq_table(rule_instances, attributes, class_value)
118
+ condition = get_condition(freq_table)
119
+ rule[:conditions].merge!(condition)
120
+ rule_instances = rule_instances.select do |data|
121
+ matches_conditions(data, condition)
122
+ end
123
+ end
124
+ return rule
125
+ end
126
+
127
+ # Returns a structure with the folloring format:
128
+ # => {attr1_label => { :attr1_value1 => [p, t], attr1_value2 => [p, t], ... },
129
+ # attr2_label => { :attr2_value1 => [p, t], attr2_value2 => [p, t], ... },
130
+ # ...
131
+ # }
132
+ # where p is the number of instances classified as class_value
133
+ # with that attribute value, and t is the total number of instances with
134
+ # that attribute value
135
+ def build_freq_table(rule_instances, attributes, class_value)
136
+ freq_table = Hash.new()
137
+ rule_instances.each do |data|
138
+ attributes.each do |attr_label|
139
+ attr_freqs = freq_table[attr_label] || Hash.new([0, 0])
140
+ pt = attr_freqs[get_attr_value(data, attr_label)]
141
+ pt = [(data.last == class_value) ? pt[0]+1 : pt[0], pt[1]+1]
142
+ attr_freqs[get_attr_value(data, attr_label)] = pt
143
+ freq_table[attr_label] = attr_freqs
144
+ end
145
+ end
146
+ return freq_table
147
+ end
148
+
149
+ # returns a single conditional term: {attrN_label => attrN_valueM}
150
+ # selecting the attribute with higher pt ratio
151
+ # (occurrences of attribute value classified as class_value /
152
+ # occurrences of attribute value)
153
+ def get_condition(freq_table)
154
+ best_pt = [0, 0]
155
+ condition = nil
156
+ freq_table.each do |attr_label, attr_freqs|
157
+ attr_freqs.each do |attr_value, pt|
158
+ if(better_pt(pt, best_pt))
159
+ condition = { attr_label => attr_value }
160
+ best_pt = pt
161
+ end
162
+ end
163
+ end
164
+ return condition
165
+ end
166
+
167
+ # pt = [p, t]
168
+ # p = occurrences of attribute value with instance classified as class_value
169
+ # t = occurrences of attribute value
170
+ # a pt is better if:
171
+ # 1- its ratio is higher
172
+ # 2- its ratio is equal, and has a higher p
173
+ def better_pt(pt, best_pt)
174
+ return false if pt[1] == 0
175
+ return true if best_pt[1] == 0
176
+ a = pt[0]*best_pt[1]
177
+ b = best_pt[0]*pt[1]
178
+ return true if a>b || (a==b && pt[0]>best_pt[0])
179
+ return false
180
+ end
181
+
182
+ def join_terms(rule)
183
+ terms = []
184
+ rule[:conditions].each do |attr_label, attr_value|
185
+ terms << "#{attr_label} == '#{attr_value}'"
186
+ end
187
+ "#{terms.join(" and ")}"
188
+ end
189
+
190
+ def then_clause(rule)
191
+ "#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
192
+ end
193
+
194
+ end
195
+ end
196
+ end
197
+
@@ -0,0 +1,73 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set.rb'
11
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
12
+
13
+ module Ai4r
14
+ module Classifiers
15
+
16
+ # = Introduction
17
+ #
18
+ # The idea behind the ZeroR classifier is to identify the
19
+ # the most common class value in the training set.
20
+ # It always returns that value when evaluating an instance.
21
+ # It is frequently used as a baseline for evaluating other machine learning
22
+ # algorithms.
23
+ class ZeroR < Classifier
24
+
25
+ attr_reader :data_set, :class_value
26
+
27
+ # Build a new ZeroR classifier. You must provide a DataSet instance
28
+ # as parameter. The last attribute of each item is considered as
29
+ # the item class.
30
+ def build(data_set)
31
+ data_set.check_not_empty
32
+ @data_set = data_set
33
+ frequencies = {}
34
+ max_freq = 0
35
+ @class_value = nil
36
+ @data_set.data_items.each do |example|
37
+ class_value = example.last
38
+ frequencies[class_value] = frequencies[class_value].nil? ? 1 : frequencies[class_value] + 1
39
+ class_frequency = frequencies[class_value]
40
+ if max_freq < class_frequency
41
+ max_freq = class_frequency
42
+ @class_value = class_value
43
+ end
44
+ end
45
+ return self
46
+ end
47
+
48
+ # You can evaluate new data, predicting its class.
49
+ # e.g.
50
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
51
+ def eval(data)
52
+ @class_value
53
+ end
54
+
55
+ # This method returns the generated rules in ruby code.
56
+ # e.g.
57
+ #
58
+ # classifier.get_rules
59
+ # # => marketing_target='Y'
60
+ #
61
+ # It is a nice way to inspect induction results, and also to execute them:
62
+ # marketing_target = nil
63
+ # eval classifier.get_rules
64
+ # puts marketing_target
65
+ # # => 'Y'
66
+ def get_rules
67
+ return "#{@data_set.data_labels.last} = '#{@class_value}'"
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,59 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of a Hierarchical clusterer with group average
17
+ # linkage, AKA unweighted pair group method average or UPGMA (Everitt
18
+ # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
19
+ # Hierarchical clusteres create one cluster per element, and then
20
+ # progressively merge clusters, until the required number of clusters
21
+ # is reached.
22
+ # With average linkage, the distance between a clusters cx and
23
+ # cluster (ci U cj) the the average distance between cx and ci, and
24
+ # cx and cj.
25
+ #
26
+ # D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
27
+ class AverageLinkage < SingleLinkage
28
+
29
+ parameters_info :distance_function =>
30
+ "Custom implementation of distance function. " +
31
+ "It must be a closure receiving two data items and return the " +
32
+ "distance bewteen them. By default, this algorithm uses " +
33
+ "ecuclidean distance of numeric attributes to the power of 2."
34
+
35
+ # Build a new clusterer, using data examples found in data_set.
36
+ # Items will be clustered in "number_of_clusters" different
37
+ # clusters.
38
+ def build(data_set, number_of_clusters)
39
+ super
40
+ end
41
+
42
+ # This algorithms does not allow classification of new data items
43
+ # once it has been built. Rebuild the cluster including you data element.
44
+ def eval(data_item)
45
+ Raise "Eval of new data is not supported by this algorithm."
46
+ end
47
+
48
+ protected
49
+
50
+ # return distance between cluster cx and cluster (ci U cj),
51
+ # using average linkage
52
+ def linkage_distance(cx, ci, cj)
53
+ (read_distance_matrix(cx, ci)+
54
+ read_distance_matrix(cx, cj))/2
55
+ end
56
+
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,93 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/k_means'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
17
+ # somewhat less sensible to the initial election of centroids than the
18
+ # original.
19
+ #
20
+ # More about K Means algorithm:
21
+ # http://en.wikipedia.org/wiki/K-means_algorithm
22
+ class BisectingKMeans < KMeans
23
+
24
+ attr_reader :data_set, :number_of_clusters, :clusters, :centroids
25
+ attr_accessor :max_iterations, :distance_function, :refine
26
+
27
+ parameters_info :max_iterations => "Maximum number of iterations to " +
28
+ "build the clusterer. By default it is uncapped.",
29
+ :distance_function => "Custom implementation of distance function. " +
30
+ "It must be a closure receiving two data items and return the " +
31
+ "distance bewteen them. By default, this algorithm uses " +
32
+ "ecuclidean distance of numeric attributes to the power of 2.",
33
+ :centroid_function => "Custom implementation to calculate the " +
34
+ "centroid of a cluster. It must be a closure receiving an array of " +
35
+ "data sets, and return an array of data items, representing the " +
36
+ "centroids of for each data set. " +
37
+ "By default, this algorithm returns a data items using the mode "+
38
+ "or mean of each attribute on each data set.",
39
+ :refine => "Boolean value. True by default. It will run the " +
40
+ "classic K Means algorithm, using as initial centroids the " +
41
+ "result of the bisecting approach."
42
+
43
+
44
+ def intialize
45
+ @refine = true
46
+ end
47
+
48
+ # Build a new clusterer, using data examples found in data_set.
49
+ # Items will be clustered in "number_of_clusters" different
50
+ # clusters.
51
+ def build(data_set, number_of_clusters)
52
+ @data_set = data_set
53
+ @number_of_clusters = number_of_clusters
54
+
55
+ @clusters = [@data_set]
56
+ @centroids = [@data_set.get_mean_or_mode]
57
+ while @clusters.length < @number_of_clusters
58
+ biggest_cluster_index = find_biggest_cluster_index(@clusters)
59
+ clusterer = KMeans.new.
60
+ set_parameters(get_parameters).
61
+ build(@clusters[biggest_cluster_index], 2)
62
+ @clusters.delete_at(biggest_cluster_index)
63
+ @centroids.delete_at(biggest_cluster_index)
64
+ @clusters.concat(clusterer.clusters)
65
+ @centroids.concat(clusterer.centroids)
66
+ end
67
+
68
+ super if @refine
69
+
70
+ return self
71
+ end
72
+
73
+ protected
74
+ def calc_initial_centroids
75
+ @centroids # Use existing centroids
76
+ end
77
+
78
+ def find_biggest_cluster_index(clusters)
79
+ max_index = 0
80
+ max_length = 0
81
+ clusters.each_index do |cluster_index|
82
+ cluster = clusters[cluster_index]
83
+ if max_length < cluster.data_items.length
84
+ max_length = cluster.data_items.length
85
+ max_index = cluster_index
86
+ end
87
+ end
88
+ return max_index
89
+ end
90
+
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,66 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # centroid linkage algorithm, aka unweighted pair group method
18
+ # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
19
+ # Sokal and Michener, 1958 )
20
+ # Hierarchical clusteres create one cluster per element, and then
21
+ # progressively merge clusters, until the required number of clusters
22
+ # is reached.
23
+ # The distance between clusters is the squared euclidean distance
24
+ # between their centroids.
25
+ #
26
+ # D(cx, (ci U cj)) = | mx - mij |^2
27
+ # D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
28
+ # (nj/(ni+nj))*D(cx, cj) -
29
+ # (ni*nj/(ni+nj)^2)*D(ci, cj)
30
+ class CentroidLinkage < SingleLinkage
31
+
32
+ parameters_info :distance_function =>
33
+ "Custom implementation of distance function. " +
34
+ "It must be a closure receiving two data items and return the " +
35
+ "distance bewteen them. By default, this algorithm uses " +
36
+ "ecuclidean distance of numeric attributes to the power of 2."
37
+
38
+ # Build a new clusterer, using data examples found in data_set.
39
+ # Items will be clustered in "number_of_clusters" different
40
+ # clusters.
41
+ def build(data_set, number_of_clusters)
42
+ super
43
+ end
44
+
45
+ # This algorithms does not allow classification of new data items
46
+ # once it has been built. Rebuild the cluster including you data element.
47
+ def eval(data_item)
48
+ Raise "Eval of new data is not supported by this algorithm."
49
+ end
50
+
51
+ protected
52
+
53
+ # return distance between cluster cx and cluster (ci U cj),
54
+ # using centroid linkage
55
+ def linkage_distance(cx, ci, cj)
56
+ ni = @index_clusters[ci].length
57
+ nj = @index_clusters[cj].length
58
+ ( ni * read_distance_matrix(cx, ci) +
59
+ nj * read_distance_matrix(cx, cj) -
60
+ 1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+