ai4r 1.2 → 1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (139) hide show
  1. data/README.rdoc +12 -25
  2. data/examples/decision_trees/id3_example.rb +6 -9
  3. data/examples/decision_trees/results.txt +2 -0
  4. data/examples/genetic_algorithm/genetic_algorithm_example.rb +11 -13
  5. data/examples/neural_network/xor_example.rb +25 -0
  6. data/lib/ai4r.rb +10 -0
  7. data/lib/ai4r/classifiers/classifier.rb +46 -0
  8. data/lib/ai4r/classifiers/id3.rb +27 -58
  9. data/lib/ai4r/classifiers/one_r.rb +19 -58
  10. data/lib/ai4r/classifiers/prism.rb +21 -57
  11. data/lib/ai4r/classifiers/zero_r.rb +16 -48
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +115 -0
  13. data/lib/ai4r/clusterers/clusterer.rb +55 -0
  14. data/lib/ai4r/clusterers/k_means.rb +164 -0
  15. data/lib/ai4r/data/data_set.rb +250 -0
  16. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +19 -19
  17. data/lib/ai4r/neural_network/backpropagation.rb +23 -24
  18. data/site/build/site/en/broken-links.xml +2 -0
  19. data/site/build/site/en/downloads.html +200 -0
  20. data/site/build/site/en/downloads.pdf +151 -0
  21. data/site/build/site/en/forum.html +197 -0
  22. data/site/build/site/en/forum.pdf +151 -0
  23. data/site/build/site/en/geneticAlgorithms.html +591 -0
  24. data/site/build/site/en/geneticAlgorithms.pdf +934 -0
  25. data/site/build/site/en/images/ai4r-logo.png +0 -0
  26. data/site/build/site/en/images/built-with-forrest-button.png +0 -0
  27. data/site/build/site/en/images/c.png +0 -0
  28. data/site/build/site/en/images/c_wbn.png +0 -0
  29. data/site/build/site/en/images/c_wn.png +0 -0
  30. data/site/build/site/en/images/ero.gif +0 -0
  31. data/site/build/site/en/images/europe2.png +0 -0
  32. data/site/build/site/en/images/europe3.png +0 -0
  33. data/site/build/site/en/images/fitness.png +0 -0
  34. data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
  35. data/site/build/site/en/images/instruction_arrow.png +0 -0
  36. data/site/build/site/en/images/jadeferret.png +0 -0
  37. data/site/build/site/en/images/my_email.png +0 -0
  38. data/site/build/site/en/images/neural_network_example.png +0 -0
  39. data/site/build/site/en/images/rubyforge.png +0 -0
  40. data/site/build/site/en/images/s.png +0 -0
  41. data/site/build/site/en/images/s_wbn.png +0 -0
  42. data/site/build/site/en/images/s_wn.png +0 -0
  43. data/site/build/site/en/images/sigmoid.png +0 -0
  44. data/site/build/site/en/images/t.png +0 -0
  45. data/site/build/site/en/images/t_wbn.png +0 -0
  46. data/site/build/site/en/images/t_wn.png +0 -0
  47. data/site/build/site/en/index.html +336 -0
  48. data/site/build/site/en/index.pdf +508 -0
  49. data/site/build/site/en/linkmap.html +263 -0
  50. data/site/build/site/en/linkmap.pdf +94 -0
  51. data/site/build/site/en/locationmap.xml +72 -0
  52. data/site/build/site/en/machineLearning.html +339 -0
  53. data/site/build/site/en/machineLearning.pdf +337 -0
  54. data/site/build/site/en/neuralNetworks.html +484 -0
  55. data/site/build/site/en/neuralNetworks.pdf +604 -0
  56. data/site/build/site/en/skin/CommonMessages_de.xml +23 -0
  57. data/site/build/site/en/skin/CommonMessages_en_US.xml +23 -0
  58. data/site/build/site/en/skin/CommonMessages_es.xml +23 -0
  59. data/site/build/site/en/skin/CommonMessages_fr.xml +23 -0
  60. data/site/build/site/en/skin/basic.css +166 -0
  61. data/site/build/site/en/skin/breadcrumbs-optimized.js +90 -0
  62. data/site/build/site/en/skin/breadcrumbs.js +237 -0
  63. data/site/build/site/en/skin/fontsize.js +166 -0
  64. data/site/build/site/en/skin/getBlank.js +40 -0
  65. data/site/build/site/en/skin/getMenu.js +45 -0
  66. data/site/build/site/en/skin/images/README.txt +1 -0
  67. data/site/build/site/en/skin/images/add.jpg +0 -0
  68. data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
  69. data/site/build/site/en/skin/images/chapter.gif +0 -0
  70. data/site/build/site/en/skin/images/chapter_open.gif +0 -0
  71. data/site/build/site/en/skin/images/current.gif +0 -0
  72. data/site/build/site/en/skin/images/error.png +0 -0
  73. data/site/build/site/en/skin/images/external-link.gif +0 -0
  74. data/site/build/site/en/skin/images/fix.jpg +0 -0
  75. data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
  76. data/site/build/site/en/skin/images/hack.jpg +0 -0
  77. data/site/build/site/en/skin/images/header_white_line.gif +0 -0
  78. data/site/build/site/en/skin/images/info.png +0 -0
  79. data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
  80. data/site/build/site/en/skin/images/label.gif +0 -0
  81. data/site/build/site/en/skin/images/page.gif +0 -0
  82. data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
  83. data/site/build/site/en/skin/images/poddoc.png +0 -0
  84. data/site/build/site/en/skin/images/printer.gif +0 -0
  85. data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  86. data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  87. data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  88. data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  89. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  90. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  91. data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  92. data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  93. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  94. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  95. data/site/build/site/en/skin/images/remove.jpg +0 -0
  96. data/site/build/site/en/skin/images/rss.png +0 -0
  97. data/site/build/site/en/skin/images/spacer.gif +0 -0
  98. data/site/build/site/en/skin/images/success.png +0 -0
  99. data/site/build/site/en/skin/images/txtdoc.png +0 -0
  100. data/site/build/site/en/skin/images/update.jpg +0 -0
  101. data/site/build/site/en/skin/images/valid-html401.png +0 -0
  102. data/site/build/site/en/skin/images/vcss.png +0 -0
  103. data/site/build/site/en/skin/images/warning.png +0 -0
  104. data/site/build/site/en/skin/images/xmldoc.gif +0 -0
  105. data/site/build/site/en/skin/menu.js +48 -0
  106. data/site/build/site/en/skin/note.txt +50 -0
  107. data/site/build/site/en/skin/print.css +54 -0
  108. data/site/build/site/en/skin/profile.css +163 -0
  109. data/site/build/site/en/skin/prototype.js +1257 -0
  110. data/site/build/site/en/skin/screen.css +587 -0
  111. data/site/build/site/en/svn.html +252 -0
  112. data/site/build/site/en/svn.pdf +306 -0
  113. data/site/build/site/en/wholesite.pdf +1915 -0
  114. data/site/build/tmp/brokenlinks.xml +2 -0
  115. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  116. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  117. data/site/build/tmp/locationmap.xml +14 -14
  118. data/site/build/tmp/output.xmap +23 -23
  119. data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
  120. data/site/build/tmp/projfilters.properties +41 -41
  121. data/site/build/webapp/WEB-INF/logs/core.log +593 -679
  122. data/site/build/webapp/WEB-INF/logs/error.log +362 -279
  123. data/site/build/webapp/WEB-INF/logs/sitemap.log +368 -1015
  124. data/site/src/documentation/content/xdocs/index.xml +18 -10
  125. data/site/src/documentation/content/xdocs/machineLearning.xml +4 -3
  126. data/site/src/documentation/content/xdocs/site.xml +2 -1
  127. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  128. data/test/classifiers/id3_test.rb +45 -44
  129. data/test/classifiers/one_r_test.rb +19 -17
  130. data/test/classifiers/prism_test.rb +22 -20
  131. data/test/classifiers/zero_r_test.rb +15 -12
  132. data/test/clusterers/bisecting_k_means_test.rb +59 -0
  133. data/test/clusterers/k_means_test.rb +93 -0
  134. data/test/data/data_set_test.rb +92 -0
  135. metadata +252 -128
  136. data/lib/ai4r/classifiers/classifier_helper.rb +0 -54
  137. data/site/src/documentation/content/xdocs/forum.html +0 -9
  138. data/site/src/documentation/resources/images/Thumbs.db +0 -0
  139. data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
@@ -11,7 +11,8 @@
11
11
  # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
12
12
  # International Journal of Man-Machine Studies. 27(4):349-370.
13
13
 
14
- require File.dirname(__FILE__) + '/classifier_helper'
14
+ require File.dirname(__FILE__) + '/../data/data_set'
15
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
15
16
 
16
17
  module Ai4r
17
18
  module Classifiers
@@ -23,50 +24,17 @@ module Ai4r
23
24
  #
24
25
  # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
25
26
  # International Journal of Man-Machine Studies. 27(4):349-370.
26
- class Prism
27
+ class Prism < Classifier
27
28
 
28
- attr_accessor :data_labels, :rules
29
- include ClassifierHelper
29
+ attr_reader :data_set, :rules
30
30
 
31
- # Build a new Prism classifier. If your data is classified with N attributed
32
- # and M examples, then your data examples must have the following format:
33
- #
34
- # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
35
- # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
36
- # ...
37
- # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
38
- # ]
39
- #
40
- # e.g.
41
- # [ ['New York', '<30', 'M', 'Y'],
42
- # ['Chicago', '<30', 'M', 'Y'],
43
- # ['Chicago', '<30', 'F', 'Y'],
44
- # ['New York', '<30', 'M', 'Y'],
45
- # ['New York', '<30', 'M', 'Y'],
46
- # ['Chicago', '[30-50)', 'M', 'Y'],
47
- # ['New York', '[30-50)', 'F', 'N'],
48
- # ['Chicago', '[30-50)', 'F', 'Y'],
49
- # ['New York', '[30-50)', 'F', 'N'],
50
- # ['Chicago', '[50-80]', 'M', 'N'],
51
- # ['New York', '[50-80]', 'F', 'N'],
52
- # ['New York', '[50-80]', 'M', 'N'],
53
- # ['Chicago', '[50-80]', 'M', 'N'],
54
- # ['New York', '[50-80]', 'F', 'N'],
55
- # ['Chicago', '>80', 'F', 'Y']
56
- # ]
57
- #
58
- # Data labels must have the following format:
59
- # [ 'city', 'age_range', 'gender', 'marketing_target' ]
60
- #
61
- # If you do not provide labels for you data, the following labels will
62
- # be created by default:
63
- # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
64
- #
65
- def build(data_examples, data_labels=nil)
66
- check_data_examples(data_examples)
67
- @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
68
- domains = build_domains(data_examples)
69
- instances = data_examples.collect {|data| data }
31
+ # Build a new Prism classifier. You must provide a DataSet instance
32
+ # as parameter.
33
+ def build(data_set)
34
+ data_set.check_not_empty
35
+ @data_set = data_set
36
+ domains = @data_set.build_domains
37
+ instances = @data_set.data_items.collect {|data| data }
70
38
  @rules = []
71
39
  domains.last.each do |class_value|
72
40
  while(has_class_value(instances, class_value))
@@ -91,7 +59,7 @@ module Ai4r
91
59
  # This method returns the generated rules in ruby code.
92
60
  # e.g.
93
61
  #
94
- # classifier.to_s
62
+ # classifier.get_rules
95
63
  # # => if age_range == '<30' then marketing_target = 'Y'
96
64
  # elsif age_range == '>80' then marketing_target = 'Y'
97
65
  # elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
@@ -101,10 +69,10 @@ module Ai4r
101
69
  # It is a nice way to inspect induction results, and also to execute them:
102
70
  # age_range = '[30-50)'
103
71
  # city = 'New York'
104
- # eval(classifier.to_s)
72
+ # eval(classifier.get_rules)
105
73
  # puts marketing_target
106
74
  # 'Y'
107
- def to_s
75
+ def get_rules
108
76
  out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
109
77
  @rules[1...-1].each do |rule|
110
78
  out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
@@ -116,6 +84,10 @@ module Ai4r
116
84
 
117
85
  protected
118
86
 
87
+ def get_attr_value(data, attr)
88
+ data[@data_set.get_index(attr)]
89
+ end
90
+
119
91
  def has_class_value(instances, class_value)
120
92
  instances.each { |data| return true if data.last == class_value}
121
93
  return false
@@ -131,23 +103,15 @@ module Ai4r
131
103
 
132
104
  def matches_conditions(data, conditions)
133
105
  conditions.each_pair do |attr_label, attr_value|
134
- return false if data[get_attr_index(attr_label)] != attr_value
106
+ return false if get_attr_value(data, attr_label) != attr_value
135
107
  end
136
108
  return true
137
109
  end
138
110
 
139
- def get_attr_index(attr_label)
140
- return @data_labels.index(attr_label)
141
- end
142
-
143
- def get_attr_value(data, attr_label)
144
- return data[get_attr_index(attr_label)]
145
- end
146
-
147
111
  def build_rule(class_value, instances)
148
112
  rule = {:class_value => class_value, :conditions => {}}
149
113
  rule_instances = instances.collect {|data| data }
150
- attributes = @data_labels[0...-1].collect {|label| label }
114
+ attributes = @data_set.data_labels[0...-1].collect {|label| label }
151
115
  until(is_perfect(instances, rule) || attributes.empty?)
152
116
  freq_table = build_freq_table(rule_instances, attributes, class_value)
153
117
  condition = get_condition(freq_table)
@@ -223,7 +187,7 @@ module Ai4r
223
187
  end
224
188
 
225
189
  def then_clause(rule)
226
- "#{@data_labels.last} = '#{rule[:class_value]}'"
190
+ "#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
227
191
  end
228
192
 
229
193
  end
@@ -7,10 +7,12 @@
7
7
  # the Mozilla Public License version 1.1 as published by the
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
- require File.dirname(__FILE__) + '/classifier_helper'
10
+ require File.dirname(__FILE__) + '/../data/data_set.rb'
11
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
11
12
 
12
13
  module Ai4r
13
14
  module Classifiers
15
+
14
16
  # = Introduction
15
17
  #
16
18
  # The idea behind the ZeroR classifier is to identify the
@@ -18,53 +20,19 @@ module Ai4r
18
20
  # It always returns that value when evaluating an instance.
19
21
  # It is frequently used as a baseline for evaluating other machine learning
20
22
  # algorithms.
21
- class ZeroR
23
+ class ZeroR < Classifier
22
24
 
23
- attr_accessor :data_labels, :class_value
24
-
25
- include ClassifierHelper
25
+ attr_reader :data_set, :class_value
26
26
 
27
- # Build a new ZeroR classifier. If your data is classified with N attributed
28
- # and M examples, then your data examples must have the following format:
29
- #
30
- # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
31
- # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
32
- # ...
33
- # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
34
- # ]
35
- #
36
- # e.g.
37
- # [ ['New York', '<30', 'M', 'Y'],
38
- # ['Chicago', '<30', 'M', 'Y'],
39
- # ['Chicago', '<30', 'F', 'Y'],
40
- # ['New York', '<30', 'M', 'Y'],
41
- # ['New York', '<30', 'M', 'Y'],
42
- # ['Chicago', '[30-50)', 'M', 'Y'],
43
- # ['New York', '[30-50)', 'F', 'N'],
44
- # ['Chicago', '[30-50)', 'F', 'Y'],
45
- # ['New York', '[30-50)', 'F', 'N'],
46
- # ['Chicago', '[50-80]', 'M', 'N'],
47
- # ['New York', '[50-80]', 'F', 'N'],
48
- # ['New York', '[50-80]', 'M', 'N'],
49
- # ['Chicago', '[50-80]', 'M', 'N'],
50
- # ['New York', '[50-80]', 'F', 'N'],
51
- # ['Chicago', '>80', 'F', 'Y']
52
- # ]
53
- #
54
- # Data labels must have the following format:
55
- # [ 'city', 'age_range', 'gender', 'marketing_target' ]
56
- #
57
- # If you do not provide labels for you data, the following labels will
58
- # be created by default:
59
- # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
60
- #
61
- def build(data_examples, data_labels=nil)
62
- check_data_examples(data_examples)
63
- @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
27
+ # Build a new ZeroR classifier. You must provide a DataSet instance
28
+ # as parameter.
29
+ def build(data_set)
30
+ data_set.check_not_empty
31
+ @data_set = data_set
64
32
  frequence = {}
65
33
  max_freq = 0
66
- @class_value
67
- data_examples.each do |example|
34
+ @class_value = nil
35
+ @data_set.data_items.each do |example|
68
36
  class_value = example.last
69
37
  class_frequency = frequence[class_value]
70
38
  class_frequency = (class_frequency) ? class_frequency+1 : 1
@@ -86,16 +54,16 @@ module Ai4r
86
54
  # This method returns the generated rules in ruby code.
87
55
  # e.g.
88
56
  #
89
- # classifier.to_s
57
+ # classifier.get_rules
90
58
  # # => marketing_target='Y'
91
59
  #
92
60
  # It is a nice way to inspect induction results, and also to execute them:
93
61
  # marketing_target = nil
94
- # eval classifier.to_s
62
+ # eval classifier.get_rules
95
63
  # puts marketing_target
96
64
  # # => 'Y'
97
- def to_s
98
- return "#{@data_labels.last} = '#{@class_value}'"
65
+ def get_rules
66
+ return "#{@data_set.data_labels.last} = '#{@class_value}'"
99
67
  end
100
68
 
101
69
  end
@@ -0,0 +1,115 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require "set"
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../clusterers/k_means'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
18
+ # somewhat less sensible to the initial election of centroids than the
19
+ # original.
20
+ #
21
+ # More about K Means algorithm:
22
+ # http://en.wikipedia.org/wiki/K-means_algorithm
23
+ class BisectingKMeans < KMeans
24
+
25
+ attr_reader :data_set, :number_of_clusters, :clusters, :centroids
26
+ attr_accessor :max_iterations, :distance_function, :refine
27
+
28
+ def intialize
29
+ @refine = true
30
+ end
31
+
32
+ # Build a new clusterer, using data examples found in data_set.
33
+ # Items will be clustered in "number_of_clusters" different
34
+ # clusters.
35
+ def build(data_set, number_of_clusters)
36
+ @data_set = data_set
37
+ @number_of_clusters = number_of_clusters
38
+
39
+ @clusters = [@data_set]
40
+ @centroids = [@data_set.get_mean_or_mode]
41
+ while @clusters.length < @number_of_clusters
42
+ biggest_cluster_index = find_biggest_cluster_index(@clusters)
43
+ clusterer = KMeans.new.
44
+ set_parameters(get_parameters).
45
+ build(@clusters[biggest_cluster_index], 2)
46
+ @clusters.delete_at(biggest_cluster_index)
47
+ @centroids.delete_at(biggest_cluster_index)
48
+ @clusters.concat(clusterer.clusters)
49
+ @centroids.concat(clusterer.centroids)
50
+ end
51
+
52
+ super if @refine
53
+
54
+ return self
55
+ end
56
+
57
+ # Get info on what can be parameterized on this clusterer algorithm.
58
+ # It returns a hash with the following format:
59
+ # { :param_name => "Info on the parameter" }
60
+ def get_parameters_info
61
+ { :max_iterations => "Maximum number of iterations used to bisect a " +
62
+ "cluster. By default it is uncapped.",
63
+ :distance_function => "Custom implementation of distance function. " +
64
+ "It must be a closure receiving two data items and return the " +
65
+ "distance bewteen them. By default, this algorithm uses " +
66
+ "ecuclidean distance of numeric attributes to the power of 2.",
67
+ :refine => "Boolean value. True by default. It will run the " +
68
+ "classic K Means algorithm, using as initial centroids the " +
69
+ "result of the bisecting approach."
70
+ }
71
+ end
72
+
73
+ # Set parameters on this clusterer instance.
74
+ # You must provide a hash with the folowing format:
75
+ # { :param_name => parameter_value }
76
+ #
77
+ # Use get_parameters_info to know what parameters are accepted.
78
+ def set_parameters(parameters)
79
+ super
80
+ if parameters.has_key?(:refine)
81
+ @refine = parameters[:refine]
82
+ end
83
+ return self
84
+ end
85
+
86
+ # Get parameter values on this clusterer instance.
87
+ # Returns a hash with the folowing format:
88
+ # { :param_name => parameter_value }
89
+ def get_parameters
90
+ params = super
91
+ params[:refine] = @refine
92
+ return params
93
+ end
94
+
95
+ protected
96
+ def calc_initial_centroids
97
+ @centroids # Use existing centroids
98
+ end
99
+
100
+ def find_biggest_cluster_index(clusters)
101
+ max_index = 0
102
+ max_length = 0
103
+ clusters.each_index do |cluster_index|
104
+ cluster = clusters[cluster_index]
105
+ if max_length < cluster.data_items.length
106
+ max_length = cluster.data_items.length
107
+ max_index = cluster_index
108
+ end
109
+ end
110
+ return max_index
111
+ end
112
+
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,55 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ module Ai4r
11
+ module Clusterers
12
+
13
+ # The purpose of this class is to define a common API for Clusterers.
14
+ # All methods in this class (other than eval) must be implemented in
15
+ # subclasses.
16
+ class Clusterer
17
+
18
+ # Build a new clusterer, using data examples found in data_set.
19
+ # Data items will be clustered in "number_of_clusters" different
20
+ # clusters.
21
+ def build(data_set, number_of_clusters)
22
+ raise NotImplementedError
23
+ end
24
+
25
+ # Classifies the given data item, returning the cluster it belongs to.
26
+ def eval(data_item)
27
+ raise NotImplementedError
28
+ end
29
+
30
+ # Get info on what can be parameterized on this clusterer.
31
+ # It returns a hash with the following format:
32
+ # { :param_name => "Info on the parameter" }
33
+ def get_parameters_info
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Set parameter values on this clusterer instance.
38
+ # You must provide a hash with the folowing format:
39
+ # { :param_name => parameter_value }
40
+ def set_parameters(parameters)
41
+ raise NotImplementedError
42
+ end
43
+
44
+ # Get parameter values on this clusterer instance.
45
+ # Returns a hash with the folowing format:
46
+ # { :param_name => parameter_value }
47
+ def get_parameters
48
+ raise NotImplementedError
49
+ end
50
+
51
+
52
+ end
53
+
54
+ end
55
+ end
@@ -0,0 +1,164 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require "set"
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # The k-means algorithm is an algorithm to cluster n objects
18
+ # based on attributes into k partitions, with k < n.
19
+ #
20
+ # More about K Means algorithm:
21
+ # http://en.wikipedia.org/wiki/K-means_algorithm
22
+ class KMeans < Clusterer
23
+
24
+ attr_reader :data_set, :number_of_clusters
25
+ attr_reader :clusters, :centroids, :iterations
26
+ attr_accessor :max_iterations
27
+ attr_accessor :distance_function
28
+
29
+ # Build a new clusterer, using data examples found in data_set.
30
+ # Items will be clustered in "number_of_clusters" different
31
+ # clusters.
32
+ def build(data_set, number_of_clusters)
33
+ @data_set = data_set
34
+ @number_of_clusters = number_of_clusters
35
+ @iterations = 0
36
+
37
+ calc_initial_centroids
38
+ while(not stop_criteria_met)
39
+ calculate_membership_clusters
40
+ recompute_centroids
41
+ end
42
+
43
+ return self
44
+ end
45
+
46
+ # Classifies the given data item, returning the cluster index it belongs
47
+ # to (0-based).
48
+ def eval(data_item)
49
+ get_min_index(@centroids.collect {|centroid|
50
+ distance(data_item, centroid)})
51
+ end
52
+
53
+ # Get info on what can be parameterized on this clusterer algorithm.
54
+ # It returns a hash with the following format:
55
+ # { :param_name => "Info on the parameter" }
56
+ def get_parameters_info
57
+ { :max_iterations => "Maximum number of iterations to build the " +
58
+ "clusterer. By default it is uncapped.",
59
+ :distance_function => "Custom implementation of distance function. " +
60
+ "It must be a closure receiving two data items and return the " +
61
+ "distance bewteen them. By default, this algorithm uses " +
62
+ "ecuclidean distance of numeric attributes to the power of 2."
63
+ }
64
+ end
65
+
66
+ # Set parameters on this clusterer instance.
67
+ # You must provide a hash with the folowing format:
68
+ # { :param_name => parameter_value }
69
+ #
70
+ # Use get_parameters_info to know what parameters are accepted.
71
+ def set_parameters(parameters)
72
+ if parameters.has_key?(:max_iterations)
73
+ @max_iterations = parameters[:max_iterations]
74
+ end
75
+ if parameters.has_key?(:distance_function)
76
+ @distance_function = parameters[:distance_function]
77
+ end
78
+ return self
79
+ end
80
+
81
+ # Get parameter values on this clusterer instance.
82
+ # Returns a hash with the folowing format:
83
+ # { :param_name => parameter_value }
84
+ def get_parameters
85
+ { :max_iterations => @max_iterations,
86
+ :distance_function => @distance_function }
87
+ end
88
+
89
+ # This function calculates the distance between 2 different
90
+ # instances. By default, it returns the euclidean distance to the
91
+ # power of 2.
92
+ # You can provide a more convinient distance implementation:
93
+ #
94
+ # 1- Overwriting this method
95
+ #
96
+ # 2- Providing a closure to the :distance_function parameter
97
+ def distance(a, b)
98
+ return @distance_function.call(a, b) if @distance_function
99
+ return euclidean_distance(a, b)
100
+ end
101
+
102
+ protected
103
+ def euclidean_distance(a, b)
104
+ dist = 0.0
105
+ a.each_index do |index|
106
+ if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
107
+ dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
108
+ end
109
+ end
110
+ return dist
111
+ end
112
+
113
+ def calc_initial_centroids
114
+ @centroids = []
115
+ tried_indexes = []
116
+ while @centroids.length < @number_of_clusters &&
117
+ tried_indexes.length < @data_set.data_items.length
118
+ random_index = rand(@data_set.data_items.length)
119
+ if !tried_indexes.include?(random_index)
120
+ tried_indexes << random_index
121
+ if !@centroids.include? @data_set.data_items[random_index]
122
+ @centroids << @data_set.data_items[random_index]
123
+ end
124
+ end
125
+ end
126
+ @number_of_clusters = @centroids.length
127
+ end
128
+
129
+ def stop_criteria_met
130
+ @old_centroids == @centroids ||
131
+ (@max_iterations && (@max_iterations <= @iterations))
132
+ end
133
+
134
+ def calculate_membership_clusters
135
+ @clusters = Array.new(@number_of_clusters) do
136
+ Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
137
+ end
138
+ @data_set.data_items.each do |data_item|
139
+ @clusters[eval(data_item)] << data_item
140
+ end
141
+ end
142
+
143
+ def recompute_centroids
144
+ @old_centroids = @centroids
145
+ @centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
146
+ @iterations += 1
147
+ end
148
+
149
+ def get_min_index(array)
150
+ min = array.first
151
+ index = 0
152
+ array.each_index do |i|
153
+ x = array[i]
154
+ if x < min
155
+ min = x
156
+ index = i
157
+ end
158
+ end
159
+ return index
160
+ end
161
+
162
+ end
163
+ end
164
+ end