ai4r 1.2 → 1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. data/README.rdoc +12 -25
  2. data/examples/decision_trees/id3_example.rb +6 -9
  3. data/examples/decision_trees/results.txt +2 -0
  4. data/examples/genetic_algorithm/genetic_algorithm_example.rb +11 -13
  5. data/examples/neural_network/xor_example.rb +25 -0
  6. data/lib/ai4r.rb +10 -0
  7. data/lib/ai4r/classifiers/classifier.rb +46 -0
  8. data/lib/ai4r/classifiers/id3.rb +27 -58
  9. data/lib/ai4r/classifiers/one_r.rb +19 -58
  10. data/lib/ai4r/classifiers/prism.rb +21 -57
  11. data/lib/ai4r/classifiers/zero_r.rb +16 -48
  12. data/lib/ai4r/clusterers/bisecting_k_means.rb +115 -0
  13. data/lib/ai4r/clusterers/clusterer.rb +55 -0
  14. data/lib/ai4r/clusterers/k_means.rb +164 -0
  15. data/lib/ai4r/data/data_set.rb +250 -0
  16. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +19 -19
  17. data/lib/ai4r/neural_network/backpropagation.rb +23 -24
  18. data/site/build/site/en/broken-links.xml +2 -0
  19. data/site/build/site/en/downloads.html +200 -0
  20. data/site/build/site/en/downloads.pdf +151 -0
  21. data/site/build/site/en/forum.html +197 -0
  22. data/site/build/site/en/forum.pdf +151 -0
  23. data/site/build/site/en/geneticAlgorithms.html +591 -0
  24. data/site/build/site/en/geneticAlgorithms.pdf +934 -0
  25. data/site/build/site/en/images/ai4r-logo.png +0 -0
  26. data/site/build/site/en/images/built-with-forrest-button.png +0 -0
  27. data/site/build/site/en/images/c.png +0 -0
  28. data/site/build/site/en/images/c_wbn.png +0 -0
  29. data/site/build/site/en/images/c_wn.png +0 -0
  30. data/site/build/site/en/images/ero.gif +0 -0
  31. data/site/build/site/en/images/europe2.png +0 -0
  32. data/site/build/site/en/images/europe3.png +0 -0
  33. data/site/build/site/en/images/fitness.png +0 -0
  34. data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
  35. data/site/build/site/en/images/instruction_arrow.png +0 -0
  36. data/site/build/site/en/images/jadeferret.png +0 -0
  37. data/site/build/site/en/images/my_email.png +0 -0
  38. data/site/build/site/en/images/neural_network_example.png +0 -0
  39. data/site/build/site/en/images/rubyforge.png +0 -0
  40. data/site/build/site/en/images/s.png +0 -0
  41. data/site/build/site/en/images/s_wbn.png +0 -0
  42. data/site/build/site/en/images/s_wn.png +0 -0
  43. data/site/build/site/en/images/sigmoid.png +0 -0
  44. data/site/build/site/en/images/t.png +0 -0
  45. data/site/build/site/en/images/t_wbn.png +0 -0
  46. data/site/build/site/en/images/t_wn.png +0 -0
  47. data/site/build/site/en/index.html +336 -0
  48. data/site/build/site/en/index.pdf +508 -0
  49. data/site/build/site/en/linkmap.html +263 -0
  50. data/site/build/site/en/linkmap.pdf +94 -0
  51. data/site/build/site/en/locationmap.xml +72 -0
  52. data/site/build/site/en/machineLearning.html +339 -0
  53. data/site/build/site/en/machineLearning.pdf +337 -0
  54. data/site/build/site/en/neuralNetworks.html +484 -0
  55. data/site/build/site/en/neuralNetworks.pdf +604 -0
  56. data/site/build/site/en/skin/CommonMessages_de.xml +23 -0
  57. data/site/build/site/en/skin/CommonMessages_en_US.xml +23 -0
  58. data/site/build/site/en/skin/CommonMessages_es.xml +23 -0
  59. data/site/build/site/en/skin/CommonMessages_fr.xml +23 -0
  60. data/site/build/site/en/skin/basic.css +166 -0
  61. data/site/build/site/en/skin/breadcrumbs-optimized.js +90 -0
  62. data/site/build/site/en/skin/breadcrumbs.js +237 -0
  63. data/site/build/site/en/skin/fontsize.js +166 -0
  64. data/site/build/site/en/skin/getBlank.js +40 -0
  65. data/site/build/site/en/skin/getMenu.js +45 -0
  66. data/site/build/site/en/skin/images/README.txt +1 -0
  67. data/site/build/site/en/skin/images/add.jpg +0 -0
  68. data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
  69. data/site/build/site/en/skin/images/chapter.gif +0 -0
  70. data/site/build/site/en/skin/images/chapter_open.gif +0 -0
  71. data/site/build/site/en/skin/images/current.gif +0 -0
  72. data/site/build/site/en/skin/images/error.png +0 -0
  73. data/site/build/site/en/skin/images/external-link.gif +0 -0
  74. data/site/build/site/en/skin/images/fix.jpg +0 -0
  75. data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
  76. data/site/build/site/en/skin/images/hack.jpg +0 -0
  77. data/site/build/site/en/skin/images/header_white_line.gif +0 -0
  78. data/site/build/site/en/skin/images/info.png +0 -0
  79. data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
  80. data/site/build/site/en/skin/images/label.gif +0 -0
  81. data/site/build/site/en/skin/images/page.gif +0 -0
  82. data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
  83. data/site/build/site/en/skin/images/poddoc.png +0 -0
  84. data/site/build/site/en/skin/images/printer.gif +0 -0
  85. data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  86. data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  87. data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  88. data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  89. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  90. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  91. data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  92. data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  93. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  94. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  95. data/site/build/site/en/skin/images/remove.jpg +0 -0
  96. data/site/build/site/en/skin/images/rss.png +0 -0
  97. data/site/build/site/en/skin/images/spacer.gif +0 -0
  98. data/site/build/site/en/skin/images/success.png +0 -0
  99. data/site/build/site/en/skin/images/txtdoc.png +0 -0
  100. data/site/build/site/en/skin/images/update.jpg +0 -0
  101. data/site/build/site/en/skin/images/valid-html401.png +0 -0
  102. data/site/build/site/en/skin/images/vcss.png +0 -0
  103. data/site/build/site/en/skin/images/warning.png +0 -0
  104. data/site/build/site/en/skin/images/xmldoc.gif +0 -0
  105. data/site/build/site/en/skin/menu.js +48 -0
  106. data/site/build/site/en/skin/note.txt +50 -0
  107. data/site/build/site/en/skin/print.css +54 -0
  108. data/site/build/site/en/skin/profile.css +163 -0
  109. data/site/build/site/en/skin/prototype.js +1257 -0
  110. data/site/build/site/en/skin/screen.css +587 -0
  111. data/site/build/site/en/svn.html +252 -0
  112. data/site/build/site/en/svn.pdf +306 -0
  113. data/site/build/site/en/wholesite.pdf +1915 -0
  114. data/site/build/tmp/brokenlinks.xml +2 -0
  115. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  116. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  117. data/site/build/tmp/locationmap.xml +14 -14
  118. data/site/build/tmp/output.xmap +23 -23
  119. data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
  120. data/site/build/tmp/projfilters.properties +41 -41
  121. data/site/build/webapp/WEB-INF/logs/core.log +593 -679
  122. data/site/build/webapp/WEB-INF/logs/error.log +362 -279
  123. data/site/build/webapp/WEB-INF/logs/sitemap.log +368 -1015
  124. data/site/src/documentation/content/xdocs/index.xml +18 -10
  125. data/site/src/documentation/content/xdocs/machineLearning.xml +4 -3
  126. data/site/src/documentation/content/xdocs/site.xml +2 -1
  127. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  128. data/test/classifiers/id3_test.rb +45 -44
  129. data/test/classifiers/one_r_test.rb +19 -17
  130. data/test/classifiers/prism_test.rb +22 -20
  131. data/test/classifiers/zero_r_test.rb +15 -12
  132. data/test/clusterers/bisecting_k_means_test.rb +59 -0
  133. data/test/clusterers/k_means_test.rb +93 -0
  134. data/test/data/data_set_test.rb +92 -0
  135. metadata +252 -128
  136. data/lib/ai4r/classifiers/classifier_helper.rb +0 -54
  137. data/site/src/documentation/content/xdocs/forum.html +0 -9
  138. data/site/src/documentation/resources/images/Thumbs.db +0 -0
  139. data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
@@ -11,7 +11,8 @@
11
11
  # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
12
12
  # International Journal of Man-Machine Studies. 27(4):349-370.
13
13
 
14
- require File.dirname(__FILE__) + '/classifier_helper'
14
+ require File.dirname(__FILE__) + '/../data/data_set'
15
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
15
16
 
16
17
  module Ai4r
17
18
  module Classifiers
@@ -23,50 +24,17 @@ module Ai4r
23
24
  #
24
25
  # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
25
26
  # International Journal of Man-Machine Studies. 27(4):349-370.
26
- class Prism
27
+ class Prism < Classifier
27
28
 
28
- attr_accessor :data_labels, :rules
29
- include ClassifierHelper
29
+ attr_reader :data_set, :rules
30
30
 
31
- # Build a new Prism classifier. If your data is classified with N attributed
32
- # and M examples, then your data examples must have the following format:
33
- #
34
- # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
35
- # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
36
- # ...
37
- # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
38
- # ]
39
- #
40
- # e.g.
41
- # [ ['New York', '<30', 'M', 'Y'],
42
- # ['Chicago', '<30', 'M', 'Y'],
43
- # ['Chicago', '<30', 'F', 'Y'],
44
- # ['New York', '<30', 'M', 'Y'],
45
- # ['New York', '<30', 'M', 'Y'],
46
- # ['Chicago', '[30-50)', 'M', 'Y'],
47
- # ['New York', '[30-50)', 'F', 'N'],
48
- # ['Chicago', '[30-50)', 'F', 'Y'],
49
- # ['New York', '[30-50)', 'F', 'N'],
50
- # ['Chicago', '[50-80]', 'M', 'N'],
51
- # ['New York', '[50-80]', 'F', 'N'],
52
- # ['New York', '[50-80]', 'M', 'N'],
53
- # ['Chicago', '[50-80]', 'M', 'N'],
54
- # ['New York', '[50-80]', 'F', 'N'],
55
- # ['Chicago', '>80', 'F', 'Y']
56
- # ]
57
- #
58
- # Data labels must have the following format:
59
- # [ 'city', 'age_range', 'gender', 'marketing_target' ]
60
- #
61
- # If you do not provide labels for you data, the following labels will
62
- # be created by default:
63
- # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
64
- #
65
- def build(data_examples, data_labels=nil)
66
- check_data_examples(data_examples)
67
- @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
68
- domains = build_domains(data_examples)
69
- instances = data_examples.collect {|data| data }
31
+ # Build a new Prism classifier. You must provide a DataSet instance
32
+ # as parameter.
33
+ def build(data_set)
34
+ data_set.check_not_empty
35
+ @data_set = data_set
36
+ domains = @data_set.build_domains
37
+ instances = @data_set.data_items.collect {|data| data }
70
38
  @rules = []
71
39
  domains.last.each do |class_value|
72
40
  while(has_class_value(instances, class_value))
@@ -91,7 +59,7 @@ module Ai4r
91
59
  # This method returns the generated rules in ruby code.
92
60
  # e.g.
93
61
  #
94
- # classifier.to_s
62
+ # classifier.get_rules
95
63
  # # => if age_range == '<30' then marketing_target = 'Y'
96
64
  # elsif age_range == '>80' then marketing_target = 'Y'
97
65
  # elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
@@ -101,10 +69,10 @@ module Ai4r
101
69
  # It is a nice way to inspect induction results, and also to execute them:
102
70
  # age_range = '[30-50)'
103
71
  # city = 'New York'
104
- # eval(classifier.to_s)
72
+ # eval(classifier.get_rules)
105
73
  # puts marketing_target
106
74
  # 'Y'
107
- def to_s
75
+ def get_rules
108
76
  out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
109
77
  @rules[1...-1].each do |rule|
110
78
  out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
@@ -116,6 +84,10 @@ module Ai4r
116
84
 
117
85
  protected
118
86
 
87
+ def get_attr_value(data, attr)
88
+ data[@data_set.get_index(attr)]
89
+ end
90
+
119
91
  def has_class_value(instances, class_value)
120
92
  instances.each { |data| return true if data.last == class_value}
121
93
  return false
@@ -131,23 +103,15 @@ module Ai4r
131
103
 
132
104
  def matches_conditions(data, conditions)
133
105
  conditions.each_pair do |attr_label, attr_value|
134
- return false if data[get_attr_index(attr_label)] != attr_value
106
+ return false if get_attr_value(data, attr_label) != attr_value
135
107
  end
136
108
  return true
137
109
  end
138
110
 
139
- def get_attr_index(attr_label)
140
- return @data_labels.index(attr_label)
141
- end
142
-
143
- def get_attr_value(data, attr_label)
144
- return data[get_attr_index(attr_label)]
145
- end
146
-
147
111
  def build_rule(class_value, instances)
148
112
  rule = {:class_value => class_value, :conditions => {}}
149
113
  rule_instances = instances.collect {|data| data }
150
- attributes = @data_labels[0...-1].collect {|label| label }
114
+ attributes = @data_set.data_labels[0...-1].collect {|label| label }
151
115
  until(is_perfect(instances, rule) || attributes.empty?)
152
116
  freq_table = build_freq_table(rule_instances, attributes, class_value)
153
117
  condition = get_condition(freq_table)
@@ -223,7 +187,7 @@ module Ai4r
223
187
  end
224
188
 
225
189
  def then_clause(rule)
226
- "#{@data_labels.last} = '#{rule[:class_value]}'"
190
+ "#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
227
191
  end
228
192
 
229
193
  end
@@ -7,10 +7,12 @@
7
7
  # the Mozilla Public License version 1.1 as published by the
8
8
  # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
9
 
10
- require File.dirname(__FILE__) + '/classifier_helper'
10
+ require File.dirname(__FILE__) + '/../data/data_set.rb'
11
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
11
12
 
12
13
  module Ai4r
13
14
  module Classifiers
15
+
14
16
  # = Introduction
15
17
  #
16
18
  # The idea behind the ZeroR classifier is to identify the
@@ -18,53 +20,19 @@ module Ai4r
18
20
  # It always returns that value when evaluating an instance.
19
21
  # It is frequently used as a baseline for evaluating other machine learning
20
22
  # algorithms.
21
- class ZeroR
23
+ class ZeroR < Classifier
22
24
 
23
- attr_accessor :data_labels, :class_value
24
-
25
- include ClassifierHelper
25
+ attr_reader :data_set, :class_value
26
26
 
27
- # Build a new ZeroR classifier. If your data is classified with N attributed
28
- # and M examples, then your data examples must have the following format:
29
- #
30
- # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
31
- # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
32
- # ...
33
- # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
34
- # ]
35
- #
36
- # e.g.
37
- # [ ['New York', '<30', 'M', 'Y'],
38
- # ['Chicago', '<30', 'M', 'Y'],
39
- # ['Chicago', '<30', 'F', 'Y'],
40
- # ['New York', '<30', 'M', 'Y'],
41
- # ['New York', '<30', 'M', 'Y'],
42
- # ['Chicago', '[30-50)', 'M', 'Y'],
43
- # ['New York', '[30-50)', 'F', 'N'],
44
- # ['Chicago', '[30-50)', 'F', 'Y'],
45
- # ['New York', '[30-50)', 'F', 'N'],
46
- # ['Chicago', '[50-80]', 'M', 'N'],
47
- # ['New York', '[50-80]', 'F', 'N'],
48
- # ['New York', '[50-80]', 'M', 'N'],
49
- # ['Chicago', '[50-80]', 'M', 'N'],
50
- # ['New York', '[50-80]', 'F', 'N'],
51
- # ['Chicago', '>80', 'F', 'Y']
52
- # ]
53
- #
54
- # Data labels must have the following format:
55
- # [ 'city', 'age_range', 'gender', 'marketing_target' ]
56
- #
57
- # If you do not provide labels for you data, the following labels will
58
- # be created by default:
59
- # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
60
- #
61
- def build(data_examples, data_labels=nil)
62
- check_data_examples(data_examples)
63
- @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
27
+ # Build a new ZeroR classifier. You must provide a DataSet instance
28
+ # as parameter.
29
+ def build(data_set)
30
+ data_set.check_not_empty
31
+ @data_set = data_set
64
32
  frequence = {}
65
33
  max_freq = 0
66
- @class_value
67
- data_examples.each do |example|
34
+ @class_value = nil
35
+ @data_set.data_items.each do |example|
68
36
  class_value = example.last
69
37
  class_frequency = frequence[class_value]
70
38
  class_frequency = (class_frequency) ? class_frequency+1 : 1
@@ -86,16 +54,16 @@ module Ai4r
86
54
  # This method returns the generated rules in ruby code.
87
55
  # e.g.
88
56
  #
89
- # classifier.to_s
57
+ # classifier.get_rules
90
58
  # # => marketing_target='Y'
91
59
  #
92
60
  # It is a nice way to inspect induction results, and also to execute them:
93
61
  # marketing_target = nil
94
- # eval classifier.to_s
62
+ # eval classifier.get_rules
95
63
  # puts marketing_target
96
64
  # # => 'Y'
97
- def to_s
98
- return "#{@data_labels.last} = '#{@class_value}'"
65
+ def get_rules
66
+ return "#{@data_set.data_labels.last} = '#{@class_value}'"
99
67
  end
100
68
 
101
69
  end
@@ -0,0 +1,115 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require "set"
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../clusterers/k_means'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
18
+ # somewhat less sensible to the initial election of centroids than the
19
+ # original.
20
+ #
21
+ # More about K Means algorithm:
22
+ # http://en.wikipedia.org/wiki/K-means_algorithm
23
+ class BisectingKMeans < KMeans
24
+
25
+ attr_reader :data_set, :number_of_clusters, :clusters, :centroids
26
+ attr_accessor :max_iterations, :distance_function, :refine
27
+
28
+ def intialize
29
+ @refine = true
30
+ end
31
+
32
+ # Build a new clusterer, using data examples found in data_set.
33
+ # Items will be clustered in "number_of_clusters" different
34
+ # clusters.
35
+ def build(data_set, number_of_clusters)
36
+ @data_set = data_set
37
+ @number_of_clusters = number_of_clusters
38
+
39
+ @clusters = [@data_set]
40
+ @centroids = [@data_set.get_mean_or_mode]
41
+ while @clusters.length < @number_of_clusters
42
+ biggest_cluster_index = find_biggest_cluster_index(@clusters)
43
+ clusterer = KMeans.new.
44
+ set_parameters(get_parameters).
45
+ build(@clusters[biggest_cluster_index], 2)
46
+ @clusters.delete_at(biggest_cluster_index)
47
+ @centroids.delete_at(biggest_cluster_index)
48
+ @clusters.concat(clusterer.clusters)
49
+ @centroids.concat(clusterer.centroids)
50
+ end
51
+
52
+ super if @refine
53
+
54
+ return self
55
+ end
56
+
57
+ # Get info on what can be parameterized on this clusterer algorithm.
58
+ # It returns a hash with the following format:
59
+ # { :param_name => "Info on the parameter" }
60
+ def get_parameters_info
61
+ { :max_iterations => "Maximum number of iterations used to bisect a " +
62
+ "cluster. By default it is uncapped.",
63
+ :distance_function => "Custom implementation of distance function. " +
64
+ "It must be a closure receiving two data items and return the " +
65
+ "distance bewteen them. By default, this algorithm uses " +
66
+ "ecuclidean distance of numeric attributes to the power of 2.",
67
+ :refine => "Boolean value. True by default. It will run the " +
68
+ "classic K Means algorithm, using as initial centroids the " +
69
+ "result of the bisecting approach."
70
+ }
71
+ end
72
+
73
+ # Set parameters on this clusterer instance.
74
+ # You must provide a hash with the folowing format:
75
+ # { :param_name => parameter_value }
76
+ #
77
+ # Use get_parameters_info to know what parameters are accepted.
78
+ def set_parameters(parameters)
79
+ super
80
+ if parameters.has_key?(:refine)
81
+ @refine = parameters[:refine]
82
+ end
83
+ return self
84
+ end
85
+
86
+ # Get parameter values on this clusterer instance.
87
+ # Returns a hash with the folowing format:
88
+ # { :param_name => parameter_value }
89
+ def get_parameters
90
+ params = super
91
+ params[:refine] = @refine
92
+ return params
93
+ end
94
+
95
+ protected
96
+ def calc_initial_centroids
97
+ @centroids # Use existing centroids
98
+ end
99
+
100
+ def find_biggest_cluster_index(clusters)
101
+ max_index = 0
102
+ max_length = 0
103
+ clusters.each_index do |cluster_index|
104
+ cluster = clusters[cluster_index]
105
+ if max_length < cluster.data_items.length
106
+ max_length = cluster.data_items.length
107
+ max_index = cluster_index
108
+ end
109
+ end
110
+ return max_index
111
+ end
112
+
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,55 @@
1
+ # Author:: Sergio Fierens
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ module Ai4r
11
+ module Clusterers
12
+
13
+ # The purpose of this class is to define a common API for Clusterers.
14
+ # All methods in this class (other than eval) must be implemented in
15
+ # subclasses.
16
+ class Clusterer
17
+
18
+ # Build a new clusterer, using data examples found in data_set.
19
+ # Data items will be clustered in "number_of_clusters" different
20
+ # clusters.
21
+ def build(data_set, number_of_clusters)
22
+ raise NotImplementedError
23
+ end
24
+
25
+ # Classifies the given data item, returning the cluster it belongs to.
26
+ def eval(data_item)
27
+ raise NotImplementedError
28
+ end
29
+
30
+ # Get info on what can be parameterized on this clusterer.
31
+ # It returns a hash with the following format:
32
+ # { :param_name => "Info on the parameter" }
33
+ def get_parameters_info
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Set parameter values on this clusterer instance.
38
+ # You must provide a hash with the folowing format:
39
+ # { :param_name => parameter_value }
40
+ def set_parameters(parameters)
41
+ raise NotImplementedError
42
+ end
43
+
44
+ # Get parameter values on this clusterer instance.
45
+ # Returns a hash with the folowing format:
46
+ # { :param_name => parameter_value }
47
+ def get_parameters
48
+ raise NotImplementedError
49
+ end
50
+
51
+
52
+ end
53
+
54
+ end
55
+ end
@@ -0,0 +1,164 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require "set"
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # The k-means algorithm is an algorithm to cluster n objects
18
+ # based on attributes into k partitions, with k < n.
19
+ #
20
+ # More about K Means algorithm:
21
+ # http://en.wikipedia.org/wiki/K-means_algorithm
22
+ class KMeans < Clusterer
23
+
24
+ attr_reader :data_set, :number_of_clusters
25
+ attr_reader :clusters, :centroids, :iterations
26
+ attr_accessor :max_iterations
27
+ attr_accessor :distance_function
28
+
29
+ # Build a new clusterer, using data examples found in data_set.
30
+ # Items will be clustered in "number_of_clusters" different
31
+ # clusters.
32
+ def build(data_set, number_of_clusters)
33
+ @data_set = data_set
34
+ @number_of_clusters = number_of_clusters
35
+ @iterations = 0
36
+
37
+ calc_initial_centroids
38
+ while(not stop_criteria_met)
39
+ calculate_membership_clusters
40
+ recompute_centroids
41
+ end
42
+
43
+ return self
44
+ end
45
+
46
+ # Classifies the given data item, returning the cluster index it belongs
47
+ # to (0-based).
48
+ def eval(data_item)
49
+ get_min_index(@centroids.collect {|centroid|
50
+ distance(data_item, centroid)})
51
+ end
52
+
53
+ # Get info on what can be parameterized on this clusterer algorithm.
54
+ # It returns a hash with the following format:
55
+ # { :param_name => "Info on the parameter" }
56
+ def get_parameters_info
57
+ { :max_iterations => "Maximum number of iterations to build the " +
58
+ "clusterer. By default it is uncapped.",
59
+ :distance_function => "Custom implementation of distance function. " +
60
+ "It must be a closure receiving two data items and return the " +
61
+ "distance bewteen them. By default, this algorithm uses " +
62
+ "ecuclidean distance of numeric attributes to the power of 2."
63
+ }
64
+ end
65
+
66
+ # Set parameters on this clusterer instance.
67
+ # You must provide a hash with the folowing format:
68
+ # { :param_name => parameter_value }
69
+ #
70
+ # Use get_parameters_info to know what parameters are accepted.
71
+ def set_parameters(parameters)
72
+ if parameters.has_key?(:max_iterations)
73
+ @max_iterations = parameters[:max_iterations]
74
+ end
75
+ if parameters.has_key?(:distance_function)
76
+ @distance_function = parameters[:distance_function]
77
+ end
78
+ return self
79
+ end
80
+
81
+ # Get parameter values on this clusterer instance.
82
+ # Returns a hash with the folowing format:
83
+ # { :param_name => parameter_value }
84
+ def get_parameters
85
+ { :max_iterations => @max_iterations,
86
+ :distance_function => @distance_function }
87
+ end
88
+
89
+ # This function calculates the distance between 2 different
90
+ # instances. By default, it returns the euclidean distance to the
91
+ # power of 2.
92
+ # You can provide a more convinient distance implementation:
93
+ #
94
+ # 1- Overwriting this method
95
+ #
96
+ # 2- Providing a closure to the :distance_function parameter
97
+ def distance(a, b)
98
+ return @distance_function.call(a, b) if @distance_function
99
+ return euclidean_distance(a, b)
100
+ end
101
+
102
+ protected
103
+ def euclidean_distance(a, b)
104
+ dist = 0.0
105
+ a.each_index do |index|
106
+ if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
107
+ dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
108
+ end
109
+ end
110
+ return dist
111
+ end
112
+
113
+ def calc_initial_centroids
114
+ @centroids = []
115
+ tried_indexes = []
116
+ while @centroids.length < @number_of_clusters &&
117
+ tried_indexes.length < @data_set.data_items.length
118
+ random_index = rand(@data_set.data_items.length)
119
+ if !tried_indexes.include?(random_index)
120
+ tried_indexes << random_index
121
+ if !@centroids.include? @data_set.data_items[random_index]
122
+ @centroids << @data_set.data_items[random_index]
123
+ end
124
+ end
125
+ end
126
+ @number_of_clusters = @centroids.length
127
+ end
128
+
129
+ def stop_criteria_met
130
+ @old_centroids == @centroids ||
131
+ (@max_iterations && (@max_iterations <= @iterations))
132
+ end
133
+
134
+ def calculate_membership_clusters
135
+ @clusters = Array.new(@number_of_clusters) do
136
+ Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
137
+ end
138
+ @data_set.data_items.each do |data_item|
139
+ @clusters[eval(data_item)] << data_item
140
+ end
141
+ end
142
+
143
+ def recompute_centroids
144
+ @old_centroids = @centroids
145
+ @centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
146
+ @iterations += 1
147
+ end
148
+
149
+ def get_min_index(array)
150
+ min = array.first
151
+ index = 0
152
+ array.each_index do |i|
153
+ x = array[i]
154
+ if x < min
155
+ min = x
156
+ index = i
157
+ end
158
+ end
159
+ return index
160
+ end
161
+
162
+ end
163
+ end
164
+ end