ai4r 1.5 → 1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (216) hide show
  1. data/examples/clusterers/simple_website_clustering.rb +47 -0
  2. data/lib/ai4r.rb +7 -0
  3. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  4. data/lib/ai4r/clusterers/average_linkage.rb +22 -23
  5. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  6. data/lib/ai4r/clusterers/complete_linkage.rb +17 -12
  7. data/lib/ai4r/clusterers/diana.rb +139 -0
  8. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  9. data/lib/ai4r/clusterers/single_linkage.rb +57 -42
  10. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  11. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  12. data/lib/ai4r/data/constants.rb +18 -0
  13. data/lib/ai4r/data/data_set.rb +5 -3
  14. data/lib/ai4r/data/proximity.rb +18 -0
  15. data/test/clusterers/average_linkage_test.rb +14 -11
  16. data/test/clusterers/bisecting_k_means_test.rb +9 -0
  17. data/test/clusterers/centroid_linkage_test.rb +50 -0
  18. data/test/clusterers/complete_linkage_test.rb +14 -5
  19. data/test/clusterers/diana_test.rb +69 -0
  20. data/test/clusterers/k_means_test.rb +9 -0
  21. data/test/clusterers/median_linkage_test.rb +50 -0
  22. data/test/clusterers/single_linkage_test.rb +15 -6
  23. data/test/clusterers/ward_linkage_test.rb +50 -0
  24. data/test/clusterers/weighted_average_linkage_test.rb +50 -0
  25. data/test/data/data_set_test.rb +14 -0
  26. data/test/data/proximity_test.rb +10 -0
  27. metadata +87 -298
  28. data/site/build/site/en/broken-links.xml +0 -2
  29. data/site/build/site/en/build/tmp/build-info.xml +0 -5
  30. data/site/build/site/en/build/tmp/plugins-1.xml +0 -212
  31. data/site/build/site/en/build/tmp/plugins-2.xml +0 -252
  32. data/site/build/site/en/build/tmp/projfilters.properties +0 -41
  33. data/site/build/site/en/downloads.html +0 -200
  34. data/site/build/site/en/downloads.pdf +0 -151
  35. data/site/build/site/en/geneticAlgorithms.html +0 -591
  36. data/site/build/site/en/geneticAlgorithms.pdf +0 -934
  37. data/site/build/site/en/images/ai4r-logo.png +0 -0
  38. data/site/build/site/en/images/built-with-forrest-button.png +0 -0
  39. data/site/build/site/en/images/c.png +0 -0
  40. data/site/build/site/en/images/c_wbn.png +0 -0
  41. data/site/build/site/en/images/c_wn.png +0 -0
  42. data/site/build/site/en/images/ero.gif +0 -0
  43. data/site/build/site/en/images/europe2.png +0 -0
  44. data/site/build/site/en/images/europe3.png +0 -0
  45. data/site/build/site/en/images/fitness.png +0 -0
  46. data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
  47. data/site/build/site/en/images/instruction_arrow.png +0 -0
  48. data/site/build/site/en/images/jadeferret.png +0 -0
  49. data/site/build/site/en/images/my_email.png +0 -0
  50. data/site/build/site/en/images/neural_network_example.png +0 -0
  51. data/site/build/site/en/images/rubyforge.png +0 -0
  52. data/site/build/site/en/images/s.png +0 -0
  53. data/site/build/site/en/images/s_wbn.png +0 -0
  54. data/site/build/site/en/images/s_wn.png +0 -0
  55. data/site/build/site/en/images/sigmoid.png +0 -0
  56. data/site/build/site/en/images/t.png +0 -0
  57. data/site/build/site/en/images/t_wbn.png +0 -0
  58. data/site/build/site/en/images/t_wn.png +0 -0
  59. data/site/build/site/en/index.html +0 -390
  60. data/site/build/site/en/index.pdf +0 -657
  61. data/site/build/site/en/linkmap.html +0 -261
  62. data/site/build/site/en/linkmap.pdf +0 -94
  63. data/site/build/site/en/locationmap.xml +0 -72
  64. data/site/build/site/en/machineLearning.html +0 -340
  65. data/site/build/site/en/machineLearning.pdf +0 -337
  66. data/site/build/site/en/neuralNetworks.html +0 -521
  67. data/site/build/site/en/neuralNetworks.pdf +0 -671
  68. data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
  69. data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
  70. data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
  71. data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
  72. data/site/build/site/en/skin/basic.css +0 -166
  73. data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
  74. data/site/build/site/en/skin/breadcrumbs.js +0 -237
  75. data/site/build/site/en/skin/fontsize.js +0 -166
  76. data/site/build/site/en/skin/getBlank.js +0 -40
  77. data/site/build/site/en/skin/getMenu.js +0 -45
  78. data/site/build/site/en/skin/images/README.txt +0 -1
  79. data/site/build/site/en/skin/images/add.jpg +0 -0
  80. data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
  81. data/site/build/site/en/skin/images/chapter.gif +0 -0
  82. data/site/build/site/en/skin/images/chapter_open.gif +0 -0
  83. data/site/build/site/en/skin/images/current.gif +0 -0
  84. data/site/build/site/en/skin/images/error.png +0 -0
  85. data/site/build/site/en/skin/images/external-link.gif +0 -0
  86. data/site/build/site/en/skin/images/fix.jpg +0 -0
  87. data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
  88. data/site/build/site/en/skin/images/hack.jpg +0 -0
  89. data/site/build/site/en/skin/images/header_white_line.gif +0 -0
  90. data/site/build/site/en/skin/images/info.png +0 -0
  91. data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
  92. data/site/build/site/en/skin/images/label.gif +0 -0
  93. data/site/build/site/en/skin/images/page.gif +0 -0
  94. data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
  95. data/site/build/site/en/skin/images/poddoc.png +0 -0
  96. data/site/build/site/en/skin/images/printer.gif +0 -0
  97. data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  98. data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  99. data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  100. data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  101. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  102. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  103. data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  104. data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  105. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  106. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  107. data/site/build/site/en/skin/images/remove.jpg +0 -0
  108. data/site/build/site/en/skin/images/rss.png +0 -0
  109. data/site/build/site/en/skin/images/spacer.gif +0 -0
  110. data/site/build/site/en/skin/images/success.png +0 -0
  111. data/site/build/site/en/skin/images/txtdoc.png +0 -0
  112. data/site/build/site/en/skin/images/update.jpg +0 -0
  113. data/site/build/site/en/skin/images/valid-html401.png +0 -0
  114. data/site/build/site/en/skin/images/vcss.png +0 -0
  115. data/site/build/site/en/skin/images/warning.png +0 -0
  116. data/site/build/site/en/skin/images/xmldoc.gif +0 -0
  117. data/site/build/site/en/skin/menu.js +0 -48
  118. data/site/build/site/en/skin/note.txt +0 -50
  119. data/site/build/site/en/skin/print.css +0 -54
  120. data/site/build/site/en/skin/profile.css +0 -163
  121. data/site/build/site/en/skin/prototype.js +0 -1257
  122. data/site/build/site/en/skin/screen.css +0 -587
  123. data/site/build/site/en/sourceCode.html +0 -244
  124. data/site/build/site/en/sourceCode.pdf +0 -278
  125. data/site/build/site/en/svn.html +0 -244
  126. data/site/build/site/en/svn.pdf +0 -278
  127. data/site/build/tmp/brokenlinks.xml +0 -2
  128. data/site/build/tmp/build-info.xml +0 -5
  129. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  130. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  131. data/site/build/tmp/input.xmap +0 -32
  132. data/site/build/tmp/internal.xmap +0 -32
  133. data/site/build/tmp/locationmap.xml +0 -29
  134. data/site/build/tmp/output.xmap +0 -38
  135. data/site/build/tmp/pluginlist2fetchbuild.xml +0 -144
  136. data/site/build/tmp/plugins-1.xml +0 -201
  137. data/site/build/tmp/plugins-2.xml +0 -401
  138. data/site/build/tmp/projfilters.properties +0 -41
  139. data/site/build/tmp/resources.xmap +0 -32
  140. data/site/build/webapp/WEB-INF/logs/access.log +0 -0
  141. data/site/build/webapp/WEB-INF/logs/core.log +0 -775
  142. data/site/build/webapp/WEB-INF/logs/debug.log +0 -0
  143. data/site/build/webapp/WEB-INF/logs/error.log +0 -213
  144. data/site/build/webapp/WEB-INF/logs/flow.log +0 -0
  145. data/site/build/webapp/WEB-INF/logs/idgen.log +0 -0
  146. data/site/build/webapp/WEB-INF/logs/linkrewriter.log +0 -0
  147. data/site/build/webapp/WEB-INF/logs/locationmap.log +0 -0
  148. data/site/build/webapp/WEB-INF/logs/sitemap.log +0 -0
  149. data/site/build/webapp/WEB-INF/logs/xmlform.log +0 -0
  150. data/site/forrest.properties +0 -152
  151. data/site/forrest.properties.dispatcher.properties +0 -25
  152. data/site/forrest.properties.xml +0 -29
  153. data/site/src/documentation/README.txt +0 -7
  154. data/site/src/documentation/classes/CatalogManager.properties +0 -62
  155. data/site/src/documentation/content/locationmap.xml +0 -72
  156. data/site/src/documentation/content/xdocs/downloads.html +0 -9
  157. data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +0 -294
  158. data/site/src/documentation/content/xdocs/index.xml +0 -129
  159. data/site/src/documentation/content/xdocs/machineLearning.xml +0 -131
  160. data/site/src/documentation/content/xdocs/neuralNetworks.xml +0 -270
  161. data/site/src/documentation/content/xdocs/site.xml +0 -54
  162. data/site/src/documentation/content/xdocs/sourceCode.xml +0 -43
  163. data/site/src/documentation/content/xdocs/tabs.xml +0 -35
  164. data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
  165. data/site/src/documentation/resources/images/c.png +0 -0
  166. data/site/src/documentation/resources/images/c_wbn.png +0 -0
  167. data/site/src/documentation/resources/images/c_wn.png +0 -0
  168. data/site/src/documentation/resources/images/ellipse-2.svg +0 -30
  169. data/site/src/documentation/resources/images/ero.gif +0 -0
  170. data/site/src/documentation/resources/images/europe2.png +0 -0
  171. data/site/src/documentation/resources/images/europe3.png +0 -0
  172. data/site/src/documentation/resources/images/fitness.png +0 -0
  173. data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
  174. data/site/src/documentation/resources/images/icon-a.png +0 -0
  175. data/site/src/documentation/resources/images/icon-b.png +0 -0
  176. data/site/src/documentation/resources/images/icon.png +0 -0
  177. data/site/src/documentation/resources/images/jadeferret.png +0 -0
  178. data/site/src/documentation/resources/images/my_email.png +0 -0
  179. data/site/src/documentation/resources/images/neural_network_example.png +0 -0
  180. data/site/src/documentation/resources/images/project-logo.png +0 -0
  181. data/site/src/documentation/resources/images/rubyforge.png +0 -0
  182. data/site/src/documentation/resources/images/s.png +0 -0
  183. data/site/src/documentation/resources/images/s_wbn.png +0 -0
  184. data/site/src/documentation/resources/images/s_wn.png +0 -0
  185. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  186. data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
  187. data/site/src/documentation/resources/images/t.png +0 -0
  188. data/site/src/documentation/resources/images/t_wbn.png +0 -0
  189. data/site/src/documentation/resources/images/t_wn.png +0 -0
  190. data/site/src/documentation/resources/schema/catalog.xcat +0 -29
  191. data/site/src/documentation/resources/schema/hello-v10.dtd +0 -51
  192. data/site/src/documentation/resources/schema/symbols-project-v10.ent +0 -26
  193. data/site/src/documentation/resources/stylesheets/hello2document.xsl +0 -33
  194. data/site/src/documentation/sitemap.xmap +0 -66
  195. data/site/src/documentation/skinconf.xml +0 -418
  196. data/site/src/documentation/translations/langcode.xml +0 -29
  197. data/site/src/documentation/translations/languages_de.xml +0 -24
  198. data/site/src/documentation/translations/languages_en.xml +0 -24
  199. data/site/src/documentation/translations/languages_es.xml +0 -22
  200. data/site/src/documentation/translations/languages_fr.xml +0 -24
  201. data/site/src/documentation/translations/languages_nl.xml +0 -24
  202. data/site/src/documentation/translations/menu.xml +0 -33
  203. data/site/src/documentation/translations/menu_af.xml +0 -33
  204. data/site/src/documentation/translations/menu_de.xml +0 -33
  205. data/site/src/documentation/translations/menu_es.xml +0 -33
  206. data/site/src/documentation/translations/menu_fr.xml +0 -33
  207. data/site/src/documentation/translations/menu_it.xml +0 -33
  208. data/site/src/documentation/translations/menu_nl.xml +0 -33
  209. data/site/src/documentation/translations/menu_no.xml +0 -33
  210. data/site/src/documentation/translations/menu_ru.xml +0 -33
  211. data/site/src/documentation/translations/menu_sk.xml +0 -33
  212. data/site/src/documentation/translations/tabs.xml +0 -22
  213. data/site/src/documentation/translations/tabs_de.xml +0 -22
  214. data/site/src/documentation/translations/tabs_es.xml +0 -22
  215. data/site/src/documentation/translations/tabs_fr.xml +0 -22
  216. data/site/src/documentation/translations/tabs_nl.xml +0 -22
@@ -0,0 +1,47 @@
1
+ require File.dirname(__FILE__) + '/google_search'
2
+ require File.dirname(__FILE__) + '/build_keywords'
3
+ require File.dirname(__FILE__) + '/../../lib/ai4r/clusterers/average_linkage'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'net/http'
7
+ require 'benchmark'
8
+
9
+ SITES_TO_CLASSIFY = [
10
+ "www.foxnews.com", "www.usatoday.com", "scm.jadeferret.com",
11
+ "www.accurev.com", "www.lastminute.com", "subversion.tigris.org",
12
+ "news.yahoo.com", "news.bbc.co.uk", "www.orbitz.com"
13
+ ]
14
+
15
+ # Return array of keywords for the site
16
+ def get_keywords(site)
17
+ response = Net::HTTP.get_response(site, "/")
18
+ Hpricot(response.body).
19
+ search("meta[@name='keywords']")[0]. #Select meta keywords element
20
+ attributes["content"]. #Select its content
21
+ split(","). #Keywords are coma separated
22
+ collect{ |k| k.strip.downcase } #Remove start and end white spaces
23
+ end
24
+
25
+ # Get keywords data for each website
26
+ Site = Struct.new("Site", :name, :keywords)
27
+ sites = SITES_TO_CLASSIFY.collect do |site_name|
28
+ Site.new(site_name, get_keywords(site_name))
29
+ end
30
+ data_set = Ai4r::Data::DataSet.new(:data_items => sites,
31
+ :data_labels => Site.members)
32
+
33
+ # The distance between sites depends on the keywords collected from internet
34
+ keywords_distance_function = lambda do |x,y|
35
+ return Ai4r::Data::Proximity.simple_matching(x.keyword, y.keywords)
36
+ end
37
+
38
+ # Create the clusters
39
+ clusterer = Ai4r::Clusterers::AverageLinkage.new
40
+ clusterer.distance_function = keywords_distance_function
41
+ clusterer.build(data_set, 3)
42
+
43
+ # Print results
44
+ clusterer.clusters.each do |cluster|
45
+ puts cluster.data_items.collect {|item| item.name}.join(", ")
46
+ puts "============"
47
+ end
@@ -1,6 +1,7 @@
1
1
  # Data
2
2
  require "ai4r/data/data_set"
3
3
  require "ai4r/data/statistics"
4
+ require "ai4r/data/proximity"
4
5
  require "ai4r/data/parameterizable"
5
6
  # Clusterers
6
7
  require "ai4r/clusterers/clusterer"
@@ -9,12 +10,18 @@ require "ai4r/clusterers/bisecting_k_means"
9
10
  require "ai4r/clusterers/single_linkage"
10
11
  require "ai4r/clusterers/complete_linkage"
11
12
  require "ai4r/clusterers/average_linkage"
13
+ require "ai4r/clusterers/weighted_average_linkage"
14
+ require "ai4r/clusterers/centroid_linkage"
15
+ require "ai4r/clusterers/median_linkage"
16
+ require "ai4r/clusterers/ward_linkage"
17
+ require "ai4r/clusterers/diana"
12
18
  # Classifiers
13
19
  require "ai4r/classifiers/classifier"
14
20
  require "ai4r/classifiers/id3"
15
21
  require "ai4r/classifiers/prism"
16
22
  require "ai4r/classifiers/one_r"
17
23
  require "ai4r/classifiers/zero_r"
24
+ require "ai4r/classifiers/hyperpipes"
18
25
  # Neural networks
19
26
  require "ai4r/neural_network/backpropagation"
20
27
  # Genetic Algorithms
@@ -0,0 +1,118 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'set'
11
+ require File.dirname(__FILE__) + '/../data/constants'
12
+ require File.dirname(__FILE__) + '/../data/data_set'
13
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
14
+
15
+ module Ai4r
16
+ module Classifiers
17
+
18
+ include Ai4r::Data
19
+
20
+ # = Introduction
21
+ #
22
+ # A fast classifier algorithm, created by Lucio de Souza Coelho
23
+ # and Len Trigg.
24
+ class Hyperpipes < Classifier
25
+
26
+ attr_reader :data_set, :pipes
27
+
28
+ # Build a new Hyperpipes classifier. You must provide a DataSet instance
29
+ # as parameter. The last attribute of each item is considered as
30
+ # the item class.
31
+ def build(data_set)
32
+ @data_set = data_set
33
+ @domains = data_set.build_domains
34
+
35
+ @pipes = {}
36
+ @domains.last.each {|cat| @pipes[cat] = build_pipe(@domains)}
37
+ @data_set.data_item.each {|item| update_pipe(@pipes[item.last], item) }
38
+
39
+ return self
40
+ end
41
+
42
+ # You can evaluate new data, predicting its class.
43
+ # e.g.
44
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
45
+ def eval(data)
46
+ votes = Hash.new {0}
47
+ @pipes.each do |category, pipe|
48
+ pipe.each_with_index do |bounds, i|
49
+ if data[i].is_a? Numeric
50
+ votes[category]+=1 if data[i]>bounds[:min] && data[i]<bounds[:max]
51
+ else
52
+ votes[category]+=1 if bounds[data[i]]
53
+ end
54
+ end
55
+ end
56
+ return votes.to_a.max {|x, y| x.last <=> y.last}.first
57
+ end
58
+
59
+ # This method returns the generated rules in ruby code.
60
+ # e.g.
61
+ #
62
+ # classifier.get_rules
63
+ # # => if age_range == '<30' then marketing_target = 'Y'
64
+ # elsif age_range == '[30-50)' then marketing_target = 'N'
65
+ # elsif age_range == '[50-80]' then marketing_target = 'N'
66
+ # end
67
+ #
68
+ # It is a nice way to inspect induction results, and also to execute them:
69
+ # marketing_target = nil
70
+ # eval classifier.get_rules
71
+ # puts marketing_target
72
+ # # => 'Y'
73
+ def get_rules
74
+ rules = []
75
+ rules << "votes = Hash.new {0}"
76
+ data = @data_set.data_items.first
77
+ labels = @data_set.data_labels.collect {|l| l.to_s}
78
+ @pipes.each do |category, pipe|
79
+ pipe.each_with_index do |bounds, i|
80
+ rule = "votes['#{category}'] += 1 "
81
+ if data[i].is_a? Numeric
82
+ rule += "if #{labels[i]} > #{bounds[:min]} && #{labels[i]} < #{bounds[:max]}"
83
+ else
84
+ rule += "if #{bounds.inspect}['#{labels[i]}']"
85
+ end
86
+ rules << rule
87
+ end
88
+ end
89
+ rules << "votes.to_a.max {|x, y| x.last <=> y.last}.first"
90
+ return rules.join('\n')
91
+ end
92
+
93
+ protected
94
+
95
+ def build_pipe(data_set)
96
+ data_set.data_items.first[0...-1].collect do |att|
97
+ if att.is_a? Numeric
98
+ {:min=>POSITIVE_INFINITY, :max=>NEGATIVE_INFINITY}
99
+ else
100
+ Hash.new(false)
101
+ end
102
+ end
103
+ end
104
+
105
+ def update_pipe(pipe, data_item)
106
+ data_item[0...-1].each_with_index do |att, i|
107
+ if att.first.is_a? Numeric
108
+ pipe[i][:min] = att if att < pipe[i][:min]
109
+ pipe[i][:max] = att if att > pipe[i][:max]
110
+ else
111
+ pipe[i][att] = true
112
+ end
113
+ end
114
+ end
115
+
116
+ end
117
+ end
118
+ end
@@ -13,14 +13,25 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
13
13
  module Ai4r
14
14
  module Clusterers
15
15
 
16
- # Implementation of a Hierarchical clusterer with complete linkage.
16
+ # Implementation of a Hierarchical clusterer with group average
17
+ # linkage, AKA unweighted pair group method average or UPGMA (Everitt
18
+ # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
17
19
  # Hierarchical clusteres create one cluster per element, and then
18
20
  # progressively merge clusters, until the required number of clusters
19
21
  # is reached.
20
- # With average linkage, the distance between two clusters is computed as
21
- # the average distance between elements of each cluster.
22
+ # With average linkage, the distance between a clusters cx and
23
+ # cluster (ci U cj) the the average distance between cx and ci, and
24
+ # cx and cj.
25
+ #
26
+ # D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
22
27
  class AverageLinkage < SingleLinkage
23
28
 
29
+ parameters_info :distance_function =>
30
+ "Custom implementation of distance function. " +
31
+ "It must be a closure receiving two data items and return the " +
32
+ "distance bewteen them. By default, this algorithm uses " +
33
+ "ecuclidean distance of numeric attributes to the power of 2."
34
+
24
35
  # Build a new clusterer, using data examples found in data_set.
25
36
  # Items will be clustered in "number_of_clusters" different
26
37
  # clusters.
@@ -28,31 +39,19 @@ module Ai4r
28
39
  super
29
40
  end
30
41
 
31
- # Classifies the given data item, returning the cluster index it belongs
32
- # to (0-based).
42
+ # This algorithms does not allow classification of new data items
43
+ # once it has been built. Rebuild the cluster including you data element.
33
44
  def eval(data_item)
34
- super
45
+ Raise "Eval of new data is not supported by this algorithm."
35
46
  end
36
47
 
37
48
  protected
38
49
 
39
- # Calculate cluster distance using the average linkage method
40
- def calc_index_clusters_distance(cluster_a, cluster_b)
41
- dist_sum = 0.0
42
- cluster_a.each do |index_a|
43
- cluster_b.each do |index_b|
44
- dist_sum += read_distance_matrix(index_a, index_b)
45
- end
46
- end
47
- return dist_sum/(cluster_a.length*cluster_b.length)
48
- end
49
-
50
- def distance_between_item_and_cluster(data_item, cluster)
51
- dist_sum = 0.0
52
- cluster.data_items.each do |another_item|
53
- dist_sum += distance(data_item, another_item)
54
- end
55
- return dist_sum/cluster.data_items.length
50
+ # return distance between cluster cx and cluster (ci U cj),
51
+ # using average linkage
52
+ def linkage_distance(cx, ci, cj)
53
+ (read_distance_matrix(cx, ci)+
54
+ read_distance_matrix(cx, cj))/2
56
55
  end
57
56
 
58
57
  end
@@ -0,0 +1,66 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # centroid linkage algorithm, aka unweighted pair group method
18
+ # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
19
+ # Sokal and Michener, 1958 )
20
+ # Hierarchical clusteres create one cluster per element, and then
21
+ # progressively merge clusters, until the required number of clusters
22
+ # is reached.
23
+ # The distance between clusters is the squared euclidean distance
24
+ # between their centroids.
25
+ #
26
+ # D(cx, (ci U cj)) = | mx - mij |^2
27
+ # D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
28
+ # (nj/(ni+nj))*D(cx, cj) -
29
+ # (ni*nj/(ni+nj)^2)*D(ci, cj)
30
+ class CentroidLinkage < SingleLinkage
31
+
32
+ parameters_info :distance_function =>
33
+ "Custom implementation of distance function. " +
34
+ "It must be a closure receiving two data items and return the " +
35
+ "distance bewteen them. By default, this algorithm uses " +
36
+ "ecuclidean distance of numeric attributes to the power of 2."
37
+
38
+ # Build a new clusterer, using data examples found in data_set.
39
+ # Items will be clustered in "number_of_clusters" different
40
+ # clusters.
41
+ def build(data_set, number_of_clusters)
42
+ super
43
+ end
44
+
45
+ # This algorithms does not allow classification of new data items
46
+ # once it has been built. Rebuild the cluster including you data element.
47
+ def eval(data_item)
48
+ Raise "Eval of new data is not supported by this algorithm."
49
+ end
50
+
51
+ protected
52
+
53
+ # return distance between cluster cx and cluster (ci U cj),
54
+ # using centroid linkage
55
+ def linkage_distance(cx, ci, cj)
56
+ ni = @index_clusters[ci].length
57
+ nj = @index_clusters[cj].length
58
+ ( ni * read_distance_matrix(cx, ci) +
59
+ nj * read_distance_matrix(cx, cj) -
60
+ 1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+
@@ -13,14 +13,24 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
13
13
  module Ai4r
14
14
  module Clusterers
15
15
 
16
- # Implementation of a Hierarchical clusterer with complete linkage.
16
+ # Implementation of a Hierarchical clusterer with complete linkage (Everitt
17
+ # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
17
18
  # Hierarchical clusteres create one cluster per element, and then
18
19
  # progressively merge clusters, until the required number of clusters
19
20
  # is reached.
20
21
  # With complete linkage, the distance between two clusters is computed as
21
22
  # the maximum distance between elements of each cluster.
23
+ #
24
+ # D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
22
25
  class CompleteLinkage < SingleLinkage
23
26
 
27
+ parameters_info :distance_function =>
28
+ "Custom implementation of distance function. " +
29
+ "It must be a closure receiving two data items and return the " +
30
+ "distance bewteen them. By default, this algorithm uses " +
31
+ "ecuclidean distance of numeric attributes to the power of 2."
32
+
33
+
24
34
  # Build a new clusterer, using data examples found in data_set.
25
35
  # Items will be clustered in "number_of_clusters" different
26
36
  # clusters.
@@ -36,22 +46,17 @@ module Ai4r
36
46
 
37
47
  protected
38
48
 
39
- # Calculate cluster distance using the complete linkage method
40
- def calc_index_clusters_distance(cluster_a, cluster_b)
41
- max_dist = 0
42
- cluster_a.each do |index_a|
43
- cluster_b.each do |index_b|
44
- dist = read_distance_matrix(index_a, index_b)
45
- max_dist = dist if dist > max_dist
46
- end
47
- end
48
- return max_dist
49
+ # return distance between cluster cx and new cluster (ci U cj),
50
+ # using complete linkage
51
+ def linkage_distance(cx, ci, cj)
52
+ [read_distance_matrix(cx, ci),
53
+ read_distance_matrix(cx, cj)].max
49
54
  end
50
55
 
51
56
  def distance_between_item_and_cluster(data_item, cluster)
52
57
  max_dist = 0
53
58
  cluster.data_items.each do |another_item|
54
- dist = distance(data_item, another_item)
59
+ dist = @distance_function.call(data_item, another_item)
55
60
  max_dist = dist if dist > max_dist
56
61
  end
57
62
  return max_dist
@@ -0,0 +1,139 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
18
+ # Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
19
+ # Clusterer. It begins with only one cluster with all data items,
20
+ # and divides the clusters until the desired clusters number is reached.
21
+ class Diana < Clusterer
22
+
23
+ attr_reader :data_set, :number_of_clusters, :clusters
24
+
25
+ parameters_info :distance_function =>
26
+ "Custom implementation of distance function. " +
27
+ "It must be a closure receiving two data items and return the " +
28
+ "distance bewteen them. By default, this algorithm uses " +
29
+ "ecuclidean distance of numeric attributes to the power of 2."
30
+
31
+ def initialize
32
+ @distance_function = lambda do |a,b|
33
+ Ai4r::Data::Proximity.squared_euclidean_distance(
34
+ a.select {|att_a| att_a.is_a? Numeric} ,
35
+ b.select {|att_b| att_b.is_a? Numeric})
36
+ end
37
+ end
38
+
39
+ # Build a new clusterer, using divisive analysis (DIANA algorithm)
40
+ def build(data_set, number_of_clusters)
41
+ @data_set = data_set
42
+ @number_of_clusters = number_of_clusters
43
+ @clusters = [@data_set[0..-1]]
44
+
45
+ while(@clusters.length < @number_of_clusters)
46
+ cluster_index_to_split = max_diameter_cluster(@clusters)
47
+ cluster_to_split = @clusters[cluster_index_to_split]
48
+ splinter_cluster = init_splinter_cluster(cluster_to_split)
49
+ while true
50
+ dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
51
+ break if dist_diff < 0
52
+ splinter_cluster << cluster_to_split.data_items[index]
53
+ cluster_to_split.data_items.delete_at(index)
54
+ end
55
+ @clusters << splinter_cluster
56
+ end
57
+
58
+ return self
59
+ end
60
+
61
+ # Classifies the given data item, returning the cluster index it belongs
62
+ # to (0-based).
63
+ def eval(data_item)
64
+ get_min_index(@clusters.collect do |cluster|
65
+ distance_sum(data_item, cluster) / cluster.data_items.length
66
+ end)
67
+ end
68
+
69
+ protected
70
+
71
+ # return the cluster with max diameter
72
+ def max_diameter_cluster(clusters)
73
+ max_index = 0
74
+ max_diameter = 0
75
+ clusters.each_with_index do |cluster, index|
76
+ diameter = cluster_diameter(cluster)
77
+ if diameter > max_diameter
78
+ max_index = index
79
+ max_diameter = diameter
80
+ end
81
+ end
82
+ return max_index
83
+ end
84
+
85
+ # Max distance between 2 items in a cluster
86
+ def cluster_diameter(cluster)
87
+ diameter = 0
88
+ cluster.data_items.each_with_index do |item_a, item_a_pos|
89
+ item_a_pos.times do |item_b_pos|
90
+ d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
91
+ diameter = d if d > diameter
92
+ end
93
+ end
94
+ return diameter
95
+ end
96
+
97
+ # Create a cluster with the item with mx distance
98
+ # to the rest of the cluster's items.
99
+ # That item is removed from the initial cluster.
100
+ def init_splinter_cluster(cluster_to_split)
101
+ max = 0.0
102
+ max_index = 0
103
+ cluster_to_split.data_items.each_with_index do |item, index|
104
+ sum = distance_sum(item, cluster_to_split)
105
+ max, max_index = sum, index if sum > max
106
+ end
107
+ splinter_cluster = cluster_to_split[max_index]
108
+ cluster_to_split.data_items.delete_at(max_index)
109
+ return splinter_cluster
110
+ end
111
+
112
+ # Return the max average distance between any item of
113
+ # cluster_to_split and the rest of items in that cluster,
114
+ # minus the average distance with the items of splinter_cluster,
115
+ # and the index of the item.
116
+ # A positive value means that the items is closer to the
117
+ # splinter group than to its current cluster.
118
+ def max_distance_difference(cluster_to_split, splinter_cluster)
119
+ max_diff = -1.0/0
120
+ max_diff_index = 0
121
+ cluster_to_split.data_items.each_with_index do |item, index|
122
+ dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
123
+ dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
124
+ dist_diff = dist_a - dist_b
125
+ max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
126
+ end
127
+ return max_diff, max_diff_index
128
+ end
129
+
130
+ # Sum up the distance between an item and all the items in a cluster
131
+ def distance_sum(item_a, cluster)
132
+ cluster.data_items.inject(0.0) do |sum, item_b|
133
+ sum + @distance_function.call(item_a, item_b)
134
+ end
135
+ end
136
+
137
+ end
138
+ end
139
+ end