ai4r 1.5 → 1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. data/examples/clusterers/simple_website_clustering.rb +47 -0
  2. data/lib/ai4r.rb +7 -0
  3. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  4. data/lib/ai4r/clusterers/average_linkage.rb +22 -23
  5. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  6. data/lib/ai4r/clusterers/complete_linkage.rb +17 -12
  7. data/lib/ai4r/clusterers/diana.rb +139 -0
  8. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  9. data/lib/ai4r/clusterers/single_linkage.rb +57 -42
  10. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  11. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  12. data/lib/ai4r/data/constants.rb +18 -0
  13. data/lib/ai4r/data/data_set.rb +5 -3
  14. data/lib/ai4r/data/proximity.rb +18 -0
  15. data/test/clusterers/average_linkage_test.rb +14 -11
  16. data/test/clusterers/bisecting_k_means_test.rb +9 -0
  17. data/test/clusterers/centroid_linkage_test.rb +50 -0
  18. data/test/clusterers/complete_linkage_test.rb +14 -5
  19. data/test/clusterers/diana_test.rb +69 -0
  20. data/test/clusterers/k_means_test.rb +9 -0
  21. data/test/clusterers/median_linkage_test.rb +50 -0
  22. data/test/clusterers/single_linkage_test.rb +15 -6
  23. data/test/clusterers/ward_linkage_test.rb +50 -0
  24. data/test/clusterers/weighted_average_linkage_test.rb +50 -0
  25. data/test/data/data_set_test.rb +14 -0
  26. data/test/data/proximity_test.rb +10 -0
  27. metadata +87 -298
  28. data/site/build/site/en/broken-links.xml +0 -2
  29. data/site/build/site/en/build/tmp/build-info.xml +0 -5
  30. data/site/build/site/en/build/tmp/plugins-1.xml +0 -212
  31. data/site/build/site/en/build/tmp/plugins-2.xml +0 -252
  32. data/site/build/site/en/build/tmp/projfilters.properties +0 -41
  33. data/site/build/site/en/downloads.html +0 -200
  34. data/site/build/site/en/downloads.pdf +0 -151
  35. data/site/build/site/en/geneticAlgorithms.html +0 -591
  36. data/site/build/site/en/geneticAlgorithms.pdf +0 -934
  37. data/site/build/site/en/images/ai4r-logo.png +0 -0
  38. data/site/build/site/en/images/built-with-forrest-button.png +0 -0
  39. data/site/build/site/en/images/c.png +0 -0
  40. data/site/build/site/en/images/c_wbn.png +0 -0
  41. data/site/build/site/en/images/c_wn.png +0 -0
  42. data/site/build/site/en/images/ero.gif +0 -0
  43. data/site/build/site/en/images/europe2.png +0 -0
  44. data/site/build/site/en/images/europe3.png +0 -0
  45. data/site/build/site/en/images/fitness.png +0 -0
  46. data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
  47. data/site/build/site/en/images/instruction_arrow.png +0 -0
  48. data/site/build/site/en/images/jadeferret.png +0 -0
  49. data/site/build/site/en/images/my_email.png +0 -0
  50. data/site/build/site/en/images/neural_network_example.png +0 -0
  51. data/site/build/site/en/images/rubyforge.png +0 -0
  52. data/site/build/site/en/images/s.png +0 -0
  53. data/site/build/site/en/images/s_wbn.png +0 -0
  54. data/site/build/site/en/images/s_wn.png +0 -0
  55. data/site/build/site/en/images/sigmoid.png +0 -0
  56. data/site/build/site/en/images/t.png +0 -0
  57. data/site/build/site/en/images/t_wbn.png +0 -0
  58. data/site/build/site/en/images/t_wn.png +0 -0
  59. data/site/build/site/en/index.html +0 -390
  60. data/site/build/site/en/index.pdf +0 -657
  61. data/site/build/site/en/linkmap.html +0 -261
  62. data/site/build/site/en/linkmap.pdf +0 -94
  63. data/site/build/site/en/locationmap.xml +0 -72
  64. data/site/build/site/en/machineLearning.html +0 -340
  65. data/site/build/site/en/machineLearning.pdf +0 -337
  66. data/site/build/site/en/neuralNetworks.html +0 -521
  67. data/site/build/site/en/neuralNetworks.pdf +0 -671
  68. data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
  69. data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
  70. data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
  71. data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
  72. data/site/build/site/en/skin/basic.css +0 -166
  73. data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
  74. data/site/build/site/en/skin/breadcrumbs.js +0 -237
  75. data/site/build/site/en/skin/fontsize.js +0 -166
  76. data/site/build/site/en/skin/getBlank.js +0 -40
  77. data/site/build/site/en/skin/getMenu.js +0 -45
  78. data/site/build/site/en/skin/images/README.txt +0 -1
  79. data/site/build/site/en/skin/images/add.jpg +0 -0
  80. data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
  81. data/site/build/site/en/skin/images/chapter.gif +0 -0
  82. data/site/build/site/en/skin/images/chapter_open.gif +0 -0
  83. data/site/build/site/en/skin/images/current.gif +0 -0
  84. data/site/build/site/en/skin/images/error.png +0 -0
  85. data/site/build/site/en/skin/images/external-link.gif +0 -0
  86. data/site/build/site/en/skin/images/fix.jpg +0 -0
  87. data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
  88. data/site/build/site/en/skin/images/hack.jpg +0 -0
  89. data/site/build/site/en/skin/images/header_white_line.gif +0 -0
  90. data/site/build/site/en/skin/images/info.png +0 -0
  91. data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
  92. data/site/build/site/en/skin/images/label.gif +0 -0
  93. data/site/build/site/en/skin/images/page.gif +0 -0
  94. data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
  95. data/site/build/site/en/skin/images/poddoc.png +0 -0
  96. data/site/build/site/en/skin/images/printer.gif +0 -0
  97. data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  98. data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  99. data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  100. data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  101. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  102. data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  103. data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  104. data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  105. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  106. data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  107. data/site/build/site/en/skin/images/remove.jpg +0 -0
  108. data/site/build/site/en/skin/images/rss.png +0 -0
  109. data/site/build/site/en/skin/images/spacer.gif +0 -0
  110. data/site/build/site/en/skin/images/success.png +0 -0
  111. data/site/build/site/en/skin/images/txtdoc.png +0 -0
  112. data/site/build/site/en/skin/images/update.jpg +0 -0
  113. data/site/build/site/en/skin/images/valid-html401.png +0 -0
  114. data/site/build/site/en/skin/images/vcss.png +0 -0
  115. data/site/build/site/en/skin/images/warning.png +0 -0
  116. data/site/build/site/en/skin/images/xmldoc.gif +0 -0
  117. data/site/build/site/en/skin/menu.js +0 -48
  118. data/site/build/site/en/skin/note.txt +0 -50
  119. data/site/build/site/en/skin/print.css +0 -54
  120. data/site/build/site/en/skin/profile.css +0 -163
  121. data/site/build/site/en/skin/prototype.js +0 -1257
  122. data/site/build/site/en/skin/screen.css +0 -587
  123. data/site/build/site/en/sourceCode.html +0 -244
  124. data/site/build/site/en/sourceCode.pdf +0 -278
  125. data/site/build/site/en/svn.html +0 -244
  126. data/site/build/site/en/svn.pdf +0 -278
  127. data/site/build/tmp/brokenlinks.xml +0 -2
  128. data/site/build/tmp/build-info.xml +0 -5
  129. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
  130. data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
  131. data/site/build/tmp/input.xmap +0 -32
  132. data/site/build/tmp/internal.xmap +0 -32
  133. data/site/build/tmp/locationmap.xml +0 -29
  134. data/site/build/tmp/output.xmap +0 -38
  135. data/site/build/tmp/pluginlist2fetchbuild.xml +0 -144
  136. data/site/build/tmp/plugins-1.xml +0 -201
  137. data/site/build/tmp/plugins-2.xml +0 -401
  138. data/site/build/tmp/projfilters.properties +0 -41
  139. data/site/build/tmp/resources.xmap +0 -32
  140. data/site/build/webapp/WEB-INF/logs/access.log +0 -0
  141. data/site/build/webapp/WEB-INF/logs/core.log +0 -775
  142. data/site/build/webapp/WEB-INF/logs/debug.log +0 -0
  143. data/site/build/webapp/WEB-INF/logs/error.log +0 -213
  144. data/site/build/webapp/WEB-INF/logs/flow.log +0 -0
  145. data/site/build/webapp/WEB-INF/logs/idgen.log +0 -0
  146. data/site/build/webapp/WEB-INF/logs/linkrewriter.log +0 -0
  147. data/site/build/webapp/WEB-INF/logs/locationmap.log +0 -0
  148. data/site/build/webapp/WEB-INF/logs/sitemap.log +0 -0
  149. data/site/build/webapp/WEB-INF/logs/xmlform.log +0 -0
  150. data/site/forrest.properties +0 -152
  151. data/site/forrest.properties.dispatcher.properties +0 -25
  152. data/site/forrest.properties.xml +0 -29
  153. data/site/src/documentation/README.txt +0 -7
  154. data/site/src/documentation/classes/CatalogManager.properties +0 -62
  155. data/site/src/documentation/content/locationmap.xml +0 -72
  156. data/site/src/documentation/content/xdocs/downloads.html +0 -9
  157. data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +0 -294
  158. data/site/src/documentation/content/xdocs/index.xml +0 -129
  159. data/site/src/documentation/content/xdocs/machineLearning.xml +0 -131
  160. data/site/src/documentation/content/xdocs/neuralNetworks.xml +0 -270
  161. data/site/src/documentation/content/xdocs/site.xml +0 -54
  162. data/site/src/documentation/content/xdocs/sourceCode.xml +0 -43
  163. data/site/src/documentation/content/xdocs/tabs.xml +0 -35
  164. data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
  165. data/site/src/documentation/resources/images/c.png +0 -0
  166. data/site/src/documentation/resources/images/c_wbn.png +0 -0
  167. data/site/src/documentation/resources/images/c_wn.png +0 -0
  168. data/site/src/documentation/resources/images/ellipse-2.svg +0 -30
  169. data/site/src/documentation/resources/images/ero.gif +0 -0
  170. data/site/src/documentation/resources/images/europe2.png +0 -0
  171. data/site/src/documentation/resources/images/europe3.png +0 -0
  172. data/site/src/documentation/resources/images/fitness.png +0 -0
  173. data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
  174. data/site/src/documentation/resources/images/icon-a.png +0 -0
  175. data/site/src/documentation/resources/images/icon-b.png +0 -0
  176. data/site/src/documentation/resources/images/icon.png +0 -0
  177. data/site/src/documentation/resources/images/jadeferret.png +0 -0
  178. data/site/src/documentation/resources/images/my_email.png +0 -0
  179. data/site/src/documentation/resources/images/neural_network_example.png +0 -0
  180. data/site/src/documentation/resources/images/project-logo.png +0 -0
  181. data/site/src/documentation/resources/images/rubyforge.png +0 -0
  182. data/site/src/documentation/resources/images/s.png +0 -0
  183. data/site/src/documentation/resources/images/s_wbn.png +0 -0
  184. data/site/src/documentation/resources/images/s_wn.png +0 -0
  185. data/site/src/documentation/resources/images/sigmoid.png +0 -0
  186. data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
  187. data/site/src/documentation/resources/images/t.png +0 -0
  188. data/site/src/documentation/resources/images/t_wbn.png +0 -0
  189. data/site/src/documentation/resources/images/t_wn.png +0 -0
  190. data/site/src/documentation/resources/schema/catalog.xcat +0 -29
  191. data/site/src/documentation/resources/schema/hello-v10.dtd +0 -51
  192. data/site/src/documentation/resources/schema/symbols-project-v10.ent +0 -26
  193. data/site/src/documentation/resources/stylesheets/hello2document.xsl +0 -33
  194. data/site/src/documentation/sitemap.xmap +0 -66
  195. data/site/src/documentation/skinconf.xml +0 -418
  196. data/site/src/documentation/translations/langcode.xml +0 -29
  197. data/site/src/documentation/translations/languages_de.xml +0 -24
  198. data/site/src/documentation/translations/languages_en.xml +0 -24
  199. data/site/src/documentation/translations/languages_es.xml +0 -22
  200. data/site/src/documentation/translations/languages_fr.xml +0 -24
  201. data/site/src/documentation/translations/languages_nl.xml +0 -24
  202. data/site/src/documentation/translations/menu.xml +0 -33
  203. data/site/src/documentation/translations/menu_af.xml +0 -33
  204. data/site/src/documentation/translations/menu_de.xml +0 -33
  205. data/site/src/documentation/translations/menu_es.xml +0 -33
  206. data/site/src/documentation/translations/menu_fr.xml +0 -33
  207. data/site/src/documentation/translations/menu_it.xml +0 -33
  208. data/site/src/documentation/translations/menu_nl.xml +0 -33
  209. data/site/src/documentation/translations/menu_no.xml +0 -33
  210. data/site/src/documentation/translations/menu_ru.xml +0 -33
  211. data/site/src/documentation/translations/menu_sk.xml +0 -33
  212. data/site/src/documentation/translations/tabs.xml +0 -22
  213. data/site/src/documentation/translations/tabs_de.xml +0 -22
  214. data/site/src/documentation/translations/tabs_es.xml +0 -22
  215. data/site/src/documentation/translations/tabs_fr.xml +0 -22
  216. data/site/src/documentation/translations/tabs_nl.xml +0 -22
@@ -0,0 +1,47 @@
1
+ require File.dirname(__FILE__) + '/google_search'
2
+ require File.dirname(__FILE__) + '/build_keywords'
3
+ require File.dirname(__FILE__) + '/../../lib/ai4r/clusterers/average_linkage'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'net/http'
7
+ require 'benchmark'
8
+
9
+ SITES_TO_CLASSIFY = [
10
+ "www.foxnews.com", "www.usatoday.com", "scm.jadeferret.com",
11
+ "www.accurev.com", "www.lastminute.com", "subversion.tigris.org",
12
+ "news.yahoo.com", "news.bbc.co.uk", "www.orbitz.com"
13
+ ]
14
+
15
+ # Return array of keywords for the site
16
+ def get_keywords(site)
17
+ response = Net::HTTP.get_response(site, "/")
18
+ Hpricot(response.body).
19
+ search("meta[@name='keywords']")[0]. #Select meta keywords element
20
+ attributes["content"]. #Select its content
21
+ split(","). #Keywords are coma separated
22
+ collect{ |k| k.strip.downcase } #Remove start and end white spaces
23
+ end
24
+
25
+ # Get keywords data for each website
26
+ Site = Struct.new("Site", :name, :keywords)
27
+ sites = SITES_TO_CLASSIFY.collect do |site_name|
28
+ Site.new(site_name, get_keywords(site_name))
29
+ end
30
+ data_set = Ai4r::Data::DataSet.new(:data_items => sites,
31
+ :data_labels => Site.members)
32
+
33
+ # The distance between sites depends on the keywords collected from internet
34
+ keywords_distance_function = lambda do |x,y|
35
+ return Ai4r::Data::Proximity.simple_matching(x.keyword, y.keywords)
36
+ end
37
+
38
+ # Create the clusters
39
+ clusterer = Ai4r::Clusterers::AverageLinkage.new
40
+ clusterer.distance_function = keywords_distance_function
41
+ clusterer.build(data_set, 3)
42
+
43
+ # Print results
44
+ clusterer.clusters.each do |cluster|
45
+ puts cluster.data_items.collect {|item| item.name}.join(", ")
46
+ puts "============"
47
+ end
@@ -1,6 +1,7 @@
1
1
  # Data
2
2
  require "ai4r/data/data_set"
3
3
  require "ai4r/data/statistics"
4
+ require "ai4r/data/proximity"
4
5
  require "ai4r/data/parameterizable"
5
6
  # Clusterers
6
7
  require "ai4r/clusterers/clusterer"
@@ -9,12 +10,18 @@ require "ai4r/clusterers/bisecting_k_means"
9
10
  require "ai4r/clusterers/single_linkage"
10
11
  require "ai4r/clusterers/complete_linkage"
11
12
  require "ai4r/clusterers/average_linkage"
13
+ require "ai4r/clusterers/weighted_average_linkage"
14
+ require "ai4r/clusterers/centroid_linkage"
15
+ require "ai4r/clusterers/median_linkage"
16
+ require "ai4r/clusterers/ward_linkage"
17
+ require "ai4r/clusterers/diana"
12
18
  # Classifiers
13
19
  require "ai4r/classifiers/classifier"
14
20
  require "ai4r/classifiers/id3"
15
21
  require "ai4r/classifiers/prism"
16
22
  require "ai4r/classifiers/one_r"
17
23
  require "ai4r/classifiers/zero_r"
24
+ require "ai4r/classifiers/hyperpipes"
18
25
  # Neural networks
19
26
  require "ai4r/neural_network/backpropagation"
20
27
  # Genetic Algorithms
@@ -0,0 +1,118 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'set'
11
+ require File.dirname(__FILE__) + '/../data/constants'
12
+ require File.dirname(__FILE__) + '/../data/data_set'
13
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
14
+
15
+ module Ai4r
16
+ module Classifiers
17
+
18
+ include Ai4r::Data
19
+
20
+ # = Introduction
21
+ #
22
+ # A fast classifier algorithm, created by Lucio de Souza Coelho
23
+ # and Len Trigg.
24
+ class Hyperpipes < Classifier
25
+
26
+ attr_reader :data_set, :pipes
27
+
28
+ # Build a new Hyperpipes classifier. You must provide a DataSet instance
29
+ # as parameter. The last attribute of each item is considered as
30
+ # the item class.
31
+ def build(data_set)
32
+ @data_set = data_set
33
+ @domains = data_set.build_domains
34
+
35
+ @pipes = {}
36
+ @domains.last.each {|cat| @pipes[cat] = build_pipe(@domains)}
37
+ @data_set.data_item.each {|item| update_pipe(@pipes[item.last], item) }
38
+
39
+ return self
40
+ end
41
+
42
+ # You can evaluate new data, predicting its class.
43
+ # e.g.
44
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
45
+ def eval(data)
46
+ votes = Hash.new {0}
47
+ @pipes.each do |category, pipe|
48
+ pipe.each_with_index do |bounds, i|
49
+ if data[i].is_a? Numeric
50
+ votes[category]+=1 if data[i]>bounds[:min] && data[i]<bounds[:max]
51
+ else
52
+ votes[category]+=1 if bounds[data[i]]
53
+ end
54
+ end
55
+ end
56
+ return votes.to_a.max {|x, y| x.last <=> y.last}.first
57
+ end
58
+
59
+ # This method returns the generated rules in ruby code.
60
+ # e.g.
61
+ #
62
+ # classifier.get_rules
63
+ # # => if age_range == '<30' then marketing_target = 'Y'
64
+ # elsif age_range == '[30-50)' then marketing_target = 'N'
65
+ # elsif age_range == '[50-80]' then marketing_target = 'N'
66
+ # end
67
+ #
68
+ # It is a nice way to inspect induction results, and also to execute them:
69
+ # marketing_target = nil
70
+ # eval classifier.get_rules
71
+ # puts marketing_target
72
+ # # => 'Y'
73
+ def get_rules
74
+ rules = []
75
+ rules << "votes = Hash.new {0}"
76
+ data = @data_set.data_items.first
77
+ labels = @data_set.data_labels.collect {|l| l.to_s}
78
+ @pipes.each do |category, pipe|
79
+ pipe.each_with_index do |bounds, i|
80
+ rule = "votes['#{category}'] += 1 "
81
+ if data[i].is_a? Numeric
82
+ rule += "if #{labels[i]} > #{bounds[:min]} && #{labels[i]} < #{bounds[:max]}"
83
+ else
84
+ rule += "if #{bounds.inspect}['#{labels[i]}']"
85
+ end
86
+ rules << rule
87
+ end
88
+ end
89
+ rules << "votes.to_a.max {|x, y| x.last <=> y.last}.first"
90
+ return rules.join('\n')
91
+ end
92
+
93
+ protected
94
+
95
+ def build_pipe(data_set)
96
+ data_set.data_items.first[0...-1].collect do |att|
97
+ if att.is_a? Numeric
98
+ {:min=>POSITIVE_INFINITY, :max=>NEGATIVE_INFINITY}
99
+ else
100
+ Hash.new(false)
101
+ end
102
+ end
103
+ end
104
+
105
+ def update_pipe(pipe, data_item)
106
+ data_item[0...-1].each_with_index do |att, i|
107
+ if att.first.is_a? Numeric
108
+ pipe[i][:min] = att if att < pipe[i][:min]
109
+ pipe[i][:max] = att if att > pipe[i][:max]
110
+ else
111
+ pipe[i][att] = true
112
+ end
113
+ end
114
+ end
115
+
116
+ end
117
+ end
118
+ end
@@ -13,14 +13,25 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
13
13
  module Ai4r
14
14
  module Clusterers
15
15
 
16
- # Implementation of a Hierarchical clusterer with complete linkage.
16
+ # Implementation of a Hierarchical clusterer with group average
17
+ # linkage, AKA unweighted pair group method average or UPGMA (Everitt
18
+ # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
17
19
  # Hierarchical clusteres create one cluster per element, and then
18
20
  # progressively merge clusters, until the required number of clusters
19
21
  # is reached.
20
- # With average linkage, the distance between two clusters is computed as
21
- # the average distance between elements of each cluster.
22
+ # With average linkage, the distance between a clusters cx and
23
+ # cluster (ci U cj) the the average distance between cx and ci, and
24
+ # cx and cj.
25
+ #
26
+ # D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
22
27
  class AverageLinkage < SingleLinkage
23
28
 
29
+ parameters_info :distance_function =>
30
+ "Custom implementation of distance function. " +
31
+ "It must be a closure receiving two data items and return the " +
32
+ "distance bewteen them. By default, this algorithm uses " +
33
+ "ecuclidean distance of numeric attributes to the power of 2."
34
+
24
35
  # Build a new clusterer, using data examples found in data_set.
25
36
  # Items will be clustered in "number_of_clusters" different
26
37
  # clusters.
@@ -28,31 +39,19 @@ module Ai4r
28
39
  super
29
40
  end
30
41
 
31
- # Classifies the given data item, returning the cluster index it belongs
32
- # to (0-based).
42
+ # This algorithms does not allow classification of new data items
43
+ # once it has been built. Rebuild the cluster including you data element.
33
44
  def eval(data_item)
34
- super
45
+ Raise "Eval of new data is not supported by this algorithm."
35
46
  end
36
47
 
37
48
  protected
38
49
 
39
- # Calculate cluster distance using the average linkage method
40
- def calc_index_clusters_distance(cluster_a, cluster_b)
41
- dist_sum = 0.0
42
- cluster_a.each do |index_a|
43
- cluster_b.each do |index_b|
44
- dist_sum += read_distance_matrix(index_a, index_b)
45
- end
46
- end
47
- return dist_sum/(cluster_a.length*cluster_b.length)
48
- end
49
-
50
- def distance_between_item_and_cluster(data_item, cluster)
51
- dist_sum = 0.0
52
- cluster.data_items.each do |another_item|
53
- dist_sum += distance(data_item, another_item)
54
- end
55
- return dist_sum/cluster.data_items.length
50
+ # return distance between cluster cx and cluster (ci U cj),
51
+ # using average linkage
52
+ def linkage_distance(cx, ci, cj)
53
+ (read_distance_matrix(cx, ci)+
54
+ read_distance_matrix(cx, cj))/2
56
55
  end
57
56
 
58
57
  end
@@ -0,0 +1,66 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
12
+
13
+ module Ai4r
14
+ module Clusterers
15
+
16
+ # Implementation of an Agglomerative Hierarchical clusterer with
17
+ # centroid linkage algorithm, aka unweighted pair group method
18
+ # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
19
+ # Sokal and Michener, 1958 )
20
+ # Hierarchical clusteres create one cluster per element, and then
21
+ # progressively merge clusters, until the required number of clusters
22
+ # is reached.
23
+ # The distance between clusters is the squared euclidean distance
24
+ # between their centroids.
25
+ #
26
+ # D(cx, (ci U cj)) = | mx - mij |^2
27
+ # D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
28
+ # (nj/(ni+nj))*D(cx, cj) -
29
+ # (ni*nj/(ni+nj)^2)*D(ci, cj)
30
+ class CentroidLinkage < SingleLinkage
31
+
32
+ parameters_info :distance_function =>
33
+ "Custom implementation of distance function. " +
34
+ "It must be a closure receiving two data items and return the " +
35
+ "distance bewteen them. By default, this algorithm uses " +
36
+ "ecuclidean distance of numeric attributes to the power of 2."
37
+
38
+ # Build a new clusterer, using data examples found in data_set.
39
+ # Items will be clustered in "number_of_clusters" different
40
+ # clusters.
41
+ def build(data_set, number_of_clusters)
42
+ super
43
+ end
44
+
45
+ # This algorithms does not allow classification of new data items
46
+ # once it has been built. Rebuild the cluster including you data element.
47
+ def eval(data_item)
48
+ Raise "Eval of new data is not supported by this algorithm."
49
+ end
50
+
51
+ protected
52
+
53
+ # return distance between cluster cx and cluster (ci U cj),
54
+ # using centroid linkage
55
+ def linkage_distance(cx, ci, cj)
56
+ ni = @index_clusters[ci].length
57
+ nj = @index_clusters[cj].length
58
+ ( ni * read_distance_matrix(cx, ci) +
59
+ nj * read_distance_matrix(cx, cj) -
60
+ 1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+
@@ -13,14 +13,24 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
13
13
  module Ai4r
14
14
  module Clusterers
15
15
 
16
- # Implementation of a Hierarchical clusterer with complete linkage.
16
+ # Implementation of a Hierarchical clusterer with complete linkage (Everitt
17
+ # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
17
18
  # Hierarchical clusteres create one cluster per element, and then
18
19
  # progressively merge clusters, until the required number of clusters
19
20
  # is reached.
20
21
  # With complete linkage, the distance between two clusters is computed as
21
22
  # the maximum distance between elements of each cluster.
23
+ #
24
+ # D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
22
25
  class CompleteLinkage < SingleLinkage
23
26
 
27
+ parameters_info :distance_function =>
28
+ "Custom implementation of distance function. " +
29
+ "It must be a closure receiving two data items and return the " +
30
+ "distance bewteen them. By default, this algorithm uses " +
31
+ "ecuclidean distance of numeric attributes to the power of 2."
32
+
33
+
24
34
  # Build a new clusterer, using data examples found in data_set.
25
35
  # Items will be clustered in "number_of_clusters" different
26
36
  # clusters.
@@ -36,22 +46,17 @@ module Ai4r
36
46
 
37
47
  protected
38
48
 
39
- # Calculate cluster distance using the complete linkage method
40
- def calc_index_clusters_distance(cluster_a, cluster_b)
41
- max_dist = 0
42
- cluster_a.each do |index_a|
43
- cluster_b.each do |index_b|
44
- dist = read_distance_matrix(index_a, index_b)
45
- max_dist = dist if dist > max_dist
46
- end
47
- end
48
- return max_dist
49
+ # return distance between cluster cx and new cluster (ci U cj),
50
+ # using complete linkage
51
+ def linkage_distance(cx, ci, cj)
52
+ [read_distance_matrix(cx, ci),
53
+ read_distance_matrix(cx, cj)].max
49
54
  end
50
55
 
51
56
  def distance_between_item_and_cluster(data_item, cluster)
52
57
  max_dist = 0
53
58
  cluster.data_items.each do |another_item|
54
- dist = distance(data_item, another_item)
59
+ dist = @distance_function.call(data_item, another_item)
55
60
  max_dist = dist if dist > max_dist
56
61
  end
57
62
  return max_dist
@@ -0,0 +1,139 @@
1
+ # Author:: Sergio Fierens (implementation)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require File.dirname(__FILE__) + '/../data/data_set'
11
+ require File.dirname(__FILE__) + '/../data/proximity'
12
+ require File.dirname(__FILE__) + '/../clusterers/clusterer'
13
+
14
+ module Ai4r
15
+ module Clusterers
16
+
17
+ # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
18
+ # Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
19
+ # Clusterer. It begins with only one cluster with all data items,
20
+ # and divides the clusters until the desired clusters number is reached.
21
+ class Diana < Clusterer
22
+
23
+ attr_reader :data_set, :number_of_clusters, :clusters
24
+
25
+ parameters_info :distance_function =>
26
+ "Custom implementation of distance function. " +
27
+ "It must be a closure receiving two data items and return the " +
28
+ "distance bewteen them. By default, this algorithm uses " +
29
+ "ecuclidean distance of numeric attributes to the power of 2."
30
+
31
+ def initialize
32
+ @distance_function = lambda do |a,b|
33
+ Ai4r::Data::Proximity.squared_euclidean_distance(
34
+ a.select {|att_a| att_a.is_a? Numeric} ,
35
+ b.select {|att_b| att_b.is_a? Numeric})
36
+ end
37
+ end
38
+
39
+ # Build a new clusterer, using divisive analysis (DIANA algorithm)
40
+ def build(data_set, number_of_clusters)
41
+ @data_set = data_set
42
+ @number_of_clusters = number_of_clusters
43
+ @clusters = [@data_set[0..-1]]
44
+
45
+ while(@clusters.length < @number_of_clusters)
46
+ cluster_index_to_split = max_diameter_cluster(@clusters)
47
+ cluster_to_split = @clusters[cluster_index_to_split]
48
+ splinter_cluster = init_splinter_cluster(cluster_to_split)
49
+ while true
50
+ dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
51
+ break if dist_diff < 0
52
+ splinter_cluster << cluster_to_split.data_items[index]
53
+ cluster_to_split.data_items.delete_at(index)
54
+ end
55
+ @clusters << splinter_cluster
56
+ end
57
+
58
+ return self
59
+ end
60
+
61
+ # Classifies the given data item, returning the cluster index it belongs
62
+ # to (0-based).
63
+ def eval(data_item)
64
+ get_min_index(@clusters.collect do |cluster|
65
+ distance_sum(data_item, cluster) / cluster.data_items.length
66
+ end)
67
+ end
68
+
69
+ protected
70
+
71
+ # return the cluster with max diameter
72
+ def max_diameter_cluster(clusters)
73
+ max_index = 0
74
+ max_diameter = 0
75
+ clusters.each_with_index do |cluster, index|
76
+ diameter = cluster_diameter(cluster)
77
+ if diameter > max_diameter
78
+ max_index = index
79
+ max_diameter = diameter
80
+ end
81
+ end
82
+ return max_index
83
+ end
84
+
85
+ # Max distance between 2 items in a cluster
86
+ def cluster_diameter(cluster)
87
+ diameter = 0
88
+ cluster.data_items.each_with_index do |item_a, item_a_pos|
89
+ item_a_pos.times do |item_b_pos|
90
+ d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
91
+ diameter = d if d > diameter
92
+ end
93
+ end
94
+ return diameter
95
+ end
96
+
97
+ # Create a cluster with the item with mx distance
98
+ # to the rest of the cluster's items.
99
+ # That item is removed from the initial cluster.
100
+ def init_splinter_cluster(cluster_to_split)
101
+ max = 0.0
102
+ max_index = 0
103
+ cluster_to_split.data_items.each_with_index do |item, index|
104
+ sum = distance_sum(item, cluster_to_split)
105
+ max, max_index = sum, index if sum > max
106
+ end
107
+ splinter_cluster = cluster_to_split[max_index]
108
+ cluster_to_split.data_items.delete_at(max_index)
109
+ return splinter_cluster
110
+ end
111
+
112
+ # Return the max average distance between any item of
113
+ # cluster_to_split and the rest of items in that cluster,
114
+ # minus the average distance with the items of splinter_cluster,
115
+ # and the index of the item.
116
+ # A positive value means that the items is closer to the
117
+ # splinter group than to its current cluster.
118
+ def max_distance_difference(cluster_to_split, splinter_cluster)
119
+ max_diff = -1.0/0
120
+ max_diff_index = 0
121
+ cluster_to_split.data_items.each_with_index do |item, index|
122
+ dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
123
+ dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
124
+ dist_diff = dist_a - dist_b
125
+ max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
126
+ end
127
+ return max_diff, max_diff_index
128
+ end
129
+
130
+ # Sum up the distance between an item and all the items in a cluster
131
+ def distance_sum(item_a, cluster)
132
+ cluster.data_items.inject(0.0) do |sum, item_b|
133
+ sum + @distance_function.call(item_a, item_b)
134
+ end
135
+ end
136
+
137
+ end
138
+ end
139
+ end