rbcluster 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ require 'rbcluster/version'
2
+ require 'rbcluster/rbcluster'
3
+ require 'rbcluster/tree'
4
+
5
+ RbCluster = Cluster
@@ -0,0 +1,20 @@
1
+ module Cluster
2
+ class Tree
3
+ def initialize(nodes)
4
+ raise NotImplementedError, "patches welcome :)"
5
+
6
+ nodes.each_with_index do |node, idx|
7
+ unless node.kind_of?(Node)
8
+ raise ArgumentError, "expected #{Node.class}, got #{node.class} at index #{idx}"
9
+ end
10
+ end
11
+
12
+ @nodes = nodes
13
+ end
14
+
15
+ def size
16
+ @nodes.size
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Cluster
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "rbcluster/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "rbcluster"
7
+ s.version = Cluster::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Jari Bakken", "Michiel Jan Laurens de Hoon"]
10
+ s.email = ["jari.bakken@gmail.com"]
11
+ s.homepage = "http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm"
12
+ s.summary = %q{Ruby bindings for the Cluster C library}
13
+ s.description = %q{This gem provides a Ruby extension to the clustering routines in the C Clustering Library (which also backs e.g. Python's pycluster and Perl's Algorithm::Cluster).}
14
+
15
+ s.rubyforge_project = "rbcluster"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- spec/*`.split("\n")
19
+ s.require_paths = ["lib"]
20
+ s.extensions = `git ls-files -- ext/**/extconf.rb`.split("\n")
21
+
22
+ s.add_development_dependency "rake-compiler"
23
+ s.add_development_dependency "rspec", "~> 2.6.0"
24
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.clustercentroids" do
4
+ pending
5
+ end
6
+
@@ -0,0 +1,106 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.clusterdistance" do
4
+ it "calculates distances for data set 1" do
5
+ weight = [ 1,1,1,1,1 ]
6
+ data = [[ 1.1, 2.2, 3.3, 4.4, 5.5, ],
7
+ [ 3.1, 3.2, 1.3, 2.4, 1.5, ],
8
+ [ 4.1, 2.2, 0.3, 5.4, 0.5, ],
9
+ [ 12.1, 2.0, 0.0, 5.0, 0.0, ]]
10
+
11
+ mask = [[ 1, 1, 1, 1, 1],
12
+ [ 1, 1, 1, 1, 1],
13
+ [ 1, 1, 1, 1, 1],
14
+ [ 1, 1, 1, 1, 1]]
15
+
16
+ # Cluster assignments
17
+ c1 = [0]
18
+ c2 = [1,2]
19
+ c3 = [3]
20
+
21
+ distance = Cluster.clusterdistance data, c1, c2, :mask => mask,
22
+ :weight => weight,
23
+ :dist => 'e',
24
+ :method => 'a',
25
+ :transpose => false
26
+
27
+ distance.should be_within(0.001).of(6.650)
28
+
29
+ distance = Cluster.clusterdistance data, c1, c3, :mask => mask,
30
+ :weight => weight,
31
+ :dist => 'e',
32
+ :method => 'a',
33
+ :transpose => false
34
+
35
+ distance.should be_within(0.001).of(32.508)
36
+
37
+ distance = Cluster.clusterdistance data, c2, c3, :mask => mask,
38
+ :weight => weight,
39
+ :dist => 'e',
40
+ :method => 'a',
41
+ :transpose => false
42
+
43
+ distance.should be_within(0.001).of(15.118)
44
+ end
45
+
46
+ it "calculates distances for data set 2" do
47
+ weight = [ 1,1 ]
48
+ data = [[ 1.1, 1.2 ],
49
+ [ 1.4, 1.3 ],
50
+ [ 1.1, 1.5 ],
51
+ [ 2.0, 1.5 ],
52
+ [ 1.7, 1.9 ],
53
+ [ 1.7, 1.9 ],
54
+ [ 5.7, 5.9 ],
55
+ [ 5.7, 5.9 ],
56
+ [ 3.1, 3.3 ],
57
+ [ 5.4, 5.3 ],
58
+ [ 5.1, 5.5 ],
59
+ [ 5.0, 5.5 ],
60
+ [ 5.1, 5.2 ]]
61
+ mask = [[ 1, 1 ],
62
+ [ 1, 1 ],
63
+ [ 1, 1 ],
64
+ [ 1, 1 ],
65
+ [ 1, 1 ],
66
+ [ 1, 1 ],
67
+ [ 1, 1 ],
68
+ [ 1, 1 ],
69
+ [ 1, 1 ],
70
+ [ 1, 1 ],
71
+ [ 1, 1 ],
72
+ [ 1, 1 ],
73
+ [ 1, 1 ]]
74
+
75
+ # Cluster assignments
76
+ c1 = [ 0, 1, 2, 3 ]
77
+ c2 = [ 4, 5, 6, 7 ]
78
+ c3 = [ 8 ]
79
+
80
+ distance = Cluster.clusterdistance data, c1, c2, :mask => mask,
81
+ :weight => weight,
82
+ :dist => 'e',
83
+ :method => 'a',
84
+ :transpose => false
85
+
86
+ distance.should be_within(0.001).of(5.833)
87
+
88
+ distance = Cluster.clusterdistance data, c1, c3, :mask => mask,
89
+ :weight => weight,
90
+ :dist => 'e',
91
+ :method => 'a',
92
+ :transpose => false
93
+
94
+ distance.should be_within(0.001).of(3.298)
95
+
96
+
97
+ distance = Cluster.clusterdistance data, c2, c3, :mask => mask,
98
+ :weight => weight,
99
+ :dist => 'e',
100
+ :method => 'a',
101
+ :transpose => false
102
+
103
+ distance.should be_within(0.001).of(0.360)
104
+ end
105
+ end
106
+
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.clustermedoids" do
4
+ pending
5
+ end
6
+
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.cuttree" do
4
+ pending
5
+ end
6
+
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.kcluster" do
4
+ it "should run kcluster for the given data" do
5
+ nclusters = 3
6
+ # First data set
7
+ weight = [1,1,1,1,1]
8
+ data = [[ 1.1, 2.2, 3.3, 4.4, 5.5],
9
+ [ 3.1, 3.2, 1.3, 2.4, 1.5],
10
+ [ 4.1, 2.2, 0.3, 5.4, 0.5],
11
+ [12.1, 2.0, 0.0, 5.0, 0.0]]
12
+ mask = [[ 1, 1, 1, 1, 1],
13
+ [ 1, 1, 1, 1, 1],
14
+ [ 1, 1, 1, 1, 1],
15
+ [ 1, 1, 1, 1, 1]]
16
+
17
+
18
+ clusterids, error, nfound = Cluster.kcluster data, :clusters => nclusters,
19
+ :mask => mask,
20
+ :weight => weight,
21
+ :transpose => false,
22
+ :passes => 100,
23
+ :method => 'a',
24
+ :dist => 'e'
25
+
26
+ clusterids.size.should == data.size
27
+ correct = [0,1,1,2]
28
+ mapping = nclusters.times.map { |n| clusterids[correct.index(n)] }
29
+ clusterids.each_with_index do |ci, i|
30
+ ci.should == mapping[correct[i]]
31
+ end
32
+ end
33
+
34
+ it "should run kcluster for a second set of data" do
35
+ nclusters = 3
36
+ weight = [1,1]
37
+ data = [ [ 1.1, 1.2 ],
38
+ [ 1.4, 1.3 ],
39
+ [ 1.1, 1.5 ],
40
+ [ 2.0, 1.5 ],
41
+ [ 1.7, 1.9 ],
42
+ [ 1.7, 1.9 ],
43
+ [ 5.7, 5.9 ],
44
+ [ 5.7, 5.9 ],
45
+ [ 3.1, 3.3 ],
46
+ [ 5.4, 5.3 ],
47
+ [ 5.1, 5.5 ],
48
+ [ 5.0, 5.5 ],
49
+ [ 5.1, 5.2 ]]
50
+
51
+ mask = [ [ 1, 1 ],
52
+ [ 1, 1 ],
53
+ [ 1, 1 ],
54
+ [ 1, 1 ],
55
+ [ 1, 1 ],
56
+ [ 1, 1 ],
57
+ [ 1, 1 ],
58
+ [ 1, 1 ],
59
+ [ 1, 1 ],
60
+ [ 1, 1 ],
61
+ [ 1, 1 ],
62
+ [ 1, 1 ],
63
+ [ 1, 1 ]]
64
+
65
+ clusterids, error, nfound = Cluster.kcluster data, :clusters => nclusters,
66
+ :mask => mask,
67
+ :weight => weight,
68
+ :transpose => false,
69
+ :passes => 100,
70
+ :method => 'a',
71
+ :dist => 'e'
72
+
73
+ clusterids.size.should == data.size
74
+
75
+ correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1]
76
+ mapping = nclusters.times.map { |n| clusterids[correct.index(n)] }
77
+ clusterids.each_with_index do |ci, i|
78
+ ci.should == mapping[correct[i]]
79
+ end
80
+ end
81
+
82
+ it "raises ArgumentError if passed inconsistent data" do
83
+ lambda {
84
+ Cluster.kcluster [[1,2,3], [1,2,3,4]], {}
85
+ }.should raise_error(ArgumentError, "expected 3 columns, row has 4")
86
+ end
87
+
88
+ it "will use default options" do
89
+ data = [[1,1,1], [10,10,0], [0,0,0]]
90
+ clusterids, error, nfound = Cluster.kcluster(data, :passes => 1000)
91
+
92
+ clusterids.should be_kind_of(Array)
93
+ [[0, 1, 0], [1, 0, 1]].should include(clusterids)
94
+ end
95
+ end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.kmedoids" do
4
+ it "should calculate kmedoids from a distance matrix" do
5
+ data = [[2.2, 3.3, 4.4],
6
+ [2.1, 1.4, 5.6],
7
+ [7.8, 9.0, 1.2],
8
+ [4.5, 2.3, 1.5],
9
+ [4.2, 2.4, 1.9],
10
+ [3.6, 3.1, 9.3],
11
+ [2.3, 1.2, 3.9],
12
+ [4.2, 9.6, 9.3],
13
+ [1.7, 8.9, 1.1]]
14
+
15
+ mask = [[1, 1, 1],
16
+ [1, 1, 1],
17
+ [0, 1, 1],
18
+ [1, 1, 1],
19
+ [1, 1, 1],
20
+ [0, 1, 0],
21
+ [1, 1, 1],
22
+ [1, 0, 1],
23
+ [1, 1, 1]]
24
+
25
+ weight = [2.0, 1.0, 0.5]
26
+ matrix = Cluster.distancematrix data, :mask => mask, :weight => weight
27
+
28
+ matrix[1][0].should be_within(0.001).of(1.243)
29
+
30
+ matrix[2][0].should be_within(0.001).of(25.073)
31
+ matrix[2][1].should be_within(0.001).of(44.960)
32
+
33
+ matrix[3][0].should be_within(0.001).of(4.510)
34
+ matrix[3][1].should be_within(0.001).of(5.924)
35
+ matrix[3][2].should be_within(0.001).of(29.957)
36
+
37
+ matrix[4][0].should be_within(0.001).of(3.410)
38
+ matrix[4][1].should be_within(0.001).of(4.761)
39
+ matrix[4][2].should be_within(0.001).of(29.203)
40
+ matrix[4][3].should be_within(0.001).of(0.077)
41
+
42
+ matrix[5][0].should be_within(0.001).of(0.040)
43
+ matrix[5][1].should be_within(0.001).of(2.890)
44
+ matrix[5][2].should be_within(0.001).of(34.810)
45
+ matrix[5][3].should be_within(0.001).of(0.640)
46
+ matrix[5][4].should be_within(0.001).of(0.490)
47
+
48
+ matrix[6][0].should be_within(0.001).of(1.301)
49
+ matrix[6][1].should be_within(0.001).of(0.447)
50
+ matrix[6][2].should be_within(0.001).of(42.990)
51
+ matrix[6][3].should be_within(0.001).of(3.934)
52
+ matrix[6][4].should be_within(0.001).of(3.046)
53
+ matrix[6][5].should be_within(0.001).of(3.610)
54
+
55
+ matrix[7][0].should be_within(0.001).of(8.002)
56
+ matrix[7][1].should be_within(0.001).of(6.266)
57
+ matrix[7][2].should be_within(0.001).of(65.610)
58
+ matrix[7][3].should be_within(0.001).of(12.240)
59
+ matrix[7][4].should be_within(0.001).of(10.952)
60
+ matrix[7][5].should be_within(0.001).of(0.000)
61
+ matrix[7][6].should be_within(0.001).of(8.720)
62
+
63
+ matrix[8][0].should be_within(0.001).of(10.659)
64
+ matrix[8][1].should be_within(0.001).of(19.056)
65
+ matrix[8][2].should be_within(0.001).of(0.010)
66
+ matrix[8][3].should be_within(0.001).of(16.949)
67
+ matrix[8][4].should be_within(0.001).of(15.734)
68
+ matrix[8][5].should be_within(0.001).of(33.640)
69
+ matrix[8][6].should be_within(0.001).of(18.266)
70
+ matrix[8][7].should be_within(0.001).of(18.448)
71
+
72
+ clusterid, error, nfound = Cluster.kmedoids matrix, :passes => 1000
73
+
74
+ clusterid[0].should == 5
75
+ clusterid[1].should == 5
76
+ clusterid[2].should == 2
77
+ clusterid[3].should == 5
78
+ clusterid[4].should == 5
79
+ clusterid[5].should == 5
80
+ clusterid[6].should == 5
81
+ clusterid[7].should == 5
82
+ clusterid[8].should == 2
83
+
84
+ error.should be_within(0.001).of(7.680)
85
+ end
86
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.{median,mean}" do
4
+ let(:data) {
5
+ [
6
+ [ 34.3, 3, 2 ],
7
+ [ 5, 10, 15, 20],
8
+ [ 1, 2, 3, 5, 7, 11, 13, 17],
9
+ [ 100, 19, 3, 1.5, 1.4, 1, 1, 1],
10
+ ]
11
+ }
12
+
13
+ it "calculates the median" do
14
+ Cluster.median(data[0]).should == 3.0
15
+ Cluster.median(data[1]).should == 12.5
16
+ Cluster.median(data[2]).should == 6.0
17
+ Cluster.median(data[3]).should == 1.45
18
+ end
19
+
20
+ it "calculates the mean" do
21
+ Cluster.mean(data[0]).should be_within(0.001).of(13.1)
22
+ Cluster.mean(data[1]).should be_within(0.001).of(12.5)
23
+ Cluster.mean(data[2]).should be_within(0.001).of(7.375)
24
+ Cluster.mean(data[3]).should be_within(0.001).of(15.988)
25
+ end
26
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ module Cluster
4
+ describe Node do
5
+ it "creates a new node with left/right" do
6
+ n = Node.new(2, 3)
7
+ n.left.should == 2
8
+ n.right.should == 3
9
+ end
10
+
11
+ it "takes an optional distance" do
12
+ n = Node.new(2, 3, 0.91)
13
+
14
+ n.left.should == 2
15
+ n.right.should == 3
16
+ n.distance.should == 0.91
17
+ end
18
+
19
+ it "is mutable" do
20
+ n = Node.new(2, 3, 0.91)
21
+
22
+ n.left = 4
23
+ n.right = 5
24
+ n.distance = 2.1
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,113 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.pca" do
4
+ it "performs principal component analysis where nrows > ncols" do
5
+ data = [
6
+ [ 3.1, 1.2 ],
7
+ [ 1.4, 1.3 ],
8
+ [ 1.1, 1.5 ],
9
+ [ 2.0, 1.5 ],
10
+ [ 1.7, 1.9 ],
11
+ [ 1.7, 1.9 ],
12
+ [ 5.7, 5.9 ],
13
+ [ 5.7, 5.9 ],
14
+ [ 3.1, 3.3 ],
15
+ [ 5.4, 5.3 ],
16
+ [ 5.1, 5.5 ],
17
+ [ 5.0, 5.5 ],
18
+ [ 5.1, 5.2 ],
19
+ ]
20
+
21
+ mean, coordinates, pc, eigenvalues = Cluster.pca(data)
22
+
23
+ mean[0].should be_within(0.001).of(3.5461538461538464)
24
+ mean[1].should be_within(0.001).of(3.5307692307692311)
25
+ coordinates[0][0].should be_within(0.001).of(2.0323189722653883)
26
+ coordinates[0][1].should be_within(0.001).of(1.2252420399694917)
27
+ coordinates[1][0].should be_within(0.001).of(3.0936985166252251)
28
+ coordinates[1][1].should be_within(0.001).of(-0.10647619705157851)
29
+ coordinates[2][0].should be_within(0.001).of(3.1453186907749426)
30
+ coordinates[2][1].should be_within(0.001).of(-0.46331699855941139)
31
+ coordinates[3][0].should be_within(0.001).of(2.5440202962223761)
32
+ coordinates[3][1].should be_within(0.001).of(0.20633980959571077)
33
+ coordinates[4][0].should be_within(0.001).of(2.4468278463376221)
34
+ coordinates[4][1].should be_within(0.001).of(-0.28412285736824866)
35
+ coordinates[5][0].should be_within(0.001).of(2.4468278463376221)
36
+ coordinates[5][1].should be_within(0.001).of(-0.28412285736824866)
37
+ coordinates[6][0].should be_within(0.001).of(-3.2018619434743254)
38
+ coordinates[6][1].should be_within(0.001).of(0.019692314198662915)
39
+ coordinates[7][0].should be_within(0.001).of(-3.2018619434743254)
40
+ coordinates[7][1].should be_within(0.001).of(0.019692314198662915)
41
+ coordinates[8][0].should be_within(0.001).of(0.46978641990344067)
42
+ coordinates[8][1].should be_within(0.001).of(-0.17778754731982949)
43
+ coordinates[9][0].should be_within(0.001).of(-2.5549912731867215)
44
+ coordinates[9][1].should be_within(0.001).of(0.19733897451533403)
45
+ coordinates[10][0].should be_within(0.001).of(-2.5033710990370044)
46
+ coordinates[10][1].should be_within(0.001).of(-0.15950182699250004)
47
+ coordinates[11][0].should be_within(0.001).of(-2.4365601663089413)
48
+ coordinates[11][1].should be_within(0.001).of(-0.23390813900973562)
49
+ coordinates[12][0].should be_within(0.001).of(-2.2801521629852974)
50
+ coordinates[12][1].should be_within(0.001).of( 0.0409309711916888)
51
+ pc[0][0].should be_within(0.001).of(-0.66810932728062988)
52
+ pc[0][1].should be_within(0.001).of(-0.74406312017235743)
53
+ pc[1][0].should be_within(0.001).of( 0.74406312017235743)
54
+ pc[1][1].should be_within(0.001).of(-0.66810932728062988)
55
+ eigenvalues[0].should be_within(0.001).of( 9.3110471246032844)
56
+ eigenvalues[1].should be_within(0.001).of( 1.4437456297481428)
57
+ end
58
+
59
+ it "performs principal component analysis where ncols > nrows" do
60
+ data = [[ 2.3, 4.5, 1.2, 6.7, 5.3, 7.1],
61
+ [ 1.3, 6.5, 2.2, 5.7, 6.2, 9.1],
62
+ [ 3.2, 7.2, 3.2, 7.4, 7.3, 8.9],
63
+ [ 4.2, 5.2, 9.2, 4.4, 6.3, 7.2]]
64
+
65
+ mean, coordinates, pc, eigenvalues = Cluster.pca(data)
66
+
67
+ mean[0].should be_within(0.001).of( 2.7500)
68
+ mean[1].should be_within(0.001).of( 5.8500)
69
+ mean[2].should be_within(0.001).of( 3.9500)
70
+ mean[3].should be_within(0.001).of( 6.0500)
71
+ mean[4].should be_within(0.001).of( 6.2750)
72
+ mean[5].should be_within(0.001).of( 8.0750)
73
+ coordinates[0][0].should be_within(0.001).of(2.6460846688406905)
74
+ coordinates[0][1].should be_within(0.001).of(-2.1421701432732418)
75
+ coordinates[0][2].should be_within(0.001).of(-0.56620932754145858)
76
+ coordinates[0][3].should be_within(0.001).of(0.0)
77
+ coordinates[1][0].should be_within(0.001).of(2.0644120899917544)
78
+ coordinates[1][1].should be_within(0.001).of(0.55542108669180323)
79
+ coordinates[1][2].should be_within(0.001).of(1.4818772348457117)
80
+ coordinates[1][3].should be_within(0.001).of(0.0)
81
+ coordinates[2][0].should be_within(0.001).of(1.0686641862092987)
82
+ coordinates[2][1].should be_within(0.001).of(1.9994412069101073)
83
+ coordinates[2][2].should be_within(0.001).of(-1.000720598980291)
84
+ coordinates[2][3].should be_within(0.001).of(0.0)
85
+ coordinates[3][0].should be_within(0.001).of(-5.77916094504174)
86
+ coordinates[3][1].should be_within(0.001).of(-0.41269215032867046)
87
+ coordinates[3][2].should be_within(0.001).of(0.085052691676038017)
88
+ coordinates[3][3].should be_within(0.001).of(0.0)
89
+ pc[0][0].should be_within(0.001).of(-0.26379660005997291)
90
+ pc[0][1].should be_within(0.001).of( 0.064814972617134495)
91
+ pc[0][2].should be_within(0.001).of(-0.91763310094893846)
92
+ pc[0][3].should be_within(0.001).of( 0.26145408875373249)
93
+ pc[1][0].should be_within(0.001).of( 0.05073770520434398)
94
+ pc[1][1].should be_within(0.001).of( 0.68616983388698793)
95
+ pc[1][2].should be_within(0.001).of( 0.13819106187213354)
96
+ pc[1][3].should be_within(0.001).of( 0.19782544121828985)
97
+ pc[2][0].should be_within(0.001).of(-0.63000893660095947)
98
+ pc[2][1].should be_within(0.001).of( 0.091155993862151397)
99
+ pc[2][2].should be_within(0.001).of( 0.045630391256086845)
100
+ pc[2][3].should be_within(0.001).of(-0.67456694780914772)
101
+
102
+ # As the last eigenvalue is zero, the corresponding eigenvector is
103
+ # strongly affected by roundoff error, and is not being tested here.
104
+ # For PCA, this doesn't matter since all data have a zero coefficient
105
+ # along this eigenvector.
106
+
107
+ eigenvalues[0].should be_within(0.001).of( 6.7678878332578778)
108
+ eigenvalues[1].should be_within(0.001).of( 3.0108911400291856)
109
+ eigenvalues[2].should be_within(0.001).of( 1.8775592718563467)
110
+ eigenvalues[3].should be_within(0.001).of( 0.0)
111
+ end
112
+ end
113
+