rbcluster 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ require 'rbcluster/version'
2
+ require 'rbcluster/rbcluster'
3
+ require 'rbcluster/tree'
4
+
5
+ RbCluster = Cluster
@@ -0,0 +1,20 @@
1
+ module Cluster
2
+ class Tree
3
+ def initialize(nodes)
4
+ raise NotImplementedError, "patches welcome :)"
5
+
6
+ nodes.each_with_index do |node, idx|
7
+ unless node.kind_of?(Node)
8
+ raise ArgumentError, "expected #{Node.class}, got #{node.class} at index #{idx}"
9
+ end
10
+ end
11
+
12
+ @nodes = nodes
13
+ end
14
+
15
+ def size
16
+ @nodes.size
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Cluster
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "rbcluster/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "rbcluster"
7
+ s.version = Cluster::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Jari Bakken", "Michiel Jan Laurens de Hoon"]
10
+ s.email = ["jari.bakken@gmail.com"]
11
+ s.homepage = "http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm"
12
+ s.summary = %q{Ruby bindings for the Cluster C library}
13
+ s.description = %q{This gem provides a Ruby extension to the clustering routines in the C Clustering Library (which also backs e.g. Python's pycluster and Perl's Algorithm::Cluster).}
14
+
15
+ s.rubyforge_project = "rbcluster"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- spec/*`.split("\n")
19
+ s.require_paths = ["lib"]
20
+ s.extensions = `git ls-files -- ext/**/extconf.rb`.split("\n")
21
+
22
+ s.add_development_dependency "rake-compiler"
23
+ s.add_development_dependency "rspec", "~> 2.6.0"
24
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.clustercentroids" do
4
+ pending
5
+ end
6
+
@@ -0,0 +1,106 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.clusterdistance" do
4
+ it "calculates distances for data set 1" do
5
+ weight = [ 1,1,1,1,1 ]
6
+ data = [[ 1.1, 2.2, 3.3, 4.4, 5.5, ],
7
+ [ 3.1, 3.2, 1.3, 2.4, 1.5, ],
8
+ [ 4.1, 2.2, 0.3, 5.4, 0.5, ],
9
+ [ 12.1, 2.0, 0.0, 5.0, 0.0, ]]
10
+
11
+ mask = [[ 1, 1, 1, 1, 1],
12
+ [ 1, 1, 1, 1, 1],
13
+ [ 1, 1, 1, 1, 1],
14
+ [ 1, 1, 1, 1, 1]]
15
+
16
+ # Cluster assignments
17
+ c1 = [0]
18
+ c2 = [1,2]
19
+ c3 = [3]
20
+
21
+ distance = Cluster.clusterdistance data, c1, c2, :mask => mask,
22
+ :weight => weight,
23
+ :dist => 'e',
24
+ :method => 'a',
25
+ :transpose => false
26
+
27
+ distance.should be_within(0.001).of(6.650)
28
+
29
+ distance = Cluster.clusterdistance data, c1, c3, :mask => mask,
30
+ :weight => weight,
31
+ :dist => 'e',
32
+ :method => 'a',
33
+ :transpose => false
34
+
35
+ distance.should be_within(0.001).of(32.508)
36
+
37
+ distance = Cluster.clusterdistance data, c2, c3, :mask => mask,
38
+ :weight => weight,
39
+ :dist => 'e',
40
+ :method => 'a',
41
+ :transpose => false
42
+
43
+ distance.should be_within(0.001).of(15.118)
44
+ end
45
+
46
+ it "calculates distances for data set 2" do
47
+ weight = [ 1,1 ]
48
+ data = [[ 1.1, 1.2 ],
49
+ [ 1.4, 1.3 ],
50
+ [ 1.1, 1.5 ],
51
+ [ 2.0, 1.5 ],
52
+ [ 1.7, 1.9 ],
53
+ [ 1.7, 1.9 ],
54
+ [ 5.7, 5.9 ],
55
+ [ 5.7, 5.9 ],
56
+ [ 3.1, 3.3 ],
57
+ [ 5.4, 5.3 ],
58
+ [ 5.1, 5.5 ],
59
+ [ 5.0, 5.5 ],
60
+ [ 5.1, 5.2 ]]
61
+ mask = [[ 1, 1 ],
62
+ [ 1, 1 ],
63
+ [ 1, 1 ],
64
+ [ 1, 1 ],
65
+ [ 1, 1 ],
66
+ [ 1, 1 ],
67
+ [ 1, 1 ],
68
+ [ 1, 1 ],
69
+ [ 1, 1 ],
70
+ [ 1, 1 ],
71
+ [ 1, 1 ],
72
+ [ 1, 1 ],
73
+ [ 1, 1 ]]
74
+
75
+ # Cluster assignments
76
+ c1 = [ 0, 1, 2, 3 ]
77
+ c2 = [ 4, 5, 6, 7 ]
78
+ c3 = [ 8 ]
79
+
80
+ distance = Cluster.clusterdistance data, c1, c2, :mask => mask,
81
+ :weight => weight,
82
+ :dist => 'e',
83
+ :method => 'a',
84
+ :transpose => false
85
+
86
+ distance.should be_within(0.001).of(5.833)
87
+
88
+ distance = Cluster.clusterdistance data, c1, c3, :mask => mask,
89
+ :weight => weight,
90
+ :dist => 'e',
91
+ :method => 'a',
92
+ :transpose => false
93
+
94
+ distance.should be_within(0.001).of(3.298)
95
+
96
+
97
+ distance = Cluster.clusterdistance data, c2, c3, :mask => mask,
98
+ :weight => weight,
99
+ :dist => 'e',
100
+ :method => 'a',
101
+ :transpose => false
102
+
103
+ distance.should be_within(0.001).of(0.360)
104
+ end
105
+ end
106
+
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.clustermedoids" do
4
+ pending
5
+ end
6
+
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.cuttree" do
4
+ pending
5
+ end
6
+
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.kcluster" do
4
+ it "should run kcluster for the given data" do
5
+ nclusters = 3
6
+ # First data set
7
+ weight = [1,1,1,1,1]
8
+ data = [[ 1.1, 2.2, 3.3, 4.4, 5.5],
9
+ [ 3.1, 3.2, 1.3, 2.4, 1.5],
10
+ [ 4.1, 2.2, 0.3, 5.4, 0.5],
11
+ [12.1, 2.0, 0.0, 5.0, 0.0]]
12
+ mask = [[ 1, 1, 1, 1, 1],
13
+ [ 1, 1, 1, 1, 1],
14
+ [ 1, 1, 1, 1, 1],
15
+ [ 1, 1, 1, 1, 1]]
16
+
17
+
18
+ clusterids, error, nfound = Cluster.kcluster data, :clusters => nclusters,
19
+ :mask => mask,
20
+ :weight => weight,
21
+ :transpose => false,
22
+ :passes => 100,
23
+ :method => 'a',
24
+ :dist => 'e'
25
+
26
+ clusterids.size.should == data.size
27
+ correct = [0,1,1,2]
28
+ mapping = nclusters.times.map { |n| clusterids[correct.index(n)] }
29
+ clusterids.each_with_index do |ci, i|
30
+ ci.should == mapping[correct[i]]
31
+ end
32
+ end
33
+
34
+ it "should run kcluster for a second set of data" do
35
+ nclusters = 3
36
+ weight = [1,1]
37
+ data = [ [ 1.1, 1.2 ],
38
+ [ 1.4, 1.3 ],
39
+ [ 1.1, 1.5 ],
40
+ [ 2.0, 1.5 ],
41
+ [ 1.7, 1.9 ],
42
+ [ 1.7, 1.9 ],
43
+ [ 5.7, 5.9 ],
44
+ [ 5.7, 5.9 ],
45
+ [ 3.1, 3.3 ],
46
+ [ 5.4, 5.3 ],
47
+ [ 5.1, 5.5 ],
48
+ [ 5.0, 5.5 ],
49
+ [ 5.1, 5.2 ]]
50
+
51
+ mask = [ [ 1, 1 ],
52
+ [ 1, 1 ],
53
+ [ 1, 1 ],
54
+ [ 1, 1 ],
55
+ [ 1, 1 ],
56
+ [ 1, 1 ],
57
+ [ 1, 1 ],
58
+ [ 1, 1 ],
59
+ [ 1, 1 ],
60
+ [ 1, 1 ],
61
+ [ 1, 1 ],
62
+ [ 1, 1 ],
63
+ [ 1, 1 ]]
64
+
65
+ clusterids, error, nfound = Cluster.kcluster data, :clusters => nclusters,
66
+ :mask => mask,
67
+ :weight => weight,
68
+ :transpose => false,
69
+ :passes => 100,
70
+ :method => 'a',
71
+ :dist => 'e'
72
+
73
+ clusterids.size.should == data.size
74
+
75
+ correct = [0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1]
76
+ mapping = nclusters.times.map { |n| clusterids[correct.index(n)] }
77
+ clusterids.each_with_index do |ci, i|
78
+ ci.should == mapping[correct[i]]
79
+ end
80
+ end
81
+
82
+ it "raises ArgumentError if passed inconsistent data" do
83
+ lambda {
84
+ Cluster.kcluster [[1,2,3], [1,2,3,4]], {}
85
+ }.should raise_error(ArgumentError, "expected 3 columns, row has 4")
86
+ end
87
+
88
+ it "will use default options" do
89
+ data = [[1,1,1], [10,10,0], [0,0,0]]
90
+ clusterids, error, nfound = Cluster.kcluster(data, :passes => 1000)
91
+
92
+ clusterids.should be_kind_of(Array)
93
+ [[0, 1, 0], [1, 0, 1]].should include(clusterids)
94
+ end
95
+ end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.kmedoids" do
4
+ it "should calculate kmedoids from a distance matrix" do
5
+ data = [[2.2, 3.3, 4.4],
6
+ [2.1, 1.4, 5.6],
7
+ [7.8, 9.0, 1.2],
8
+ [4.5, 2.3, 1.5],
9
+ [4.2, 2.4, 1.9],
10
+ [3.6, 3.1, 9.3],
11
+ [2.3, 1.2, 3.9],
12
+ [4.2, 9.6, 9.3],
13
+ [1.7, 8.9, 1.1]]
14
+
15
+ mask = [[1, 1, 1],
16
+ [1, 1, 1],
17
+ [0, 1, 1],
18
+ [1, 1, 1],
19
+ [1, 1, 1],
20
+ [0, 1, 0],
21
+ [1, 1, 1],
22
+ [1, 0, 1],
23
+ [1, 1, 1]]
24
+
25
+ weight = [2.0, 1.0, 0.5]
26
+ matrix = Cluster.distancematrix data, :mask => mask, :weight => weight
27
+
28
+ matrix[1][0].should be_within(0.001).of(1.243)
29
+
30
+ matrix[2][0].should be_within(0.001).of(25.073)
31
+ matrix[2][1].should be_within(0.001).of(44.960)
32
+
33
+ matrix[3][0].should be_within(0.001).of(4.510)
34
+ matrix[3][1].should be_within(0.001).of(5.924)
35
+ matrix[3][2].should be_within(0.001).of(29.957)
36
+
37
+ matrix[4][0].should be_within(0.001).of(3.410)
38
+ matrix[4][1].should be_within(0.001).of(4.761)
39
+ matrix[4][2].should be_within(0.001).of(29.203)
40
+ matrix[4][3].should be_within(0.001).of(0.077)
41
+
42
+ matrix[5][0].should be_within(0.001).of(0.040)
43
+ matrix[5][1].should be_within(0.001).of(2.890)
44
+ matrix[5][2].should be_within(0.001).of(34.810)
45
+ matrix[5][3].should be_within(0.001).of(0.640)
46
+ matrix[5][4].should be_within(0.001).of(0.490)
47
+
48
+ matrix[6][0].should be_within(0.001).of(1.301)
49
+ matrix[6][1].should be_within(0.001).of(0.447)
50
+ matrix[6][2].should be_within(0.001).of(42.990)
51
+ matrix[6][3].should be_within(0.001).of(3.934)
52
+ matrix[6][4].should be_within(0.001).of(3.046)
53
+ matrix[6][5].should be_within(0.001).of(3.610)
54
+
55
+ matrix[7][0].should be_within(0.001).of(8.002)
56
+ matrix[7][1].should be_within(0.001).of(6.266)
57
+ matrix[7][2].should be_within(0.001).of(65.610)
58
+ matrix[7][3].should be_within(0.001).of(12.240)
59
+ matrix[7][4].should be_within(0.001).of(10.952)
60
+ matrix[7][5].should be_within(0.001).of(0.000)
61
+ matrix[7][6].should be_within(0.001).of(8.720)
62
+
63
+ matrix[8][0].should be_within(0.001).of(10.659)
64
+ matrix[8][1].should be_within(0.001).of(19.056)
65
+ matrix[8][2].should be_within(0.001).of(0.010)
66
+ matrix[8][3].should be_within(0.001).of(16.949)
67
+ matrix[8][4].should be_within(0.001).of(15.734)
68
+ matrix[8][5].should be_within(0.001).of(33.640)
69
+ matrix[8][6].should be_within(0.001).of(18.266)
70
+ matrix[8][7].should be_within(0.001).of(18.448)
71
+
72
+ clusterid, error, nfound = Cluster.kmedoids matrix, :passes => 1000
73
+
74
+ clusterid[0].should == 5
75
+ clusterid[1].should == 5
76
+ clusterid[2].should == 2
77
+ clusterid[3].should == 5
78
+ clusterid[4].should == 5
79
+ clusterid[5].should == 5
80
+ clusterid[6].should == 5
81
+ clusterid[7].should == 5
82
+ clusterid[8].should == 2
83
+
84
+ error.should be_within(0.001).of(7.680)
85
+ end
86
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.{median,mean}" do
4
+ let(:data) {
5
+ [
6
+ [ 34.3, 3, 2 ],
7
+ [ 5, 10, 15, 20],
8
+ [ 1, 2, 3, 5, 7, 11, 13, 17],
9
+ [ 100, 19, 3, 1.5, 1.4, 1, 1, 1],
10
+ ]
11
+ }
12
+
13
+ it "calculates the median" do
14
+ Cluster.median(data[0]).should == 3.0
15
+ Cluster.median(data[1]).should == 12.5
16
+ Cluster.median(data[2]).should == 6.0
17
+ Cluster.median(data[3]).should == 1.45
18
+ end
19
+
20
+ it "calculates the mean" do
21
+ Cluster.mean(data[0]).should be_within(0.001).of(13.1)
22
+ Cluster.mean(data[1]).should be_within(0.001).of(12.5)
23
+ Cluster.mean(data[2]).should be_within(0.001).of(7.375)
24
+ Cluster.mean(data[3]).should be_within(0.001).of(15.988)
25
+ end
26
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ module Cluster
4
+ describe Node do
5
+ it "creates a new node with left/right" do
6
+ n = Node.new(2, 3)
7
+ n.left.should == 2
8
+ n.right.should == 3
9
+ end
10
+
11
+ it "takes an optional distance" do
12
+ n = Node.new(2, 3, 0.91)
13
+
14
+ n.left.should == 2
15
+ n.right.should == 3
16
+ n.distance.should == 0.91
17
+ end
18
+
19
+ it "is mutable" do
20
+ n = Node.new(2, 3, 0.91)
21
+
22
+ n.left = 4
23
+ n.right = 5
24
+ n.distance = 2.1
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,113 @@
1
+ require 'spec_helper'
2
+
3
+ describe "Cluster.pca" do
4
+ it "performs principal component analysis where nrows > ncols" do
5
+ data = [
6
+ [ 3.1, 1.2 ],
7
+ [ 1.4, 1.3 ],
8
+ [ 1.1, 1.5 ],
9
+ [ 2.0, 1.5 ],
10
+ [ 1.7, 1.9 ],
11
+ [ 1.7, 1.9 ],
12
+ [ 5.7, 5.9 ],
13
+ [ 5.7, 5.9 ],
14
+ [ 3.1, 3.3 ],
15
+ [ 5.4, 5.3 ],
16
+ [ 5.1, 5.5 ],
17
+ [ 5.0, 5.5 ],
18
+ [ 5.1, 5.2 ],
19
+ ]
20
+
21
+ mean, coordinates, pc, eigenvalues = Cluster.pca(data)
22
+
23
+ mean[0].should be_within(0.001).of(3.5461538461538464)
24
+ mean[1].should be_within(0.001).of(3.5307692307692311)
25
+ coordinates[0][0].should be_within(0.001).of(2.0323189722653883)
26
+ coordinates[0][1].should be_within(0.001).of(1.2252420399694917)
27
+ coordinates[1][0].should be_within(0.001).of(3.0936985166252251)
28
+ coordinates[1][1].should be_within(0.001).of(-0.10647619705157851)
29
+ coordinates[2][0].should be_within(0.001).of(3.1453186907749426)
30
+ coordinates[2][1].should be_within(0.001).of(-0.46331699855941139)
31
+ coordinates[3][0].should be_within(0.001).of(2.5440202962223761)
32
+ coordinates[3][1].should be_within(0.001).of(0.20633980959571077)
33
+ coordinates[4][0].should be_within(0.001).of(2.4468278463376221)
34
+ coordinates[4][1].should be_within(0.001).of(-0.28412285736824866)
35
+ coordinates[5][0].should be_within(0.001).of(2.4468278463376221)
36
+ coordinates[5][1].should be_within(0.001).of(-0.28412285736824866)
37
+ coordinates[6][0].should be_within(0.001).of(-3.2018619434743254)
38
+ coordinates[6][1].should be_within(0.001).of(0.019692314198662915)
39
+ coordinates[7][0].should be_within(0.001).of(-3.2018619434743254)
40
+ coordinates[7][1].should be_within(0.001).of(0.019692314198662915)
41
+ coordinates[8][0].should be_within(0.001).of(0.46978641990344067)
42
+ coordinates[8][1].should be_within(0.001).of(-0.17778754731982949)
43
+ coordinates[9][0].should be_within(0.001).of(-2.5549912731867215)
44
+ coordinates[9][1].should be_within(0.001).of(0.19733897451533403)
45
+ coordinates[10][0].should be_within(0.001).of(-2.5033710990370044)
46
+ coordinates[10][1].should be_within(0.001).of(-0.15950182699250004)
47
+ coordinates[11][0].should be_within(0.001).of(-2.4365601663089413)
48
+ coordinates[11][1].should be_within(0.001).of(-0.23390813900973562)
49
+ coordinates[12][0].should be_within(0.001).of(-2.2801521629852974)
50
+ coordinates[12][1].should be_within(0.001).of( 0.0409309711916888)
51
+ pc[0][0].should be_within(0.001).of(-0.66810932728062988)
52
+ pc[0][1].should be_within(0.001).of(-0.74406312017235743)
53
+ pc[1][0].should be_within(0.001).of( 0.74406312017235743)
54
+ pc[1][1].should be_within(0.001).of(-0.66810932728062988)
55
+ eigenvalues[0].should be_within(0.001).of( 9.3110471246032844)
56
+ eigenvalues[1].should be_within(0.001).of( 1.4437456297481428)
57
+ end
58
+
59
+ it "performs principal component analysis where ncols > nrows" do
60
+ data = [[ 2.3, 4.5, 1.2, 6.7, 5.3, 7.1],
61
+ [ 1.3, 6.5, 2.2, 5.7, 6.2, 9.1],
62
+ [ 3.2, 7.2, 3.2, 7.4, 7.3, 8.9],
63
+ [ 4.2, 5.2, 9.2, 4.4, 6.3, 7.2]]
64
+
65
+ mean, coordinates, pc, eigenvalues = Cluster.pca(data)
66
+
67
+ mean[0].should be_within(0.001).of( 2.7500)
68
+ mean[1].should be_within(0.001).of( 5.8500)
69
+ mean[2].should be_within(0.001).of( 3.9500)
70
+ mean[3].should be_within(0.001).of( 6.0500)
71
+ mean[4].should be_within(0.001).of( 6.2750)
72
+ mean[5].should be_within(0.001).of( 8.0750)
73
+ coordinates[0][0].should be_within(0.001).of(2.6460846688406905)
74
+ coordinates[0][1].should be_within(0.001).of(-2.1421701432732418)
75
+ coordinates[0][2].should be_within(0.001).of(-0.56620932754145858)
76
+ coordinates[0][3].should be_within(0.001).of(0.0)
77
+ coordinates[1][0].should be_within(0.001).of(2.0644120899917544)
78
+ coordinates[1][1].should be_within(0.001).of(0.55542108669180323)
79
+ coordinates[1][2].should be_within(0.001).of(1.4818772348457117)
80
+ coordinates[1][3].should be_within(0.001).of(0.0)
81
+ coordinates[2][0].should be_within(0.001).of(1.0686641862092987)
82
+ coordinates[2][1].should be_within(0.001).of(1.9994412069101073)
83
+ coordinates[2][2].should be_within(0.001).of(-1.000720598980291)
84
+ coordinates[2][3].should be_within(0.001).of(0.0)
85
+ coordinates[3][0].should be_within(0.001).of(-5.77916094504174)
86
+ coordinates[3][1].should be_within(0.001).of(-0.41269215032867046)
87
+ coordinates[3][2].should be_within(0.001).of(0.085052691676038017)
88
+ coordinates[3][3].should be_within(0.001).of(0.0)
89
+ pc[0][0].should be_within(0.001).of(-0.26379660005997291)
90
+ pc[0][1].should be_within(0.001).of( 0.064814972617134495)
91
+ pc[0][2].should be_within(0.001).of(-0.91763310094893846)
92
+ pc[0][3].should be_within(0.001).of( 0.26145408875373249)
93
+ pc[1][0].should be_within(0.001).of( 0.05073770520434398)
94
+ pc[1][1].should be_within(0.001).of( 0.68616983388698793)
95
+ pc[1][2].should be_within(0.001).of( 0.13819106187213354)
96
+ pc[1][3].should be_within(0.001).of( 0.19782544121828985)
97
+ pc[2][0].should be_within(0.001).of(-0.63000893660095947)
98
+ pc[2][1].should be_within(0.001).of( 0.091155993862151397)
99
+ pc[2][2].should be_within(0.001).of( 0.045630391256086845)
100
+ pc[2][3].should be_within(0.001).of(-0.67456694780914772)
101
+
102
+ # As the last eigenvalue is zero, the corresponding eigenvector is
103
+ # strongly affected by roundoff error, and is not being tested here.
104
+ # For PCA, this doesn't matter since all data have a zero coefficient
105
+ # along this eigenvector.
106
+
107
+ eigenvalues[0].should be_within(0.001).of( 6.7678878332578778)
108
+ eigenvalues[1].should be_within(0.001).of( 3.0108911400291856)
109
+ eigenvalues[2].should be_within(0.001).of( 1.8775592718563467)
110
+ eigenvalues[3].should be_within(0.001).of( 0.0)
111
+ end
112
+ end
113
+