flock 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/API.rdoc +22 -0
- data/README.rdoc +44 -14
- data/Rakefile +5 -0
- data/VERSION +1 -1
- data/ext/cluster.c +2993 -2680
- data/ext/cluster.h +1 -1
- data/ext/flock.c +235 -67
- data/ext/kmeanspp.c +129 -0
- data/flock.gemspec +13 -18
- data/lib/flock.rb +229 -32
- metadata +10 -15
- data/examples/dense.rb +0 -38
- data/examples/som.rb +0 -13
- data/examples/sparse.rb +0 -22
- data/examples/treecluster.rb +0 -13
data/ext/kmeanspp.c
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
|
3
|
+
extern double uniform();
|
4
|
+
typedef struct clusterpoint {
|
5
|
+
double dist;
|
6
|
+
int n, chosen, closest;
|
7
|
+
} clusterpoint;
|
8
|
+
|
9
|
+
int compare(const void *ptr1, const void *ptr2) {
|
10
|
+
clusterpoint *p1 = (clusterpoint *)ptr1, *p2 = (clusterpoint *)ptr2;
|
11
|
+
return p1->dist == p2->dist ? 0 : p1->dist < p2->dist ? -1 : 1;
|
12
|
+
}
|
13
|
+
|
14
|
+
double compute_distances(int ndata, int npoints,
|
15
|
+
double **data, int **mask, double weight[], int transpose, clusterpoint dists[],
|
16
|
+
double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int)) {
|
17
|
+
|
18
|
+
int i, j, closest = 0;
|
19
|
+
double min, dist, total = 0;
|
20
|
+
|
21
|
+
// compute distances to chosen point
|
22
|
+
for (i = 0; i < npoints; i++) {
|
23
|
+
if (dists[i].chosen) continue;
|
24
|
+
|
25
|
+
min = -1;
|
26
|
+
for (j = 0; j < npoints; j++) {
|
27
|
+
if (!dists[j].chosen) continue;
|
28
|
+
|
29
|
+
dist = metric(ndata, data, data, mask, mask, weight, dists[i].n, dists[j].n, transpose);
|
30
|
+
if (min < 0 || min > dist) {
|
31
|
+
min = dist;
|
32
|
+
closest = j;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
dists[i].dist = min * min;
|
37
|
+
dists[i].closest = closest;
|
38
|
+
total += dists[i].dist;
|
39
|
+
}
|
40
|
+
|
41
|
+
return total;
|
42
|
+
}
|
43
|
+
|
44
|
+
void weightedassign(int nclusters, int nrows, int ncolumns,
|
45
|
+
double** data, int** mask, double weight[], int transpose,
|
46
|
+
double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int),
|
47
|
+
int clusterid[]) {
|
48
|
+
|
49
|
+
int i, n, chosen = (int)((double)nrows*uniform());
|
50
|
+
int ndata = (transpose == 0 ? ncolumns : nrows), npoints = (transpose == 0 ? nrows : ncolumns);
|
51
|
+
double total = 0, cutoff, curr;
|
52
|
+
clusterpoint dists[npoints];
|
53
|
+
|
54
|
+
for (i = 0; i < npoints; i++) {
|
55
|
+
dists[i].n = i;
|
56
|
+
dists[i].chosen = 0;
|
57
|
+
dists[i].dist = 0;
|
58
|
+
}
|
59
|
+
|
60
|
+
// setup 1st centroid
|
61
|
+
n = 1;
|
62
|
+
clusterid[chosen] = 0;
|
63
|
+
dists[chosen].chosen = 1;
|
64
|
+
|
65
|
+
// pick k-points for k-clusters with a probability weighted by square of distance from closest centroid.
|
66
|
+
while (n < nclusters) {
|
67
|
+
total = compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
|
68
|
+
qsort((void*)dists, npoints, sizeof(clusterpoint), compare);
|
69
|
+
|
70
|
+
curr = 0;
|
71
|
+
cutoff = total * uniform();
|
72
|
+
for (i = 0; i < npoints; i++) {
|
73
|
+
if (dists[i].chosen) continue;
|
74
|
+
curr += dists[i].dist;
|
75
|
+
if (curr >= cutoff || i == (npoints - 1)) {
|
76
|
+
clusterid[dists[i].n] = n++;
|
77
|
+
dists[i].chosen = 1;
|
78
|
+
dists[i].dist = 0;
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
// assign remaining points to closest cluster
|
85
|
+
compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
|
86
|
+
for (n = 0; n < npoints; n++) {
|
87
|
+
if (dists[n].chosen) continue;
|
88
|
+
clusterid[dists[n].n] = clusterid[dists[dists[n].closest].n];
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
void spreadoutassign(int nclusters, int nrows, int ncolumns,
|
93
|
+
double** data, int** mask, double weight[], int transpose,
|
94
|
+
double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int),
|
95
|
+
int clusterid[]) {
|
96
|
+
|
97
|
+
int i, n, chosen = 0;
|
98
|
+
int ndata = (transpose == 0 ? ncolumns : nrows), npoints = (transpose == 0 ? nrows : ncolumns);
|
99
|
+
clusterpoint dists[npoints];
|
100
|
+
|
101
|
+
for (i = 0; i < npoints; i++) {
|
102
|
+
dists[i].n = i;
|
103
|
+
dists[i].chosen = 0;
|
104
|
+
dists[i].dist = 0;
|
105
|
+
}
|
106
|
+
|
107
|
+
// setup 1st centroid
|
108
|
+
n = 1;
|
109
|
+
clusterid[chosen] = 0;
|
110
|
+
dists[chosen].chosen = 1;
|
111
|
+
|
112
|
+
// pick k-points for k-clusters with max distance from all centers.
|
113
|
+
chosen = npoints - 1;
|
114
|
+
while (n < nclusters) {
|
115
|
+
compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
|
116
|
+
qsort((void*)dists, npoints, sizeof(clusterpoint), compare);
|
117
|
+
|
118
|
+
clusterid[dists[chosen].n] = n++;
|
119
|
+
dists[chosen].chosen = 1;
|
120
|
+
dists[chosen].dist = 0;
|
121
|
+
}
|
122
|
+
|
123
|
+
// assign remaining points to closest cluster
|
124
|
+
compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
|
125
|
+
for (n = 0; n < npoints; n++) {
|
126
|
+
if (dists[n].chosen) continue;
|
127
|
+
clusterid[dists[n].n] = clusterid[dists[dists[n].closest].n];
|
128
|
+
}
|
129
|
+
}
|
data/flock.gemspec
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
# Generated by jeweler
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{flock}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.5.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Bharanee Rathna"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-07-26}
|
13
13
|
s.description = %q{A thin ruby binding to Cluster 3.0}
|
14
14
|
s.email = ["deepfryed@gmail.com"]
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
@@ -17,27 +17,22 @@ Gem::Specification.new do |s|
|
|
17
17
|
"README.rdoc"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
|
+
"API.rdoc",
|
20
21
|
"README.rdoc",
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
22
|
+
"Rakefile",
|
23
|
+
"VERSION",
|
24
|
+
"ext/cluster.c",
|
25
|
+
"ext/cluster.h",
|
26
|
+
"ext/extconf.rb",
|
27
|
+
"ext/flock.c",
|
28
|
+
"ext/kmeanspp.c",
|
29
|
+
"flock.gemspec",
|
30
|
+
"lib/flock.rb"
|
29
31
|
]
|
30
32
|
s.homepage = %q{http://github.com/deepfryed/flock}
|
31
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
32
33
|
s.require_paths = ["lib"]
|
33
34
|
s.rubygems_version = %q{1.3.7}
|
34
35
|
s.summary = %q{Ruby bindings to Cluster 3.0.}
|
35
|
-
s.test_files = [
|
36
|
-
"examples/sparse.rb",
|
37
|
-
"examples/som.rb",
|
38
|
-
"examples/dense.rb",
|
39
|
-
"examples/treecluster.rb"
|
40
|
-
]
|
41
36
|
|
42
37
|
if s.respond_to? :specification_version then
|
43
38
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
data/lib/flock.rb
CHANGED
@@ -1,50 +1,247 @@
|
|
1
1
|
require_relative '../ext/flock'
|
2
|
+
|
3
|
+
# Ruby bindings to data clustering algorithms provided by
|
4
|
+
# {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm]
|
5
|
+
#
|
6
|
+
# == Algorithms implemented
|
7
|
+
#
|
8
|
+
# * K-Means, K-Medians, K-Means++
|
9
|
+
# * Self-Organizing Maps
|
10
|
+
# * Tree Cluster or Hierarchical Clustering
|
11
|
+
#
|
12
|
+
# == Synopsis
|
13
|
+
#
|
14
|
+
# require 'pp'
|
15
|
+
# require 'flock'
|
16
|
+
#
|
17
|
+
# # sparse data.
|
18
|
+
# data = []
|
19
|
+
# data << %w(apple orange)
|
20
|
+
# data << %w(black white)
|
21
|
+
# data << %w(white cyan)
|
22
|
+
# data << %w(apple orange)
|
23
|
+
# data << %w(apple)
|
24
|
+
#
|
25
|
+
# pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_RANDOM)
|
26
|
+
# pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_KMEANS_PLUSPLUS)
|
27
|
+
# pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_SPREADOUT)
|
28
|
+
#
|
29
|
+
# # dense data.
|
30
|
+
# data = Array.new(13) {[]}
|
31
|
+
# mask = Array.new(13) {[]}
|
32
|
+
# weights = Array.new(13) {1.0}
|
33
|
+
#
|
34
|
+
# data[0][0] = 0.1; data[0][1] = 0.0;
|
35
|
+
# data[1][0] = 1.4; data[1][1] = 1.3;
|
36
|
+
# data[2][0] = 1.2; data[2][1] = 2.5;
|
37
|
+
# data[3][0] = 2.3; data[3][1] = 1.5;
|
38
|
+
# data[4][0] = 1.7; data[4][1] = 0.7;
|
39
|
+
# data[5][0] = 0.0; data[5][1] = 3.9;
|
40
|
+
# data[6][0] = 6.7; data[6][1] = 3.9;
|
41
|
+
#
|
42
|
+
# mask[0][0] = 1; mask[0][1] = 1;
|
43
|
+
# mask[1][0] = 1; mask[1][1] = 1;
|
44
|
+
# mask[2][0] = 1; mask[2][1] = 1;
|
45
|
+
# mask[3][0] = 1; mask[3][1] = 1;
|
46
|
+
# mask[4][0] = 1; mask[4][1] = 1;
|
47
|
+
# mask[5][0] = 0; mask[5][1] = 1;
|
48
|
+
# mask[6][0] = 1; mask[6][1] = 1;
|
49
|
+
#
|
50
|
+
# pp Flock.kcluster(2, data, mask: mask, weights: weights)
|
51
|
+
#
|
52
|
+
#
|
53
|
+
# == See
|
54
|
+
# * examples/* for more examples.
|
55
|
+
# * README.rdoc for more details.
|
56
|
+
# * API.rdoc is a public API overview.
|
2
57
|
module Flock
|
3
58
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
59
|
+
# Cluster using k-means and k-medians.
|
60
|
+
#
|
61
|
+
# @example
|
62
|
+
#
|
63
|
+
# data = []
|
64
|
+
# data << %w(apple orange)
|
65
|
+
# data << %w(black white)
|
66
|
+
# data << %w(white cyan)
|
67
|
+
# data << %w(apple orange)
|
68
|
+
# data << %w(apple)
|
69
|
+
# result = Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_RANDOM)
|
70
|
+
#
|
71
|
+
# @param [Fixnum] size number of clusters the data points are grouped into.
|
72
|
+
# @param [Array] data An array of arrays of sparse or dense data, or an array of hashes of sparse data. Dense data
|
73
|
+
# should always be in numeric form. Sparse data values are converted to a dense row format
|
74
|
+
# by looking at the unique values and then converting each data point into a numeric vector
|
75
|
+
# that represents the presence or absence of a value in that data point.
|
76
|
+
# @option options [Array] :mask An array of arrays of 1s and 0s denoting if an element in the datapoint is
|
77
|
+
# to be used for computing distance (defaults to: all 1 vectors).
|
78
|
+
# @option options [Array] :weights Numeric weight for each data point (defaults to: all 1 vector).
|
79
|
+
# @option options [true, false] :transpose Transpose the dense data matrix (defaults to: false).
|
80
|
+
# @option options [Fixnum] :iterations Number of iterations to be run (defaults to: 100).
|
81
|
+
# @option options [Fixnum] :method Clustering method
|
82
|
+
# - Flock::METHOD_AVERAGE (default)
|
83
|
+
# - Flock::METHOD_MEDIAN
|
84
|
+
# @option options [Fixnum] :metric Distance measure, one of the following
|
85
|
+
# - Flock::METRIC_EUCLIDIAN (default)
|
86
|
+
# - Flock::METRIC_CITY_BLOCK
|
87
|
+
# - Flock::METRIC_CORRELATION
|
88
|
+
# - Flock::METRIC_ABSOLUTE_CORRELATION
|
89
|
+
# - Flock::METRIC_UNCENTERED_CORRELATION
|
90
|
+
# - Flock::METRIC_ABSOLUTE_UNCENTERED_CORRELATION
|
91
|
+
# - Flock::METRIC_SPEARMAN
|
92
|
+
# - Flock::METRIC_KENDALL
|
93
|
+
# @option options [Fixnum] :seed Initial seeding of clusters
|
94
|
+
# - Flock::SEED_RANDOM (default)
|
95
|
+
# - Flock::SEED_KMEANS_PLUSPLUS
|
96
|
+
# - Flock::SEED_SPREADOUT
|
97
|
+
# @return [Hash]
|
98
|
+
# {
|
99
|
+
# :cluster => [Array],
|
100
|
+
# :centroid => [Array<Array>],
|
101
|
+
# :error => [Numeric],
|
102
|
+
# :repeated => [Fixnum]
|
103
|
+
# }
|
104
|
+
def self.kcluster size, data, options = {}
|
105
|
+
options[:sparse] = true if sparse?(data[0])
|
106
|
+
if options[:sparse]
|
107
|
+
data, options[:weights] = densify(data, options[:weights])
|
108
|
+
options[:mask] = nil
|
10
109
|
end
|
11
|
-
|
110
|
+
do_kcluster(size, data, options)
|
12
111
|
end
|
13
112
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
113
|
+
# Arranges data points on a 2D grid without having to specify a fixed cluster size. So in theory you could have
|
114
|
+
# a maximum of nxm clusters.
|
115
|
+
#
|
116
|
+
# @example
|
117
|
+
#
|
118
|
+
# data = []
|
119
|
+
# data << %w(apple orange)
|
120
|
+
# data << %w(black white)
|
121
|
+
# data << %w(white cyan)
|
122
|
+
# data << %w(apple orange)
|
123
|
+
# data << %w(apple)
|
124
|
+
# result = Flock.self_organizing_map(2, 2, data, sparse: true)
|
125
|
+
#
|
126
|
+
# @param [Fixnum] nx Grid size in 1st dimension (x)
|
127
|
+
# @param [Fixnum] ny Grid size in 2nd dimension (y)
|
128
|
+
# @param [Array] data See Flock#kcluster
|
129
|
+
# @option options [Array] :mask See Flock#kcluster
|
130
|
+
# @option options [true, false] :transpose See Flock#kcluster
|
131
|
+
# @option options [Fixnum] :iterations See Flock#kcluster
|
132
|
+
# @option options [Fixnum] :metric See Flock#kcluster
|
133
|
+
# @option options [Numeric] :tau Initial tau value for distance metric.
|
134
|
+
# @return [Hash]
|
135
|
+
# {
|
136
|
+
# :cluster => [Array<Array>],
|
137
|
+
# :centroid => [Array<Array>]
|
138
|
+
# }
|
139
|
+
def self.self_organizing_map nx, ny, data, options = {}
|
140
|
+
options[:sparse] = true if sparse?(data[0])
|
141
|
+
if options[:sparse]
|
142
|
+
data, options[:weights] = densify(data, options[:weights])
|
143
|
+
options[:mask] = nil
|
20
144
|
end
|
21
|
-
|
145
|
+
do_self_organizing_map(nx, ny, data, options)
|
22
146
|
end
|
23
147
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
148
|
+
# Clusters data into hierarchies and then returns the clusters required using cut-tree.
|
149
|
+
#
|
150
|
+
# @example
|
151
|
+
#
|
152
|
+
# data = []
|
153
|
+
# data << %w(apple orange)
|
154
|
+
# data << %w(black white)
|
155
|
+
# data << %w(white cyan)
|
156
|
+
# data << %w(apple orange)
|
157
|
+
# data << %w(apple)
|
158
|
+
# result = Flock.treecluster(2, data, sparse: true)
|
159
|
+
#
|
160
|
+
# @param [Fixnum] size Number of clusters required. (See Flock#kcluster)
|
161
|
+
# @param [Array] data See Flock#kcluster
|
162
|
+
# @option options [Array] :mask See Flock#kcluster
|
163
|
+
# @option options [true, false] :transpose See Flock#kcluster
|
164
|
+
# @option options [Fixnum] :iterations See Flock#kcluster
|
165
|
+
# @option options [Fixnum] :metric See Flock#kcluster
|
166
|
+
# @option options [Fixnum] :method Method to use for treecluster
|
167
|
+
# - Flock::METHOD_SINGLE_LINKAGE
|
168
|
+
# - Flock::METHOD_MAXIMUM_LINKAGE
|
169
|
+
# - Flock::METHOD_AVERAGE_LINKAGE (default)
|
170
|
+
# - Flock::METHOD_CENTROID_LINKAGE
|
171
|
+
# @return [Hash]
|
172
|
+
# {
|
173
|
+
# :cluster => [Array]
|
174
|
+
# }
|
175
|
+
def self.treecluster size, data, options = {}
|
176
|
+
options[:sparse] = true if sparse?(data[0])
|
177
|
+
if options[:sparse]
|
178
|
+
data, options[:weights] = densify(data, options[:weights])
|
179
|
+
options[:mask] = nil
|
31
180
|
end
|
181
|
+
do_treecluster(size, data, options)
|
182
|
+
end
|
32
183
|
|
33
|
-
|
184
|
+
# @deprecated use {kcluster} instead.
|
185
|
+
def self.kmeans size, data, options = {}
|
186
|
+
kcluster(size, data, options)
|
34
187
|
end
|
35
188
|
|
36
|
-
|
37
|
-
|
38
|
-
|
189
|
+
# @deprecated use {kcluster}(size, data, sparse: true, ...) instead.
|
190
|
+
def self.sparse_kmeans size, data, options = {}
|
191
|
+
kcluster(size, data, options.merge(sparse: true))
|
39
192
|
end
|
40
193
|
|
41
|
-
|
42
|
-
|
43
|
-
|
194
|
+
# @deprecated use {treecluster}(size, data, sparse: true, ...) instead.
|
195
|
+
def self.sparse_treecluster size, data, options = {}
|
196
|
+
treecluster(size, data, options.merge(sparse: true))
|
44
197
|
end
|
45
198
|
|
46
|
-
|
47
|
-
|
48
|
-
|
199
|
+
# @deprecated use {self_organizing_map}(nx, ny, data, sparse: true, ...) instead.
|
200
|
+
def self.sparse_self_organizing_map nx, ny, data, options = {}
|
201
|
+
self_organizing_map(nx, ny, data, options.merge(sparse: true))
|
49
202
|
end
|
50
|
-
|
203
|
+
|
204
|
+
private
|
205
|
+
|
206
|
+
def self.sparse? row
|
207
|
+
row.kind_of?(Hash) or !row[0].kind_of?(Numeric)
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.sparse_array? row
|
211
|
+
!row.kind_of?(Hash)
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.sparse_hash_to_data sparse_data
|
215
|
+
dims = Hash[sparse_data.map(&:keys).flatten.uniq.map.with_index{|k,v| [k,v]}]
|
216
|
+
data = sparse_data.map do |sv|
|
217
|
+
vector = Array.new(dims.size) {0}
|
218
|
+
sv.each {|k,v| vector[dims[k]] = v }
|
219
|
+
vector
|
220
|
+
end
|
221
|
+
|
222
|
+
[dims, data]
|
223
|
+
end
|
224
|
+
|
225
|
+
def self.sparse_array_to_data sparse_data
|
226
|
+
dims = Hash[sparse_data.flatten.uniq.map.with_index{|k,v| [k,v]}]
|
227
|
+
data = sparse_data.map do |sv|
|
228
|
+
vector = Array.new(dims.size) {0}
|
229
|
+
sv.each {|k| vector[dims[k]] = 1 }
|
230
|
+
vector
|
231
|
+
end
|
232
|
+
|
233
|
+
[dims, data]
|
234
|
+
end
|
235
|
+
|
236
|
+
def self.densify sparse_data, weights = nil
|
237
|
+
dims, data = sparse_array?(sparse_data[0]) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
|
238
|
+
|
239
|
+
if weights
|
240
|
+
resampled = Array.new(dims.size) {1}
|
241
|
+
weights.each {|k,v| resampled[dims[k]] = v }
|
242
|
+
weights = resampled
|
243
|
+
end
|
244
|
+
|
245
|
+
[data, weights]
|
246
|
+
end
|
247
|
+
end # Flock
|