fastcluster 0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/Manifest +14 -0
- data/README.rdoc +59 -0
- data/Rakefile +12 -0
- data/ext/clusterer.c +325 -0
- data/ext/extconf.rb +4 -0
- data/fastcluster.gemspec +31 -0
- data/lib/fastcluster.rb +4 -0
- data/lib/fastcluster/cluster.rb +25 -0
- data/spec/lib/fastcluster/cluster_spec.rb +0 -0
- data/spec/lib/fastcluster/clusterer_spec.rb +222 -0
- data/spec/spec.opts +5 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/test_data.rb +28 -0
- data/test.rb +69 -0
- metadata +80 -0
data/CHANGELOG
ADDED
data/Manifest
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
CHANGELOG
|
2
|
+
Manifest
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
ext/clusterer.c
|
6
|
+
ext/extconf.rb
|
7
|
+
lib/fastcluster.rb
|
8
|
+
lib/fastcluster/cluster.rb
|
9
|
+
spec/lib/fastcluster/cluster_spec.rb
|
10
|
+
spec/lib/fastcluster/clusterer_spec.rb
|
11
|
+
spec/spec.opts
|
12
|
+
spec/spec_helper.rb
|
13
|
+
spec/test_data.rb
|
14
|
+
test.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
= Simple and fast clustering library
|
3
|
+
|
4
|
+
* http://github.com/jemmyw/fastcluster
|
5
|
+
|
6
|
+
== DESCRIPTION
|
7
|
+
|
8
|
+
This gem provides a really simple way to cluster 2 dimensional points. It is based
|
9
|
+
on the Hierclust[http://hierclust.rubyforge.org/] gem by Brandt Kurowski,
|
10
|
+
except that it does not cluster hierarchically. The aim of this gem is to provide
|
11
|
+
the same clustering algorithm but to be as fast as possible.
|
12
|
+
|
13
|
+
== INSTALL:
|
14
|
+
|
15
|
+
* sudo gem install fastcluster
|
16
|
+
|
17
|
+
== EXAMPLE:
|
18
|
+
|
19
|
+
require 'fastcluster'
|
20
|
+
points = [[1, 1], [1, 2], [5, 9]]
|
21
|
+
clusterer = Fastcluster::Clusterer.new(3, 0, points)
|
22
|
+
clusterer.clusters.each do |cluster|
|
23
|
+
puts cluster.x
|
24
|
+
puts cluster.y
|
25
|
+
puts cluster.size
|
26
|
+
end
|
27
|
+
|
28
|
+
== ALGORITHM:
|
29
|
+
|
30
|
+
1. All points are initially clusters with size 1
|
31
|
+
2. Precluster - create a grid of size [resolution] and cluster the points in each grid space automatically
|
32
|
+
3. Combine two closest clusters, the new cluster has the summed size and the averaged distance (size weighted)
|
33
|
+
between the clusters.
|
34
|
+
4. Loop to 3 until no cluster is less that [separation] apart or only one cluster remains
|
35
|
+
|
36
|
+
== LICENSE:
|
37
|
+
|
38
|
+
(The MIT License)
|
39
|
+
|
40
|
+
Copyright (c) 2009 Jeremy Wells
|
41
|
+
|
42
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
43
|
+
a copy of this software and associated documentation files (the
|
44
|
+
'Software'), to deal in the Software without restriction, including
|
45
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
46
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
47
|
+
permit persons to whom the Software is furnished to do so, subject to
|
48
|
+
the following conditions:
|
49
|
+
|
50
|
+
The above copyright notice and this permission notice shall be
|
51
|
+
included in all copies or substantial portions of the Software.
|
52
|
+
|
53
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
54
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
55
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
56
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
57
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
58
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
59
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
require 'spec/rake/spectask'
|
4
|
+
require 'echoe'
|
5
|
+
|
6
|
+
Echoe.new("fastcluster") do |p|
|
7
|
+
p.author = "Jeremy Wells"
|
8
|
+
p.email = "jeremy@boost.co.nz"
|
9
|
+
p.summary = "A clustering library for 2 dimensional points"
|
10
|
+
p.description = "A clustering library for 2 dimensional points"
|
11
|
+
p.url = "http://github.com/jemmyw/fastcluster"
|
12
|
+
end
|
data/ext/clusterer.c
ADDED
@@ -0,0 +1,325 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <math.h>
|
5
|
+
|
6
|
+
/*
|
7
|
+
*
|
8
|
+
* Algorithm:
|
9
|
+
* all points are initially clusters with size 1
|
10
|
+
* precluster - create a grid of size @resolution and cluster the points in each grid space automatically
|
11
|
+
* loop until no cluster is less that @separation apart
|
12
|
+
* combine two closest clusters, the new cluster has the summed size and the averaged distance (size weighted)
|
13
|
+
* between the clusters.
|
14
|
+
**/
|
15
|
+
typedef struct {
|
16
|
+
double x;
|
17
|
+
double y;
|
18
|
+
long size;
|
19
|
+
} CLUSTER;
|
20
|
+
|
21
|
+
/*
|
22
|
+
* An array of points to be clustered.
|
23
|
+
*/
|
24
|
+
static VALUE fc_get_points(VALUE self) {
|
25
|
+
return rb_iv_get(self, "@points");
|
26
|
+
}
|
27
|
+
|
28
|
+
/*
|
29
|
+
* call-seq:
|
30
|
+
* add(x, y) -> nil
|
31
|
+
*
|
32
|
+
* Add a point to this clusterer.
|
33
|
+
*/
|
34
|
+
static VALUE fc_add_point(VALUE self, VALUE x, VALUE y) {
|
35
|
+
long len = 2;
|
36
|
+
VALUE holdArray = rb_ary_new3(2, x, y);
|
37
|
+
VALUE pointArray = fc_get_points(self);
|
38
|
+
rb_ary_push(pointArray, holdArray);
|
39
|
+
|
40
|
+
return Qnil;
|
41
|
+
}
|
42
|
+
|
43
|
+
/*
|
44
|
+
* call-seq:
|
45
|
+
* <<(point) -> nil
|
46
|
+
*
|
47
|
+
* Add a point to this clusterer. The point must be in the format
|
48
|
+
* of an array with two number.
|
49
|
+
*
|
50
|
+
* Example:
|
51
|
+
* clusterer << [1, 2]
|
52
|
+
*/
|
53
|
+
static VALUE fc_append_point(VALUE self, VALUE point) {
|
54
|
+
VALUE pointArray = fc_get_points(self);
|
55
|
+
rb_ary_push(pointArray, point);
|
56
|
+
return Qnil;
|
57
|
+
}
|
58
|
+
|
59
|
+
/*
|
60
|
+
* Calculate the distance (pythag) between two cluster points
|
61
|
+
*/
|
62
|
+
static double fc_get_distance_between(CLUSTER * one, CLUSTER * two) {
|
63
|
+
double rr = pow((long)one->x - (long)two->x, 2) + pow((long)one->y - (long)two->y, 2);
|
64
|
+
return sqrt(rr);
|
65
|
+
}
|
66
|
+
|
67
|
+
/*
|
68
|
+
* Add a point to a cluster. This increments the size and calcualtes the average between
|
69
|
+
* the current cluster position and the new point.
|
70
|
+
*/
|
71
|
+
static void fc_add_to_cluster(CLUSTER * dst, double x, double y) {
|
72
|
+
dst->x = ((dst->x * dst->size) + x) / (dst->size + 1);
|
73
|
+
dst->y = ((dst->y * dst->size) + y) / (dst->size + 1);
|
74
|
+
dst->size++;
|
75
|
+
}
|
76
|
+
|
77
|
+
/*
|
78
|
+
* Combine two clusters into one with an average center point
|
79
|
+
*/
|
80
|
+
static void fc_combine_clusters(CLUSTER * dst, CLUSTER * src) {
|
81
|
+
dst->x = (dst->x*dst->size + src->x*src->size) / (dst->size+src->size);
|
82
|
+
dst->y = (dst->y*dst->size + src->y*src->size) / (dst->size+src->size);
|
83
|
+
dst->size = dst->size + src->size;
|
84
|
+
}
|
85
|
+
|
86
|
+
/*
|
87
|
+
* Get the maximum grid size
|
88
|
+
*/
|
89
|
+
static long fc_get_max_grid(long resolution, CLUSTER * point_array, long num_points) {
|
90
|
+
int i;
|
91
|
+
int max_grid = 0;
|
92
|
+
for(i = 0; i < num_points; i++) {
|
93
|
+
CLUSTER * point = &point_array[i];
|
94
|
+
int xg = point->x/resolution;
|
95
|
+
int yg = point->y/resolution;
|
96
|
+
if(xg>max_grid)
|
97
|
+
max_grid = xg;
|
98
|
+
if(yg>max_grid)
|
99
|
+
max_grid = yg;
|
100
|
+
}
|
101
|
+
return max_grid+1;
|
102
|
+
}
|
103
|
+
|
104
|
+
/*
|
105
|
+
* call-seq:
|
106
|
+
* new(separation = 0, resolution = 0, points = nil)
|
107
|
+
*
|
108
|
+
* Create a new Clusterer. The new method accepts 3 optional arguments, separation,
|
109
|
+
* resolution and points.
|
110
|
+
*
|
111
|
+
* <tt>separation</tt> - The distance between clusters. The higher this number, the
|
112
|
+
* less clusters there will be. If this is 0 then no clustering will occur.
|
113
|
+
*
|
114
|
+
* <tt>resolution</tt> - If specified then the points are placed on a grid with each grid square
|
115
|
+
* being this size. Points falling in the same grid square are automatically clustered.
|
116
|
+
* This option should be specified clustering larger number of points to reduce processing time.
|
117
|
+
*
|
118
|
+
* <tt>points</tt> - An array of points. Each array item must be an array with
|
119
|
+
* two numbers (x, y). Example: <code>[[1, 2], [3, 4]]</code>.
|
120
|
+
*/
|
121
|
+
static VALUE fc_initialize_clusterer(int argc, VALUE *argv, VALUE self) {
|
122
|
+
if(argc > 0)
|
123
|
+
rb_iv_set(self, "@separation", argv[0]);
|
124
|
+
else
|
125
|
+
rb_iv_set(self, "@separation", INT2FIX(0));
|
126
|
+
|
127
|
+
if(argc > 1)
|
128
|
+
rb_iv_set(self, "@resolution", argv[1]);
|
129
|
+
else
|
130
|
+
rb_iv_set(self, "@resolution", INT2FIX(0));
|
131
|
+
|
132
|
+
VALUE pointArray = rb_ary_new();
|
133
|
+
rb_iv_set(self, "@points", pointArray);
|
134
|
+
|
135
|
+
if(argc > 2) {
|
136
|
+
if(TYPE(argv[2]) == T_ARRAY) {
|
137
|
+
rb_iv_set(self, "@points", argv[2]);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
return Qnil;
|
142
|
+
}
|
143
|
+
|
144
|
+
/*
|
145
|
+
* Turn the ruby array of points (format [[x,y], [x,y]]) into an array of
|
146
|
+
* CLUSTER
|
147
|
+
*/
|
148
|
+
static void fc_native_point_array(CLUSTER * arrayPtr, VALUE rubyArray, long num_points) {
|
149
|
+
int i;
|
150
|
+
for(i=0;i<num_points;i++) {
|
151
|
+
VALUE holdArray = RARRAY(rubyArray)->ptr[i];
|
152
|
+
double x = NUM2DBL(RARRAY(holdArray)->ptr[0]);
|
153
|
+
double y = NUM2DBL(RARRAY(holdArray)->ptr[1]);
|
154
|
+
|
155
|
+
arrayPtr[i].x = x;
|
156
|
+
arrayPtr[i].y = y;
|
157
|
+
arrayPtr[i].size = 1;
|
158
|
+
}
|
159
|
+
}
|
160
|
+
|
161
|
+
static CLUSTER *fc_calculate_clusters(long separation, long resolution, CLUSTER * point_array, int num_points, long * cluster_size) {
|
162
|
+
int max_grid = fc_get_max_grid(resolution, &point_array[0], num_points);
|
163
|
+
int i, j;
|
164
|
+
long preclust_size = 0;
|
165
|
+
|
166
|
+
CLUSTER * cluster;
|
167
|
+
CLUSTER * clusters;
|
168
|
+
|
169
|
+
if(resolution > 0) {
|
170
|
+
CLUSTER grid_array[max_grid][max_grid];
|
171
|
+
|
172
|
+
for(i=0;i<max_grid;i++) {
|
173
|
+
for(j=0;j<max_grid;j++) {
|
174
|
+
grid_array[i][j].size = 0;
|
175
|
+
}
|
176
|
+
}
|
177
|
+
|
178
|
+
for(i = 0; i < num_points; i++) {
|
179
|
+
cluster = &point_array[i];
|
180
|
+
|
181
|
+
int gx = floor(cluster->x/resolution);
|
182
|
+
int gy = floor(cluster->y/resolution);
|
183
|
+
|
184
|
+
fc_add_to_cluster(&grid_array[gx][gy], cluster->x, cluster->y);
|
185
|
+
|
186
|
+
if(grid_array[gx][gy].size == 1) preclust_size++;
|
187
|
+
}
|
188
|
+
|
189
|
+
clusters = malloc(preclust_size * sizeof(CLUSTER));
|
190
|
+
|
191
|
+
int max_grid_total = max_grid * max_grid;
|
192
|
+
CLUSTER * gridPtr = grid_array[0];
|
193
|
+
|
194
|
+
int incr = 0;
|
195
|
+
for(i=0;i<max_grid_total;i++) {
|
196
|
+
if(gridPtr[i].size > 0) {
|
197
|
+
clusters[incr] = gridPtr[i];
|
198
|
+
incr++;
|
199
|
+
}
|
200
|
+
}
|
201
|
+
} else {
|
202
|
+
preclust_size = num_points;
|
203
|
+
clusters = malloc(preclust_size * sizeof(CLUSTER));
|
204
|
+
memcpy(&clusters[0], &point_array[0], preclust_size * sizeof(CLUSTER));
|
205
|
+
}
|
206
|
+
|
207
|
+
double distance_sep = 0;
|
208
|
+
long current_cluster_size = 0;
|
209
|
+
int found;
|
210
|
+
long nearest_origin;
|
211
|
+
long nearest_other;
|
212
|
+
|
213
|
+
do {
|
214
|
+
// calculate distance sep
|
215
|
+
distance_sep = 0;
|
216
|
+
nearest_other = 0;
|
217
|
+
|
218
|
+
for(i=0;i<preclust_size;i++){
|
219
|
+
for(j=i+1;j<preclust_size;j++){
|
220
|
+
double distance = fc_get_distance_between(&clusters[i], &clusters[j]);
|
221
|
+
|
222
|
+
// printf("distance between %f, %f and %f, %f is %f\n", clusters[i].x, clusters[i].y, clusters[j].x, clusters[j].y, distance);
|
223
|
+
|
224
|
+
if(distance_sep == 0 || distance < distance_sep) {
|
225
|
+
distance_sep = distance;
|
226
|
+
|
227
|
+
if(distance < separation || separation == 0) {
|
228
|
+
nearest_origin = i;
|
229
|
+
nearest_other = j;
|
230
|
+
}
|
231
|
+
}
|
232
|
+
}
|
233
|
+
}
|
234
|
+
|
235
|
+
if(nearest_other > 0) {
|
236
|
+
fc_combine_clusters(&clusters[nearest_origin], &clusters[nearest_other]);
|
237
|
+
|
238
|
+
CLUSTER *newarr = malloc(preclust_size * sizeof(CLUSTER));
|
239
|
+
memcpy(&newarr[0], &clusters[0], nearest_other * sizeof(CLUSTER));
|
240
|
+
memcpy(&newarr[nearest_other], &clusters[nearest_other+1], (preclust_size - (nearest_other + 1)) * sizeof(CLUSTER));
|
241
|
+
|
242
|
+
void *_tmp = realloc(clusters, ((preclust_size-1) * sizeof(CLUSTER)));
|
243
|
+
clusters = (CLUSTER*)_tmp;
|
244
|
+
preclust_size = preclust_size - 1;
|
245
|
+
|
246
|
+
for(i=0;i<preclust_size;i++)
|
247
|
+
clusters[i] = newarr[i];
|
248
|
+
|
249
|
+
free(newarr);
|
250
|
+
}
|
251
|
+
|
252
|
+
} while((separation == 0 || distance_sep < separation) && preclust_size > 1);
|
253
|
+
|
254
|
+
*cluster_size = preclust_size;
|
255
|
+
return clusters;
|
256
|
+
}
|
257
|
+
|
258
|
+
static VALUE fc_get_cluster_class() {
|
259
|
+
ID cluster_module_id = rb_intern("Fastcluster");
|
260
|
+
ID cluster_class_id = rb_intern("Cluster");
|
261
|
+
VALUE cluster_module = rb_const_get(rb_cObject, cluster_module_id);
|
262
|
+
return rb_const_get(cluster_module, cluster_class_id);
|
263
|
+
}
|
264
|
+
|
265
|
+
/*
|
266
|
+
* Return the clusters found for the points in this clusterer. This will be an
|
267
|
+
* array of Cluster objects.
|
268
|
+
*
|
269
|
+
* Example:
|
270
|
+
* clusterer = Fastcluster::Clusterer.new(3, 0, [[1, 1], [1, 2], [5, 9]])
|
271
|
+
* clusterer.clusters -> [(1.00, 1.50): 2, (5.00, 9.00): 1]
|
272
|
+
*/
|
273
|
+
static VALUE fc_get_clusters(VALUE self) {
|
274
|
+
// Get the separation adn resolution from ruby
|
275
|
+
long separation = NUM2INT(rb_iv_get(self, "@separation"));
|
276
|
+
long resolution = NUM2INT(rb_iv_get(self, "@resolution"));
|
277
|
+
int i;
|
278
|
+
|
279
|
+
// Create a native array of clusters from the ruby array of points
|
280
|
+
VALUE pointArray = fc_get_points(self);
|
281
|
+
long num_points = RARRAY(pointArray)->len;
|
282
|
+
CLUSTER native_point_array[num_points];
|
283
|
+
|
284
|
+
fc_native_point_array(&native_point_array[0], pointArray, num_points);
|
285
|
+
|
286
|
+
// Calcualte the clusters
|
287
|
+
CLUSTER * clusters = NULL;
|
288
|
+
long cluster_size;
|
289
|
+
|
290
|
+
clusters = fc_calculate_clusters(separation, resolution, &native_point_array[0], num_points, &cluster_size);
|
291
|
+
|
292
|
+
// Create ruby array of clusters to return
|
293
|
+
VALUE cluster_class = fc_get_cluster_class();
|
294
|
+
VALUE ruby_cluster_array = rb_ary_new2(cluster_size);
|
295
|
+
|
296
|
+
for(i=0;i<cluster_size;i++) {
|
297
|
+
int arg_count = 3;
|
298
|
+
VALUE arg_array[arg_count];
|
299
|
+
|
300
|
+
arg_array[0] = rb_float_new(clusters[i].x);
|
301
|
+
arg_array[1] = rb_float_new(clusters[i].y);
|
302
|
+
arg_array[2] = INT2FIX(clusters[i].size);
|
303
|
+
|
304
|
+
VALUE cluster_obj = rb_class_new_instance(arg_count, arg_array, cluster_class);
|
305
|
+
rb_ary_push(ruby_cluster_array, cluster_obj);
|
306
|
+
}
|
307
|
+
|
308
|
+
// Free the clusters array
|
309
|
+
free(clusters);
|
310
|
+
|
311
|
+
return ruby_cluster_array;
|
312
|
+
}
|
313
|
+
|
314
|
+
void Init_clusterer() {
|
315
|
+
VALUE clustererModule = rb_define_module("Fastcluster");
|
316
|
+
VALUE clustererClass = rb_define_class_under(clustererModule, "Clusterer", rb_cObject);
|
317
|
+
|
318
|
+
rb_define_method(clustererClass, "initialize", fc_initialize_clusterer, -1);
|
319
|
+
rb_define_method(clustererClass, "add", fc_add_point, 2);
|
320
|
+
|
321
|
+
rb_define_method(clustererClass, "<<", fc_append_point, 1);
|
322
|
+
|
323
|
+
rb_define_method(clustererClass, "clusters", fc_get_clusters, 0);
|
324
|
+
rb_define_method(clustererClass, "points", fc_get_points, 0);
|
325
|
+
}
|
data/ext/extconf.rb
ADDED
data/fastcluster.gemspec
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{fastcluster}
|
5
|
+
s.version = "0.9"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Jeremy Wells"]
|
9
|
+
s.date = %q{2009-10-24}
|
10
|
+
s.description = %q{A clustering library for 2 dimensional points}
|
11
|
+
s.email = %q{jeremy@boost.co.nz}
|
12
|
+
s.extensions = ["ext/extconf.rb"]
|
13
|
+
s.extra_rdoc_files = ["CHANGELOG", "README.rdoc", "ext/clusterer.c", "ext/extconf.rb", "lib/fastcluster.rb", "lib/fastcluster/cluster.rb"]
|
14
|
+
s.files = ["CHANGELOG", "Manifest", "README.rdoc", "Rakefile", "ext/clusterer.c", "ext/extconf.rb", "lib/fastcluster.rb", "lib/fastcluster/cluster.rb", "spec/lib/fastcluster/cluster_spec.rb", "spec/lib/fastcluster/clusterer_spec.rb", "spec/spec.opts", "spec/spec_helper.rb", "spec/test_data.rb", "test.rb", "fastcluster.gemspec"]
|
15
|
+
s.homepage = %q{http://github.com/jemmyw/fastcluster}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Fastcluster", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib", "ext"]
|
18
|
+
s.rubyforge_project = %q{fastcluster}
|
19
|
+
s.rubygems_version = %q{1.3.5}
|
20
|
+
s.summary = %q{A clustering library for 2 dimensional points}
|
21
|
+
|
22
|
+
if s.respond_to? :specification_version then
|
23
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
24
|
+
s.specification_version = 3
|
25
|
+
|
26
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
27
|
+
else
|
28
|
+
end
|
29
|
+
else
|
30
|
+
end
|
31
|
+
end
|
data/lib/fastcluster.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module Fastcluster
|
2
|
+
class Cluster
|
3
|
+
include Comparable
|
4
|
+
|
5
|
+
attr_reader :x, :y, :size
|
6
|
+
|
7
|
+
def initialize(x, y, size)
|
8
|
+
@x = x
|
9
|
+
@y = y
|
10
|
+
@size = size
|
11
|
+
end
|
12
|
+
|
13
|
+
def <=>(anOther)
|
14
|
+
size <=> anOther.size
|
15
|
+
end
|
16
|
+
|
17
|
+
def inspect
|
18
|
+
to_s
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
'(%0.2f, %0.2f): %d' % [@x, @y, @size]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
File without changes
|
@@ -0,0 +1,222 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
describe Fastcluster::Clusterer do
|
5
|
+
before do
|
6
|
+
@points = POINTS
|
7
|
+
end
|
8
|
+
|
9
|
+
it 'should allow setting points in initializer' do
|
10
|
+
@clusterer = Fastcluster::Clusterer.new(105, 5, @points)
|
11
|
+
@clusterer.points.size.should == 168
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'instance' do
|
15
|
+
before do
|
16
|
+
@clusterer = Fastcluster::Clusterer.new(105, 5)
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '#add' do
|
20
|
+
it 'should add an x y point to the clusterer' do
|
21
|
+
@clusterer.add(5, 10)
|
22
|
+
@clusterer.points.size.should == 1
|
23
|
+
@clusterer.points.first.should == [5, 10]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe '#<<' do
|
28
|
+
it 'should add the value to the clusterer' do
|
29
|
+
@clusterer << [5, 10]
|
30
|
+
@clusterer.points.size.should == 1
|
31
|
+
@clusterer.points.first.should == [5, 10]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe '#clusters' do
|
37
|
+
describe 'with large test' do
|
38
|
+
before do
|
39
|
+
@clusterer = Fastcluster::Clusterer.new(105, 5, @points)
|
40
|
+
@clusters = @clusterer.clusters.sort{|a, b| a.size == b.size ? a.x <=> b.x : a.size <=> b.size }
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should take less than 1 second' do
|
44
|
+
time = Benchmark.measure { @clusterer.clusters }
|
45
|
+
time.total.should be < 1
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should have as many cluster points as data points' do
|
49
|
+
@clusters.inject(0){|m, n| m + n.size }.should == @points.size
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should have 23 clusters' do
|
53
|
+
@clusters.size.should == 23
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should have a cluster of 108 points at 836 by 178' do
|
57
|
+
@clusters.last.size.should == 108
|
58
|
+
@clusters.last.x.should be_close(836, 1)
|
59
|
+
@clusters.last.y.should be_close(178, 1)
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should have a cluster of 1 point at 97 by 1203' do
|
63
|
+
@clusters.first.size.should == 1
|
64
|
+
@clusters.first.x.should be_close(97, 1)
|
65
|
+
@clusters.first.y.should be_close(1203, 1)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "with two points" do
|
70
|
+
before do
|
71
|
+
@clusterer = Fastcluster::Clusterer.new(0, 0, [[1, 5], [2, 8]])
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should return one cluster" do
|
75
|
+
@clusterer.clusters.size.should == 1
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should have two points in the cluster" do
|
79
|
+
@clusterer.clusters.first.size.should == 2
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe "with three points" do
|
84
|
+
before do
|
85
|
+
@clusterer = Fastcluster::Clusterer.new(0, 0, [[1, 2], [5, 6], [2, 3]])
|
86
|
+
@clusters = @clusterer.clusters
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should return one cluster" do
|
90
|
+
@clusters.size.should == 1
|
91
|
+
end
|
92
|
+
|
93
|
+
it "containing three items" do
|
94
|
+
@clusters.first.size.should == 3
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe "with four points" do
|
99
|
+
before do
|
100
|
+
@points = [
|
101
|
+
[0, 1],
|
102
|
+
[1, 0],
|
103
|
+
[3, 4],
|
104
|
+
[4, 3],
|
105
|
+
]
|
106
|
+
end
|
107
|
+
|
108
|
+
describe "and no separation" do
|
109
|
+
before do
|
110
|
+
@clusterer = Fastcluster::Clusterer.new(0, 0, @points)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should return one cluster" do
|
114
|
+
@clusterer.clusters.size.should == 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
describe "and separation 1" do
|
119
|
+
before do
|
120
|
+
require 'lib/fastcluster'
|
121
|
+
@clusterer = Fastcluster::Clusterer.new(1, 0, @points)
|
122
|
+
end
|
123
|
+
|
124
|
+
it "should return all four individual points" do
|
125
|
+
@clusterer.clusters.size.should == 4
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe "and separation 2" do
|
130
|
+
before do
|
131
|
+
@clusterer = Fastcluster::Clusterer.new(2, 0, @points)
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should return two clusters" do
|
135
|
+
@clusterer.clusters.size.should == 2
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
describe "with eight points" do
|
141
|
+
before do
|
142
|
+
@points = [
|
143
|
+
[0, 1],
|
144
|
+
[1, 0],
|
145
|
+
[3, 4],
|
146
|
+
[4, 3],
|
147
|
+
[7, 8],
|
148
|
+
[8, 7],
|
149
|
+
[8, 9],
|
150
|
+
[9, 8]
|
151
|
+
]
|
152
|
+
end
|
153
|
+
|
154
|
+
describe "and no separation" do
|
155
|
+
before do
|
156
|
+
@clusterer = Fastcluster::Clusterer.new(0, 0)
|
157
|
+
@points.each do |point|
|
158
|
+
@clusterer << point
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
it "should return one cluster when no minimum separation is given" do
|
163
|
+
@clusterer.clusters.size.should == 1
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
describe "and separation 1" do
|
168
|
+
before do
|
169
|
+
@clusterer = Fastcluster::Clusterer.new(1, 0)
|
170
|
+
@points.each do |point|
|
171
|
+
@clusterer << point
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
it "should have all eight points in individual clusters" do
|
176
|
+
@clusterer.clusters.size.should == 8
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
describe "and separation 3" do
|
181
|
+
describe "with no resolution limit" do
|
182
|
+
before do
|
183
|
+
@clusterer = Fastcluster::Clusterer.new(3, 0)
|
184
|
+
@points.each do |point|
|
185
|
+
@clusterer << point
|
186
|
+
end
|
187
|
+
@clusters = @clusterer.clusters.sort
|
188
|
+
end
|
189
|
+
|
190
|
+
it "should have three clusters" do
|
191
|
+
@clusters.size.should == 3
|
192
|
+
end
|
193
|
+
|
194
|
+
it "should have clusters size 2, 2, and 4 " do
|
195
|
+
@clusters[0].size.should == 2
|
196
|
+
@clusters[1].size.should == 2
|
197
|
+
@clusters[2].size.should == 4
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
describe "with coarse resolution" do
|
202
|
+
before do
|
203
|
+
@clusterer = Fastcluster::Clusterer.new(3, 5)
|
204
|
+
@points.each do |point|
|
205
|
+
@clusterer << point
|
206
|
+
end
|
207
|
+
@clusters = @clusterer.clusters.sort
|
208
|
+
end
|
209
|
+
|
210
|
+
it "should have three clusters" do
|
211
|
+
@clusters.size.should == 2
|
212
|
+
end
|
213
|
+
|
214
|
+
it "should have clusters size 2, 2, and 4 " do
|
215
|
+
@clusters[0].size.should == 4
|
216
|
+
@clusters[1].size.should == 4
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
data/spec/test_data.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
POINTS = [[815, 183], [860, 176], [793, 176], [847, 176], [813, 176], [865, 183],
|
2
|
+
[804, 185], [813, 181], [797, 181], [193, 133], [905, 168], [821, 173],
|
3
|
+
[804, 178], [799, 180], [175, 360], [880, 176], [826, 171], [843, 181],
|
4
|
+
[857, 171], [443, 495], [857, 174], [382, 132], [97, 1203], [218, 281],
|
5
|
+
[97, 1778], [814, 181], [474, 133], [797, 179], [844, 180], [812, 178],
|
6
|
+
[846, 18], [795, 183], [862, 182], [840, 175], [788, 176], [478, 140],
|
7
|
+
[860, 173], [974, 16], [833, 181], [288, 130], [831, 874], [884, 174],
|
8
|
+
[834, 179], [836, 182], [181, 354], [346, 46], [855, 171], [855, 180],
|
9
|
+
[959, 177], [792, 181], [138, 1774], [283, 783], [815, 877], [807, 178],
|
10
|
+
[430, 497], [819, 181], [801, 176], [836, 182], [825, 176], [873, 179],
|
11
|
+
[847, 182], [124, 1678], [157, 132], [835, 177], [827, 180], [532, 1191],
|
12
|
+
[873, 178], [174, 128], [805, 1328], [798, 178], [502, 659], [804, 180],
|
13
|
+
[960, 178], [886, 182], [867, 183], [875, 179], [854, 176], [849, 185],
|
14
|
+
[136, 1783], [800, 181], [810, 180], [312, 782], [865, 177], [745, 125],
|
15
|
+
[833, 178], [882, 181], [834, 177], [821, 184], [899, 182], [26, 124],
|
16
|
+
[859, 182], [892, 174], [172, 134], [822, 182], [396, 135], [830, 184],
|
17
|
+
[792, 185], [427, 488], [818, 173], [832, 177], [406, 129], [852, 181],
|
18
|
+
[805, 177], [820, 175], [19, 14], [839, 183], [264, 127], [800, 172],
|
19
|
+
[811, 176], [827, 178], [896, 177], [863, 183], [813, 177], [458, 495],
|
20
|
+
[808, 181], [850, 748], [810, 184], [850, 181], [886, 179], [446, 497],
|
21
|
+
[851, 180], [592, 132], [300, 538], [794, 180], [815, 180], [57, 1772],
|
22
|
+
[799, 175], [821, 182], [135, 1681], [830, 483], [796, 173], [887, 40],
|
23
|
+
[823, 179], [864, 179], [322, 30], [886, 182], [808, 178], [823, 189],
|
24
|
+
[230, 360], [224, 275], [875, 179], [133, 1773], [844, 175], [433, 495],
|
25
|
+
[13, 15], [856, 178], [871, 181], [658, 130], [839, 183], [822, 83],
|
26
|
+
[827, 175], [848, 179], [894, 184], [833, 177], [828, 176], [482, 135],
|
27
|
+
[841, 177], [817, 184], [470, 140], [800, 180], [857, 15], [807, 187],
|
28
|
+
[425, 497], [64, 131], [852, 580], [883, 183], [836, 181], [878, 177]]
|
data/test.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/lib/fastcluster'
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
points = [[815, 183], [860, 176], [793, 176], [847, 176], [813, 176], [865, 183],
|
5
|
+
[804, 185], [813, 181], [797, 181], [193, 133], [905, 168], [821, 173],
|
6
|
+
[804, 178], [799, 180], [175, 360], [880, 176], [826, 171], [843, 181],
|
7
|
+
[857, 171], [443, 495], [857, 174], [382, 132], [97, 1203], [218, 281],
|
8
|
+
[97, 1778], [814, 181], [474, 133], [797, 179], [844, 180], [812, 178],
|
9
|
+
[846, 18], [795, 183], [862, 182], [840, 175], [788, 176], [478, 140],
|
10
|
+
[860, 173], [974, 16], [833, 181], [288, 130], [831, 874], [884, 174],
|
11
|
+
[834, 179], [836, 182], [181, 354], [346, 46], [855, 171], [855, 180],
|
12
|
+
[959, 177], [792, 181], [138, 1774], [283, 783], [815, 877], [807, 178],
|
13
|
+
[430, 497], [819, 181], [801, 176], [836, 182], [825, 176], [873, 179],
|
14
|
+
[847, 182], [124, 1678], [157, 132], [835, 177], [827, 180], [532, 1191],
|
15
|
+
[873, 178], [174, 128], [805, 1328], [798, 178], [502, 659], [804, 180],
|
16
|
+
[960, 178], [886, 182], [867, 183], [875, 179], [854, 176], [849, 185],
|
17
|
+
[136, 1783], [800, 181], [810, 180], [312, 782], [865, 177], [745, 125],
|
18
|
+
[833, 178], [882, 181], [834, 177], [821, 184], [899, 182], [26, 124],
|
19
|
+
[859, 182], [892, 174], [172, 134], [822, 182], [396, 135], [830, 184],
|
20
|
+
[792, 185], [427, 488], [818, 173], [832, 177], [406, 129], [852, 181],
|
21
|
+
[805, 177], [820, 175], [19, 14], [839, 183], [264, 127], [800, 172],
|
22
|
+
[811, 176], [827, 178], [896, 177], [863, 183], [813, 177], [458, 495],
|
23
|
+
[808, 181], [850, 748], [810, 184], [850, 181], [886, 179], [446, 497],
|
24
|
+
[851, 180], [592, 132], [300, 538], [794, 180], [815, 180], [57, 1772],
|
25
|
+
[799, 175], [821, 182], [135, 1681], [830, 483], [796, 173], [887, 40],
|
26
|
+
[823, 179], [864, 179], [322, 30], [886, 182], [808, 178], [823, 189],
|
27
|
+
[230, 360], [224, 275], [875, 179], [133, 1773], [844, 175], [433, 495],
|
28
|
+
[13, 15], [856, 178], [871, 181], [658, 130], [839, 183], [822, 83],
|
29
|
+
[827, 175], [848, 179], [894, 184], [833, 177], [828, 176], [482, 135],
|
30
|
+
[841, 177], [817, 184], [470, 140], [800, 180], [857, 15], [807, 187],
|
31
|
+
[425, 497], [64, 131], [852, 580], [883, 183], [836, 181], [878, 177],
|
32
|
+
[815, 183], [860, 176], [793, 176], [847, 176], [813, 176], [865, 183],
|
33
|
+
[804, 185], [813, 181], [797, 181], [193, 133], [905, 168], [821, 173],
|
34
|
+
[804, 178], [799, 180], [175, 360], [880, 176], [826, 171], [843, 181],
|
35
|
+
[857, 171], [443, 495], [857, 174], [382, 132], [97, 1203], [218, 281],
|
36
|
+
[97, 1778], [814, 181], [474, 133], [797, 179], [844, 180], [812, 178],
|
37
|
+
[846, 18], [795, 183], [862, 182], [840, 175], [788, 176], [478, 140],
|
38
|
+
[860, 173], [974, 16], [833, 181], [288, 130], [831, 874], [884, 174],
|
39
|
+
[834, 179], [836, 182], [181, 354], [346, 46], [855, 171], [855, 180],
|
40
|
+
[959, 177], [792, 181], [138, 1774], [283, 783], [815, 877], [807, 178],
|
41
|
+
[430, 497], [819, 181], [801, 176], [836, 182], [825, 176], [873, 179],
|
42
|
+
[847, 182], [124, 1678], [157, 132], [835, 177], [827, 180], [532, 1191],
|
43
|
+
[873, 178], [174, 128], [805, 1328], [798, 178], [502, 659], [804, 180],
|
44
|
+
[960, 178], [886, 182], [867, 183], [875, 179], [854, 176], [849, 185],
|
45
|
+
[136, 1783], [800, 181], [810, 180], [312, 782], [865, 177], [745, 125],
|
46
|
+
[833, 178], [882, 181], [834, 177], [821, 184], [899, 182], [26, 124],
|
47
|
+
[859, 182], [892, 174], [172, 134], [822, 182], [396, 135], [830, 184],
|
48
|
+
[792, 185], [427, 488], [818, 173], [832, 177], [406, 129], [852, 181],
|
49
|
+
[805, 177], [820, 175], [19, 14], [839, 183], [264, 127], [800, 172],
|
50
|
+
[811, 176], [827, 178], [896, 177], [863, 183], [813, 177], [458, 495],
|
51
|
+
[808, 181], [850, 748], [810, 184], [850, 181], [886, 179], [446, 497],
|
52
|
+
[851, 180], [592, 132], [300, 538], [794, 180], [815, 180], [57, 1772],
|
53
|
+
[799, 175], [821, 182], [135, 1681], [830, 483], [796, 173], [887, 40],
|
54
|
+
[823, 179], [864, 179], [322, 30], [886, 182], [808, 178], [823, 189],
|
55
|
+
[230, 360], [224, 275], [875, 179], [133, 1773], [844, 175], [433, 495],
|
56
|
+
[13, 15], [856, 178], [871, 181], [658, 130], [839, 183], [822, 83],
|
57
|
+
[827, 175], [848, 179], [894, 184], [833, 177], [828, 176], [482, 135],
|
58
|
+
[841, 177], [817, 184], [470, 140], [800, 180], [857, 15], [807, 187],
|
59
|
+
[425, 497], [64, 131], [852, 580], [883, 183], [836, 181], [878, 177]]
|
60
|
+
|
61
|
+
puts Benchmark.measure {
|
62
|
+
clusterer = Fastcluster::Clusterer.new(105, 5, points)
|
63
|
+
clusters = clusterer.clusters
|
64
|
+
|
65
|
+
clusters.sort{|a,b| a.size == b.size ? a.x <=> b.x : a.size <=> b.size }.each do |cluster|
|
66
|
+
puts cluster
|
67
|
+
end
|
68
|
+
|
69
|
+
}
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fastcluster
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.9"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeremy Wells
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-24 00:00:00 +13:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: A clustering library for 2 dimensional points
|
17
|
+
email: jeremy@boost.co.nz
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- CHANGELOG
|
24
|
+
- README.rdoc
|
25
|
+
- ext/clusterer.c
|
26
|
+
- ext/extconf.rb
|
27
|
+
- lib/fastcluster.rb
|
28
|
+
- lib/fastcluster/cluster.rb
|
29
|
+
files:
|
30
|
+
- CHANGELOG
|
31
|
+
- Manifest
|
32
|
+
- README.rdoc
|
33
|
+
- Rakefile
|
34
|
+
- ext/clusterer.c
|
35
|
+
- ext/extconf.rb
|
36
|
+
- lib/fastcluster.rb
|
37
|
+
- lib/fastcluster/cluster.rb
|
38
|
+
- spec/lib/fastcluster/cluster_spec.rb
|
39
|
+
- spec/lib/fastcluster/clusterer_spec.rb
|
40
|
+
- spec/spec.opts
|
41
|
+
- spec/spec_helper.rb
|
42
|
+
- spec/test_data.rb
|
43
|
+
- test.rb
|
44
|
+
- fastcluster.gemspec
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: http://github.com/jemmyw/fastcluster
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options:
|
51
|
+
- --line-numbers
|
52
|
+
- --inline-source
|
53
|
+
- --title
|
54
|
+
- Fastcluster
|
55
|
+
- --main
|
56
|
+
- README.rdoc
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
- ext
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
version:
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "1.2"
|
71
|
+
version:
|
72
|
+
requirements: []
|
73
|
+
|
74
|
+
rubyforge_project: fastcluster
|
75
|
+
rubygems_version: 1.3.5
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: A clustering library for 2 dimensional points
|
79
|
+
test_files: []
|
80
|
+
|