flock 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,129 @@
1
+ #include <stdlib.h>
2
+
3
+ extern double uniform();
4
+ typedef struct clusterpoint {
5
+ double dist;
6
+ int n, chosen, closest;
7
+ } clusterpoint;
8
+
9
+ int compare(const void *ptr1, const void *ptr2) {
10
+ clusterpoint *p1 = (clusterpoint *)ptr1, *p2 = (clusterpoint *)ptr2;
11
+ return p1->dist == p2->dist ? 0 : p1->dist < p2->dist ? -1 : 1;
12
+ }
13
+
14
+ double compute_distances(int ndata, int npoints,
15
+ double **data, int **mask, double weight[], int transpose, clusterpoint dists[],
16
+ double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int)) {
17
+
18
+ int i, j, closest = 0;
19
+ double min, dist, total = 0;
20
+
21
+ // compute distances to chosen point
22
+ for (i = 0; i < npoints; i++) {
23
+ if (dists[i].chosen) continue;
24
+
25
+ min = -1;
26
+ for (j = 0; j < npoints; j++) {
27
+ if (!dists[j].chosen) continue;
28
+
29
+ dist = metric(ndata, data, data, mask, mask, weight, dists[i].n, dists[j].n, transpose);
30
+ if (min < 0 || min > dist) {
31
+ min = dist;
32
+ closest = j;
33
+ }
34
+ }
35
+
36
+ dists[i].dist = min * min;
37
+ dists[i].closest = closest;
38
+ total += dists[i].dist;
39
+ }
40
+
41
+ return total;
42
+ }
43
+
44
+ void weightedassign(int nclusters, int nrows, int ncolumns,
45
+ double** data, int** mask, double weight[], int transpose,
46
+ double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int),
47
+ int clusterid[]) {
48
+
49
+ int i, n, chosen = (int)((double)nrows*uniform());
50
+ int ndata = (transpose == 0 ? ncolumns : nrows), npoints = (transpose == 0 ? nrows : ncolumns);
51
+ double total = 0, cutoff, curr;
52
+ clusterpoint dists[npoints];
53
+
54
+ for (i = 0; i < npoints; i++) {
55
+ dists[i].n = i;
56
+ dists[i].chosen = 0;
57
+ dists[i].dist = 0;
58
+ }
59
+
60
+ // setup 1st centroid
61
+ n = 1;
62
+ clusterid[chosen] = 0;
63
+ dists[chosen].chosen = 1;
64
+
65
+ // pick k-points for k-clusters with a probability weighted by square of distance from closest centroid.
66
+ while (n < nclusters) {
67
+ total = compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
68
+ qsort((void*)dists, npoints, sizeof(clusterpoint), compare);
69
+
70
+ curr = 0;
71
+ cutoff = total * uniform();
72
+ for (i = 0; i < npoints; i++) {
73
+ if (dists[i].chosen) continue;
74
+ curr += dists[i].dist;
75
+ if (curr >= cutoff || i == (npoints - 1)) {
76
+ clusterid[dists[i].n] = n++;
77
+ dists[i].chosen = 1;
78
+ dists[i].dist = 0;
79
+ break;
80
+ }
81
+ }
82
+ }
83
+
84
+ // assign remaining points to closest cluster
85
+ compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
86
+ for (n = 0; n < npoints; n++) {
87
+ if (dists[n].chosen) continue;
88
+ clusterid[dists[n].n] = clusterid[dists[dists[n].closest].n];
89
+ }
90
+ }
91
+
92
+ void spreadoutassign(int nclusters, int nrows, int ncolumns,
93
+ double** data, int** mask, double weight[], int transpose,
94
+ double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int),
95
+ int clusterid[]) {
96
+
97
+ int i, n, chosen = 0;
98
+ int ndata = (transpose == 0 ? ncolumns : nrows), npoints = (transpose == 0 ? nrows : ncolumns);
99
+ clusterpoint dists[npoints];
100
+
101
+ for (i = 0; i < npoints; i++) {
102
+ dists[i].n = i;
103
+ dists[i].chosen = 0;
104
+ dists[i].dist = 0;
105
+ }
106
+
107
+ // setup 1st centroid
108
+ n = 1;
109
+ clusterid[chosen] = 0;
110
+ dists[chosen].chosen = 1;
111
+
112
+ // pick k-points for k-clusters with max distance from all centers.
113
+ chosen = npoints - 1;
114
+ while (n < nclusters) {
115
+ compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
116
+ qsort((void*)dists, npoints, sizeof(clusterpoint), compare);
117
+
118
+ clusterid[dists[chosen].n] = n++;
119
+ dists[chosen].chosen = 1;
120
+ dists[chosen].dist = 0;
121
+ }
122
+
123
+ // assign remaining points to closest cluster
124
+ compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
125
+ for (n = 0; n < npoints; n++) {
126
+ if (dists[n].chosen) continue;
127
+ clusterid[dists[n].n] = clusterid[dists[dists[n].closest].n];
128
+ }
129
+ }
@@ -1,15 +1,15 @@
1
1
  # Generated by jeweler
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{flock}
8
- s.version = "0.4.1"
8
+ s.version = "0.5.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Bharanee Rathna"]
12
- s.date = %q{2011-04-28}
12
+ s.date = %q{2011-07-26}
13
13
  s.description = %q{A thin ruby binding to Cluster 3.0}
14
14
  s.email = ["deepfryed@gmail.com"]
15
15
  s.extensions = ["ext/extconf.rb"]
@@ -17,27 +17,22 @@ Gem::Specification.new do |s|
17
17
  "README.rdoc"
18
18
  ]
19
19
  s.files = [
20
+ "API.rdoc",
20
21
  "README.rdoc",
21
- "Rakefile",
22
- "VERSION",
23
- "ext/cluster.c",
24
- "ext/cluster.h",
25
- "ext/extconf.rb",
26
- "ext/flock.c",
27
- "flock.gemspec",
28
- "lib/flock.rb"
22
+ "Rakefile",
23
+ "VERSION",
24
+ "ext/cluster.c",
25
+ "ext/cluster.h",
26
+ "ext/extconf.rb",
27
+ "ext/flock.c",
28
+ "ext/kmeanspp.c",
29
+ "flock.gemspec",
30
+ "lib/flock.rb"
29
31
  ]
30
32
  s.homepage = %q{http://github.com/deepfryed/flock}
31
- s.rdoc_options = ["--charset=UTF-8"]
32
33
  s.require_paths = ["lib"]
33
34
  s.rubygems_version = %q{1.3.7}
34
35
  s.summary = %q{Ruby bindings to Cluster 3.0.}
35
- s.test_files = [
36
- "examples/sparse.rb",
37
- "examples/som.rb",
38
- "examples/dense.rb",
39
- "examples/treecluster.rb"
40
- ]
41
36
 
42
37
  if s.respond_to? :specification_version then
43
38
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
@@ -1,50 +1,247 @@
1
1
  require_relative '../ext/flock'
2
+
3
+ # Ruby bindings to data clustering algorithms provided by
4
+ # {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm]
5
+ #
6
+ # == Algorithms implemented
7
+ #
8
+ # * K-Means, K-Medians, K-Means++
9
+ # * Self-Organizing Maps
10
+ # * Tree Cluster or Hierarchical Clustering
11
+ #
12
+ # == Synopsis
13
+ #
14
+ # require 'pp'
15
+ # require 'flock'
16
+ #
17
+ # # sparse data.
18
+ # data = []
19
+ # data << %w(apple orange)
20
+ # data << %w(black white)
21
+ # data << %w(white cyan)
22
+ # data << %w(apple orange)
23
+ # data << %w(apple)
24
+ #
25
+ # pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_RANDOM)
26
+ # pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_KMEANS_PLUSPLUS)
27
+ # pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_SPREADOUT)
28
+ #
29
+ # # dense data.
30
+ # data = Array.new(13) {[]}
31
+ # mask = Array.new(13) {[]}
32
+ # weights = Array.new(13) {1.0}
33
+ #
34
+ # data[0][0] = 0.1; data[0][1] = 0.0;
35
+ # data[1][0] = 1.4; data[1][1] = 1.3;
36
+ # data[2][0] = 1.2; data[2][1] = 2.5;
37
+ # data[3][0] = 2.3; data[3][1] = 1.5;
38
+ # data[4][0] = 1.7; data[4][1] = 0.7;
39
+ # data[5][0] = 0.0; data[5][1] = 3.9;
40
+ # data[6][0] = 6.7; data[6][1] = 3.9;
41
+ #
42
+ # mask[0][0] = 1; mask[0][1] = 1;
43
+ # mask[1][0] = 1; mask[1][1] = 1;
44
+ # mask[2][0] = 1; mask[2][1] = 1;
45
+ # mask[3][0] = 1; mask[3][1] = 1;
46
+ # mask[4][0] = 1; mask[4][1] = 1;
47
+ # mask[5][0] = 0; mask[5][1] = 1;
48
+ # mask[6][0] = 1; mask[6][1] = 1;
49
+ #
50
+ # pp Flock.kcluster(2, data, mask: mask, weights: weights)
51
+ #
52
+ #
53
+ # == See
54
+ # * examples/* for more examples.
55
+ # * README.rdoc for more details.
56
+ # * API.rdoc is a public API overview.
2
57
  module Flock
3
58
 
4
- def self.sparse_hash_to_data sparse_data
5
- dims = Hash[sparse_data.map(&:keys).flatten.uniq.map.with_index{|k,v| [k,v]}]
6
- data = sparse_data.map do |sv|
7
- vector = Array.new(dims.size) {0}
8
- sv.each {|k,v| vector[dims[k]] = v }
9
- vector
59
+ # Cluster using k-means and k-medians.
60
+ #
61
+ # @example
62
+ #
63
+ # data = []
64
+ # data << %w(apple orange)
65
+ # data << %w(black white)
66
+ # data << %w(white cyan)
67
+ # data << %w(apple orange)
68
+ # data << %w(apple)
69
+ # result = Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_RANDOM)
70
+ #
71
+ # @param [Fixnum] size number of clusters the data points are grouped into.
72
+ # @param [Array] data An array of arrays of sparse or dense data, or an array of hashes of sparse data. Dense data
73
+ # should always be in numeric form. Sparse data values are converted to a dense row format
74
+ # by looking at the unique values and then converting each data point into a numeric vector
75
+ # that represents the presence or absence of a value in that data point.
76
+ # @option options [Array] :mask An array of arrays of 1s and 0s denoting if an element in the datapoint is
77
+ # to be used for computing distance (defaults to: all 1 vectors).
78
+ # @option options [Array] :weights Numeric weight for each data point (defaults to: all 1 vector).
79
+ # @option options [true, false] :transpose Transpose the dense data matrix (defaults to: false).
80
+ # @option options [Fixnum] :iterations Number of iterations to be run (defaults to: 100).
81
+ # @option options [Fixnum] :method Clustering method
82
+ # - Flock::METHOD_AVERAGE (default)
83
+ # - Flock::METHOD_MEDIAN
84
+ # @option options [Fixnum] :metric Distance measure, one of the following
85
+ # - Flock::METRIC_EUCLIDIAN (default)
86
+ # - Flock::METRIC_CITY_BLOCK
87
+ # - Flock::METRIC_CORRELATION
88
+ # - Flock::METRIC_ABSOLUTE_CORRELATION
89
+ # - Flock::METRIC_UNCENTERED_CORRELATION
90
+ # - Flock::METRIC_ABSOLUTE_UNCENTERED_CORRELATION
91
+ # - Flock::METRIC_SPEARMAN
92
+ # - Flock::METRIC_KENDALL
93
+ # @option options [Fixnum] :seed Initial seeding of clusters
94
+ # - Flock::SEED_RANDOM (default)
95
+ # - Flock::SEED_KMEANS_PLUSPLUS
96
+ # - Flock::SEED_SPREADOUT
97
+ # @return [Hash]
98
+ # {
99
+ # :cluster => [Array],
100
+ # :centroid => [Array<Array>],
101
+ # :error => [Numeric],
102
+ # :repeated => [Fixnum]
103
+ # }
104
+ def self.kcluster size, data, options = {}
105
+ options[:sparse] = true if sparse?(data[0])
106
+ if options[:sparse]
107
+ data, options[:weights] = densify(data, options[:weights])
108
+ options[:mask] = nil
10
109
  end
11
- [dims, data]
110
+ do_kcluster(size, data, options)
12
111
  end
13
112
 
14
- def self.sparse_array_to_data sparse_data
15
- dims = Hash[sparse_data.flatten.uniq.map.with_index{|k,v| [k,v]}]
16
- data = sparse_data.map do |sv|
17
- vector = Array.new(dims.size) {0}
18
- sv.each {|k| vector[dims[k]] = 1 }
19
- vector
113
+ # Arranges data points on a 2D grid without having to specify a fixed cluster size. So in theory you could have
114
+ # a maximum of nxm clusters.
115
+ #
116
+ # @example
117
+ #
118
+ # data = []
119
+ # data << %w(apple orange)
120
+ # data << %w(black white)
121
+ # data << %w(white cyan)
122
+ # data << %w(apple orange)
123
+ # data << %w(apple)
124
+ # result = Flock.self_organizing_map(2, 2, data, sparse: true)
125
+ #
126
+ # @param [Fixnum] nx Grid size in 1st dimension (x)
127
+ # @param [Fixnum] ny Grid size in 2nd dimension (y)
128
+ # @param [Array] data See Flock#kcluster
129
+ # @option options [Array] :mask See Flock#kcluster
130
+ # @option options [true, false] :transpose See Flock#kcluster
131
+ # @option options [Fixnum] :iterations See Flock#kcluster
132
+ # @option options [Fixnum] :metric See Flock#kcluster
133
+ # @option options [Numeric] :tau Initial tau value for distance metric.
134
+ # @return [Hash]
135
+ # {
136
+ # :cluster => [Array<Array>],
137
+ # :centroid => [Array<Array>]
138
+ # }
139
+ def self.self_organizing_map nx, ny, data, options = {}
140
+ options[:sparse] = true if sparse?(data[0])
141
+ if options[:sparse]
142
+ data, options[:weights] = densify(data, options[:weights])
143
+ options[:mask] = nil
20
144
  end
21
- [dims, data]
145
+ do_self_organizing_map(nx, ny, data, options)
22
146
  end
23
147
 
24
- def self.densify sparse_data, weights = nil
25
- dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
26
-
27
- if weights
28
- resampled = Array.new(dims.size) {1}
29
- weights.each {|k,v| resampled[dims[k]] = v }
30
- weights = resampled
148
+ # Clusters data into hierarchies and then returns the clusters required using cut-tree.
149
+ #
150
+ # @example
151
+ #
152
+ # data = []
153
+ # data << %w(apple orange)
154
+ # data << %w(black white)
155
+ # data << %w(white cyan)
156
+ # data << %w(apple orange)
157
+ # data << %w(apple)
158
+ # result = Flock.treecluster(2, data, sparse: true)
159
+ #
160
+ # @param [Fixnum] size Number of clusters required. (See Flock#kcluster)
161
+ # @param [Array] data See Flock#kcluster
162
+ # @option options [Array] :mask See Flock#kcluster
163
+ # @option options [true, false] :transpose See Flock#kcluster
164
+ # @option options [Fixnum] :iterations See Flock#kcluster
165
+ # @option options [Fixnum] :metric See Flock#kcluster
166
+ # @option options [Fixnum] :method Method to use for treecluster
167
+ # - Flock::METHOD_SINGLE_LINKAGE
168
+ # - Flock::METHOD_MAXIMUM_LINKAGE
169
+ # - Flock::METHOD_AVERAGE_LINKAGE (default)
170
+ # - Flock::METHOD_CENTROID_LINKAGE
171
+ # @return [Hash]
172
+ # {
173
+ # :cluster => [Array]
174
+ # }
175
+ def self.treecluster size, data, options = {}
176
+ options[:sparse] = true if sparse?(data[0])
177
+ if options[:sparse]
178
+ data, options[:weights] = densify(data, options[:weights])
179
+ options[:mask] = nil
31
180
  end
181
+ do_treecluster(size, data, options)
182
+ end
32
183
 
33
- [data, weights]
184
+ # @deprecated use {kcluster} instead.
185
+ def self.kmeans size, data, options = {}
186
+ kcluster(size, data, options)
34
187
  end
35
188
 
36
- def self.sparse_kmeans size, sparse_data, options = {}
37
- data, options[:weights] = densify(sparse_data, options[:weights])
38
- kmeans(size, data, data, options)
189
+ # @deprecated use {kcluster}(size, data, sparse: true, ...) instead.
190
+ def self.sparse_kmeans size, data, options = {}
191
+ kcluster(size, data, options.merge(sparse: true))
39
192
  end
40
193
 
41
- def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
42
- data, options[:weights] = densify(sparse_data, options[:weights])
43
- self_organizing_map(nx, ny, data, data, options)
194
+ # @deprecated use {treecluster}(size, data, sparse: true, ...) instead.
195
+ def self.sparse_treecluster size, data, options = {}
196
+ treecluster(size, data, options.merge(sparse: true))
44
197
  end
45
198
 
46
- def self.sparse_treecluster size, sparse_data, options = {}
47
- data, options[:weights] = densify(sparse_data, options[:weights])
48
- treecluster(size, data, data, options)
199
+ # @deprecated use {self_organizing_map}(nx, ny, data, sparse: true, ...) instead.
200
+ def self.sparse_self_organizing_map nx, ny, data, options = {}
201
+ self_organizing_map(nx, ny, data, options.merge(sparse: true))
49
202
  end
50
- end
203
+
204
+ private
205
+
206
+ def self.sparse? row
207
+ row.kind_of?(Hash) or !row[0].kind_of?(Numeric)
208
+ end
209
+
210
+ def self.sparse_array? row
211
+ !row.kind_of?(Hash)
212
+ end
213
+
214
+ def self.sparse_hash_to_data sparse_data
215
+ dims = Hash[sparse_data.map(&:keys).flatten.uniq.map.with_index{|k,v| [k,v]}]
216
+ data = sparse_data.map do |sv|
217
+ vector = Array.new(dims.size) {0}
218
+ sv.each {|k,v| vector[dims[k]] = v }
219
+ vector
220
+ end
221
+
222
+ [dims, data]
223
+ end
224
+
225
+ def self.sparse_array_to_data sparse_data
226
+ dims = Hash[sparse_data.flatten.uniq.map.with_index{|k,v| [k,v]}]
227
+ data = sparse_data.map do |sv|
228
+ vector = Array.new(dims.size) {0}
229
+ sv.each {|k| vector[dims[k]] = 1 }
230
+ vector
231
+ end
232
+
233
+ [dims, data]
234
+ end
235
+
236
+ def self.densify sparse_data, weights = nil
237
+ dims, data = sparse_array?(sparse_data[0]) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
238
+
239
+ if weights
240
+ resampled = Array.new(dims.size) {1}
241
+ weights.each {|k,v| resampled[dims[k]] = v }
242
+ weights = resampled
243
+ end
244
+
245
+ [data, weights]
246
+ end
247
+ end # Flock