flock 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ #include <stdlib.h>
2
+
3
+ extern double uniform();
4
+ typedef struct clusterpoint {
5
+ double dist;
6
+ int n, chosen, closest;
7
+ } clusterpoint;
8
+
9
+ int compare(const void *ptr1, const void *ptr2) {
10
+ clusterpoint *p1 = (clusterpoint *)ptr1, *p2 = (clusterpoint *)ptr2;
11
+ return p1->dist == p2->dist ? 0 : p1->dist < p2->dist ? -1 : 1;
12
+ }
13
+
14
+ double compute_distances(int ndata, int npoints,
15
+ double **data, int **mask, double weight[], int transpose, clusterpoint dists[],
16
+ double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int)) {
17
+
18
+ int i, j, closest = 0;
19
+ double min, dist, total = 0;
20
+
21
+ // compute distances to chosen point
22
+ for (i = 0; i < npoints; i++) {
23
+ if (dists[i].chosen) continue;
24
+
25
+ min = -1;
26
+ for (j = 0; j < npoints; j++) {
27
+ if (!dists[j].chosen) continue;
28
+
29
+ dist = metric(ndata, data, data, mask, mask, weight, dists[i].n, dists[j].n, transpose);
30
+ if (min < 0 || min > dist) {
31
+ min = dist;
32
+ closest = j;
33
+ }
34
+ }
35
+
36
+ dists[i].dist = min * min;
37
+ dists[i].closest = closest;
38
+ total += dists[i].dist;
39
+ }
40
+
41
+ return total;
42
+ }
43
+
44
+ void weightedassign(int nclusters, int nrows, int ncolumns,
45
+ double** data, int** mask, double weight[], int transpose,
46
+ double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int),
47
+ int clusterid[]) {
48
+
49
+ int i, n, chosen = (int)((double)nrows*uniform());
50
+ int ndata = (transpose == 0 ? ncolumns : nrows), npoints = (transpose == 0 ? nrows : ncolumns);
51
+ double total = 0, cutoff, curr;
52
+ clusterpoint dists[npoints];
53
+
54
+ for (i = 0; i < npoints; i++) {
55
+ dists[i].n = i;
56
+ dists[i].chosen = 0;
57
+ dists[i].dist = 0;
58
+ }
59
+
60
+ // setup 1st centroid
61
+ n = 1;
62
+ clusterid[chosen] = 0;
63
+ dists[chosen].chosen = 1;
64
+
65
+ // pick k-points for k-clusters with a probability weighted by square of distance from closest centroid.
66
+ while (n < nclusters) {
67
+ total = compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
68
+ qsort((void*)dists, npoints, sizeof(clusterpoint), compare);
69
+
70
+ curr = 0;
71
+ cutoff = total * uniform();
72
+ for (i = 0; i < npoints; i++) {
73
+ if (dists[i].chosen) continue;
74
+ curr += dists[i].dist;
75
+ if (curr >= cutoff || i == (npoints - 1)) {
76
+ clusterid[dists[i].n] = n++;
77
+ dists[i].chosen = 1;
78
+ dists[i].dist = 0;
79
+ break;
80
+ }
81
+ }
82
+ }
83
+
84
+ // assign remaining points to closest cluster
85
+ compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
86
+ for (n = 0; n < npoints; n++) {
87
+ if (dists[n].chosen) continue;
88
+ clusterid[dists[n].n] = clusterid[dists[dists[n].closest].n];
89
+ }
90
+ }
91
+
92
+ void spreadoutassign(int nclusters, int nrows, int ncolumns,
93
+ double** data, int** mask, double weight[], int transpose,
94
+ double (*metric)(int, double**, double**, int**, int**, const double[], int, int, int),
95
+ int clusterid[]) {
96
+
97
+ int i, n, chosen = 0;
98
+ int ndata = (transpose == 0 ? ncolumns : nrows), npoints = (transpose == 0 ? nrows : ncolumns);
99
+ clusterpoint dists[npoints];
100
+
101
+ for (i = 0; i < npoints; i++) {
102
+ dists[i].n = i;
103
+ dists[i].chosen = 0;
104
+ dists[i].dist = 0;
105
+ }
106
+
107
+ // setup 1st centroid
108
+ n = 1;
109
+ clusterid[chosen] = 0;
110
+ dists[chosen].chosen = 1;
111
+
112
+ // pick k-points for k-clusters with max distance from all centers.
113
+ chosen = npoints - 1;
114
+ while (n < nclusters) {
115
+ compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
116
+ qsort((void*)dists, npoints, sizeof(clusterpoint), compare);
117
+
118
+ clusterid[dists[chosen].n] = n++;
119
+ dists[chosen].chosen = 1;
120
+ dists[chosen].dist = 0;
121
+ }
122
+
123
+ // assign remaining points to closest cluster
124
+ compute_distances(ndata, npoints, data, mask, weight, transpose, dists, metric);
125
+ for (n = 0; n < npoints; n++) {
126
+ if (dists[n].chosen) continue;
127
+ clusterid[dists[n].n] = clusterid[dists[dists[n].closest].n];
128
+ }
129
+ }
@@ -1,15 +1,15 @@
1
1
  # Generated by jeweler
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{flock}
8
- s.version = "0.4.1"
8
+ s.version = "0.5.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Bharanee Rathna"]
12
- s.date = %q{2011-04-28}
12
+ s.date = %q{2011-07-26}
13
13
  s.description = %q{A thin ruby binding to Cluster 3.0}
14
14
  s.email = ["deepfryed@gmail.com"]
15
15
  s.extensions = ["ext/extconf.rb"]
@@ -17,27 +17,22 @@ Gem::Specification.new do |s|
17
17
  "README.rdoc"
18
18
  ]
19
19
  s.files = [
20
+ "API.rdoc",
20
21
  "README.rdoc",
21
- "Rakefile",
22
- "VERSION",
23
- "ext/cluster.c",
24
- "ext/cluster.h",
25
- "ext/extconf.rb",
26
- "ext/flock.c",
27
- "flock.gemspec",
28
- "lib/flock.rb"
22
+ "Rakefile",
23
+ "VERSION",
24
+ "ext/cluster.c",
25
+ "ext/cluster.h",
26
+ "ext/extconf.rb",
27
+ "ext/flock.c",
28
+ "ext/kmeanspp.c",
29
+ "flock.gemspec",
30
+ "lib/flock.rb"
29
31
  ]
30
32
  s.homepage = %q{http://github.com/deepfryed/flock}
31
- s.rdoc_options = ["--charset=UTF-8"]
32
33
  s.require_paths = ["lib"]
33
34
  s.rubygems_version = %q{1.3.7}
34
35
  s.summary = %q{Ruby bindings to Cluster 3.0.}
35
- s.test_files = [
36
- "examples/sparse.rb",
37
- "examples/som.rb",
38
- "examples/dense.rb",
39
- "examples/treecluster.rb"
40
- ]
41
36
 
42
37
  if s.respond_to? :specification_version then
43
38
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
@@ -1,50 +1,247 @@
1
1
  require_relative '../ext/flock'
2
+
3
+ # Ruby bindings to data clustering algorithms provided by
4
+ # {Cluster 3.0}[http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm]
5
+ #
6
+ # == Algorithms implemented
7
+ #
8
+ # * K-Means, K-Medians, K-Means++
9
+ # * Self-Organizing Maps
10
+ # * Tree Cluster or Hierarchical Clustering
11
+ #
12
+ # == Synopsis
13
+ #
14
+ # require 'pp'
15
+ # require 'flock'
16
+ #
17
+ # # sparse data.
18
+ # data = []
19
+ # data << %w(apple orange)
20
+ # data << %w(black white)
21
+ # data << %w(white cyan)
22
+ # data << %w(apple orange)
23
+ # data << %w(apple)
24
+ #
25
+ # pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_RANDOM)
26
+ # pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_KMEANS_PLUSPLUS)
27
+ # pp Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_SPREADOUT)
28
+ #
29
+ # # dense data.
30
+ # data = Array.new(13) {[]}
31
+ # mask = Array.new(13) {[]}
32
+ # weights = Array.new(13) {1.0}
33
+ #
34
+ # data[0][0] = 0.1; data[0][1] = 0.0;
35
+ # data[1][0] = 1.4; data[1][1] = 1.3;
36
+ # data[2][0] = 1.2; data[2][1] = 2.5;
37
+ # data[3][0] = 2.3; data[3][1] = 1.5;
38
+ # data[4][0] = 1.7; data[4][1] = 0.7;
39
+ # data[5][0] = 0.0; data[5][1] = 3.9;
40
+ # data[6][0] = 6.7; data[6][1] = 3.9;
41
+ #
42
+ # mask[0][0] = 1; mask[0][1] = 1;
43
+ # mask[1][0] = 1; mask[1][1] = 1;
44
+ # mask[2][0] = 1; mask[2][1] = 1;
45
+ # mask[3][0] = 1; mask[3][1] = 1;
46
+ # mask[4][0] = 1; mask[4][1] = 1;
47
+ # mask[5][0] = 0; mask[5][1] = 1;
48
+ # mask[6][0] = 1; mask[6][1] = 1;
49
+ #
50
+ # pp Flock.kcluster(2, data, mask: mask, weights: weights)
51
+ #
52
+ #
53
+ # == See
54
+ # * examples/* for more examples.
55
+ # * README.rdoc for more details.
56
+ # * API.rdoc is a public API overview.
2
57
  module Flock
3
58
 
4
- def self.sparse_hash_to_data sparse_data
5
- dims = Hash[sparse_data.map(&:keys).flatten.uniq.map.with_index{|k,v| [k,v]}]
6
- data = sparse_data.map do |sv|
7
- vector = Array.new(dims.size) {0}
8
- sv.each {|k,v| vector[dims[k]] = v }
9
- vector
59
+ # Cluster using k-means and k-medians.
60
+ #
61
+ # @example
62
+ #
63
+ # data = []
64
+ # data << %w(apple orange)
65
+ # data << %w(black white)
66
+ # data << %w(white cyan)
67
+ # data << %w(apple orange)
68
+ # data << %w(apple)
69
+ # result = Flock.kcluster(2, data, sparse: true, seed: Flock::SEED_RANDOM)
70
+ #
71
+ # @param [Fixnum] size number of clusters the data points are grouped into.
72
+ # @param [Array] data An array of arrays of sparse or dense data, or an array of hashes of sparse data. Dense data
73
+ # should always be in numeric form. Sparse data values are converted to a dense row format
74
+ # by looking at the unique values and then converting each data point into a numeric vector
75
+ # that represents the presence or absence of a value in that data point.
76
+ # @option options [Array] :mask An array of arrays of 1s and 0s denoting if an element in the datapoint is
77
+ # to be used for computing distance (defaults to: all 1 vectors).
78
+ # @option options [Array] :weights Numeric weight for each data point (defaults to: all 1 vector).
79
+ # @option options [true, false] :transpose Transpose the dense data matrix (defaults to: false).
80
+ # @option options [Fixnum] :iterations Number of iterations to be run (defaults to: 100).
81
+ # @option options [Fixnum] :method Clustering method
82
+ # - Flock::METHOD_AVERAGE (default)
83
+ # - Flock::METHOD_MEDIAN
84
+ # @option options [Fixnum] :metric Distance measure, one of the following
85
+ # - Flock::METRIC_EUCLIDIAN (default)
86
+ # - Flock::METRIC_CITY_BLOCK
87
+ # - Flock::METRIC_CORRELATION
88
+ # - Flock::METRIC_ABSOLUTE_CORRELATION
89
+ # - Flock::METRIC_UNCENTERED_CORRELATION
90
+ # - Flock::METRIC_ABSOLUTE_UNCENTERED_CORRELATION
91
+ # - Flock::METRIC_SPEARMAN
92
+ # - Flock::METRIC_KENDALL
93
+ # @option options [Fixnum] :seed Initial seeding of clusters
94
+ # - Flock::SEED_RANDOM (default)
95
+ # - Flock::SEED_KMEANS_PLUSPLUS
96
+ # - Flock::SEED_SPREADOUT
97
+ # @return [Hash]
98
+ # {
99
+ # :cluster => [Array],
100
+ # :centroid => [Array<Array>],
101
+ # :error => [Numeric],
102
+ # :repeated => [Fixnum]
103
+ # }
104
+ def self.kcluster size, data, options = {}
105
+ options[:sparse] = true if sparse?(data[0])
106
+ if options[:sparse]
107
+ data, options[:weights] = densify(data, options[:weights])
108
+ options[:mask] = nil
10
109
  end
11
- [dims, data]
110
+ do_kcluster(size, data, options)
12
111
  end
13
112
 
14
- def self.sparse_array_to_data sparse_data
15
- dims = Hash[sparse_data.flatten.uniq.map.with_index{|k,v| [k,v]}]
16
- data = sparse_data.map do |sv|
17
- vector = Array.new(dims.size) {0}
18
- sv.each {|k| vector[dims[k]] = 1 }
19
- vector
113
+ # Arranges data points on a 2D grid without having to specify a fixed cluster size. So in theory you could have
114
+ # a maximum of nxm clusters.
115
+ #
116
+ # @example
117
+ #
118
+ # data = []
119
+ # data << %w(apple orange)
120
+ # data << %w(black white)
121
+ # data << %w(white cyan)
122
+ # data << %w(apple orange)
123
+ # data << %w(apple)
124
+ # result = Flock.self_organizing_map(2, 2, data, sparse: true)
125
+ #
126
+ # @param [Fixnum] nx Grid size in 1st dimension (x)
127
+ # @param [Fixnum] ny Grid size in 2nd dimension (y)
128
+ # @param [Array] data See Flock#kcluster
129
+ # @option options [Array] :mask See Flock#kcluster
130
+ # @option options [true, false] :transpose See Flock#kcluster
131
+ # @option options [Fixnum] :iterations See Flock#kcluster
132
+ # @option options [Fixnum] :metric See Flock#kcluster
133
+ # @option options [Numeric] :tau Initial tau value for distance metric.
134
+ # @return [Hash]
135
+ # {
136
+ # :cluster => [Array<Array>],
137
+ # :centroid => [Array<Array>]
138
+ # }
139
+ def self.self_organizing_map nx, ny, data, options = {}
140
+ options[:sparse] = true if sparse?(data[0])
141
+ if options[:sparse]
142
+ data, options[:weights] = densify(data, options[:weights])
143
+ options[:mask] = nil
20
144
  end
21
- [dims, data]
145
+ do_self_organizing_map(nx, ny, data, options)
22
146
  end
23
147
 
24
- def self.densify sparse_data, weights = nil
25
- dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
26
-
27
- if weights
28
- resampled = Array.new(dims.size) {1}
29
- weights.each {|k,v| resampled[dims[k]] = v }
30
- weights = resampled
148
+ # Clusters data into hierarchies and then returns the clusters required using cut-tree.
149
+ #
150
+ # @example
151
+ #
152
+ # data = []
153
+ # data << %w(apple orange)
154
+ # data << %w(black white)
155
+ # data << %w(white cyan)
156
+ # data << %w(apple orange)
157
+ # data << %w(apple)
158
+ # result = Flock.treecluster(2, data, sparse: true)
159
+ #
160
+ # @param [Fixnum] size Number of clusters required. (See Flock#kcluster)
161
+ # @param [Array] data See Flock#kcluster
162
+ # @option options [Array] :mask See Flock#kcluster
163
+ # @option options [true, false] :transpose See Flock#kcluster
164
+ # @option options [Fixnum] :iterations See Flock#kcluster
165
+ # @option options [Fixnum] :metric See Flock#kcluster
166
+ # @option options [Fixnum] :method Method to use for treecluster
167
+ # - Flock::METHOD_SINGLE_LINKAGE
168
+ # - Flock::METHOD_MAXIMUM_LINKAGE
169
+ # - Flock::METHOD_AVERAGE_LINKAGE (default)
170
+ # - Flock::METHOD_CENTROID_LINKAGE
171
+ # @return [Hash]
172
+ # {
173
+ # :cluster => [Array]
174
+ # }
175
+ def self.treecluster size, data, options = {}
176
+ options[:sparse] = true if sparse?(data[0])
177
+ if options[:sparse]
178
+ data, options[:weights] = densify(data, options[:weights])
179
+ options[:mask] = nil
31
180
  end
181
+ do_treecluster(size, data, options)
182
+ end
32
183
 
33
- [data, weights]
184
+ # @deprecated use {kcluster} instead.
185
+ def self.kmeans size, data, options = {}
186
+ kcluster(size, data, options)
34
187
  end
35
188
 
36
- def self.sparse_kmeans size, sparse_data, options = {}
37
- data, options[:weights] = densify(sparse_data, options[:weights])
38
- kmeans(size, data, data, options)
189
+ # @deprecated use {kcluster}(size, data, sparse: true, ...) instead.
190
+ def self.sparse_kmeans size, data, options = {}
191
+ kcluster(size, data, options.merge(sparse: true))
39
192
  end
40
193
 
41
- def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
42
- data, options[:weights] = densify(sparse_data, options[:weights])
43
- self_organizing_map(nx, ny, data, data, options)
194
+ # @deprecated use {treecluster}(size, data, sparse: true, ...) instead.
195
+ def self.sparse_treecluster size, data, options = {}
196
+ treecluster(size, data, options.merge(sparse: true))
44
197
  end
45
198
 
46
- def self.sparse_treecluster size, sparse_data, options = {}
47
- data, options[:weights] = densify(sparse_data, options[:weights])
48
- treecluster(size, data, data, options)
199
+ # @deprecated use {self_organizing_map}(nx, ny, data, sparse: true, ...) instead.
200
+ def self.sparse_self_organizing_map nx, ny, data, options = {}
201
+ self_organizing_map(nx, ny, data, options.merge(sparse: true))
49
202
  end
50
- end
203
+
204
+ private
205
+
206
+ def self.sparse? row
207
+ row.kind_of?(Hash) or !row[0].kind_of?(Numeric)
208
+ end
209
+
210
+ def self.sparse_array? row
211
+ !row.kind_of?(Hash)
212
+ end
213
+
214
+ def self.sparse_hash_to_data sparse_data
215
+ dims = Hash[sparse_data.map(&:keys).flatten.uniq.map.with_index{|k,v| [k,v]}]
216
+ data = sparse_data.map do |sv|
217
+ vector = Array.new(dims.size) {0}
218
+ sv.each {|k,v| vector[dims[k]] = v }
219
+ vector
220
+ end
221
+
222
+ [dims, data]
223
+ end
224
+
225
+ def self.sparse_array_to_data sparse_data
226
+ dims = Hash[sparse_data.flatten.uniq.map.with_index{|k,v| [k,v]}]
227
+ data = sparse_data.map do |sv|
228
+ vector = Array.new(dims.size) {0}
229
+ sv.each {|k| vector[dims[k]] = 1 }
230
+ vector
231
+ end
232
+
233
+ [dims, data]
234
+ end
235
+
236
+ def self.densify sparse_data, weights = nil
237
+ dims, data = sparse_array?(sparse_data[0]) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
238
+
239
+ if weights
240
+ resampled = Array.new(dims.size) {1}
241
+ weights.each {|k,v| resampled[dims[k]] = v }
242
+ weights = resampled
243
+ end
244
+
245
+ [data, weights]
246
+ end
247
+ end # Flock