reem 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Changes.md +0 -0
- data/Gemfile +9 -0
- data/LICENSE +21 -0
- data/README.md +0 -0
- data/Rakefile +9 -0
- data/lib/reem.rb +8 -0
- data/lib/reem/.clusterer.rb.swp +0 -0
- data/lib/reem/cholesky_factorization.rb +33 -0
- data/lib/reem/clusterer.java +183 -0
- data/lib/reem/clusterer.rb +10 -0
- data/lib/reem/lower_triangular_matrix.rb +27 -0
- data/lib/reem/multivariate_gaussian.rb +13 -0
- data/lib/reem/version.rb +7 -0
- data/reem.gemspec +18 -0
- data/test/clusterer_test.rb +20 -0
- data/test/helper.rb +7 -0
- data/test/lower_triangular_matrix_test.rb +17 -0
- data/test/multivariate_gaussian_test.rb +27 -0
- metadata +95 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 21a616566baa006fcc7d121ca0d863beb7ce12f0
|
4
|
+
data.tar.gz: 7a7f6be47462eefe9901979414cf2cdf3a204a7e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 58f172e95a0878b849778ea9029d81d32c0119d284c73f2967c1cf8010e888793839bbb086dc9603bec0234519bcfbe086ec7dd7e1c13e1fc5ee5f5b5987b00e
|
7
|
+
data.tar.gz: c59f57537da4203167f48da999f7deefc15912a864074b8244fb7152a6b821131a02a67d35ab61a8fbc984acdeb6836979e9022f9184e7b7d5d93bf32c2b5555
|
data/Changes.md
ADDED
File without changes
|
data/Gemfile
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
gemspec
|
3
|
+
|
4
|
+
platforms :rbx do
|
5
|
+
gem 'rubysl', '~> 2.0' # if using anything in the ruby standard library
|
6
|
+
gem 'psych' # if using yaml
|
7
|
+
gem 'minitest' # if using minitest
|
8
|
+
gem 'rubinius-developer_tools' # if using any of coverage, debugger, profiler
|
9
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014 Matthew Kirk
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
File without changes
|
data/Rakefile
ADDED
data/lib/reem.rb
ADDED
Binary file
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Reem
|
2
|
+
class CholeskyFactorization
|
3
|
+
def initialize(matrix)
|
4
|
+
@matrix = matrix
|
5
|
+
@lower = LowerTriangularMatrix.new(matrix)
|
6
|
+
decompose!
|
7
|
+
end
|
8
|
+
|
9
|
+
def decompose!
|
10
|
+
@lower.n.times do |j|
|
11
|
+
# Sqrt the diagonal
|
12
|
+
@lower[j,j] = Math::sqrt(@lower[j,j])
|
13
|
+
# Divide the subdiagonal colum by the diagonal
|
14
|
+
|
15
|
+
(j + 1...@lower.m).each do |i|
|
16
|
+
@lower[i,j] = @lower[i,j] / @lower[j,j]
|
17
|
+
end
|
18
|
+
|
19
|
+
# symmetric rank 1 update
|
20
|
+
# subtract the crossproduct of the
|
21
|
+
# subdiagonal column from the remaining
|
22
|
+
# lower diagonal
|
23
|
+
(j+1...@lower.n).each do |k|
|
24
|
+
(k...@lower.m).each do |l|
|
25
|
+
@lower[l,k] = @lower[l, k] - @lower[l,k] * @lower[k,j]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
@upper = @lower.transpose
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
public class EMClusterer extends AbstractConditionalDistribution implements FunctionApproximater {
|
2
|
+
/**
|
3
|
+
* The tolerance
|
4
|
+
*/
|
5
|
+
private static final double TOLERANCE = 1E-6;
|
6
|
+
/**
|
7
|
+
* The tolerance
|
8
|
+
*/
|
9
|
+
private static final int MAX_ITERATIONS = 1000;
|
10
|
+
/**
|
11
|
+
* The mixture distribution
|
12
|
+
*/
|
13
|
+
private MixtureDistribution mixture;
|
14
|
+
/**
|
15
|
+
* The number of clusters
|
16
|
+
*/
|
17
|
+
private int k;
|
18
|
+
/**
|
19
|
+
* The threshold
|
20
|
+
*/
|
21
|
+
private double tolerance;
|
22
|
+
|
23
|
+
/**
|
24
|
+
* The max iterations
|
25
|
+
*/
|
26
|
+
private int maxIterations;
|
27
|
+
|
28
|
+
/**
|
29
|
+
* How many iterations it took
|
30
|
+
*/
|
31
|
+
private int iterations;
|
32
|
+
|
33
|
+
/**
|
34
|
+
* Whether to print stuff
|
35
|
+
*/
|
36
|
+
private boolean debug = false;
|
37
|
+
|
38
|
+
/**
|
39
|
+
* Make a new em clusterer
|
40
|
+
* @param k the number of clusters
|
41
|
+
* @param tolerance the tolerance
|
42
|
+
*/
|
43
|
+
public EMClusterer(int k, double tolerance, int maxIterations) {
|
44
|
+
this.k = k;
|
45
|
+
this.tolerance = tolerance;
|
46
|
+
this.maxIterations = maxIterations;
|
47
|
+
}
|
48
|
+
|
49
|
+
/**
|
50
|
+
* Make a new clusterer
|
51
|
+
*/
|
52
|
+
public EMClusterer() {
|
53
|
+
this(2, TOLERANCE, MAX_ITERATIONS);
|
54
|
+
}
|
55
|
+
|
56
|
+
/**
|
57
|
+
* @see func.Classifier#classDistribution(shared.Instance)
|
58
|
+
*/
|
59
|
+
public Distribution distributionFor(Instance instance) {
|
60
|
+
// calculate the log probs
|
61
|
+
double[] probs = new double[mixture.getComponents().length];
|
62
|
+
double maxLog = Double.NEGATIVE_INFINITY;
|
63
|
+
for (int i = 0; i < probs.length; i++) {
|
64
|
+
probs[i] = mixture.getComponents()[i].logp(instance);
|
65
|
+
maxLog = Math.max(maxLog, probs[i]);
|
66
|
+
}
|
67
|
+
// turn into real probs
|
68
|
+
double sum = 0;
|
69
|
+
for (int i = 0; i < probs.length; i++) {
|
70
|
+
probs[i] = Math.exp(probs[i] - maxLog);
|
71
|
+
sum += probs[i];
|
72
|
+
}
|
73
|
+
// normalize
|
74
|
+
for (int i = 0; i < probs.length; i++) {
|
75
|
+
probs[i] /= sum;
|
76
|
+
}
|
77
|
+
return new DiscreteDistribution(probs);
|
78
|
+
}
|
79
|
+
|
80
|
+
/**
|
81
|
+
* @see func.FunctionApproximater#estimate(shared.DataSet)
|
82
|
+
*/
|
83
|
+
public void estimate(DataSet set) {
|
84
|
+
// kmeans initialization
|
85
|
+
KMeansClusterer kmeans = new KMeansClusterer(k);
|
86
|
+
kmeans.estimate(set);
|
87
|
+
double[] prior = new double[k];
|
88
|
+
double weightSum = 0;
|
89
|
+
int[] counts = new int[k];
|
90
|
+
int[] classifications = new int[set.size()];
|
91
|
+
for (int i = 0; i < set.size(); i++) {
|
92
|
+
classifications[i] = kmeans.value(set.get(i)).getDiscrete();
|
93
|
+
counts[classifications[i]]++;
|
94
|
+
prior[classifications[i]] += set.get(i).getWeight();
|
95
|
+
weightSum += set.get(i).getWeight();
|
96
|
+
}
|
97
|
+
// create data sets for each of the classes
|
98
|
+
Instance[][] instances = new Instance[k][];
|
99
|
+
for (int i = 0; i < instances.length; i++) {
|
100
|
+
instances[i] = new Instance[counts[i]];
|
101
|
+
}
|
102
|
+
Arrays.fill(counts, 0);
|
103
|
+
for (int i = 0; i < set.size(); i++) {
|
104
|
+
instances[classifications[i]][counts[classifications[i]]] = set.get(i);
|
105
|
+
counts[classifications[i]]++;
|
106
|
+
}
|
107
|
+
MultivariateGaussian[] initial = new MultivariateGaussian[k];
|
108
|
+
for (int i = 0; i < initial.length; i++) {
|
109
|
+
initial[i] = new MultivariateGaussian();
|
110
|
+
initial[i].setDebug(debug);
|
111
|
+
initial[i].estimate(new DataSet(instances[i]));
|
112
|
+
prior[i] /= weightSum;
|
113
|
+
}
|
114
|
+
mixture = new MixtureDistribution(initial, prior);
|
115
|
+
// reestimate
|
116
|
+
boolean done = false;
|
117
|
+
double lastLogLikelihood = 0;
|
118
|
+
iterations = 0;
|
119
|
+
while (!done) {
|
120
|
+
if (debug) {
|
121
|
+
System.out.println("On iteration " + iterations);
|
122
|
+
System.out.println(mixture);
|
123
|
+
}
|
124
|
+
mixture.estimate(set);
|
125
|
+
double logLikelihood = 0;
|
126
|
+
for (int j = 0; j < set.size(); j++) {
|
127
|
+
logLikelihood += mixture.logp(set.get(j));
|
128
|
+
}
|
129
|
+
logLikelihood /= set.size();
|
130
|
+
done = (iterations > 0 && Math.abs(logLikelihood - lastLogLikelihood) < tolerance)
|
131
|
+
|| (iterations + 1 >= maxIterations);
|
132
|
+
lastLogLikelihood = logLikelihood;
|
133
|
+
iterations++;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
/**
|
138
|
+
* @see func.FunctionApproximater#value(shared.Instance)
|
139
|
+
*/
|
140
|
+
public Instance value(Instance i) {
|
141
|
+
return distributionFor(i).mode();
|
142
|
+
}
|
143
|
+
|
144
|
+
/**
|
145
|
+
* Get the number of iterations it took
|
146
|
+
* @return the number
|
147
|
+
*/
|
148
|
+
public int getIterations() {
|
149
|
+
return iterations;
|
150
|
+
}
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Is debug mode on
|
154
|
+
* @return true if it is
|
155
|
+
*/
|
156
|
+
public boolean isDebug() {
|
157
|
+
return debug;
|
158
|
+
}
|
159
|
+
|
160
|
+
/**
|
161
|
+
* Set debug mode on or off
|
162
|
+
* @param b the debug mode
|
163
|
+
*/
|
164
|
+
public void setDebug(boolean b) {
|
165
|
+
debug = b;
|
166
|
+
}
|
167
|
+
|
168
|
+
/**
|
169
|
+
* Get the mixture
|
170
|
+
* @return the mixture
|
171
|
+
*/
|
172
|
+
public MixtureDistribution getMixture() {
|
173
|
+
return mixture;
|
174
|
+
}
|
175
|
+
|
176
|
+
/**
|
177
|
+
* @see java.lang.Object#toString()
|
178
|
+
*/
|
179
|
+
public String toString() {
|
180
|
+
return mixture.toString();
|
181
|
+
}
|
182
|
+
|
183
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Reem
|
2
|
+
class LowerTriangularMatrix
|
3
|
+
def initialize(matrix)
|
4
|
+
columns, rows = matrix.sizes
|
5
|
+
@ltm = NArray.float(columns, rows)
|
6
|
+
|
7
|
+
(0...columns).each do |i|
|
8
|
+
(0..i).each do |j|
|
9
|
+
@ltm[j,i] = matrix[j,i]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def transpose
|
15
|
+
@transpose ||= begin
|
16
|
+
columns, rows = @ltm.sizes
|
17
|
+
matrix = NArray.float(columns, rows)
|
18
|
+
(0...columns).each do |i|
|
19
|
+
(0...rows).each do |j|
|
20
|
+
matrix[i,j] = @ltm[j,i]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
matrix
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/reem/version.rb
ADDED
data/reem.gemspec
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/reem/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Matt Kirk"]
|
6
|
+
gem.email = ["matt@matthewkirk.com"]
|
7
|
+
gem.description = gem.summary = "Simple EM Clustering for Ruby"
|
8
|
+
gem.homepage = "http://github.com/hexgnu/reem"
|
9
|
+
gem.license = "MIT"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split("\n")
|
12
|
+
gem.test_files = `git ls-files -- test/*`.split("\n")
|
13
|
+
gem.name = "reem"
|
14
|
+
gem.require_paths = ["lib"]
|
15
|
+
gem.version = Reem::VERSION
|
16
|
+
gem.add_dependency 'narray'
|
17
|
+
gem.add_development_dependency 'minitest'
|
18
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe Reem::Clusterer do
|
4
|
+
it 'separates two obvious gaussians into two clusters' do
|
5
|
+
skip
|
6
|
+
mga = MultivariateGaussian.new(Vector[100,100,100], Matrix.identity(3) * 0.01)
|
7
|
+
mba = MultivariateGaussian.new(Vector[-1,-1,-1], Matrix.identity(3) * 10)
|
8
|
+
|
9
|
+
instances = instances.length.times.map do |i|
|
10
|
+
if Distribution.random.next_boolean
|
11
|
+
mga.sample(nil)
|
12
|
+
else
|
13
|
+
mgb.sample(nil)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
set = ::Reem::DataSet.new(instances)
|
17
|
+
em = ::Reem::Clusterer.new
|
18
|
+
puts em.estimate(set)
|
19
|
+
end
|
20
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe ::Reem::LowerTriangularMatrix do
|
4
|
+
let(:matrix) { NArray[[1,2,3], [4,5,6], [7,8,9]] }
|
5
|
+
let(:lower_triangle) { NArray[[1.0,0.0,0.0], [4.0,5.0,0.0], [7.0,8.0,9.0]] }
|
6
|
+
let(:upper_triangle) { NArray[[1.0, 4.0, 7.0], [0.0, 5.0, 8.0], [0.0, 0.0, 9.0]]}
|
7
|
+
|
8
|
+
it 'should convert to a lower triangular matrix' do
|
9
|
+
ltm = ::Reem::LowerTriangularMatrix.new(matrix)
|
10
|
+
ltm.instance_variable_get("@ltm").must_equal lower_triangle
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'transposes' do
|
14
|
+
ltm = ::Reem::LowerTriangularMatrix.new(matrix)
|
15
|
+
ltm.transpose.must_equal upper_triangle
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
describe ::Reem::MultivariateGaussian do
|
4
|
+
it 'blends' do
|
5
|
+
skip
|
6
|
+
mga = ::Reem::MultivariateGaussian.new(Vector[100,100,100], Matrix.identity(3) * 0.01)
|
7
|
+
|
8
|
+
instances = 20.times.map do |i|
|
9
|
+
mga.sample
|
10
|
+
end
|
11
|
+
|
12
|
+
set = DataSet.new(instances)
|
13
|
+
mg = MultivariateGaussian
|
14
|
+
mg.estimate(set)
|
15
|
+
|
16
|
+
puts mg
|
17
|
+
puts "Most likely #{mg.mode}"
|
18
|
+
|
19
|
+
10.times do |i|
|
20
|
+
puts mg.sample
|
21
|
+
end
|
22
|
+
|
23
|
+
instances.each do |instance|
|
24
|
+
puts "Probability of \n #{instance} \n #{mg.p(instance)}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: reem
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matt Kirk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: narray
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: Simple EM Clustering for Ruby
|
42
|
+
email:
|
43
|
+
- matt@matthewkirk.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- Changes.md
|
49
|
+
- Gemfile
|
50
|
+
- LICENSE
|
51
|
+
- README.md
|
52
|
+
- Rakefile
|
53
|
+
- lib/reem.rb
|
54
|
+
- lib/reem/.clusterer.rb.swp
|
55
|
+
- lib/reem/cholesky_factorization.rb
|
56
|
+
- lib/reem/clusterer.java
|
57
|
+
- lib/reem/clusterer.rb
|
58
|
+
- lib/reem/lower_triangular_matrix.rb
|
59
|
+
- lib/reem/multivariate_gaussian.rb
|
60
|
+
- lib/reem/version.rb
|
61
|
+
- reem.gemspec
|
62
|
+
- test/clusterer_test.rb
|
63
|
+
- test/helper.rb
|
64
|
+
- test/lower_triangular_matrix_test.rb
|
65
|
+
- test/multivariate_gaussian_test.rb
|
66
|
+
homepage: http://github.com/hexgnu/reem
|
67
|
+
licenses:
|
68
|
+
- MIT
|
69
|
+
metadata: {}
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options: []
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
requirements: []
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 2.0.2
|
87
|
+
signing_key:
|
88
|
+
specification_version: 4
|
89
|
+
summary: Simple EM Clustering for Ruby
|
90
|
+
test_files:
|
91
|
+
- test/clusterer_test.rb
|
92
|
+
- test/helper.rb
|
93
|
+
- test/lower_triangular_matrix_test.rb
|
94
|
+
- test/multivariate_gaussian_test.rb
|
95
|
+
has_rdoc:
|