reem 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 21a616566baa006fcc7d121ca0d863beb7ce12f0
4
+ data.tar.gz: 7a7f6be47462eefe9901979414cf2cdf3a204a7e
5
+ SHA512:
6
+ metadata.gz: 58f172e95a0878b849778ea9029d81d32c0119d284c73f2967c1cf8010e888793839bbb086dc9603bec0234519bcfbe086ec7dd7e1c13e1fc5ee5f5b5987b00e
7
+ data.tar.gz: c59f57537da4203167f48da999f7deefc15912a864074b8244fb7152a6b821131a02a67d35ab61a8fbc984acdeb6836979e9022f9184e7b7d5d93bf32c2b5555
data/Changes.md ADDED
File without changes
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
4
+ platforms :rbx do
5
+ gem 'rubysl', '~> 2.0' # if using anything in the ruby standard library
6
+ gem 'psych' # if using yaml
7
+ gem 'minitest' # if using minitest
8
+ gem 'rubinius-developer_tools' # if using any of coverage, debugger, profiler
9
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Matthew Kirk
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new(:test) do |test|
4
+ test.libs << 'test'
5
+ test.warning = true
6
+ test.pattern = 'test/**/*_test.rb'
7
+ end
8
+
9
+ task :default => :test
data/lib/reem.rb ADDED
@@ -0,0 +1,8 @@
1
+ require 'narray'
2
+
3
+ module Reem
4
+ autoload :Version, 'reem/version'
5
+ autoload :LowerTriangularMatrix, 'reem/lower_triangular_matrix'
6
+ autoload :Clusterer, 'reem/clusterer'
7
+ autoload :MultivariateGaussian, 'reem/multivariate_gaussian'
8
+ end
Binary file
@@ -0,0 +1,33 @@
1
+ module Reem
2
+ class CholeskyFactorization
3
+ def initialize(matrix)
4
+ @matrix = matrix
5
+ @lower = LowerTriangularMatrix.new(matrix)
6
+ decompose!
7
+ end
8
+
9
+ def decompose!
10
+ @lower.n.times do |j|
11
+ # Sqrt the diagonal
12
+ @lower[j,j] = Math::sqrt(@lower[j,j])
13
+ # Divide the subdiagonal colum by the diagonal
14
+
15
+ (j + 1...@lower.m).each do |i|
16
+ @lower[i,j] = @lower[i,j] / @lower[j,j]
17
+ end
18
+
19
+ # symmetric rank 1 update
20
+ # subtract the crossproduct of the
21
+ # subdiagonal column from the remaining
22
+ # lower diagonal
23
+ (j+1...@lower.n).each do |k|
24
+ (k...@lower.m).each do |l|
25
+ @lower[l,k] = @lower[l, k] - @lower[l,k] * @lower[k,j]
26
+ end
27
+ end
28
+ end
29
+
30
+ @upper = @lower.transpose
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,183 @@
1
+ public class EMClusterer extends AbstractConditionalDistribution implements FunctionApproximater {
2
+ /**
3
+ * The tolerance
4
+ */
5
+ private static final double TOLERANCE = 1E-6;
6
+ /**
7
+ * The tolerance
8
+ */
9
+ private static final int MAX_ITERATIONS = 1000;
10
+ /**
11
+ * The mixture distribution
12
+ */
13
+ private MixtureDistribution mixture;
14
+ /**
15
+ * The number of clusters
16
+ */
17
+ private int k;
18
+ /**
19
+ * The threshold
20
+ */
21
+ private double tolerance;
22
+
23
+ /**
24
+ * The max iterations
25
+ */
26
+ private int maxIterations;
27
+
28
+ /**
29
+ * How many iterations it took
30
+ */
31
+ private int iterations;
32
+
33
+ /**
34
+ * Whether to print stuff
35
+ */
36
+ private boolean debug = false;
37
+
38
+ /**
39
+ * Make a new em clusterer
40
+ * @param k the number of clusters
41
+ * @param tolerance the tolerance
42
+ */
43
+ public EMClusterer(int k, double tolerance, int maxIterations) {
44
+ this.k = k;
45
+ this.tolerance = tolerance;
46
+ this.maxIterations = maxIterations;
47
+ }
48
+
49
+ /**
50
+ * Make a new clusterer
51
+ */
52
+ public EMClusterer() {
53
+ this(2, TOLERANCE, MAX_ITERATIONS);
54
+ }
55
+
56
+ /**
57
+ * @see func.Classifier#classDistribution(shared.Instance)
58
+ */
59
+ public Distribution distributionFor(Instance instance) {
60
+ // calculate the log probs
61
+ double[] probs = new double[mixture.getComponents().length];
62
+ double maxLog = Double.NEGATIVE_INFINITY;
63
+ for (int i = 0; i < probs.length; i++) {
64
+ probs[i] = mixture.getComponents()[i].logp(instance);
65
+ maxLog = Math.max(maxLog, probs[i]);
66
+ }
67
+ // turn into real probs
68
+ double sum = 0;
69
+ for (int i = 0; i < probs.length; i++) {
70
+ probs[i] = Math.exp(probs[i] - maxLog);
71
+ sum += probs[i];
72
+ }
73
+ // normalize
74
+ for (int i = 0; i < probs.length; i++) {
75
+ probs[i] /= sum;
76
+ }
77
+ return new DiscreteDistribution(probs);
78
+ }
79
+
80
+ /**
81
+ * @see func.FunctionApproximater#estimate(shared.DataSet)
82
+ */
83
+ public void estimate(DataSet set) {
84
+ // kmeans initialization
85
+ KMeansClusterer kmeans = new KMeansClusterer(k);
86
+ kmeans.estimate(set);
87
+ double[] prior = new double[k];
88
+ double weightSum = 0;
89
+ int[] counts = new int[k];
90
+ int[] classifications = new int[set.size()];
91
+ for (int i = 0; i < set.size(); i++) {
92
+ classifications[i] = kmeans.value(set.get(i)).getDiscrete();
93
+ counts[classifications[i]]++;
94
+ prior[classifications[i]] += set.get(i).getWeight();
95
+ weightSum += set.get(i).getWeight();
96
+ }
97
+ // create data sets for each of the classes
98
+ Instance[][] instances = new Instance[k][];
99
+ for (int i = 0; i < instances.length; i++) {
100
+ instances[i] = new Instance[counts[i]];
101
+ }
102
+ Arrays.fill(counts, 0);
103
+ for (int i = 0; i < set.size(); i++) {
104
+ instances[classifications[i]][counts[classifications[i]]] = set.get(i);
105
+ counts[classifications[i]]++;
106
+ }
107
+ MultivariateGaussian[] initial = new MultivariateGaussian[k];
108
+ for (int i = 0; i < initial.length; i++) {
109
+ initial[i] = new MultivariateGaussian();
110
+ initial[i].setDebug(debug);
111
+ initial[i].estimate(new DataSet(instances[i]));
112
+ prior[i] /= weightSum;
113
+ }
114
+ mixture = new MixtureDistribution(initial, prior);
115
+ // reestimate
116
+ boolean done = false;
117
+ double lastLogLikelihood = 0;
118
+ iterations = 0;
119
+ while (!done) {
120
+ if (debug) {
121
+ System.out.println("On iteration " + iterations);
122
+ System.out.println(mixture);
123
+ }
124
+ mixture.estimate(set);
125
+ double logLikelihood = 0;
126
+ for (int j = 0; j < set.size(); j++) {
127
+ logLikelihood += mixture.logp(set.get(j));
128
+ }
129
+ logLikelihood /= set.size();
130
+ done = (iterations > 0 && Math.abs(logLikelihood - lastLogLikelihood) < tolerance)
131
+ || (iterations + 1 >= maxIterations);
132
+ lastLogLikelihood = logLikelihood;
133
+ iterations++;
134
+ }
135
+ }
136
+
137
+ /**
138
+ * @see func.FunctionApproximater#value(shared.Instance)
139
+ */
140
+ public Instance value(Instance i) {
141
+ return distributionFor(i).mode();
142
+ }
143
+
144
+ /**
145
+ * Get the number of iterations it took
146
+ * @return the number
147
+ */
148
+ public int getIterations() {
149
+ return iterations;
150
+ }
151
+
152
+ /**
153
+ * Is debug mode on
154
+ * @return true if it is
155
+ */
156
+ public boolean isDebug() {
157
+ return debug;
158
+ }
159
+
160
+ /**
161
+ * Set debug mode on or off
162
+ * @param b the debug mode
163
+ */
164
+ public void setDebug(boolean b) {
165
+ debug = b;
166
+ }
167
+
168
+ /**
169
+ * Get the mixture
170
+ * @return the mixture
171
+ */
172
+ public MixtureDistribution getMixture() {
173
+ return mixture;
174
+ }
175
+
176
+ /**
177
+ * @see java.lang.Object#toString()
178
+ */
179
+ public String toString() {
180
+ return mixture.toString();
181
+ }
182
+
183
+ }
@@ -0,0 +1,10 @@
1
+ module Reem
2
+ class Clusterer
3
+ TOLERANCE = 1E-6.freeze
4
+ MAX_ITERATIONS = 1000.freeze
5
+
6
+ def initialize(k = 2, tolerance = TOLERANCE, max_iterations = MAX_ITERATIONS)
7
+
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,27 @@
1
+ module Reem
2
+ class LowerTriangularMatrix
3
+ def initialize(matrix)
4
+ columns, rows = matrix.sizes
5
+ @ltm = NArray.float(columns, rows)
6
+
7
+ (0...columns).each do |i|
8
+ (0..i).each do |j|
9
+ @ltm[j,i] = matrix[j,i]
10
+ end
11
+ end
12
+ end
13
+
14
+ def transpose
15
+ @transpose ||= begin
16
+ columns, rows = @ltm.sizes
17
+ matrix = NArray.float(columns, rows)
18
+ (0...columns).each do |i|
19
+ (0...rows).each do |j|
20
+ matrix[i,j] = @ltm[j,i]
21
+ end
22
+ end
23
+ matrix
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,13 @@
1
+ module Reem
2
+ class MultivariateGaussian
3
+ FLOOR = 0.01
4
+ FLOOR_CHAGNE = 10
5
+
6
+ def initialize(vector_mean, covariance, floor)
7
+ @mean = mean
8
+ @covariance = covariance
9
+ @floor = floor
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,7 @@
1
+ module Reem
2
+ VERSION = {
3
+ :major => 0,
4
+ :minor => 0,
5
+ :patch => 1
6
+ }.values.join('.').freeze
7
+ end
data/reem.gemspec ADDED
@@ -0,0 +1,18 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/reem/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Matt Kirk"]
6
+ gem.email = ["matt@matthewkirk.com"]
7
+ gem.description = gem.summary = "Simple EM Clustering for Ruby"
8
+ gem.homepage = "http://github.com/hexgnu/reem"
9
+ gem.license = "MIT"
10
+
11
+ gem.files = `git ls-files`.split("\n")
12
+ gem.test_files = `git ls-files -- test/*`.split("\n")
13
+ gem.name = "reem"
14
+ gem.require_paths = ["lib"]
15
+ gem.version = Reem::VERSION
16
+ gem.add_dependency 'narray'
17
+ gem.add_development_dependency 'minitest'
18
+ end
@@ -0,0 +1,20 @@
1
+ require 'helper'
2
+
3
+ describe Reem::Clusterer do
4
+ it 'separates two obvious gaussians into two clusters' do
5
+ skip
6
+ mga = MultivariateGaussian.new(Vector[100,100,100], Matrix.identity(3) * 0.01)
7
+ mba = MultivariateGaussian.new(Vector[-1,-1,-1], Matrix.identity(3) * 10)
8
+
9
+ instances = instances.length.times.map do |i|
10
+ if Distribution.random.next_boolean
11
+ mga.sample(nil)
12
+ else
13
+ mgb.sample(nil)
14
+ end
15
+ end
16
+ set = ::Reem::DataSet.new(instances)
17
+ em = ::Reem::Clusterer.new
18
+ puts em.estimate(set)
19
+ end
20
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,7 @@
1
+ $TESTING = true
2
+
3
+ require 'minitest/autorun'
4
+ require 'minitest/pride'
5
+ require 'bundler'
6
+ Bundler.require
7
+ require 'reem'
@@ -0,0 +1,17 @@
1
+ require 'helper'
2
+
3
+ describe ::Reem::LowerTriangularMatrix do
4
+ let(:matrix) { NArray[[1,2,3], [4,5,6], [7,8,9]] }
5
+ let(:lower_triangle) { NArray[[1.0,0.0,0.0], [4.0,5.0,0.0], [7.0,8.0,9.0]] }
6
+ let(:upper_triangle) { NArray[[1.0, 4.0, 7.0], [0.0, 5.0, 8.0], [0.0, 0.0, 9.0]]}
7
+
8
+ it 'should convert to a lower triangular matrix' do
9
+ ltm = ::Reem::LowerTriangularMatrix.new(matrix)
10
+ ltm.instance_variable_get("@ltm").must_equal lower_triangle
11
+ end
12
+
13
+ it 'transposes' do
14
+ ltm = ::Reem::LowerTriangularMatrix.new(matrix)
15
+ ltm.transpose.must_equal upper_triangle
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ require 'helper'
2
+
3
+ describe ::Reem::MultivariateGaussian do
4
+ it 'blends' do
5
+ skip
6
+ mga = ::Reem::MultivariateGaussian.new(Vector[100,100,100], Matrix.identity(3) * 0.01)
7
+
8
+ instances = 20.times.map do |i|
9
+ mga.sample
10
+ end
11
+
12
+ set = DataSet.new(instances)
13
+ mg = MultivariateGaussian
14
+ mg.estimate(set)
15
+
16
+ puts mg
17
+ puts "Most likely #{mg.mode}"
18
+
19
+ 10.times do |i|
20
+ puts mg.sample
21
+ end
22
+
23
+ instances.each do |instance|
24
+ puts "Probability of \n #{instance} \n #{mg.p(instance)}"
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reem
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Matt Kirk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: narray
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Simple EM Clustering for Ruby
42
+ email:
43
+ - matt@matthewkirk.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - Changes.md
49
+ - Gemfile
50
+ - LICENSE
51
+ - README.md
52
+ - Rakefile
53
+ - lib/reem.rb
54
+ - lib/reem/.clusterer.rb.swp
55
+ - lib/reem/cholesky_factorization.rb
56
+ - lib/reem/clusterer.java
57
+ - lib/reem/clusterer.rb
58
+ - lib/reem/lower_triangular_matrix.rb
59
+ - lib/reem/multivariate_gaussian.rb
60
+ - lib/reem/version.rb
61
+ - reem.gemspec
62
+ - test/clusterer_test.rb
63
+ - test/helper.rb
64
+ - test/lower_triangular_matrix_test.rb
65
+ - test/multivariate_gaussian_test.rb
66
+ homepage: http://github.com/hexgnu/reem
67
+ licenses:
68
+ - MIT
69
+ metadata: {}
70
+ post_install_message:
71
+ rdoc_options: []
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 2.0.2
87
+ signing_key:
88
+ specification_version: 4
89
+ summary: Simple EM Clustering for Ruby
90
+ test_files:
91
+ - test/clusterer_test.rb
92
+ - test/helper.rb
93
+ - test/lower_triangular_matrix_test.rb
94
+ - test/multivariate_gaussian_test.rb
95
+ has_rdoc: