ruby-em_algorithm 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +6 -0
- data/Gemfile.lock +30 -0
- data/README.md +44 -0
- data/Rakefile +7 -0
- data/example/.ex1.rb.swp +0 -0
- data/example/.ex2.rb.swp +0 -0
- data/example/.ex3-tmp.rb.swp +0 -0
- data/example/.ex3.rb.swp +0 -0
- data/example/data/2dim-gmm-new.txt +1267 -0
- data/example/data/2dim-gmm-simple.txt +676 -0
- data/example/data/2dim-gmm-test.txt +6565 -0
- data/example/data/2dim-gmm-test2.txt +2782 -0
- data/example/data/2dim-gmm-test3.csv +1641 -0
- data/example/data/2dim-gmm-test3.txt +2782 -0
- data/example/data/2dim-gmm-test4.csv +868 -0
- data/example/data/2dim-gmm-test4.txt +4924 -0
- data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
- data/example/data/2dim-gmm-without_weight.txt +18001 -0
- data/example/data/2dim-gmm.txt +1267 -0
- data/example/data/gmm-new.txt +10001 -0
- data/example/data/gmm-simple.txt +676 -0
- data/example/data/gmm.txt +10001 -0
- data/example/data/old-gmm.txt +10000 -0
- data/example/ex1.rb +20 -0
- data/example/ex1.rb~ +20 -0
- data/example/ex2.rb +33 -0
- data/example/ex2.rb~ +33 -0
- data/example/ex3-tmp.rb +23 -0
- data/example/ex3-tmp.rb~ +25 -0
- data/example/ex3.rb +43 -0
- data/example/ex3.rb~ +43 -0
- data/example/tools/.2dim.rb.swp +0 -0
- data/example/tools/2dim.rb +69 -0
- data/example/tools/2dim.rb~ +69 -0
- data/example/tools/boxmuller.rb +28 -0
- data/example/tools/boxmuller.rb~ +28 -0
- data/example/tools/conv_from_yaml.rb +8 -0
- data/example/tools/conv_from_yaml_to_csv.rb +8 -0
- data/example/tools/conv_to_yaml.rb +17 -0
- data/example/tools/ellipsoid.gnuplot +63 -0
- data/example/tools/ellipsoid.gnuplot~ +64 -0
- data/example/tools/histogram.rb +19 -0
- data/example/tools/histogram2d.rb +20 -0
- data/example/tools/histogram2d.rb~ +18 -0
- data/example/tools/kmeans.rb +34 -0
- data/example/tools/mean.rb +19 -0
- data/example/tools/table.data +4618 -0
- data/example/tools/tmp.txt +69632 -0
- data/example/tools/xmeans.R +608 -0
- data/example/tools/xmeans.rb +35 -0
- data/lib/em_algorithm/.base.rb.swp +0 -0
- data/lib/em_algorithm/base.rb +116 -0
- data/lib/em_algorithm/base.rb~ +116 -0
- data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
- data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
- data/lib/em_algorithm/convergence/check_method.rb +4 -0
- data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
- data/lib/em_algorithm/convergence/chi_square.rb +40 -0
- data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
- data/lib/em_algorithm/convergence/likelihood.rb +35 -0
- data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
- data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
- data/lib/em_algorithm/models/.model.rb.swp +0 -0
- data/lib/em_algorithm/models/gaussian.rb +47 -0
- data/lib/em_algorithm/models/gaussian.rb~ +47 -0
- data/lib/em_algorithm/models/md_gaussian.rb +67 -0
- data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
- data/lib/em_algorithm/models/mixture.rb +122 -0
- data/lib/em_algorithm/models/mixture.rb~ +122 -0
- data/lib/em_algorithm/models/model.rb +19 -0
- data/lib/em_algorithm/models/model.rb~ +19 -0
- data/lib/ruby-em_algorithm.rb +3 -0
- data/lib/ruby-em_algorithm/version.rb +3 -0
- data/ruby-em_algorithm.gemspec +21 -0
- data/spec/spec_helper.rb +9 -0
- metadata +178 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'rsruby'
|
5
|
+
require 'gsl'
|
6
|
+
|
7
|
+
include GSL
|
8
|
+
|
9
|
+
r = RSRuby.instance
|
10
|
+
c = r.eval_R(<<-RCOMMAND)
|
11
|
+
a <- read.csv('#{ARGV[0]}.csv')
|
12
|
+
source('./xmeans.R')
|
13
|
+
xmeans(a,2,20)
|
14
|
+
RCOMMAND
|
15
|
+
|
16
|
+
data_array = YAML.load_file("#{ARGV[0]}.txt").map {|v| Vector[v] }
|
17
|
+
cluster = Array.new(c["size"].size).map { {"mu_sum" => 0.0, "sigma_sum" => Matrix.alloc(data_array[0].size, data_array[0].size)} }
|
18
|
+
c["cluster"].each_with_index do |num, di|
|
19
|
+
cluster[num - 1]["mu_sum"] += data_array[di]
|
20
|
+
cluster[num - 1]["sigma_sum"] += data_array[di].trans * data_array[di]
|
21
|
+
end
|
22
|
+
c["size"].each_with_index do |size, num|
|
23
|
+
cluster[num]["mu"] = cluster[num]["mu_sum"] / size
|
24
|
+
cluster[num]["sigma"] = cluster[num]["sigma_sum"] / size
|
25
|
+
end
|
26
|
+
|
27
|
+
c["centers"].each_with_index do |cen, ci|
|
28
|
+
puts "### cluster #{ci}"
|
29
|
+
p cen
|
30
|
+
end
|
31
|
+
cluster.each_with_index do |clu, ci|
|
32
|
+
puts "### cluster #{ci}"
|
33
|
+
p clu["mu"].to_a
|
34
|
+
p clu["sigma"].to_a
|
35
|
+
end
|
Binary file
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'em_algorithm/models/model'
|
2
|
+
require 'em_algorithm/models/gaussian'
|
3
|
+
require 'em_algorithm/models/md_gaussian'
|
4
|
+
require 'em_algorithm/models/mixture'
|
5
|
+
require 'em_algorithm/convergence/check_method'
|
6
|
+
require 'em_algorithm/convergence/likelihood'
|
7
|
+
require 'em_algorithm/convergence/chi_square'
|
8
|
+
|
9
|
+
module EMAlgorithm
|
10
|
+
include Math
|
11
|
+
include GSL
|
12
|
+
|
13
|
+
MAX_ITERATION = 10000
|
14
|
+
|
15
|
+
class Base
|
16
|
+
|
17
|
+
attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
|
18
|
+
|
19
|
+
# * Model limitation
|
20
|
+
# currently only Gaussian Mixture model is supported.
|
21
|
+
# if you want to use simple Gaussian, you must use a mixture model
|
22
|
+
# which has only one Gaussian model entry with weight 1.0.
|
23
|
+
#
|
24
|
+
# * Input data format
|
25
|
+
# You can estimate the probability distribution and the target value distribution.
|
26
|
+
# If you want to estimate the target value distribution, you must specify the target
|
27
|
+
# value and its correspondence area size into the input vector x.
|
28
|
+
# x[-2]: target value
|
29
|
+
# x[-1]: correspondent area size
|
30
|
+
def initialize(options)
|
31
|
+
opts = {
|
32
|
+
:model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
|
33
|
+
:data_array => [],
|
34
|
+
:value_distribution_estimation => false,
|
35
|
+
:debug => true
|
36
|
+
}.merge(options)
|
37
|
+
@model = opts[:model]
|
38
|
+
@original_data_array = opts[:data_array]
|
39
|
+
@value_distribution_estimation = opts[:value_distribution_estimation]
|
40
|
+
if @value_distribution_estimation
|
41
|
+
@data_array = value_to_frequency(@original_data_array)
|
42
|
+
else
|
43
|
+
@data_array = @original_data_array
|
44
|
+
end
|
45
|
+
@likelihood = Likelihood.new(@data_array)
|
46
|
+
@debug = opts[:debug]
|
47
|
+
@const = 1.0
|
48
|
+
end
|
49
|
+
|
50
|
+
# calculate @posterior_data_array
|
51
|
+
def estep
|
52
|
+
@model.clear_temp_weight_per_datum!
|
53
|
+
@posterior_data_array = @model.calculate_posterior_data_array(@data_array)
|
54
|
+
@model.update_temp_weights!(@data_array, @posterior_data_array)
|
55
|
+
end
|
56
|
+
|
57
|
+
# calculate posterior model parameters
|
58
|
+
def mstep
|
59
|
+
if @debug
|
60
|
+
$stderr.puts @model.inspect
|
61
|
+
end
|
62
|
+
@model.update_parameters!(@data_array)
|
63
|
+
end
|
64
|
+
|
65
|
+
def run!
|
66
|
+
MAX_ITERATION.times do |i|
|
67
|
+
if @debug
|
68
|
+
$stderr.puts "step#{i}"
|
69
|
+
end
|
70
|
+
# check convergence
|
71
|
+
@likelihood.calculate(@model)
|
72
|
+
if @debug
|
73
|
+
$stderr.puts @likelihood.debug_output
|
74
|
+
end
|
75
|
+
if @likelihood.converged?
|
76
|
+
@num_step = i
|
77
|
+
if @value_distribution_estimation
|
78
|
+
@const = distribution_to_value_ratio
|
79
|
+
end
|
80
|
+
break
|
81
|
+
end
|
82
|
+
if @debug
|
83
|
+
$stderr.puts @model.debug_output
|
84
|
+
$stderr.puts ""
|
85
|
+
end
|
86
|
+
estep
|
87
|
+
mstep
|
88
|
+
end
|
89
|
+
@model
|
90
|
+
end
|
91
|
+
|
92
|
+
def value_to_frequency(data_array)
|
93
|
+
modified_data_array = []
|
94
|
+
data_array.each do |x|
|
95
|
+
x[x.size-2].round.times do
|
96
|
+
x_without_value = x[0..(x.size-3)]
|
97
|
+
x_without_value = x_without_value.first if x_without_value.size == 1
|
98
|
+
modified_data_array << x_without_value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
modified_data_array
|
102
|
+
end
|
103
|
+
|
104
|
+
# the integration of the distribution equal 1.0
|
105
|
+
# so thus, the integration of the target value means the ratio of
|
106
|
+
# probability distribution and the target value distribution
|
107
|
+
def distribution_to_value_ratio
|
108
|
+
integrated_value = 0.0
|
109
|
+
@original_data_array.each do |x|
|
110
|
+
value = x[x.size-2]
|
111
|
+
integrated_value += value * x[x.size-1]
|
112
|
+
end
|
113
|
+
integrated_value
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'em_algorithm/models/model'
|
2
|
+
require 'em_algorithm/models/gaussian'
|
3
|
+
require 'em_algorithm/models/md_gaussian'
|
4
|
+
require 'em_algorithm/models/mixture'
|
5
|
+
require 'em_algorithm/convergence/check_method'
|
6
|
+
require 'em_algorithm/convergence/likelihood'
|
7
|
+
require 'em_algorithm/convergence/chi_square'
|
8
|
+
|
9
|
+
module EMAlgorithm
|
10
|
+
include Math
|
11
|
+
include GSL
|
12
|
+
|
13
|
+
MAX_ITERATION = 10000
|
14
|
+
|
15
|
+
class Base
|
16
|
+
|
17
|
+
attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
|
18
|
+
|
19
|
+
# * Model limitation
|
20
|
+
# currently only Gaussian Mixture model is supported.
|
21
|
+
# if you want to use simple Gaussian, you must use a mixture model
|
22
|
+
# which has only one Gaussian model entry with weight 1.0.
|
23
|
+
#
|
24
|
+
# * Input data format
|
25
|
+
# You can estimate the probability distribution and the target value distribution.
|
26
|
+
# If you want to estimate the target value distribution, you must specify the target
|
27
|
+
# value and its correspondence area size into the input vector x.
|
28
|
+
# x[-2]: target value
|
29
|
+
# x[-1]: correspondent area size
|
30
|
+
def initialize(options)
|
31
|
+
opts = {
|
32
|
+
:model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
|
33
|
+
:data_array => [],
|
34
|
+
:value_distribution_estimation => false,
|
35
|
+
:debug => true
|
36
|
+
}.merge(options)
|
37
|
+
@model = opts[:model]
|
38
|
+
@original_data_array = opts[:data_array]
|
39
|
+
@value_distribution_estimation = opts[:value_distribution_estimation]
|
40
|
+
if @value_distribution_estimation
|
41
|
+
@data_array = value_to_frequency(@original_data_array)
|
42
|
+
else
|
43
|
+
@data_array = @original_data_array
|
44
|
+
end
|
45
|
+
@likelihood = Likelihood.new(@data_array)
|
46
|
+
@debug = opts[:debug]
|
47
|
+
@const = 1.0
|
48
|
+
end
|
49
|
+
|
50
|
+
# calculate @posterior_data_array
|
51
|
+
def estep
|
52
|
+
@model.clear_temp_weight_per_datum!
|
53
|
+
@posterior_data_array = @model.calculate_posterior_data_array(@data_array)
|
54
|
+
@model.update_temp_weights!(@data_array, @posterior_data_array)
|
55
|
+
end
|
56
|
+
|
57
|
+
# calculate posterior model parameters
|
58
|
+
def mstep
|
59
|
+
if @debug
|
60
|
+
$stderr.puts @model.inspect
|
61
|
+
end
|
62
|
+
@model.update_parameters!(@data_array)
|
63
|
+
end
|
64
|
+
|
65
|
+
def run!
|
66
|
+
MAX_ITERATION.times do |i|
|
67
|
+
if @debug
|
68
|
+
$stderr.puts "step#{i}"
|
69
|
+
end
|
70
|
+
# check convergence
|
71
|
+
@likelihood.calculate(@model)
|
72
|
+
if @debug
|
73
|
+
$stderr.puts @likelihood.debug_output
|
74
|
+
end
|
75
|
+
if @likelihood.converged?
|
76
|
+
@num_step = i
|
77
|
+
if @value_distribution_estimation
|
78
|
+
@const = distribution_to_value_ratio
|
79
|
+
end
|
80
|
+
break
|
81
|
+
end
|
82
|
+
if @debug
|
83
|
+
$stderr.puts @model.debug_output
|
84
|
+
$stderr.puts ""
|
85
|
+
end
|
86
|
+
estep
|
87
|
+
mstep
|
88
|
+
end
|
89
|
+
@model
|
90
|
+
end
|
91
|
+
|
92
|
+
def value_to_frequency(data_array)
|
93
|
+
modified_data_array = []
|
94
|
+
data_array.each do |x|
|
95
|
+
x[x.size-2].round.times do
|
96
|
+
x_without_value = x[0..(x.size-3)]
|
97
|
+
x_without_value = x_without_value.first if x_without_value.size == 1
|
98
|
+
modified_data_array << x_without_value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
modified_data_array
|
102
|
+
end
|
103
|
+
|
104
|
+
# the integration of the distribution equal 1.0
|
105
|
+
# so thus, the integration of the target value means the ratio of
|
106
|
+
# probability distribution and the target value distribution
|
107
|
+
def distribution_to_value_ratio
|
108
|
+
integrated_value = 0.0
|
109
|
+
@original_data_array.each do |x|
|
110
|
+
value = x[x.size-2]
|
111
|
+
integrated_value += value * x[x.size-1]
|
112
|
+
end
|
113
|
+
integrated_value
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
Binary file
|
Binary file
|
File without changes
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class ChiSquare < CheckMethod
|
3
|
+
STAT_THRESHOLD = 0.05
|
4
|
+
CONV_THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate chi square
|
14
|
+
def calculate(model, const)
|
15
|
+
chi_square = 0.0
|
16
|
+
@data_array.each do |x|
|
17
|
+
value = x[x.size-1]
|
18
|
+
pdf = model.pdf(x[0..(x.size-2)])
|
19
|
+
next if value <= 1.0
|
20
|
+
estimated = const * pdf
|
21
|
+
chi_square += (value - estimated)**2 / estimated
|
22
|
+
end
|
23
|
+
@history << chi_square
|
24
|
+
chi_square
|
25
|
+
end
|
26
|
+
|
27
|
+
def value
|
28
|
+
@history.last
|
29
|
+
end
|
30
|
+
|
31
|
+
def converged?
|
32
|
+
return false if @history.length == 1
|
33
|
+
(@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
|
34
|
+
end
|
35
|
+
|
36
|
+
def debug_output
|
37
|
+
$stderr.puts "ChiSquare: #{value}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class ChiSquare < CheckMethod
|
3
|
+
STAT_THRESHOLD = 0.05
|
4
|
+
CONV_THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate chi square
|
14
|
+
def calculate(model, const)
|
15
|
+
chi_square = 0.0
|
16
|
+
@data_array.each do |x|
|
17
|
+
value = x[x.size-1]
|
18
|
+
pdf = model.pdf(x[0..(x.size-2)])
|
19
|
+
next if value <= 1.0
|
20
|
+
estimated = const * pdf
|
21
|
+
chi_square += (value - estimated)**2 / estimated
|
22
|
+
end
|
23
|
+
@history << chi_square
|
24
|
+
chi_square
|
25
|
+
end
|
26
|
+
|
27
|
+
def value
|
28
|
+
@history.last
|
29
|
+
end
|
30
|
+
|
31
|
+
def converged?
|
32
|
+
return false if @history.length == 1
|
33
|
+
(@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
|
34
|
+
end
|
35
|
+
|
36
|
+
def debug_output
|
37
|
+
$stderr.puts "ChiSquare: #{value}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class Likelihood < CheckMethod
|
3
|
+
#THRESHOLD = 0.0001
|
4
|
+
THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate log likelihood
|
14
|
+
def calculate(model)
|
15
|
+
likelihood = @data_array.inject(0.0) do |likelihood, x|
|
16
|
+
likelihood + log(model.pdf(x))
|
17
|
+
end
|
18
|
+
@history << likelihood
|
19
|
+
likelihood
|
20
|
+
end
|
21
|
+
|
22
|
+
def value
|
23
|
+
@history.last
|
24
|
+
end
|
25
|
+
|
26
|
+
def converged?
|
27
|
+
return false if @history.length == 1
|
28
|
+
(@history[-1] - @history[-2]).abs < THRESHOLD
|
29
|
+
end
|
30
|
+
|
31
|
+
def debug_output
|
32
|
+
"Likelihood: #{value}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class Likelihood < CheckMethod
|
3
|
+
#THRESHOLD = 0.0001
|
4
|
+
THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate log likelihood
|
14
|
+
def calculate(model)
|
15
|
+
likelihood = @data_array.inject(0.0) do |likelihood, x|
|
16
|
+
likelihood + log(model.pdf(x))
|
17
|
+
end
|
18
|
+
@history << likelihood
|
19
|
+
likelihood
|
20
|
+
end
|
21
|
+
|
22
|
+
def value
|
23
|
+
@history.last
|
24
|
+
end
|
25
|
+
|
26
|
+
def converged?
|
27
|
+
return false if @history.length == 1
|
28
|
+
(@history[-1] - @history[-2]).abs < THRESHOLD
|
29
|
+
end
|
30
|
+
|
31
|
+
def debug_output
|
32
|
+
"Likelihood: #{value}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|