ruby-em_algorithm 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +6 -0
- data/Gemfile.lock +30 -0
- data/README.md +44 -0
- data/Rakefile +7 -0
- data/example/.ex1.rb.swp +0 -0
- data/example/.ex2.rb.swp +0 -0
- data/example/.ex3-tmp.rb.swp +0 -0
- data/example/.ex3.rb.swp +0 -0
- data/example/data/2dim-gmm-new.txt +1267 -0
- data/example/data/2dim-gmm-simple.txt +676 -0
- data/example/data/2dim-gmm-test.txt +6565 -0
- data/example/data/2dim-gmm-test2.txt +2782 -0
- data/example/data/2dim-gmm-test3.csv +1641 -0
- data/example/data/2dim-gmm-test3.txt +2782 -0
- data/example/data/2dim-gmm-test4.csv +868 -0
- data/example/data/2dim-gmm-test4.txt +4924 -0
- data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
- data/example/data/2dim-gmm-without_weight.txt +18001 -0
- data/example/data/2dim-gmm.txt +1267 -0
- data/example/data/gmm-new.txt +10001 -0
- data/example/data/gmm-simple.txt +676 -0
- data/example/data/gmm.txt +10001 -0
- data/example/data/old-gmm.txt +10000 -0
- data/example/ex1.rb +20 -0
- data/example/ex1.rb~ +20 -0
- data/example/ex2.rb +33 -0
- data/example/ex2.rb~ +33 -0
- data/example/ex3-tmp.rb +23 -0
- data/example/ex3-tmp.rb~ +25 -0
- data/example/ex3.rb +43 -0
- data/example/ex3.rb~ +43 -0
- data/example/tools/.2dim.rb.swp +0 -0
- data/example/tools/2dim.rb +69 -0
- data/example/tools/2dim.rb~ +69 -0
- data/example/tools/boxmuller.rb +28 -0
- data/example/tools/boxmuller.rb~ +28 -0
- data/example/tools/conv_from_yaml.rb +8 -0
- data/example/tools/conv_from_yaml_to_csv.rb +8 -0
- data/example/tools/conv_to_yaml.rb +17 -0
- data/example/tools/ellipsoid.gnuplot +63 -0
- data/example/tools/ellipsoid.gnuplot~ +64 -0
- data/example/tools/histogram.rb +19 -0
- data/example/tools/histogram2d.rb +20 -0
- data/example/tools/histogram2d.rb~ +18 -0
- data/example/tools/kmeans.rb +34 -0
- data/example/tools/mean.rb +19 -0
- data/example/tools/table.data +4618 -0
- data/example/tools/tmp.txt +69632 -0
- data/example/tools/xmeans.R +608 -0
- data/example/tools/xmeans.rb +35 -0
- data/lib/em_algorithm/.base.rb.swp +0 -0
- data/lib/em_algorithm/base.rb +116 -0
- data/lib/em_algorithm/base.rb~ +116 -0
- data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
- data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
- data/lib/em_algorithm/convergence/check_method.rb +4 -0
- data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
- data/lib/em_algorithm/convergence/chi_square.rb +40 -0
- data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
- data/lib/em_algorithm/convergence/likelihood.rb +35 -0
- data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
- data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
- data/lib/em_algorithm/models/.model.rb.swp +0 -0
- data/lib/em_algorithm/models/gaussian.rb +47 -0
- data/lib/em_algorithm/models/gaussian.rb~ +47 -0
- data/lib/em_algorithm/models/md_gaussian.rb +67 -0
- data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
- data/lib/em_algorithm/models/mixture.rb +122 -0
- data/lib/em_algorithm/models/mixture.rb~ +122 -0
- data/lib/em_algorithm/models/model.rb +19 -0
- data/lib/em_algorithm/models/model.rb~ +19 -0
- data/lib/ruby-em_algorithm.rb +3 -0
- data/lib/ruby-em_algorithm/version.rb +3 -0
- data/ruby-em_algorithm.gemspec +21 -0
- data/spec/spec_helper.rb +9 -0
- metadata +178 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'rsruby'
|
5
|
+
require 'gsl'
|
6
|
+
|
7
|
+
include GSL
|
8
|
+
|
9
|
+
r = RSRuby.instance
|
10
|
+
c = r.eval_R(<<-RCOMMAND)
|
11
|
+
a <- read.csv('#{ARGV[0]}.csv')
|
12
|
+
source('./xmeans.R')
|
13
|
+
xmeans(a,2,20)
|
14
|
+
RCOMMAND
|
15
|
+
|
16
|
+
data_array = YAML.load_file("#{ARGV[0]}.txt").map {|v| Vector[v] }
|
17
|
+
cluster = Array.new(c["size"].size).map { {"mu_sum" => 0.0, "sigma_sum" => Matrix.alloc(data_array[0].size, data_array[0].size)} }
|
18
|
+
c["cluster"].each_with_index do |num, di|
|
19
|
+
cluster[num - 1]["mu_sum"] += data_array[di]
|
20
|
+
cluster[num - 1]["sigma_sum"] += data_array[di].trans * data_array[di]
|
21
|
+
end
|
22
|
+
c["size"].each_with_index do |size, num|
|
23
|
+
cluster[num]["mu"] = cluster[num]["mu_sum"] / size
|
24
|
+
cluster[num]["sigma"] = cluster[num]["sigma_sum"] / size
|
25
|
+
end
|
26
|
+
|
27
|
+
c["centers"].each_with_index do |cen, ci|
|
28
|
+
puts "### cluster #{ci}"
|
29
|
+
p cen
|
30
|
+
end
|
31
|
+
cluster.each_with_index do |clu, ci|
|
32
|
+
puts "### cluster #{ci}"
|
33
|
+
p clu["mu"].to_a
|
34
|
+
p clu["sigma"].to_a
|
35
|
+
end
|
Binary file
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'em_algorithm/models/model'
|
2
|
+
require 'em_algorithm/models/gaussian'
|
3
|
+
require 'em_algorithm/models/md_gaussian'
|
4
|
+
require 'em_algorithm/models/mixture'
|
5
|
+
require 'em_algorithm/convergence/check_method'
|
6
|
+
require 'em_algorithm/convergence/likelihood'
|
7
|
+
require 'em_algorithm/convergence/chi_square'
|
8
|
+
|
9
|
+
module EMAlgorithm
|
10
|
+
include Math
|
11
|
+
include GSL
|
12
|
+
|
13
|
+
MAX_ITERATION = 10000
|
14
|
+
|
15
|
+
class Base
|
16
|
+
|
17
|
+
attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
|
18
|
+
|
19
|
+
# * Model limitation
|
20
|
+
# currently only Gaussian Mixture model is supported.
|
21
|
+
# if you want to use simple Gaussian, you must use a mixture model
|
22
|
+
# which has only one Gaussian model entry with weight 1.0.
|
23
|
+
#
|
24
|
+
# * Input data format
|
25
|
+
# You can estimate the probability distribution and the target value distribution.
|
26
|
+
# If you want to estimate the target value distribution, you must specify the target
|
27
|
+
# value and its correspondence area size into the input vector x.
|
28
|
+
# x[-2]: target value
|
29
|
+
# x[-1]: correspondent area size
|
30
|
+
def initialize(options)
|
31
|
+
opts = {
|
32
|
+
:model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
|
33
|
+
:data_array => [],
|
34
|
+
:value_distribution_estimation => false,
|
35
|
+
:debug => true
|
36
|
+
}.merge(options)
|
37
|
+
@model = opts[:model]
|
38
|
+
@original_data_array = opts[:data_array]
|
39
|
+
@value_distribution_estimation = opts[:value_distribution_estimation]
|
40
|
+
if @value_distribution_estimation
|
41
|
+
@data_array = value_to_frequency(@original_data_array)
|
42
|
+
else
|
43
|
+
@data_array = @original_data_array
|
44
|
+
end
|
45
|
+
@likelihood = Likelihood.new(@data_array)
|
46
|
+
@debug = opts[:debug]
|
47
|
+
@const = 1.0
|
48
|
+
end
|
49
|
+
|
50
|
+
# calculate @posterior_data_array
|
51
|
+
def estep
|
52
|
+
@model.clear_temp_weight_per_datum!
|
53
|
+
@posterior_data_array = @model.calculate_posterior_data_array(@data_array)
|
54
|
+
@model.update_temp_weights!(@data_array, @posterior_data_array)
|
55
|
+
end
|
56
|
+
|
57
|
+
# calculate posterior model parameters
|
58
|
+
def mstep
|
59
|
+
if @debug
|
60
|
+
$stderr.puts @model.inspect
|
61
|
+
end
|
62
|
+
@model.update_parameters!(@data_array)
|
63
|
+
end
|
64
|
+
|
65
|
+
def run!
|
66
|
+
MAX_ITERATION.times do |i|
|
67
|
+
if @debug
|
68
|
+
$stderr.puts "step#{i}"
|
69
|
+
end
|
70
|
+
# check convergence
|
71
|
+
@likelihood.calculate(@model)
|
72
|
+
if @debug
|
73
|
+
$stderr.puts @likelihood.debug_output
|
74
|
+
end
|
75
|
+
if @likelihood.converged?
|
76
|
+
@num_step = i
|
77
|
+
if @value_distribution_estimation
|
78
|
+
@const = distribution_to_value_ratio
|
79
|
+
end
|
80
|
+
break
|
81
|
+
end
|
82
|
+
if @debug
|
83
|
+
$stderr.puts @model.debug_output
|
84
|
+
$stderr.puts ""
|
85
|
+
end
|
86
|
+
estep
|
87
|
+
mstep
|
88
|
+
end
|
89
|
+
@model
|
90
|
+
end
|
91
|
+
|
92
|
+
def value_to_frequency(data_array)
|
93
|
+
modified_data_array = []
|
94
|
+
data_array.each do |x|
|
95
|
+
x[x.size-2].round.times do
|
96
|
+
x_without_value = x[0..(x.size-3)]
|
97
|
+
x_without_value = x_without_value.first if x_without_value.size == 1
|
98
|
+
modified_data_array << x_without_value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
modified_data_array
|
102
|
+
end
|
103
|
+
|
104
|
+
# the integration of the distribution equal 1.0
|
105
|
+
# so thus, the integration of the target value means the ratio of
|
106
|
+
# probability distribution and the target value distribution
|
107
|
+
def distribution_to_value_ratio
|
108
|
+
integrated_value = 0.0
|
109
|
+
@original_data_array.each do |x|
|
110
|
+
value = x[x.size-2]
|
111
|
+
integrated_value += value * x[x.size-1]
|
112
|
+
end
|
113
|
+
integrated_value
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'em_algorithm/models/model'
|
2
|
+
require 'em_algorithm/models/gaussian'
|
3
|
+
require 'em_algorithm/models/md_gaussian'
|
4
|
+
require 'em_algorithm/models/mixture'
|
5
|
+
require 'em_algorithm/convergence/check_method'
|
6
|
+
require 'em_algorithm/convergence/likelihood'
|
7
|
+
require 'em_algorithm/convergence/chi_square'
|
8
|
+
|
9
|
+
module EMAlgorithm
|
10
|
+
include Math
|
11
|
+
include GSL
|
12
|
+
|
13
|
+
MAX_ITERATION = 10000
|
14
|
+
|
15
|
+
class Base
|
16
|
+
|
17
|
+
attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
|
18
|
+
|
19
|
+
# * Model limitation
|
20
|
+
# currently only Gaussian Mixture model is supported.
|
21
|
+
# if you want to use simple Gaussian, you must use a mixture model
|
22
|
+
# which has only one Gaussian model entry with weight 1.0.
|
23
|
+
#
|
24
|
+
# * Input data format
|
25
|
+
# You can estimate the probability distribution and the target value distribution.
|
26
|
+
# If you want to estimate the target value distribution, you must specify the target
|
27
|
+
# value and its correspondence area size into the input vector x.
|
28
|
+
# x[-2]: target value
|
29
|
+
# x[-1]: correspondent area size
|
30
|
+
def initialize(options)
|
31
|
+
opts = {
|
32
|
+
:model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
|
33
|
+
:data_array => [],
|
34
|
+
:value_distribution_estimation => false,
|
35
|
+
:debug => true
|
36
|
+
}.merge(options)
|
37
|
+
@model = opts[:model]
|
38
|
+
@original_data_array = opts[:data_array]
|
39
|
+
@value_distribution_estimation = opts[:value_distribution_estimation]
|
40
|
+
if @value_distribution_estimation
|
41
|
+
@data_array = value_to_frequency(@original_data_array)
|
42
|
+
else
|
43
|
+
@data_array = @original_data_array
|
44
|
+
end
|
45
|
+
@likelihood = Likelihood.new(@data_array)
|
46
|
+
@debug = opts[:debug]
|
47
|
+
@const = 1.0
|
48
|
+
end
|
49
|
+
|
50
|
+
# calculate @posterior_data_array
|
51
|
+
def estep
|
52
|
+
@model.clear_temp_weight_per_datum!
|
53
|
+
@posterior_data_array = @model.calculate_posterior_data_array(@data_array)
|
54
|
+
@model.update_temp_weights!(@data_array, @posterior_data_array)
|
55
|
+
end
|
56
|
+
|
57
|
+
# calculate posterior model parameters
|
58
|
+
def mstep
|
59
|
+
if @debug
|
60
|
+
$stderr.puts @model.inspect
|
61
|
+
end
|
62
|
+
@model.update_parameters!(@data_array)
|
63
|
+
end
|
64
|
+
|
65
|
+
def run!
|
66
|
+
MAX_ITERATION.times do |i|
|
67
|
+
if @debug
|
68
|
+
$stderr.puts "step#{i}"
|
69
|
+
end
|
70
|
+
# check convergence
|
71
|
+
@likelihood.calculate(@model)
|
72
|
+
if @debug
|
73
|
+
$stderr.puts @likelihood.debug_output
|
74
|
+
end
|
75
|
+
if @likelihood.converged?
|
76
|
+
@num_step = i
|
77
|
+
if @value_distribution_estimation
|
78
|
+
@const = distribution_to_value_ratio
|
79
|
+
end
|
80
|
+
break
|
81
|
+
end
|
82
|
+
if @debug
|
83
|
+
$stderr.puts @model.debug_output
|
84
|
+
$stderr.puts ""
|
85
|
+
end
|
86
|
+
estep
|
87
|
+
mstep
|
88
|
+
end
|
89
|
+
@model
|
90
|
+
end
|
91
|
+
|
92
|
+
def value_to_frequency(data_array)
|
93
|
+
modified_data_array = []
|
94
|
+
data_array.each do |x|
|
95
|
+
x[x.size-2].round.times do
|
96
|
+
x_without_value = x[0..(x.size-3)]
|
97
|
+
x_without_value = x_without_value.first if x_without_value.size == 1
|
98
|
+
modified_data_array << x_without_value
|
99
|
+
end
|
100
|
+
end
|
101
|
+
modified_data_array
|
102
|
+
end
|
103
|
+
|
104
|
+
# the integration of the distribution equal 1.0
|
105
|
+
# so thus, the integration of the target value means the ratio of
|
106
|
+
# probability distribution and the target value distribution
|
107
|
+
def distribution_to_value_ratio
|
108
|
+
integrated_value = 0.0
|
109
|
+
@original_data_array.each do |x|
|
110
|
+
value = x[x.size-2]
|
111
|
+
integrated_value += value * x[x.size-1]
|
112
|
+
end
|
113
|
+
integrated_value
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
Binary file
|
Binary file
|
File without changes
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class ChiSquare < CheckMethod
|
3
|
+
STAT_THRESHOLD = 0.05
|
4
|
+
CONV_THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate chi square
|
14
|
+
def calculate(model, const)
|
15
|
+
chi_square = 0.0
|
16
|
+
@data_array.each do |x|
|
17
|
+
value = x[x.size-1]
|
18
|
+
pdf = model.pdf(x[0..(x.size-2)])
|
19
|
+
next if value <= 1.0
|
20
|
+
estimated = const * pdf
|
21
|
+
chi_square += (value - estimated)**2 / estimated
|
22
|
+
end
|
23
|
+
@history << chi_square
|
24
|
+
chi_square
|
25
|
+
end
|
26
|
+
|
27
|
+
def value
|
28
|
+
@history.last
|
29
|
+
end
|
30
|
+
|
31
|
+
def converged?
|
32
|
+
return false if @history.length == 1
|
33
|
+
(@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
|
34
|
+
end
|
35
|
+
|
36
|
+
def debug_output
|
37
|
+
$stderr.puts "ChiSquare: #{value}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class ChiSquare < CheckMethod
|
3
|
+
STAT_THRESHOLD = 0.05
|
4
|
+
CONV_THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate chi square
|
14
|
+
def calculate(model, const)
|
15
|
+
chi_square = 0.0
|
16
|
+
@data_array.each do |x|
|
17
|
+
value = x[x.size-1]
|
18
|
+
pdf = model.pdf(x[0..(x.size-2)])
|
19
|
+
next if value <= 1.0
|
20
|
+
estimated = const * pdf
|
21
|
+
chi_square += (value - estimated)**2 / estimated
|
22
|
+
end
|
23
|
+
@history << chi_square
|
24
|
+
chi_square
|
25
|
+
end
|
26
|
+
|
27
|
+
def value
|
28
|
+
@history.last
|
29
|
+
end
|
30
|
+
|
31
|
+
def converged?
|
32
|
+
return false if @history.length == 1
|
33
|
+
(@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
|
34
|
+
end
|
35
|
+
|
36
|
+
def debug_output
|
37
|
+
$stderr.puts "ChiSquare: #{value}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class Likelihood < CheckMethod
|
3
|
+
#THRESHOLD = 0.0001
|
4
|
+
THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate log likelihood
|
14
|
+
def calculate(model)
|
15
|
+
likelihood = @data_array.inject(0.0) do |likelihood, x|
|
16
|
+
likelihood + log(model.pdf(x))
|
17
|
+
end
|
18
|
+
@history << likelihood
|
19
|
+
likelihood
|
20
|
+
end
|
21
|
+
|
22
|
+
def value
|
23
|
+
@history.last
|
24
|
+
end
|
25
|
+
|
26
|
+
def converged?
|
27
|
+
return false if @history.length == 1
|
28
|
+
(@history[-1] - @history[-2]).abs < THRESHOLD
|
29
|
+
end
|
30
|
+
|
31
|
+
def debug_output
|
32
|
+
"Likelihood: #{value}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EMAlgorithm
|
2
|
+
class Likelihood < CheckMethod
|
3
|
+
#THRESHOLD = 0.0001
|
4
|
+
THRESHOLD = 0.01
|
5
|
+
|
6
|
+
attr_accessor :history
|
7
|
+
|
8
|
+
def initialize(data_array)
|
9
|
+
@data_array = data_array
|
10
|
+
@history = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# calculate log likelihood
|
14
|
+
def calculate(model)
|
15
|
+
likelihood = @data_array.inject(0.0) do |likelihood, x|
|
16
|
+
likelihood + log(model.pdf(x))
|
17
|
+
end
|
18
|
+
@history << likelihood
|
19
|
+
likelihood
|
20
|
+
end
|
21
|
+
|
22
|
+
def value
|
23
|
+
@history.last
|
24
|
+
end
|
25
|
+
|
26
|
+
def converged?
|
27
|
+
return false if @history.length == 1
|
28
|
+
(@history[-1] - @history[-2]).abs < THRESHOLD
|
29
|
+
end
|
30
|
+
|
31
|
+
def debug_output
|
32
|
+
"Likelihood: #{value}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|