ruby-em_algorithm 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/Gemfile +6 -0
  2. data/Gemfile.lock +30 -0
  3. data/README.md +44 -0
  4. data/Rakefile +7 -0
  5. data/example/.ex1.rb.swp +0 -0
  6. data/example/.ex2.rb.swp +0 -0
  7. data/example/.ex3-tmp.rb.swp +0 -0
  8. data/example/.ex3.rb.swp +0 -0
  9. data/example/data/2dim-gmm-new.txt +1267 -0
  10. data/example/data/2dim-gmm-simple.txt +676 -0
  11. data/example/data/2dim-gmm-test.txt +6565 -0
  12. data/example/data/2dim-gmm-test2.txt +2782 -0
  13. data/example/data/2dim-gmm-test3.csv +1641 -0
  14. data/example/data/2dim-gmm-test3.txt +2782 -0
  15. data/example/data/2dim-gmm-test4.csv +868 -0
  16. data/example/data/2dim-gmm-test4.txt +4924 -0
  17. data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
  18. data/example/data/2dim-gmm-without_weight.txt +18001 -0
  19. data/example/data/2dim-gmm.txt +1267 -0
  20. data/example/data/gmm-new.txt +10001 -0
  21. data/example/data/gmm-simple.txt +676 -0
  22. data/example/data/gmm.txt +10001 -0
  23. data/example/data/old-gmm.txt +10000 -0
  24. data/example/ex1.rb +20 -0
  25. data/example/ex1.rb~ +20 -0
  26. data/example/ex2.rb +33 -0
  27. data/example/ex2.rb~ +33 -0
  28. data/example/ex3-tmp.rb +23 -0
  29. data/example/ex3-tmp.rb~ +25 -0
  30. data/example/ex3.rb +43 -0
  31. data/example/ex3.rb~ +43 -0
  32. data/example/tools/.2dim.rb.swp +0 -0
  33. data/example/tools/2dim.rb +69 -0
  34. data/example/tools/2dim.rb~ +69 -0
  35. data/example/tools/boxmuller.rb +28 -0
  36. data/example/tools/boxmuller.rb~ +28 -0
  37. data/example/tools/conv_from_yaml.rb +8 -0
  38. data/example/tools/conv_from_yaml_to_csv.rb +8 -0
  39. data/example/tools/conv_to_yaml.rb +17 -0
  40. data/example/tools/ellipsoid.gnuplot +63 -0
  41. data/example/tools/ellipsoid.gnuplot~ +64 -0
  42. data/example/tools/histogram.rb +19 -0
  43. data/example/tools/histogram2d.rb +20 -0
  44. data/example/tools/histogram2d.rb~ +18 -0
  45. data/example/tools/kmeans.rb +34 -0
  46. data/example/tools/mean.rb +19 -0
  47. data/example/tools/table.data +4618 -0
  48. data/example/tools/tmp.txt +69632 -0
  49. data/example/tools/xmeans.R +608 -0
  50. data/example/tools/xmeans.rb +35 -0
  51. data/lib/em_algorithm/.base.rb.swp +0 -0
  52. data/lib/em_algorithm/base.rb +116 -0
  53. data/lib/em_algorithm/base.rb~ +116 -0
  54. data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
  55. data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
  56. data/lib/em_algorithm/convergence/check_method.rb +4 -0
  57. data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
  58. data/lib/em_algorithm/convergence/chi_square.rb +40 -0
  59. data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
  60. data/lib/em_algorithm/convergence/likelihood.rb +35 -0
  61. data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
  62. data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
  63. data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
  64. data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
  65. data/lib/em_algorithm/models/.model.rb.swp +0 -0
  66. data/lib/em_algorithm/models/gaussian.rb +47 -0
  67. data/lib/em_algorithm/models/gaussian.rb~ +47 -0
  68. data/lib/em_algorithm/models/md_gaussian.rb +67 -0
  69. data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
  70. data/lib/em_algorithm/models/mixture.rb +122 -0
  71. data/lib/em_algorithm/models/mixture.rb~ +122 -0
  72. data/lib/em_algorithm/models/model.rb +19 -0
  73. data/lib/em_algorithm/models/model.rb~ +19 -0
  74. data/lib/ruby-em_algorithm.rb +3 -0
  75. data/lib/ruby-em_algorithm/version.rb +3 -0
  76. data/ruby-em_algorithm.gemspec +21 -0
  77. data/spec/spec_helper.rb +9 -0
  78. metadata +178 -0
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'rsruby'
5
+ require 'gsl'
6
+
7
+ include GSL
8
+
9
+ r = RSRuby.instance
10
+ c = r.eval_R(<<-RCOMMAND)
11
+ a <- read.csv('#{ARGV[0]}.csv')
12
+ source('./xmeans.R')
13
+ xmeans(a,2,20)
14
+ RCOMMAND
15
+
16
+ data_array = YAML.load_file("#{ARGV[0]}.txt").map {|v| Vector[v] }
17
+ cluster = Array.new(c["size"].size).map { {"mu_sum" => 0.0, "sigma_sum" => Matrix.alloc(data_array[0].size, data_array[0].size)} }
18
+ c["cluster"].each_with_index do |num, di|
19
+ cluster[num - 1]["mu_sum"] += data_array[di]
20
+ cluster[num - 1]["sigma_sum"] += data_array[di].trans * data_array[di]
21
+ end
22
+ c["size"].each_with_index do |size, num|
23
+ cluster[num]["mu"] = cluster[num]["mu_sum"] / size
24
+ cluster[num]["sigma"] = cluster[num]["sigma_sum"] / size
25
+ end
26
+
27
+ c["centers"].each_with_index do |cen, ci|
28
+ puts "### cluster #{ci}"
29
+ p cen
30
+ end
31
+ cluster.each_with_index do |clu, ci|
32
+ puts "### cluster #{ci}"
33
+ p clu["mu"].to_a
34
+ p clu["sigma"].to_a
35
+ end
@@ -0,0 +1,116 @@
1
+ require 'em_algorithm/models/model'
2
+ require 'em_algorithm/models/gaussian'
3
+ require 'em_algorithm/models/md_gaussian'
4
+ require 'em_algorithm/models/mixture'
5
+ require 'em_algorithm/convergence/check_method'
6
+ require 'em_algorithm/convergence/likelihood'
7
+ require 'em_algorithm/convergence/chi_square'
8
+
9
+ module EMAlgorithm
10
+ include Math
11
+ include GSL
12
+
13
+ MAX_ITERATION = 10000
14
+
15
+ class Base
16
+
17
+ attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
18
+
19
+ # * Model limitation
20
+ # currently only Gaussian Mixture model is supported.
21
+ # if you want to use simple Gaussian, you must use a mixture model
22
+ # which has only one Gaussian model entry with weight 1.0.
23
+ #
24
+ # * Input data format
25
+ # You can estimate the probability distribution and the target value distribution.
26
+ # If you want to estimate the target value distribution, you must specify the target
27
+ # value and its correspondence area size into the input vector x.
28
+ # x[-2]: target value
29
+ # x[-1]: correspondent area size
30
+ def initialize(options)
31
+ opts = {
32
+ :model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
33
+ :data_array => [],
34
+ :value_distribution_estimation => false,
35
+ :debug => true
36
+ }.merge(options)
37
+ @model = opts[:model]
38
+ @original_data_array = opts[:data_array]
39
+ @value_distribution_estimation = opts[:value_distribution_estimation]
40
+ if @value_distribution_estimation
41
+ @data_array = value_to_frequency(@original_data_array)
42
+ else
43
+ @data_array = @original_data_array
44
+ end
45
+ @likelihood = Likelihood.new(@data_array)
46
+ @debug = opts[:debug]
47
+ @const = 1.0
48
+ end
49
+
50
+ # calculate @posterior_data_array
51
+ def estep
52
+ @model.clear_temp_weight_per_datum!
53
+ @posterior_data_array = @model.calculate_posterior_data_array(@data_array)
54
+ @model.update_temp_weights!(@data_array, @posterior_data_array)
55
+ end
56
+
57
+ # calculate posterior model parameters
58
+ def mstep
59
+ if @debug
60
+ $stderr.puts @model.inspect
61
+ end
62
+ @model.update_parameters!(@data_array)
63
+ end
64
+
65
+ def run!
66
+ MAX_ITERATION.times do |i|
67
+ if @debug
68
+ $stderr.puts "step#{i}"
69
+ end
70
+ # check convergence
71
+ @likelihood.calculate(@model)
72
+ if @debug
73
+ $stderr.puts @likelihood.debug_output
74
+ end
75
+ if @likelihood.converged?
76
+ @num_step = i
77
+ if @value_distribution_estimation
78
+ @const = distribution_to_value_ratio
79
+ end
80
+ break
81
+ end
82
+ if @debug
83
+ $stderr.puts @model.debug_output
84
+ $stderr.puts ""
85
+ end
86
+ estep
87
+ mstep
88
+ end
89
+ @model
90
+ end
91
+
92
+ def value_to_frequency(data_array)
93
+ modified_data_array = []
94
+ data_array.each do |x|
95
+ x[x.size-2].round.times do
96
+ x_without_value = x[0..(x.size-3)]
97
+ x_without_value = x_without_value.first if x_without_value.size == 1
98
+ modified_data_array << x_without_value
99
+ end
100
+ end
101
+ modified_data_array
102
+ end
103
+
104
+ # the integration of the distribution equal 1.0
105
+ # so thus, the integration of the target value means the ratio of
106
+ # probability distribution and the target value distribution
107
+ def distribution_to_value_ratio
108
+ integrated_value = 0.0
109
+ @original_data_array.each do |x|
110
+ value = x[x.size-2]
111
+ integrated_value += value * x[x.size-1]
112
+ end
113
+ integrated_value
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,116 @@
1
+ require 'em_algorithm/models/model'
2
+ require 'em_algorithm/models/gaussian'
3
+ require 'em_algorithm/models/md_gaussian'
4
+ require 'em_algorithm/models/mixture'
5
+ require 'em_algorithm/convergence/check_method'
6
+ require 'em_algorithm/convergence/likelihood'
7
+ require 'em_algorithm/convergence/chi_square'
8
+
9
+ module EMAlgorithm
10
+ include Math
11
+ include GSL
12
+
13
+ MAX_ITERATION = 10000
14
+
15
+ class Base
16
+
17
+ attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
18
+
19
+ # * Model limitation
20
+ # currently only Gaussian Mixture model is supported.
21
+ # if you want to use simple Gaussian, you must use a mixture model
22
+ # which has only one Gaussian model entry with weight 1.0.
23
+ #
24
+ # * Input data format
25
+ # You can estimate the probability distribution and the target value distribution.
26
+ # If you want to estimate the target value distribution, you must specify the target
27
+ # value and its correspondence area size into the input vector x.
28
+ # x[-2]: target value
29
+ # x[-1]: correspondent area size
30
+ def initialize(options)
31
+ opts = {
32
+ :model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
33
+ :data_array => [],
34
+ :value_distribution_estimation => false,
35
+ :debug => true
36
+ }.merge(options)
37
+ @model = opts[:model]
38
+ @original_data_array = opts[:data_array]
39
+ @value_distribution_estimation = opts[:value_distribution_estimation]
40
+ if @value_distribution_estimation
41
+ @data_array = value_to_frequency(@original_data_array)
42
+ else
43
+ @data_array = @original_data_array
44
+ end
45
+ @likelihood = Likelihood.new(@data_array)
46
+ @debug = opts[:debug]
47
+ @const = 1.0
48
+ end
49
+
50
+ # calculate @posterior_data_array
51
+ def estep
52
+ @model.clear_temp_weight_per_datum!
53
+ @posterior_data_array = @model.calculate_posterior_data_array(@data_array)
54
+ @model.update_temp_weights!(@data_array, @posterior_data_array)
55
+ end
56
+
57
+ # calculate posterior model parameters
58
+ def mstep
59
+ if @debug
60
+ $stderr.puts @model.inspect
61
+ end
62
+ @model.update_parameters!(@data_array)
63
+ end
64
+
65
+ def run!
66
+ MAX_ITERATION.times do |i|
67
+ if @debug
68
+ $stderr.puts "step#{i}"
69
+ end
70
+ # check convergence
71
+ @likelihood.calculate(@model)
72
+ if @debug
73
+ $stderr.puts @likelihood.debug_output
74
+ end
75
+ if @likelihood.converged?
76
+ @num_step = i
77
+ if @value_distribution_estimation
78
+ @const = distribution_to_value_ratio
79
+ end
80
+ break
81
+ end
82
+ if @debug
83
+ $stderr.puts @model.debug_output
84
+ $stderr.puts ""
85
+ end
86
+ estep
87
+ mstep
88
+ end
89
+ @model
90
+ end
91
+
92
+ def value_to_frequency(data_array)
93
+ modified_data_array = []
94
+ data_array.each do |x|
95
+ x[x.size-2].round.times do
96
+ x_without_value = x[0..(x.size-3)]
97
+ x_without_value = x_without_value.first if x_without_value.size == 1
98
+ modified_data_array << x_without_value
99
+ end
100
+ end
101
+ modified_data_array
102
+ end
103
+
104
+ # the integration of the distribution equal 1.0
105
+ # so thus, the integration of the target value means the ratio of
106
+ # probability distribution and the target value distribution
107
+ def distribution_to_value_ratio
108
+ integrated_value = 0.0
109
+ @original_data_array.each do |x|
110
+ value = x[x.size-2]
111
+ integrated_value += value * x[x.size-1]
112
+ end
113
+ integrated_value
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,4 @@
1
+ module EMAlgorithm
2
+ class CheckMethod
3
+ end
4
+ end
@@ -0,0 +1,40 @@
1
+ module EMAlgorithm
2
+ class ChiSquare < CheckMethod
3
+ STAT_THRESHOLD = 0.05
4
+ CONV_THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate chi square
14
+ def calculate(model, const)
15
+ chi_square = 0.0
16
+ @data_array.each do |x|
17
+ value = x[x.size-1]
18
+ pdf = model.pdf(x[0..(x.size-2)])
19
+ next if value <= 1.0
20
+ estimated = const * pdf
21
+ chi_square += (value - estimated)**2 / estimated
22
+ end
23
+ @history << chi_square
24
+ chi_square
25
+ end
26
+
27
+ def value
28
+ @history.last
29
+ end
30
+
31
+ def converged?
32
+ return false if @history.length == 1
33
+ (@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
34
+ end
35
+
36
+ def debug_output
37
+ $stderr.puts "ChiSquare: #{value}"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,40 @@
1
+ module EMAlgorithm
2
+ class ChiSquare < CheckMethod
3
+ STAT_THRESHOLD = 0.05
4
+ CONV_THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate chi square
14
+ def calculate(model, const)
15
+ chi_square = 0.0
16
+ @data_array.each do |x|
17
+ value = x[x.size-1]
18
+ pdf = model.pdf(x[0..(x.size-2)])
19
+ next if value <= 1.0
20
+ estimated = const * pdf
21
+ chi_square += (value - estimated)**2 / estimated
22
+ end
23
+ @history << chi_square
24
+ chi_square
25
+ end
26
+
27
+ def value
28
+ @history.last
29
+ end
30
+
31
+ def converged?
32
+ return false if @history.length == 1
33
+ (@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
34
+ end
35
+
36
+ def debug_output
37
+ $stderr.puts "ChiSquare: #{value}"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,35 @@
1
+ module EMAlgorithm
2
+ class Likelihood < CheckMethod
3
+ #THRESHOLD = 0.0001
4
+ THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate log likelihood
14
+ def calculate(model)
15
+ likelihood = @data_array.inject(0.0) do |likelihood, x|
16
+ likelihood + log(model.pdf(x))
17
+ end
18
+ @history << likelihood
19
+ likelihood
20
+ end
21
+
22
+ def value
23
+ @history.last
24
+ end
25
+
26
+ def converged?
27
+ return false if @history.length == 1
28
+ (@history[-1] - @history[-2]).abs < THRESHOLD
29
+ end
30
+
31
+ def debug_output
32
+ "Likelihood: #{value}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,35 @@
1
+ module EMAlgorithm
2
+ class Likelihood < CheckMethod
3
+ #THRESHOLD = 0.0001
4
+ THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate log likelihood
14
+ def calculate(model)
15
+ likelihood = @data_array.inject(0.0) do |likelihood, x|
16
+ likelihood + log(model.pdf(x))
17
+ end
18
+ @history << likelihood
19
+ likelihood
20
+ end
21
+
22
+ def value
23
+ @history.last
24
+ end
25
+
26
+ def converged?
27
+ return false if @history.length == 1
28
+ (@history[-1] - @history[-2]).abs < THRESHOLD
29
+ end
30
+
31
+ def debug_output
32
+ "Likelihood: #{value}"
33
+ end
34
+ end
35
+ end