ruby-em_algorithm 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. data/Gemfile +6 -0
  2. data/Gemfile.lock +30 -0
  3. data/README.md +44 -0
  4. data/Rakefile +7 -0
  5. data/example/.ex1.rb.swp +0 -0
  6. data/example/.ex2.rb.swp +0 -0
  7. data/example/.ex3-tmp.rb.swp +0 -0
  8. data/example/.ex3.rb.swp +0 -0
  9. data/example/data/2dim-gmm-new.txt +1267 -0
  10. data/example/data/2dim-gmm-simple.txt +676 -0
  11. data/example/data/2dim-gmm-test.txt +6565 -0
  12. data/example/data/2dim-gmm-test2.txt +2782 -0
  13. data/example/data/2dim-gmm-test3.csv +1641 -0
  14. data/example/data/2dim-gmm-test3.txt +2782 -0
  15. data/example/data/2dim-gmm-test4.csv +868 -0
  16. data/example/data/2dim-gmm-test4.txt +4924 -0
  17. data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
  18. data/example/data/2dim-gmm-without_weight.txt +18001 -0
  19. data/example/data/2dim-gmm.txt +1267 -0
  20. data/example/data/gmm-new.txt +10001 -0
  21. data/example/data/gmm-simple.txt +676 -0
  22. data/example/data/gmm.txt +10001 -0
  23. data/example/data/old-gmm.txt +10000 -0
  24. data/example/ex1.rb +20 -0
  25. data/example/ex1.rb~ +20 -0
  26. data/example/ex2.rb +33 -0
  27. data/example/ex2.rb~ +33 -0
  28. data/example/ex3-tmp.rb +23 -0
  29. data/example/ex3-tmp.rb~ +25 -0
  30. data/example/ex3.rb +43 -0
  31. data/example/ex3.rb~ +43 -0
  32. data/example/tools/.2dim.rb.swp +0 -0
  33. data/example/tools/2dim.rb +69 -0
  34. data/example/tools/2dim.rb~ +69 -0
  35. data/example/tools/boxmuller.rb +28 -0
  36. data/example/tools/boxmuller.rb~ +28 -0
  37. data/example/tools/conv_from_yaml.rb +8 -0
  38. data/example/tools/conv_from_yaml_to_csv.rb +8 -0
  39. data/example/tools/conv_to_yaml.rb +17 -0
  40. data/example/tools/ellipsoid.gnuplot +63 -0
  41. data/example/tools/ellipsoid.gnuplot~ +64 -0
  42. data/example/tools/histogram.rb +19 -0
  43. data/example/tools/histogram2d.rb +20 -0
  44. data/example/tools/histogram2d.rb~ +18 -0
  45. data/example/tools/kmeans.rb +34 -0
  46. data/example/tools/mean.rb +19 -0
  47. data/example/tools/table.data +4618 -0
  48. data/example/tools/tmp.txt +69632 -0
  49. data/example/tools/xmeans.R +608 -0
  50. data/example/tools/xmeans.rb +35 -0
  51. data/lib/em_algorithm/.base.rb.swp +0 -0
  52. data/lib/em_algorithm/base.rb +116 -0
  53. data/lib/em_algorithm/base.rb~ +116 -0
  54. data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
  55. data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
  56. data/lib/em_algorithm/convergence/check_method.rb +4 -0
  57. data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
  58. data/lib/em_algorithm/convergence/chi_square.rb +40 -0
  59. data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
  60. data/lib/em_algorithm/convergence/likelihood.rb +35 -0
  61. data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
  62. data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
  63. data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
  64. data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
  65. data/lib/em_algorithm/models/.model.rb.swp +0 -0
  66. data/lib/em_algorithm/models/gaussian.rb +47 -0
  67. data/lib/em_algorithm/models/gaussian.rb~ +47 -0
  68. data/lib/em_algorithm/models/md_gaussian.rb +67 -0
  69. data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
  70. data/lib/em_algorithm/models/mixture.rb +122 -0
  71. data/lib/em_algorithm/models/mixture.rb~ +122 -0
  72. data/lib/em_algorithm/models/model.rb +19 -0
  73. data/lib/em_algorithm/models/model.rb~ +19 -0
  74. data/lib/ruby-em_algorithm.rb +3 -0
  75. data/lib/ruby-em_algorithm/version.rb +3 -0
  76. data/ruby-em_algorithm.gemspec +21 -0
  77. data/spec/spec_helper.rb +9 -0
  78. metadata +178 -0
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yaml'
4
+ require 'rsruby'
5
+ require 'gsl'
6
+
7
+ include GSL
8
+
9
+ r = RSRuby.instance
10
+ c = r.eval_R(<<-RCOMMAND)
11
+ a <- read.csv('#{ARGV[0]}.csv')
12
+ source('./xmeans.R')
13
+ xmeans(a,2,20)
14
+ RCOMMAND
15
+
16
+ data_array = YAML.load_file("#{ARGV[0]}.txt").map {|v| Vector[v] }
17
+ cluster = Array.new(c["size"].size).map { {"mu_sum" => 0.0, "sigma_sum" => Matrix.alloc(data_array[0].size, data_array[0].size)} }
18
+ c["cluster"].each_with_index do |num, di|
19
+ cluster[num - 1]["mu_sum"] += data_array[di]
20
+ cluster[num - 1]["sigma_sum"] += data_array[di].trans * data_array[di]
21
+ end
22
+ c["size"].each_with_index do |size, num|
23
+ cluster[num]["mu"] = cluster[num]["mu_sum"] / size
24
+ cluster[num]["sigma"] = cluster[num]["sigma_sum"] / size
25
+ end
26
+
27
+ c["centers"].each_with_index do |cen, ci|
28
+ puts "### cluster #{ci}"
29
+ p cen
30
+ end
31
+ cluster.each_with_index do |clu, ci|
32
+ puts "### cluster #{ci}"
33
+ p clu["mu"].to_a
34
+ p clu["sigma"].to_a
35
+ end
@@ -0,0 +1,116 @@
1
+ require 'em_algorithm/models/model'
2
+ require 'em_algorithm/models/gaussian'
3
+ require 'em_algorithm/models/md_gaussian'
4
+ require 'em_algorithm/models/mixture'
5
+ require 'em_algorithm/convergence/check_method'
6
+ require 'em_algorithm/convergence/likelihood'
7
+ require 'em_algorithm/convergence/chi_square'
8
+
9
+ module EMAlgorithm
10
+ include Math
11
+ include GSL
12
+
13
+ MAX_ITERATION = 10000
14
+
15
+ class Base
16
+
17
+ attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
18
+
19
+ # * Model limitation
20
+ # currently only Gaussian Mixture model is supported.
21
+ # if you want to use simple Gaussian, you must use a mixture model
22
+ # which has only one Gaussian model entry with weight 1.0.
23
+ #
24
+ # * Input data format
25
+ # You can estimate the probability distribution and the target value distribution.
26
+ # If you want to estimate the target value distribution, you must specify the target
27
+ # value and its correspondence area size into the input vector x.
28
+ # x[-2]: target value
29
+ # x[-1]: correspondent area size
30
+ def initialize(options)
31
+ opts = {
32
+ :model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
33
+ :data_array => [],
34
+ :value_distribution_estimation => false,
35
+ :debug => true
36
+ }.merge(options)
37
+ @model = opts[:model]
38
+ @original_data_array = opts[:data_array]
39
+ @value_distribution_estimation = opts[:value_distribution_estimation]
40
+ if @value_distribution_estimation
41
+ @data_array = value_to_frequency(@original_data_array)
42
+ else
43
+ @data_array = @original_data_array
44
+ end
45
+ @likelihood = Likelihood.new(@data_array)
46
+ @debug = opts[:debug]
47
+ @const = 1.0
48
+ end
49
+
50
+ # calculate @posterior_data_array
51
+ def estep
52
+ @model.clear_temp_weight_per_datum!
53
+ @posterior_data_array = @model.calculate_posterior_data_array(@data_array)
54
+ @model.update_temp_weights!(@data_array, @posterior_data_array)
55
+ end
56
+
57
+ # calculate posterior model parameters
58
+ def mstep
59
+ if @debug
60
+ $stderr.puts @model.inspect
61
+ end
62
+ @model.update_parameters!(@data_array)
63
+ end
64
+
65
+ def run!
66
+ MAX_ITERATION.times do |i|
67
+ if @debug
68
+ $stderr.puts "step#{i}"
69
+ end
70
+ # check convergence
71
+ @likelihood.calculate(@model)
72
+ if @debug
73
+ $stderr.puts @likelihood.debug_output
74
+ end
75
+ if @likelihood.converged?
76
+ @num_step = i
77
+ if @value_distribution_estimation
78
+ @const = distribution_to_value_ratio
79
+ end
80
+ break
81
+ end
82
+ if @debug
83
+ $stderr.puts @model.debug_output
84
+ $stderr.puts ""
85
+ end
86
+ estep
87
+ mstep
88
+ end
89
+ @model
90
+ end
91
+
92
+ def value_to_frequency(data_array)
93
+ modified_data_array = []
94
+ data_array.each do |x|
95
+ x[x.size-2].round.times do
96
+ x_without_value = x[0..(x.size-3)]
97
+ x_without_value = x_without_value.first if x_without_value.size == 1
98
+ modified_data_array << x_without_value
99
+ end
100
+ end
101
+ modified_data_array
102
+ end
103
+
104
+ # the integration of the distribution equal 1.0
105
+ # so thus, the integration of the target value means the ratio of
106
+ # probability distribution and the target value distribution
107
+ def distribution_to_value_ratio
108
+ integrated_value = 0.0
109
+ @original_data_array.each do |x|
110
+ value = x[x.size-2]
111
+ integrated_value += value * x[x.size-1]
112
+ end
113
+ integrated_value
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,116 @@
1
+ require 'em_algorithm/models/model'
2
+ require 'em_algorithm/models/gaussian'
3
+ require 'em_algorithm/models/md_gaussian'
4
+ require 'em_algorithm/models/mixture'
5
+ require 'em_algorithm/convergence/check_method'
6
+ require 'em_algorithm/convergence/likelihood'
7
+ require 'em_algorithm/convergence/chi_square'
8
+
9
+ module EMAlgorithm
10
+ include Math
11
+ include GSL
12
+
13
+ MAX_ITERATION = 10000
14
+
15
+ class Base
16
+
17
+ attr_accessor :model, :original_data_array, :data_array, :likelihood, :num_step, :const
18
+
19
+ # * Model limitation
20
+ # currently only Gaussian Mixture model is supported.
21
+ # if you want to use simple Gaussian, you must use a mixture model
22
+ # which has only one Gaussian model entry with weight 1.0.
23
+ #
24
+ # * Input data format
25
+ # You can estimate the probability distribution and the target value distribution.
26
+ # If you want to estimate the target value distribution, you must specify the target
27
+ # value and its correspondence area size into the input vector x.
28
+ # x[-2]: target value
29
+ # x[-1]: correspondent area size
30
+ def initialize(options)
31
+ opts = {
32
+ :model => Mixture.new(:models => [Gaussian.new(0.0, 9.0)], :weights => [1.0]),
33
+ :data_array => [],
34
+ :value_distribution_estimation => false,
35
+ :debug => true
36
+ }.merge(options)
37
+ @model = opts[:model]
38
+ @original_data_array = opts[:data_array]
39
+ @value_distribution_estimation = opts[:value_distribution_estimation]
40
+ if @value_distribution_estimation
41
+ @data_array = value_to_frequency(@original_data_array)
42
+ else
43
+ @data_array = @original_data_array
44
+ end
45
+ @likelihood = Likelihood.new(@data_array)
46
+ @debug = opts[:debug]
47
+ @const = 1.0
48
+ end
49
+
50
+ # calculate @posterior_data_array
51
+ def estep
52
+ @model.clear_temp_weight_per_datum!
53
+ @posterior_data_array = @model.calculate_posterior_data_array(@data_array)
54
+ @model.update_temp_weights!(@data_array, @posterior_data_array)
55
+ end
56
+
57
+ # calculate posterior model parameters
58
+ def mstep
59
+ if @debug
60
+ $stderr.puts @model.inspect
61
+ end
62
+ @model.update_parameters!(@data_array)
63
+ end
64
+
65
+ def run!
66
+ MAX_ITERATION.times do |i|
67
+ if @debug
68
+ $stderr.puts "step#{i}"
69
+ end
70
+ # check convergence
71
+ @likelihood.calculate(@model)
72
+ if @debug
73
+ $stderr.puts @likelihood.debug_output
74
+ end
75
+ if @likelihood.converged?
76
+ @num_step = i
77
+ if @value_distribution_estimation
78
+ @const = distribution_to_value_ratio
79
+ end
80
+ break
81
+ end
82
+ if @debug
83
+ $stderr.puts @model.debug_output
84
+ $stderr.puts ""
85
+ end
86
+ estep
87
+ mstep
88
+ end
89
+ @model
90
+ end
91
+
92
+ def value_to_frequency(data_array)
93
+ modified_data_array = []
94
+ data_array.each do |x|
95
+ x[x.size-2].round.times do
96
+ x_without_value = x[0..(x.size-3)]
97
+ x_without_value = x_without_value.first if x_without_value.size == 1
98
+ modified_data_array << x_without_value
99
+ end
100
+ end
101
+ modified_data_array
102
+ end
103
+
104
+ # the integration of the distribution equal 1.0
105
+ # so thus, the integration of the target value means the ratio of
106
+ # probability distribution and the target value distribution
107
+ def distribution_to_value_ratio
108
+ integrated_value = 0.0
109
+ @original_data_array.each do |x|
110
+ value = x[x.size-2]
111
+ integrated_value += value * x[x.size-1]
112
+ end
113
+ integrated_value
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,4 @@
1
+ module EMAlgorithm
2
+ class CheckMethod
3
+ end
4
+ end
@@ -0,0 +1,40 @@
1
+ module EMAlgorithm
2
+ class ChiSquare < CheckMethod
3
+ STAT_THRESHOLD = 0.05
4
+ CONV_THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate chi square
14
+ def calculate(model, const)
15
+ chi_square = 0.0
16
+ @data_array.each do |x|
17
+ value = x[x.size-1]
18
+ pdf = model.pdf(x[0..(x.size-2)])
19
+ next if value <= 1.0
20
+ estimated = const * pdf
21
+ chi_square += (value - estimated)**2 / estimated
22
+ end
23
+ @history << chi_square
24
+ chi_square
25
+ end
26
+
27
+ def value
28
+ @history.last
29
+ end
30
+
31
+ def converged?
32
+ return false if @history.length == 1
33
+ (@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
34
+ end
35
+
36
+ def debug_output
37
+ $stderr.puts "ChiSquare: #{value}"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,40 @@
1
+ module EMAlgorithm
2
+ class ChiSquare < CheckMethod
3
+ STAT_THRESHOLD = 0.05
4
+ CONV_THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate chi square
14
+ def calculate(model, const)
15
+ chi_square = 0.0
16
+ @data_array.each do |x|
17
+ value = x[x.size-1]
18
+ pdf = model.pdf(x[0..(x.size-2)])
19
+ next if value <= 1.0
20
+ estimated = const * pdf
21
+ chi_square += (value - estimated)**2 / estimated
22
+ end
23
+ @history << chi_square
24
+ chi_square
25
+ end
26
+
27
+ def value
28
+ @history.last
29
+ end
30
+
31
+ def converged?
32
+ return false if @history.length == 1
33
+ (@history[-1] < STAT_THRESHOLD) || ((@history[-1] - @history[-2]).abs < CONV_THRESHOLD)
34
+ end
35
+
36
+ def debug_output
37
+ $stderr.puts "ChiSquare: #{value}"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,35 @@
1
+ module EMAlgorithm
2
+ class Likelihood < CheckMethod
3
+ #THRESHOLD = 0.0001
4
+ THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate log likelihood
14
+ def calculate(model)
15
+ likelihood = @data_array.inject(0.0) do |likelihood, x|
16
+ likelihood + log(model.pdf(x))
17
+ end
18
+ @history << likelihood
19
+ likelihood
20
+ end
21
+
22
+ def value
23
+ @history.last
24
+ end
25
+
26
+ def converged?
27
+ return false if @history.length == 1
28
+ (@history[-1] - @history[-2]).abs < THRESHOLD
29
+ end
30
+
31
+ def debug_output
32
+ "Likelihood: #{value}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,35 @@
1
+ module EMAlgorithm
2
+ class Likelihood < CheckMethod
3
+ #THRESHOLD = 0.0001
4
+ THRESHOLD = 0.01
5
+
6
+ attr_accessor :history
7
+
8
+ def initialize(data_array)
9
+ @data_array = data_array
10
+ @history = []
11
+ end
12
+
13
+ # calculate log likelihood
14
+ def calculate(model)
15
+ likelihood = @data_array.inject(0.0) do |likelihood, x|
16
+ likelihood + log(model.pdf(x))
17
+ end
18
+ @history << likelihood
19
+ likelihood
20
+ end
21
+
22
+ def value
23
+ @history.last
24
+ end
25
+
26
+ def converged?
27
+ return false if @history.length == 1
28
+ (@history[-1] - @history[-2]).abs < THRESHOLD
29
+ end
30
+
31
+ def debug_output
32
+ "Likelihood: #{value}"
33
+ end
34
+ end
35
+ end