rubst 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,58 @@
1
+ require "rubygems"
2
+ require "set"
3
+ require "narray"
4
+ require "facets"
5
+
6
+ class Environment
7
+
8
+ @@amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
9
+
10
+ attr_accessor :number,
11
+ :label,
12
+ :freq_array,
13
+ :prob_array,
14
+ :logodd_array,
15
+ :smooth_prob_array
16
+
17
+ def initialize(number, label)
18
+ @number = number
19
+ @label = label
20
+ @freq_array = NArray.int(21)
21
+ @prob_array = NArray.float(21)
22
+ @logodd_array = NArray.float(21)
23
+ @smooth_prob_array = NArray.float(21)
24
+ end
25
+
26
+ def add_residue_count(a, inc = 1)
27
+ @freq_array[@@amino_acids.index(a.upcase)] += inc
28
+ end
29
+
30
+ def label_set
31
+ label.split("").map_with_index { |l, i| "#{i}#{l}" }.to_set
32
+ end
33
+
34
+ def to_s
35
+ "#{number}-#{label}"
36
+ end
37
+ end
38
+
39
+ if $0 == __FILE__
40
+
41
+ require "test/unit"
42
+
43
+ class TestEnvironment < Test::Unit::TestCase
44
+
45
+ def setup
46
+ @env = Environment.new(1, "AHaSon")
47
+ end
48
+
49
+ def test_label_set
50
+ assert_equal(%w[0A 1H 2a 3S 4o 5n].to_set, @env.label_set)
51
+ end
52
+
53
+ def test_to_s
54
+ assert_equal("1-AHaSon", @env.to_s)
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,14 @@
1
+ class EnvironmentFeature < Struct.new(:name, :symbols, :labels, :constrained, :silent)
2
+
3
+ def to_s
4
+ values.join(";")
5
+ end
6
+
7
+ def constrained?
8
+ constrained == "T"
9
+ end
10
+
11
+ def silent?
12
+ silent == "T"
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ require "rubygems"
2
+ require "facets"
3
+
4
+ module NArrayExtensions
5
+
6
+ def pretty_string(opts={})
7
+ { :col_header => nil,
8
+ :row_header => nil }.merge!(opts)
9
+
10
+ ("%-3s" % "#") + opts[:col_header].inject("") { |s, a| s + ("%7s" % a) } + "\n" +
11
+ self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
12
+ if v.is_a? Float
13
+ s + ("%7.2f" % v)
14
+ else
15
+ s + ("%7d" % v)
16
+ end
17
+ }
18
+ end
19
+ end
20
+
21
+ NArray.send(:include, NArrayExtensions)
@@ -0,0 +1,26 @@
1
+ require "rubygems"
2
+ require "facets"
3
+ require "narray"
4
+
5
+ module NMatrixExtensions
6
+
7
+
8
+ def pretty_string(opts={})
9
+ { :col_header => nil,
10
+ :row_header => nil }.merge!(opts)
11
+
12
+ ("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
13
+ s + ("%7s" % a)
14
+ } + "\n" + self.to_a.map_with_index { |a, i|
15
+ ("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
16
+ if v.is_a? Float
17
+ s + ("%7.2f" % v)
18
+ else
19
+ s + ("%7d" % v)
20
+ end
21
+ }
22
+ }.join("\n")
23
+ end
24
+ end
25
+
26
+ NMatrix.send(:include, NMatrixExtensions)
@@ -0,0 +1,6 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ module Rubst
5
+ VERSION = '0.0.1'
6
+ end
@@ -0,0 +1,536 @@
1
+ require "getoptlong"
2
+ require "rdoc/usage"
3
+ require "logger"
4
+ require "rubygems"
5
+ require "narray"
6
+ require "bio"
7
+ require "set"
8
+ require "facets"
9
+
10
+ require "narray_extensions"
11
+ require "nmatrix_extensions"
12
+ require "enumerable_extensions"
13
+ require "environment_feature"
14
+ require "environment"
15
+
16
+ module Rubst
17
+ class CLI
18
+
19
+ def self.print_version
20
+ puts Rubst::VERSION
21
+ end
22
+
23
+ def self.print_usage
24
+ puts <<-USAGE
25
+
26
+ rubst [ options ] -f TEM-file
27
+ or
28
+ rubst [ options ] -l TEMLIST-file
29
+
30
+ Available options:
31
+
32
+ -h, --help
33
+ show help
34
+ -f, --tem-file FILE
35
+ a tem file
36
+ -l, --tem-list FILE
37
+ a list for tem files
38
+ -o, --outfile FILE
39
+ output filename ("allmat.dat" if not specified)
40
+ --weight INTEGER (PID)
41
+ clustering level (PID) for the BLOSUM-like weighting (not supported yet)
42
+ --noweight
43
+ calculate substitution counts with no weights (default)
44
+ -c, --classdef FILE
45
+ a file for the defintion of environments (default: 'classdef.dat')
46
+ -y, --cys INTEGER
47
+ 0 for using C and J only for structure
48
+ 1 for both structure and sequence (default)
49
+ -output INTEGER
50
+ 0 for raw counts (no-smoothing performed)
51
+ 1 for probabilities
52
+ 2 for log-odds (default)
53
+ --scale INTEGER
54
+ log-odds matrices in 1/n bit units (default 3)
55
+ --sigma DOUBLE
56
+ change the sigma value for smoothing (default 5)
57
+ --add DOUBLE
58
+ add this value to raw counts when deriving log-odds without smoothing
59
+ (default 1/#classes)
60
+ --penv
61
+ use environment-dependent frequencies for log-odds calculation (default false)
62
+ --pidmin DOUBLE
63
+ count substitutions only for pairs with PID equal to or greater than this value (default none)
64
+ --pidmax DOUBLE
65
+ count substitutions only for pairs with PID smaller than this value (default none)
66
+ --analysis
67
+ analyze structural environments (also -D)
68
+ -v, --verbose INTEGER
69
+ 0 for WARN level (default)
70
+ 1 for INFO level or more severe
71
+ 2 for DEBUG level or more sever
72
+ --version
73
+ print version
74
+
75
+ USAGE
76
+ end
77
+
78
+ def self.execute(arguments=[])
79
+ #
80
+ # Abbreviations in the source codes
81
+ #
82
+ # * env: environment
83
+ # * tem: (FUGUE) template
84
+ # * classdef: (envlironment) class definition
85
+ # * aa: amino acid
86
+ # * tot: total
87
+ # * rel: relative
88
+ # * obs: observation (frequency)
89
+ # * mut: mutation
90
+ # * mutb: mutability
91
+ # * freq: frequency
92
+ # * prob: probability
93
+ # * opts: options
94
+
95
+ #
96
+ # Global variables & Abbreviations
97
+ #
98
+ $logger = Logger.new(STDOUT)
99
+ $amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
100
+ $tem_list = nil
101
+ $tem_file = nil
102
+ $classdef = nil
103
+ $outfile = nil
104
+ $format = nil
105
+ $aa_tot_obs = {}
106
+ $aa_mut_obs = {}
107
+ $aa_mutb = {}
108
+ $aa_rel_mutb = {}
109
+ $aa_rel_freq = {}
110
+ $tot_aa = 0
111
+ $sigma = 5.0
112
+ $smooth_prob = {}
113
+
114
+ #
115
+ # Options parsing
116
+ #
117
+ opts = GetoptLong.new(
118
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
119
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
120
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
121
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
122
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
123
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
124
+ [ '-o', GetoptLong::REQUIRED_ARGUMENT ],
125
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
126
+ [ '--version', GetoptLong::NO_ARGUMENT ]
127
+ )
128
+
129
+ opts.each do |opt, arg|
130
+ case opt
131
+ when '--help'
132
+ print_usage
133
+ when '--tem-list'
134
+ $tem_list = arg
135
+ when '--tem-file'
136
+ $tem_file = arg
137
+ when '--classdef'
138
+ $classdef = arg || 'classdef.dat'
139
+ when '--output'
140
+ $format = arg.to_i || 2
141
+ when '-o'
142
+ $outfile = arg || 'allmat.dat'
143
+ when '--cyc'
144
+ $cysteine = (arg.to_i == 1 ? false : true)
145
+ when '--verbose'
146
+ $logger.level = case arg.to_i
147
+ when 0 then Logger::WARN
148
+ when 1 then Logger::INFO
149
+ when 2 then Logger::DEBUG
150
+ else Logger::INFO
151
+ end
152
+ when '--version'
153
+ print_version
154
+ end
155
+ end
156
+
157
+ if ARGV.length != 0 or
158
+ !$tem_list && !$tem_file or
159
+ $tem_list && $tem_file
160
+ print_usage
161
+ exit 0
162
+ end
163
+
164
+ #
165
+ # Reading Environment Class Definition File
166
+ #
167
+ $env_features = []
168
+ $env_features << EnvironmentFeature.new("sequence",
169
+ $amino_acids,
170
+ $amino_acids,
171
+ "F",
172
+ "F")
173
+
174
+ IO.foreach($classdef) do |l|
175
+ next if l =~ /^#/
176
+ if (env_ftr = l.chomp.split(/;/)).length == 5
177
+ $logger.info ">>> An environment feature, #{l.chomp} detected"
178
+ if env_ftr[-1] == "T"
179
+ $logger.warn "!!! The environment feature, #{l.chomp} silent"
180
+ next
181
+ end
182
+ if env_ftr[-2] == "T"
183
+ $logger.warn "!!! The environment feature, #{l.chomp} constrained"
184
+ end
185
+ $env_features << EnvironmentFeature.new(env_ftr[0],
186
+ env_ftr[1].split(""),
187
+ env_ftr[2].split(""),
188
+ env_ftr[3],
189
+ env_ftr[4])
190
+ end
191
+ end
192
+
193
+ $envs = {}
194
+ $env_features.inject([]) { |sum, ec|
195
+ sum << ec.labels
196
+ }.inject { |pro, lb|
197
+ pro.product(lb)
198
+ }.each_with_index { |e, i|
199
+ $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
200
+ }
201
+
202
+ #
203
+ # Reading TEM list file
204
+ #
205
+ if $tem_list
206
+ IO.foreach($tem_list) do |tem_file|
207
+ tem_file.chomp!
208
+
209
+ $logger.debug ">>> Processing #{tem_file} ..."
210
+
211
+ ali = Bio::Alignment::OriginalAlignment.new
212
+ ff = Bio::FlatFile.auto(tem_file)
213
+ ff.each_entry do |pir|
214
+ if pir.definition == "sequence"
215
+ ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
216
+ end
217
+ end
218
+
219
+ env_labels = {}
220
+ disulphide = {}
221
+
222
+ ali.each_pair do |key, seq|
223
+ # check disulphide bond environment first!
224
+ ff.rewind
225
+ ff.each_entry do |pir|
226
+ if (pir.entry_id == key) && (pir.definition == "disulphide")
227
+ disulphide[key] = pir.data.gsub("\n", "").split("")
228
+ end
229
+ end
230
+
231
+ $env_features.each_with_index do |ec, ei|
232
+ env_labels[key] = [] unless env_labels.has_key?(key)
233
+
234
+ ff.rewind
235
+ ff.each_entry do |pir|
236
+ if (pir.entry_id == key) && (pir.definition == ec.name)
237
+ labels = pir.data.gsub("\n", "").split("").map_with_index { |sym, pos|
238
+ if sym == "-"
239
+ "-"
240
+ elsif sym == "X" || sym == "x"
241
+ "X"
242
+ else
243
+ if ei == 0 # Amino Acid Environment Feature
244
+ ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
245
+ else
246
+ ec.labels[ec.symbols.index(sym)]
247
+ end
248
+ end
249
+ }
250
+
251
+ if env_labels[key].empty?
252
+ env_labels[key] = labels
253
+ else
254
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ ali.each_pair do |key1, seq1|
262
+ ali.each_pair do |key2, seq2|
263
+ if key1 != key2
264
+ s1 = seq1.split("")
265
+ s2 = seq2.split("")
266
+ s1.each_with_index do |source, pos|
267
+ if env_labels[key1][pos].include?("X")
268
+ $logger.info ">>> Substitutions from #{key1}-#{pos}-#{source} were masked"
269
+ next
270
+ end
271
+
272
+ source.upcase!
273
+ target = s2[pos].upcase
274
+
275
+ if !$amino_acids.include?(source)
276
+ $logger.info "!!! #{key1}-#{pos}-#{source} is not standard amino acid"
277
+ next
278
+ end
279
+
280
+ if !$amino_acids.include?(target)
281
+ $logger.info "!!! #{key1}-#{pos}-#{target} is not standard amino acid"
282
+ next
283
+ end
284
+
285
+ source = (((disulphide[key1][pos] == "F") && (source == "C")) ? "J" : source)
286
+ target = (((disulphide[key2][pos] == "F") && (target == "C")) ? "J" : target)
287
+
288
+ $envs[env_labels[key1][pos]].add_residue_count(target)
289
+
290
+ if $aa_tot_obs.has_key?(source)
291
+ $aa_tot_obs[source] += 1
292
+ else
293
+ $aa_tot_obs[source] = 1
294
+ end
295
+
296
+ if source != target
297
+ if $aa_mut_obs.has_key?(source)
298
+ $aa_mut_obs[source] += 1
299
+ else
300
+ $aa_mut_obs[source] = 1
301
+ end
302
+ end
303
+ $logger.debug ">>> Add #{key1}-#{pos}-#{source} -> #{key2}-#{pos}-#{target} substituion for #{env_labels[key1][pos]}"
304
+ end
305
+ end
306
+ end
307
+ end
308
+ end # IO.foreach($tem_list)
309
+
310
+ # #
311
+ # # Frequency matrix
312
+ # #
313
+ # $envs.values.sort_by { |v| v.number }.each do |env|
314
+ # puts ">#{env.label} #{env.number}"
315
+ # puts env.freq_array.pretty_string(:col_header => $amino_acids,
316
+ # :row_header => "Prb")
317
+ # end
318
+
319
+ $tot_freq_matrix = NMatrix.int(21,21)
320
+
321
+ # for each combination of environment features
322
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
323
+ env_groups.each_pair do |label, group|
324
+ $grp_freq_matrix = NMatrix.int(21,21)
325
+
326
+ $amino_acids.each_with_index do |aa, ai|
327
+ freq_array = group.find { |e| e.label.start_with?(aa) }.freq_array
328
+ 0.upto($grp_freq_matrix.shape[1] - 1) do |j|
329
+ $grp_freq_matrix[ai, j] = freq_array[j]
330
+ end
331
+ end
332
+
333
+ $tot_freq_matrix += $grp_freq_matrix
334
+
335
+ # puts ">#{label}"
336
+ # puts $grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
337
+ end
338
+
339
+ # # for total
340
+ # puts ">Total"
341
+ # puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
342
+
343
+ #
344
+ # Amino Acid Frequencies and Mutabilities
345
+ #
346
+ ala_factor = 100 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
347
+ $tot_aa = $aa_tot_obs.values.sum
348
+
349
+ # puts ">Total amino acid frequencies"
350
+ # puts "%-3s %8s %8s %8s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUT REL_MUT REL_FRQ]
351
+
352
+ $aa_tot_obs.each_pair do |res, freq|
353
+ $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
354
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
355
+ $aa_rel_freq[res] = freq / $tot_aa.to_f
356
+
357
+ # puts "%-3s %8d %8d %8.2f %8d %8.4f" % [res, $aa_mut_obs[res], freq, $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
358
+ end
359
+
360
+ #
361
+ # Probability matrices
362
+ #
363
+ tot_prob_matrix = NMatrix.float(21, 21)
364
+
365
+ 0.upto($tot_freq_matrix.shape[0] - 1) do |i|
366
+ col_sum = 0
367
+ 0.upto($tot_freq_matrix.shape[1] - 1) do |j|
368
+ col_sum += $tot_freq_matrix[i, j]
369
+ end
370
+ 0.upto($tot_freq_matrix.shape[1] - 1) do |k|
371
+ # normalized substitutions probabilities with mutability
372
+ #tot_prob_matrix[i, k] = $aa_rel_mutb[$amino_acids[k]] * $tot_freq_matrix[i,k] / col_sum.to_f
373
+
374
+ # raw substitution probabilities with just frequencies
375
+ tot_prob_matrix[i, k] = 100 * $tot_freq_matrix[i,k] / col_sum.to_f
376
+ end
377
+ end
378
+
379
+ # puts ">Total probability"
380
+ # puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
381
+
382
+ # # a new way of getting probability matrix
383
+ # new_tot_prob_matrix = NMatrix.float(21, 21)
384
+ #
385
+ # 0.upto($tot_freq_matrix.shape[0] - 1) do |i|
386
+ # col_sum = 0
387
+ # 0.upto($tot_freq_matrix.shape[1] - 1) do |j|
388
+ # col_sum += ((1 - $aa_mutb[$amino_acids[j]]) / $aa_rel_freq[$amino_acids[j]]) * $tot_freq_matrix[i, j]
389
+ # end
390
+ # 0.upto($tot_freq_matrix.shape[1] - 1) do |k|
391
+ # new_tot_prob_matrix[i, k] = 100 * ((1 - $aa_mutb[$amino_acids[k]]) / $aa_rel_freq[$amino_acids[k]]) * $tot_freq_matrix[i, k] / col_sum
392
+ # end
393
+ # end
394
+
395
+ #
396
+ # p1 probability
397
+ #
398
+ p1 = NArray.float(21)
399
+ a0 = NArray.float(21).fill(1 / 21.0)
400
+ big_N = $tot_aa
401
+ small_n = 21
402
+ omega1 = 1.0 / (1 + big_N.to_f / ($sigma * small_n.to_f))
403
+ omega2 = 1.0 - omega1
404
+
405
+ 0.upto(p1.shape[0] - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
406
+ $smooth_prob[1] = p1
407
+
408
+ # puts "P1 probability matrix"
409
+ # puts p1.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
410
+ # puts p1.sum
411
+
412
+ #
413
+ # p2 and above
414
+ #
415
+ env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
416
+
417
+ 1.upto($env_features.size) do |ci|
418
+ env_labels.combination(ci) do |c1|
419
+ Enumerable.cart_prod(*c1).each do |labels|
420
+ pattern = "." * $env_features.size
421
+ labels.each do |label|
422
+ j = label[0].chr.to_i
423
+ l = label[1].chr
424
+ pattern[j] = l
425
+ end
426
+
427
+ # get environmetns, frequencies, and probabilities
428
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
429
+ freq_arr = envs.inject(NArray.int(21)) { |sum, env| sum + env.freq_array }
430
+
431
+ # if freq_arr.sum == 0
432
+ # $logger.warn "!!! Environment combination, #{labels.to_set} has no frequency"
433
+ #
434
+ # # store smoothed probabilties in a hash using a set of envrionment labels as a key
435
+ # smooth_prob_arr = NArray.float(21).fill(0.0)
436
+ # if !$smooth_prob.has_key?(ci + 1)
437
+ # $smooth_prob[ci + 1] = {}
438
+ # $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
439
+ # else
440
+ # $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
441
+ # end
442
+ #
443
+ # # print smoothed probabilties
444
+ # puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
445
+ # puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
446
+ # puts smooth_prob_arr.sum
447
+ # next
448
+ # end
449
+
450
+ prob_arr = NArray.float(21)
451
+ 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
452
+
453
+ # collect priors
454
+ priors = []
455
+ if ci > 1
456
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
457
+ else
458
+ priors << $smooth_prob[1]
459
+ end
460
+
461
+ # averaging priors... have a look at Entropy based normalization!
462
+ #pri_avg = priors.sum / priors.size
463
+
464
+ # entropy based weighting priors
465
+ entropy_max = Math::log(21)
466
+ entropies = priors.map do |prior|
467
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
468
+ end
469
+ pri_avg = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
470
+
471
+
472
+ smooth_prob_arr = NArray.float(21)
473
+ big_N = freq_arr.sum.to_f
474
+ small_n = 21.0
475
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
476
+ omega2 = 1.0 - omega1
477
+
478
+ # smoothing step
479
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * pri_avg[i] + omega2 * prob_arr[i]) }
480
+
481
+ # normalization step
482
+ smooth_prob_arr_sum = smooth_prob_arr.sum
483
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
484
+
485
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
486
+ if !$smooth_prob.has_key?(ci + 1)
487
+ $smooth_prob[ci + 1] = {}
488
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
489
+ else
490
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
491
+ end
492
+
493
+ # # print smoothed probabilties
494
+ # puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
495
+ # puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
496
+ # puts smooth_prob_arr.sum
497
+ end
498
+ end
499
+ end
500
+
501
+ # summarizing ...
502
+ $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
503
+ # $envs.values.each do |e|
504
+ # puts ">" + e.label
505
+ # puts e.smooth_prob_array.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
506
+ # end
507
+
508
+ tot_smooth_prob_matrix = NMatrix.float(21,21)
509
+
510
+ # for each combination of environment features
511
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
512
+ env_groups.sort.each do |label, group|
513
+ grp_prob_matrix = NMatrix.float(21,21)
514
+
515
+ $amino_acids.each_with_index do |aa, ai|
516
+ smooth_prob_array = group.find { |e| e.label.start_with?(aa) }.smooth_prob_array
517
+ 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
518
+ end
519
+
520
+ tot_smooth_prob_matrix += grp_prob_matrix
521
+
522
+ puts ">#{label}"
523
+ puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
524
+ end
525
+
526
+ tot_smooth_prob_matrix /= env_groups.size
527
+
528
+ # for total
529
+ puts ">Total Probability"
530
+ puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
531
+
532
+ end
533
+
534
+ end
535
+ end
536
+ end