rubst 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ require "rubygems"
2
+ require "set"
3
+ require "narray"
4
+ require "facets"
5
+
6
+ class Environment
7
+
8
+ @@amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
9
+
10
+ attr_accessor :number,
11
+ :label,
12
+ :freq_array,
13
+ :prob_array,
14
+ :logodd_array,
15
+ :smooth_prob_array
16
+
17
+ def initialize(number, label)
18
+ @number = number
19
+ @label = label
20
+ @freq_array = NArray.int(21)
21
+ @prob_array = NArray.float(21)
22
+ @logodd_array = NArray.float(21)
23
+ @smooth_prob_array = NArray.float(21)
24
+ end
25
+
26
+ def add_residue_count(a, inc = 1)
27
+ @freq_array[@@amino_acids.index(a.upcase)] += inc
28
+ end
29
+
30
+ def label_set
31
+ label.split("").map_with_index { |l, i| "#{i}#{l}" }.to_set
32
+ end
33
+
34
+ def to_s
35
+ "#{number}-#{label}"
36
+ end
37
+ end
38
+
39
+ if $0 == __FILE__
40
+
41
+ require "test/unit"
42
+
43
+ class TestEnvironment < Test::Unit::TestCase
44
+
45
+ def setup
46
+ @env = Environment.new(1, "AHaSon")
47
+ end
48
+
49
+ def test_label_set
50
+ assert_equal(%w[0A 1H 2a 3S 4o 5n].to_set, @env.label_set)
51
+ end
52
+
53
+ def test_to_s
54
+ assert_equal("1-AHaSon", @env.to_s)
55
+ end
56
+
57
+ end
58
+ end
@@ -0,0 +1,14 @@
1
+ class EnvironmentFeature < Struct.new(:name, :symbols, :labels, :constrained, :silent)
2
+
3
+ def to_s
4
+ values.join(";")
5
+ end
6
+
7
+ def constrained?
8
+ constrained == "T"
9
+ end
10
+
11
+ def silent?
12
+ silent == "T"
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ require "rubygems"
2
+ require "facets"
3
+
4
+ module NArrayExtensions
5
+
6
+ def pretty_string(opts={})
7
+ { :col_header => nil,
8
+ :row_header => nil }.merge!(opts)
9
+
10
+ ("%-3s" % "#") + opts[:col_header].inject("") { |s, a| s + ("%7s" % a) } + "\n" +
11
+ self.to_a.inject("%-3s" % opts[:row_header]) { |s, v|
12
+ if v.is_a? Float
13
+ s + ("%7.2f" % v)
14
+ else
15
+ s + ("%7d" % v)
16
+ end
17
+ }
18
+ end
19
+ end
20
+
21
+ NArray.send(:include, NArrayExtensions)
@@ -0,0 +1,26 @@
1
+ require "rubygems"
2
+ require "facets"
3
+ require "narray"
4
+
5
+ module NMatrixExtensions
6
+
7
+
8
+ def pretty_string(opts={})
9
+ { :col_header => nil,
10
+ :row_header => nil }.merge!(opts)
11
+
12
+ ("%-3s" % "#") + opts[:col_header].inject("") { |s, a|
13
+ s + ("%7s" % a)
14
+ } + "\n" + self.to_a.map_with_index { |a, i|
15
+ ("%-3s" % opts[:row_header][i]) + a.inject("") { |s, v|
16
+ if v.is_a? Float
17
+ s + ("%7.2f" % v)
18
+ else
19
+ s + ("%7d" % v)
20
+ end
21
+ }
22
+ }.join("\n")
23
+ end
24
+ end
25
+
26
+ NMatrix.send(:include, NMatrixExtensions)
@@ -0,0 +1,6 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ module Rubst
5
+ VERSION = '0.0.1'
6
+ end
@@ -0,0 +1,536 @@
1
+ require "getoptlong"
2
+ require "rdoc/usage"
3
+ require "logger"
4
+ require "rubygems"
5
+ require "narray"
6
+ require "bio"
7
+ require "set"
8
+ require "facets"
9
+
10
+ require "narray_extensions"
11
+ require "nmatrix_extensions"
12
+ require "enumerable_extensions"
13
+ require "environment_feature"
14
+ require "environment"
15
+
16
+ module Rubst
17
+ class CLI
18
+
19
+ def self.print_version
20
+ puts Rubst::VERSION
21
+ end
22
+
23
+ def self.print_usage
24
+ puts <<-USAGE
25
+
26
+ rubst [ options ] -f TEM-file
27
+ or
28
+ rubst [ options ] -l TEMLIST-file
29
+
30
+ Available options:
31
+
32
+ -h, --help
33
+ show help
34
+ -f, --tem-file FILE
35
+ a tem file
36
+ -l, --tem-list FILE
37
+ a list for tem files
38
+ -o, --outfile FILE
39
+ output filename ("allmat.dat" if not specified)
40
+ --weight INTEGER (PID)
41
+ clustering level (PID) for the BLOSUM-like weighting (not supported yet)
42
+ --noweight
43
+ calculate substitution counts with no weights (default)
44
+ -c, --classdef FILE
45
+ a file for the defintion of environments (default: 'classdef.dat')
46
+ -y, --cys INTEGER
47
+ 0 for using C and J only for structure
48
+ 1 for both structure and sequence (default)
49
+ -output INTEGER
50
+ 0 for raw counts (no-smoothing performed)
51
+ 1 for probabilities
52
+ 2 for log-odds (default)
53
+ --scale INTEGER
54
+ log-odds matrices in 1/n bit units (default 3)
55
+ --sigma DOUBLE
56
+ change the sigma value for smoothing (default 5)
57
+ --add DOUBLE
58
+ add this value to raw counts when deriving log-odds without smoothing
59
+ (default 1/#classes)
60
+ --penv
61
+ use environment-dependent frequencies for log-odds calculation (default false)
62
+ --pidmin DOUBLE
63
+ count substitutions only for pairs with PID equal to or greater than this value (default none)
64
+ --pidmax DOUBLE
65
+ count substitutions only for pairs with PID smaller than this value (default none)
66
+ --analysis
67
+ analyze structural environments (also -D)
68
+ -v, --verbose INTEGER
69
+ 0 for WARN level (default)
70
+ 1 for INFO level or more severe
71
+ 2 for DEBUG level or more sever
72
+ --version
73
+ print version
74
+
75
+ USAGE
76
+ end
77
+
78
+ def self.execute(arguments=[])
79
+ #
80
+ # Abbreviations in the source codes
81
+ #
82
+ # * env: environment
83
+ # * tem: (FUGUE) template
84
+ # * classdef: (envlironment) class definition
85
+ # * aa: amino acid
86
+ # * tot: total
87
+ # * rel: relative
88
+ # * obs: observation (frequency)
89
+ # * mut: mutation
90
+ # * mutb: mutability
91
+ # * freq: frequency
92
+ # * prob: probability
93
+ # * opts: options
94
+
95
+ #
96
+ # Global variables & Abbreviations
97
+ #
98
+ $logger = Logger.new(STDOUT)
99
+ $amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
100
+ $tem_list = nil
101
+ $tem_file = nil
102
+ $classdef = nil
103
+ $outfile = nil
104
+ $format = nil
105
+ $aa_tot_obs = {}
106
+ $aa_mut_obs = {}
107
+ $aa_mutb = {}
108
+ $aa_rel_mutb = {}
109
+ $aa_rel_freq = {}
110
+ $tot_aa = 0
111
+ $sigma = 5.0
112
+ $smooth_prob = {}
113
+
114
+ #
115
+ # Options parsing
116
+ #
117
+ opts = GetoptLong.new(
118
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
119
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
120
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
121
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
122
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
123
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
124
+ [ '-o', GetoptLong::REQUIRED_ARGUMENT ],
125
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
126
+ [ '--version', GetoptLong::NO_ARGUMENT ]
127
+ )
128
+
129
+ opts.each do |opt, arg|
130
+ case opt
131
+ when '--help'
132
+ print_usage
133
+ when '--tem-list'
134
+ $tem_list = arg
135
+ when '--tem-file'
136
+ $tem_file = arg
137
+ when '--classdef'
138
+ $classdef = arg || 'classdef.dat'
139
+ when '--output'
140
+ $format = arg.to_i || 2
141
+ when '-o'
142
+ $outfile = arg || 'allmat.dat'
143
+ when '--cyc'
144
+ $cysteine = (arg.to_i == 1 ? false : true)
145
+ when '--verbose'
146
+ $logger.level = case arg.to_i
147
+ when 0 then Logger::WARN
148
+ when 1 then Logger::INFO
149
+ when 2 then Logger::DEBUG
150
+ else Logger::INFO
151
+ end
152
+ when '--version'
153
+ print_version
154
+ end
155
+ end
156
+
157
+ if ARGV.length != 0 or
158
+ !$tem_list && !$tem_file or
159
+ $tem_list && $tem_file
160
+ print_usage
161
+ exit 0
162
+ end
163
+
164
+ #
165
+ # Reading Environment Class Definition File
166
+ #
167
+ $env_features = []
168
+ $env_features << EnvironmentFeature.new("sequence",
169
+ $amino_acids,
170
+ $amino_acids,
171
+ "F",
172
+ "F")
173
+
174
+ IO.foreach($classdef) do |l|
175
+ next if l =~ /^#/
176
+ if (env_ftr = l.chomp.split(/;/)).length == 5
177
+ $logger.info ">>> An environment feature, #{l.chomp} detected"
178
+ if env_ftr[-1] == "T"
179
+ $logger.warn "!!! The environment feature, #{l.chomp} silent"
180
+ next
181
+ end
182
+ if env_ftr[-2] == "T"
183
+ $logger.warn "!!! The environment feature, #{l.chomp} constrained"
184
+ end
185
+ $env_features << EnvironmentFeature.new(env_ftr[0],
186
+ env_ftr[1].split(""),
187
+ env_ftr[2].split(""),
188
+ env_ftr[3],
189
+ env_ftr[4])
190
+ end
191
+ end
192
+
193
+ $envs = {}
194
+ $env_features.inject([]) { |sum, ec|
195
+ sum << ec.labels
196
+ }.inject { |pro, lb|
197
+ pro.product(lb)
198
+ }.each_with_index { |e, i|
199
+ $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
200
+ }
201
+
202
+ #
203
+ # Reading TEM list file
204
+ #
205
+ if $tem_list
206
+ IO.foreach($tem_list) do |tem_file|
207
+ tem_file.chomp!
208
+
209
+ $logger.debug ">>> Processing #{tem_file} ..."
210
+
211
+ ali = Bio::Alignment::OriginalAlignment.new
212
+ ff = Bio::FlatFile.auto(tem_file)
213
+ ff.each_entry do |pir|
214
+ if pir.definition == "sequence"
215
+ ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
216
+ end
217
+ end
218
+
219
+ env_labels = {}
220
+ disulphide = {}
221
+
222
+ ali.each_pair do |key, seq|
223
+ # check disulphide bond environment first!
224
+ ff.rewind
225
+ ff.each_entry do |pir|
226
+ if (pir.entry_id == key) && (pir.definition == "disulphide")
227
+ disulphide[key] = pir.data.gsub("\n", "").split("")
228
+ end
229
+ end
230
+
231
+ $env_features.each_with_index do |ec, ei|
232
+ env_labels[key] = [] unless env_labels.has_key?(key)
233
+
234
+ ff.rewind
235
+ ff.each_entry do |pir|
236
+ if (pir.entry_id == key) && (pir.definition == ec.name)
237
+ labels = pir.data.gsub("\n", "").split("").map_with_index { |sym, pos|
238
+ if sym == "-"
239
+ "-"
240
+ elsif sym == "X" || sym == "x"
241
+ "X"
242
+ else
243
+ if ei == 0 # Amino Acid Environment Feature
244
+ ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
245
+ else
246
+ ec.labels[ec.symbols.index(sym)]
247
+ end
248
+ end
249
+ }
250
+
251
+ if env_labels[key].empty?
252
+ env_labels[key] = labels
253
+ else
254
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ ali.each_pair do |key1, seq1|
262
+ ali.each_pair do |key2, seq2|
263
+ if key1 != key2
264
+ s1 = seq1.split("")
265
+ s2 = seq2.split("")
266
+ s1.each_with_index do |source, pos|
267
+ if env_labels[key1][pos].include?("X")
268
+ $logger.info ">>> Substitutions from #{key1}-#{pos}-#{source} were masked"
269
+ next
270
+ end
271
+
272
+ source.upcase!
273
+ target = s2[pos].upcase
274
+
275
+ if !$amino_acids.include?(source)
276
+ $logger.info "!!! #{key1}-#{pos}-#{source} is not standard amino acid"
277
+ next
278
+ end
279
+
280
+ if !$amino_acids.include?(target)
281
+ $logger.info "!!! #{key1}-#{pos}-#{target} is not standard amino acid"
282
+ next
283
+ end
284
+
285
+ source = (((disulphide[key1][pos] == "F") && (source == "C")) ? "J" : source)
286
+ target = (((disulphide[key2][pos] == "F") && (target == "C")) ? "J" : target)
287
+
288
+ $envs[env_labels[key1][pos]].add_residue_count(target)
289
+
290
+ if $aa_tot_obs.has_key?(source)
291
+ $aa_tot_obs[source] += 1
292
+ else
293
+ $aa_tot_obs[source] = 1
294
+ end
295
+
296
+ if source != target
297
+ if $aa_mut_obs.has_key?(source)
298
+ $aa_mut_obs[source] += 1
299
+ else
300
+ $aa_mut_obs[source] = 1
301
+ end
302
+ end
303
+ $logger.debug ">>> Add #{key1}-#{pos}-#{source} -> #{key2}-#{pos}-#{target} substituion for #{env_labels[key1][pos]}"
304
+ end
305
+ end
306
+ end
307
+ end
308
+ end # IO.foreach($tem_list)
309
+
310
+ # #
311
+ # # Frequency matrix
312
+ # #
313
+ # $envs.values.sort_by { |v| v.number }.each do |env|
314
+ # puts ">#{env.label} #{env.number}"
315
+ # puts env.freq_array.pretty_string(:col_header => $amino_acids,
316
+ # :row_header => "Prb")
317
+ # end
318
+
319
+ $tot_freq_matrix = NMatrix.int(21,21)
320
+
321
+ # for each combination of environment features
322
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
323
+ env_groups.each_pair do |label, group|
324
+ $grp_freq_matrix = NMatrix.int(21,21)
325
+
326
+ $amino_acids.each_with_index do |aa, ai|
327
+ freq_array = group.find { |e| e.label.start_with?(aa) }.freq_array
328
+ 0.upto($grp_freq_matrix.shape[1] - 1) do |j|
329
+ $grp_freq_matrix[ai, j] = freq_array[j]
330
+ end
331
+ end
332
+
333
+ $tot_freq_matrix += $grp_freq_matrix
334
+
335
+ # puts ">#{label}"
336
+ # puts $grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
337
+ end
338
+
339
+ # # for total
340
+ # puts ">Total"
341
+ # puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
342
+
343
+ #
344
+ # Amino Acid Frequencies and Mutabilities
345
+ #
346
+ ala_factor = 100 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
347
+ $tot_aa = $aa_tot_obs.values.sum
348
+
349
+ # puts ">Total amino acid frequencies"
350
+ # puts "%-3s %8s %8s %8s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUT REL_MUT REL_FRQ]
351
+
352
+ $aa_tot_obs.each_pair do |res, freq|
353
+ $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
354
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
355
+ $aa_rel_freq[res] = freq / $tot_aa.to_f
356
+
357
+ # puts "%-3s %8d %8d %8.2f %8d %8.4f" % [res, $aa_mut_obs[res], freq, $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
358
+ end
359
+
360
+ #
361
+ # Probability matrices
362
+ #
363
+ tot_prob_matrix = NMatrix.float(21, 21)
364
+
365
+ 0.upto($tot_freq_matrix.shape[0] - 1) do |i|
366
+ col_sum = 0
367
+ 0.upto($tot_freq_matrix.shape[1] - 1) do |j|
368
+ col_sum += $tot_freq_matrix[i, j]
369
+ end
370
+ 0.upto($tot_freq_matrix.shape[1] - 1) do |k|
371
+ # normalized substitutions probabilities with mutability
372
+ #tot_prob_matrix[i, k] = $aa_rel_mutb[$amino_acids[k]] * $tot_freq_matrix[i,k] / col_sum.to_f
373
+
374
+ # raw substitution probabilities with just frequencies
375
+ tot_prob_matrix[i, k] = 100 * $tot_freq_matrix[i,k] / col_sum.to_f
376
+ end
377
+ end
378
+
379
+ # puts ">Total probability"
380
+ # puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
381
+
382
+ # # a new way of getting probability matrix
383
+ # new_tot_prob_matrix = NMatrix.float(21, 21)
384
+ #
385
+ # 0.upto($tot_freq_matrix.shape[0] - 1) do |i|
386
+ # col_sum = 0
387
+ # 0.upto($tot_freq_matrix.shape[1] - 1) do |j|
388
+ # col_sum += ((1 - $aa_mutb[$amino_acids[j]]) / $aa_rel_freq[$amino_acids[j]]) * $tot_freq_matrix[i, j]
389
+ # end
390
+ # 0.upto($tot_freq_matrix.shape[1] - 1) do |k|
391
+ # new_tot_prob_matrix[i, k] = 100 * ((1 - $aa_mutb[$amino_acids[k]]) / $aa_rel_freq[$amino_acids[k]]) * $tot_freq_matrix[i, k] / col_sum
392
+ # end
393
+ # end
394
+
395
+ #
396
+ # p1 probability
397
+ #
398
+ p1 = NArray.float(21)
399
+ a0 = NArray.float(21).fill(1 / 21.0)
400
+ big_N = $tot_aa
401
+ small_n = 21
402
+ omega1 = 1.0 / (1 + big_N.to_f / ($sigma * small_n.to_f))
403
+ omega2 = 1.0 - omega1
404
+
405
+ 0.upto(p1.shape[0] - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
406
+ $smooth_prob[1] = p1
407
+
408
+ # puts "P1 probability matrix"
409
+ # puts p1.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
410
+ # puts p1.sum
411
+
412
+ #
413
+ # p2 and above
414
+ #
415
+ env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
416
+
417
+ 1.upto($env_features.size) do |ci|
418
+ env_labels.combination(ci) do |c1|
419
+ Enumerable.cart_prod(*c1).each do |labels|
420
+ pattern = "." * $env_features.size
421
+ labels.each do |label|
422
+ j = label[0].chr.to_i
423
+ l = label[1].chr
424
+ pattern[j] = l
425
+ end
426
+
427
+ # get environmetns, frequencies, and probabilities
428
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
429
+ freq_arr = envs.inject(NArray.int(21)) { |sum, env| sum + env.freq_array }
430
+
431
+ # if freq_arr.sum == 0
432
+ # $logger.warn "!!! Environment combination, #{labels.to_set} has no frequency"
433
+ #
434
+ # # store smoothed probabilties in a hash using a set of envrionment labels as a key
435
+ # smooth_prob_arr = NArray.float(21).fill(0.0)
436
+ # if !$smooth_prob.has_key?(ci + 1)
437
+ # $smooth_prob[ci + 1] = {}
438
+ # $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
439
+ # else
440
+ # $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
441
+ # end
442
+ #
443
+ # # print smoothed probabilties
444
+ # puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
445
+ # puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
446
+ # puts smooth_prob_arr.sum
447
+ # next
448
+ # end
449
+
450
+ prob_arr = NArray.float(21)
451
+ 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
452
+
453
+ # collect priors
454
+ priors = []
455
+ if ci > 1
456
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
457
+ else
458
+ priors << $smooth_prob[1]
459
+ end
460
+
461
+ # averaging priors... have a look at Entropy based normalization!
462
+ #pri_avg = priors.sum / priors.size
463
+
464
+ # entropy based weighting priors
465
+ entropy_max = Math::log(21)
466
+ entropies = priors.map do |prior|
467
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
468
+ end
469
+ pri_avg = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
470
+
471
+
472
+ smooth_prob_arr = NArray.float(21)
473
+ big_N = freq_arr.sum.to_f
474
+ small_n = 21.0
475
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
476
+ omega2 = 1.0 - omega1
477
+
478
+ # smoothing step
479
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * pri_avg[i] + omega2 * prob_arr[i]) }
480
+
481
+ # normalization step
482
+ smooth_prob_arr_sum = smooth_prob_arr.sum
483
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
484
+
485
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
486
+ if !$smooth_prob.has_key?(ci + 1)
487
+ $smooth_prob[ci + 1] = {}
488
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
489
+ else
490
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
491
+ end
492
+
493
+ # # print smoothed probabilties
494
+ # puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
495
+ # puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
496
+ # puts smooth_prob_arr.sum
497
+ end
498
+ end
499
+ end
500
+
501
+ # summarizing ...
502
+ $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
503
+ # $envs.values.each do |e|
504
+ # puts ">" + e.label
505
+ # puts e.smooth_prob_array.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
506
+ # end
507
+
508
+ tot_smooth_prob_matrix = NMatrix.float(21,21)
509
+
510
+ # for each combination of environment features
511
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
512
+ env_groups.sort.each do |label, group|
513
+ grp_prob_matrix = NMatrix.float(21,21)
514
+
515
+ $amino_acids.each_with_index do |aa, ai|
516
+ smooth_prob_array = group.find { |e| e.label.start_with?(aa) }.smooth_prob_array
517
+ 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
518
+ end
519
+
520
+ tot_smooth_prob_matrix += grp_prob_matrix
521
+
522
+ puts ">#{label}"
523
+ puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
524
+ end
525
+
526
+ tot_smooth_prob_matrix /= env_groups.size
527
+
528
+ # for total
529
+ puts ">Total Probability"
530
+ puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
531
+
532
+ end
533
+
534
+ end
535
+ end
536
+ end