egor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/egor/cli.rb ADDED
@@ -0,0 +1,1063 @@
1
+ require "getoptlong"
2
+ require "logger"
3
+ require "rubygems"
4
+ require "narray"
5
+ require "bio"
6
+ require "set"
7
+ require "facets"
8
+ require "simple_memoize"
9
+
10
+ require "narray_extensions"
11
+ require "nmatrix_extensions"
12
+ require "enumerable_extensions"
13
+ require "math_extensions"
14
+ require "environment_feature"
15
+ require "environment"
16
+
17
+ # This is a module for an actual command line interpreter for Egor
18
+ # ---
19
+ # Copyright (C) 2008-9 Semin Lee
20
+ module Egor
21
+ class CLI
22
+ class << self
23
+
24
+ # :nodoc:
25
+ def print_version
26
+ puts Egor::VERSION
27
+ end
28
+
29
+ # Print Egor's Usage on the screen
30
+ #
31
+ # :call-seq:
32
+ # Egor::CLI::print_usage
33
+ #
34
+ def print_usage
35
+ puts <<-USAGE
36
+ egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
37
+
38
+ Usage:
39
+ egor [ options ] -l TEMLIST-file -c CLASSDEF-file
40
+ or
41
+ egor [ options ] -f TEM-file -c CLASSDEF-file
42
+
43
+ Options:
44
+ --tem-file (-f) STRING: a tem file
45
+ --tem-list (-l) STRING: a list for tem files
46
+ --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
47
+ --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
48
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
+ --noweight: calculate substitution counts with no weights (default)
50
+ --smooth (-s) INTEGER:
51
+ 0 for parial smoothing (default)
52
+ 1 for full smoothing
53
+ --nosmooth: perform no smoothing operation
54
+ --cys (-y) INTEGER: (NOT implemented yet)
55
+ 0 for using C and J only for structure
56
+ 1 for both structure and sequence (default)
57
+ --output INTEGER:
58
+ 0 for raw counts (no-smoothing performed)
59
+ 1 for probabilities
60
+ 2 for log-odds (default)
61
+ --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
62
+ --sigma DOUBLE: change the sigma value for smoothing (default 5)
63
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
64
+ --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
65
+ --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
66
+ --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
67
+ --verbose (-v) INTEGER
68
+ 0 for ERROR level (default)
69
+ 1 for WARN or above level
70
+ 2 for INFO or above level
71
+ 3 for DEBUG or above level
72
+ --version: print version
73
+ --help (-h): show help
74
+
75
+ USAGE
76
+ end
77
+
78
+ # Calculate PID between two sequences
79
+ #
80
+ # :call-seq:
81
+ # Egor::CLI::calc_pid(seq1, seq2) -> Float
82
+ #
83
+ def calc_pid(seq1, seq2)
84
+ s1 = seq1.split("")
85
+ s2 = seq2.split("")
86
+ cols = s1.zip(s2)
87
+ align = 0
88
+ ident = 0
89
+ intgp = 0
90
+
91
+ cols.each do |col|
92
+ if (col[0] != "-") && (col[1] != "-")
93
+ align += 1
94
+ if col[0] == col[1]
95
+ ident += 1
96
+ end
97
+ elsif (((col[0] == "-") && (col[1] != "-")) ||
98
+ ((col[0] != "-") && (col[1] == "-")))
99
+ intgp += 1
100
+ end
101
+ end
102
+
103
+ pid = 100.0 * ident.to_f / (align + intgp)
104
+ end
105
+ memoize :calc_pid
106
+
107
+ # :nodoc:
108
+ def execute(arguments=[])
109
+ #
110
+ # Abbreviations in the aa1 codes
111
+ #
112
+ # * env: environment
113
+ # * tem: (FUGUE) template
114
+ # * classdef: (envlironment) class definition
115
+ # * aa: amino acid
116
+ # * aa: weighted amino acid
117
+ # * tot: total
118
+ # * rel: relative
119
+ # * obs: observation (frequency)
120
+ # * mut: mutation
121
+ # * mutb: mutability
122
+ # * freq: frequency
123
+ # * prob: probability
124
+ # * opts: options
125
+ #
126
+
127
+ # Part 1.
128
+ #
129
+ # Global variables and their default values
130
+ #
131
+ $logger = Logger.new(STDOUT)
132
+ $logger.level = Logger::ERROR
133
+ $amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
134
+ $tem_list = nil
135
+ $tem_file = nil
136
+ $classdef = "classdef.dat"
137
+ $outfile = "allmat.dat"
138
+ $outfh = nil # file hanfle for outfile
139
+ $output = 2
140
+ $aa_tot_obs = {}
141
+ $aa_mut_obs = {}
142
+ $aa_mutb = {}
143
+ $aa_rel_mutb = {}
144
+ $aa_rel_freq = {}
145
+ $env_aa_obs = {}
146
+ $ali_size = 0
147
+ $tot_aa = 0
148
+ $sigma = 5.0
149
+ $weight = 60
150
+ $noweight = false
151
+ $smooth = :partial
152
+ $nosmooth = false
153
+ $scale = 3
154
+ $pidmin = nil
155
+ $pidmax = nil
156
+ $scale = 3
157
+ $add = 0
158
+ $penv = false
159
+ $heatmap = false
160
+ $smooth_prob = {}
161
+
162
+ # Part 2.
163
+ #
164
+ # Parsing options
165
+ #
166
+ opts = GetoptLong.new(
167
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
168
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
169
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
170
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
171
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
172
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
173
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
174
+ [ '--heatmap', GetoptLong::NO_ARGUMENT ],
175
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
176
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
177
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
178
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
179
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
180
+ [ '--version', GetoptLong::NO_ARGUMENT ]
181
+ )
182
+
183
+ opts.each do |opt, arg|
184
+ case opt
185
+ when '--help'
186
+ print_usage
187
+ exit 0
188
+ when '--tem-list'
189
+ $tem_list = arg
190
+ when '--tem-file'
191
+ $tem_file = arg
192
+ when '--classdef'
193
+ $classdef = arg
194
+ when '--output'
195
+ $output = arg.to_i
196
+ when '--outfile'
197
+ $outfile = arg
198
+ when '--cyc'
199
+ $logger.error "!!! --cys option is not available yet"
200
+ exit 1
201
+ $cysteine = (arg.to_i == 1 ? false : true)
202
+ when '--weight'
203
+ $weight = arg.to_i
204
+ when '--sigma'
205
+ $sigma = arg.to_f
206
+ when '--pidmin'
207
+ $pidmin = arg.to_f
208
+ when '--pidmax'
209
+ $pidmax = arg.to_f
210
+ when '--noweight'
211
+ $noweight = true
212
+ when '--smooth'
213
+ $smooth = (arg.to_i == 1 ? :full : :parital)
214
+ when '--nosmooth'
215
+ $nosmooth = true
216
+ when '--scale'
217
+ $scale = arg.to_f
218
+ when '--add'
219
+ $add = arg.to_f
220
+ when '--penv'
221
+ $logger.error "!!! --penv option is not available yet"
222
+ exit 1
223
+ $penv = true
224
+ when '--heatmap'
225
+ $heatmap = true
226
+ when '--verbose'
227
+ $logger.level = case arg.to_i
228
+ when 0 then Logger::ERROR
229
+ when 1 then Logger::WARN
230
+ when 2 then Logger::INFO
231
+ when 3 then Logger::DEBUG
232
+ else Logger::ERROR
233
+ end
234
+ when '--version'
235
+ print_version
236
+ exit 0
237
+ end
238
+ end
239
+
240
+ # when arguments are nonsense, print usage
241
+ if ((ARGV.length != 0) ||
242
+ (!$tem_list && !$tem_file) ||
243
+ ($tem_list && $tem_file))
244
+ print_usage
245
+ exit 1
246
+ end
247
+
248
+ # Part 3.
249
+ #
250
+ # Reading Environment Class Definition File
251
+ #
252
+
253
+ # a hash for storing all environment feature objects
254
+ $env_features = []
255
+
256
+ # aa1 amino acid in a substitution itself is a environment feature
257
+ $env_features << EnvironmentFeature.new("sequence",
258
+ $amino_acids,
259
+ $amino_acids,
260
+ "F",
261
+ "F")
262
+
263
+ # read environment class definiton file and
264
+ # store them into the hash prepared above
265
+ IO.foreach($classdef) do |line|
266
+ if line.start_with?("#")
267
+ next
268
+ elsif (env_ftr = line.chomp.split(/;/)).length == 5
269
+ $logger.info ">>> An environment feature, #{line.chomp} detected"
270
+ if env_ftr[-1] == "T"
271
+ # skip silenced environment feature
272
+ $logger.warn "!!! The environment feature, #{line.chomp} silent"
273
+ next
274
+ end
275
+ if env_ftr[-2] == "T"
276
+ $logger.warn "!!! The environment feature, #{line.chomp} constrained"
277
+ end
278
+ $env_features << EnvironmentFeature.new(env_ftr[0],
279
+ env_ftr[1].split(""),
280
+ env_ftr[2].split(""),
281
+ env_ftr[3],
282
+ env_ftr[4])
283
+ else
284
+ $logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
285
+ exit 1
286
+ end
287
+ end
288
+
289
+ # a hash for storing all environment objects
290
+ $envs = {}
291
+
292
+ # generate all possible combinations of environment labels, and
293
+ # create & store every environment object into the hash prepared above with the label as a key
294
+ $env_features.inject([]) { |sum, ec|
295
+ sum << ec.labels
296
+ }.inject { |pro, lb|
297
+ pro.product(lb)
298
+ }.each_with_index { |e, i|
299
+ $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
300
+ }
301
+
302
+ # Part 4.
303
+ #
304
+ # Reading TEM file or TEMLIST list file and couting substitutions
305
+ #
306
+
307
+ # a global file handle for output
308
+ $outfh = File.open($outfile, "w")
309
+
310
+ if $tem_file
311
+ $tem_list = [$tem_file]
312
+ end
313
+
314
+ if $tem_list
315
+ IO.foreach($tem_list) do |tem_file|
316
+ tem_file.chomp!
317
+
318
+ $logger.info ">>> Analysing #{tem_file} ..."
319
+
320
+ ali = Bio::Alignment::OriginalAlignment.new
321
+ ff = Bio::FlatFile.auto(tem_file)
322
+ ff.each_entry do |pir|
323
+ if pir.definition == "sequence"
324
+ ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
325
+ end
326
+ end
327
+
328
+ $ali_size += ali.size
329
+ env_labels = {}
330
+ disulphide = {}
331
+
332
+ ali.each_pair do |key, seq|
333
+ # check disulphide bond environment first!
334
+ ff.rewind
335
+ ff.each_entry do |pir|
336
+ if (pir.entry_id == key) && (pir.definition == "disulphide")
337
+ disulphide[key] = pir.data.gsub("\n", "").split("")
338
+ end
339
+ end
340
+
341
+ $env_features.each_with_index do |ec, ei|
342
+ env_labels[key] = [] unless env_labels.has_key?(key)
343
+
344
+ ff.rewind
345
+ ff.each_entry do |pir|
346
+ if (pir.entry_id == key) && (pir.definition == ec.name)
347
+ labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
348
+ if sym == "-"
349
+ "-"
350
+ elsif sym == "X" || sym == "x"
351
+ "X"
352
+ else
353
+ if ei == 0 # Amino Acid Environment Feature
354
+ ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
355
+ else
356
+ ec.labels[ec.symbols.index(sym)]
357
+ end
358
+ end
359
+ end
360
+
361
+ if env_labels[key].empty?
362
+ env_labels[key] = labels
363
+ else
364
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
365
+ end
366
+ end
367
+ end
368
+ end
369
+ end
370
+
371
+ if $noweight
372
+ ali.each_pair do |id1, seq1|
373
+ ali.each_pair do |id2, seq2|
374
+ if id1 != id2
375
+ pid = calc_pid(seq1, seq2)
376
+ s1 = seq1.split("")
377
+ s2 = seq2.split("")
378
+
379
+ # check PID_MIN
380
+ if $pidmin && (pid < $pidmin)
381
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
382
+ next
383
+ end
384
+
385
+ # check PID_MAX
386
+ if $pidmax && (pid > $pidmax)
387
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
388
+ next
389
+ end
390
+
391
+ s1.each_with_index do |aa1, pos|
392
+ if env_labels[id1][pos].include?("X")
393
+ $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
394
+ next
395
+ end
396
+
397
+ aa1.upcase!
398
+ aa2 = s2[pos].upcase
399
+
400
+ if !$amino_acids.include?(aa1)
401
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
402
+ next
403
+ end
404
+
405
+ if !$amino_acids.include?(aa2)
406
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
407
+ next
408
+ end
409
+
410
+ aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
411
+ aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
412
+
413
+ $envs[env_labels[id1][pos]].add_residue_count(aa2)
414
+
415
+ grp_label = env_labels[id1][pos][1..-1]
416
+
417
+ if $env_aa_obs.has_key? grp_label
418
+ if $env_aa_obs[grp_label].has_key? aa1
419
+ $env_aa_obs[grp_label][aa1] += 1
420
+ else
421
+ $env_aa_obs[grp_label][aa1] = 1
422
+ end
423
+ else
424
+ $env_aa_obs[grp_label] = Hash.new(0)
425
+ $env_aa_obs[grp_label][aa1] = 1
426
+ end
427
+
428
+ if $aa_tot_obs.has_key? aa1
429
+ $aa_tot_obs[aa1] += 1
430
+ else
431
+ $aa_tot_obs[aa1] = 1
432
+ end
433
+
434
+ if aa1 != aa2
435
+ if $aa_mut_obs.has_key? aa1
436
+ $aa_mut_obs[aa1] += 1
437
+ else
438
+ $aa_mut_obs[aa1] = 1
439
+ end
440
+ end
441
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
442
+ end
443
+ end
444
+ end
445
+ end
446
+ else
447
+ # BLOSUM-like weighting
448
+ clusters = []
449
+ ali.each_pair { |i, s| clusters << [i] }
450
+
451
+ # a loop for single linkage clustering
452
+ begin
453
+ continue = false
454
+ 0.upto(clusters.size - 2) do |i|
455
+ indexes = []
456
+ (i + 1).upto(clusters.size - 1) do |j|
457
+ found = false
458
+ clusters[i].each do |c1|
459
+ clusters[j].each do |c2|
460
+ if calc_pid(ali[c1], ali[c2]) >= $weight
461
+ indexes << j
462
+ found = true
463
+ break
464
+ end
465
+ end
466
+ break if found
467
+ end
468
+ end
469
+
470
+ unless indexes.empty?
471
+ continue = true
472
+ group = clusters[i]
473
+ indexes.each do |k|
474
+ group = group.concat(clusters[k])
475
+ clusters[k] = nil
476
+ end
477
+ clusters[i] = group
478
+ clusters.compact!
479
+ end
480
+ end
481
+ end while(continue)
482
+
483
+ clusters.combination(2).each do |cluster1, cluster2|
484
+ cluster1.each do |id1|
485
+ cluster2.each do |id2|
486
+ seq1 = ali[id1].split("")
487
+ seq2 = ali[id2].split("")
488
+ seq1.each_with_index do |aa1, pos|
489
+ if env_labels[id1][pos].include?("X")
490
+ $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
491
+ next
492
+ end
493
+
494
+ aa1.upcase!
495
+ aa2 = seq2[pos].upcase
496
+
497
+ if !$amino_acids.include?(aa1)
498
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
499
+ next
500
+ end
501
+
502
+ if !$amino_acids.include?(aa2)
503
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
504
+ next
505
+ end
506
+
507
+ aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
508
+ aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
509
+ size1 = cluster1.size
510
+ size2 = cluster2.size
511
+ obs1 = 1.0 / size1
512
+ obs2 = 1.0 / size2
513
+
514
+ $envs[env_labels[id1][pos]].add_residue_count(aa2, 1.0 / (size1 * size2))
515
+ $envs[env_labels[id2][pos]].add_residue_count(aa1, 1.0 / (size1 * size2))
516
+
517
+ grp_label1 = env_labels[id1][pos][1..-1]
518
+ grp_label2 = env_labels[id2][pos][1..-1]
519
+
520
+ if $env_aa_obs.has_key? grp_label1
521
+ if $env_aa_obs[grp_label1].has_key? aa1
522
+ $env_aa_obs[grp_label1][aa1] += obs1
523
+ else
524
+ $env_aa_obs[grp_label1][aa1] = obs1
525
+ end
526
+ else
527
+ $env_aa_obs[grp_label1] = Hash.new(0.0)
528
+ $env_aa_obs[grp_label1][aa1] = obs1
529
+ end
530
+
531
+ if $env_aa_obs.has_key? grp_label2
532
+ if $env_aa_obs[grp_label2].has_key? aa2
533
+ $env_aa_obs[grp_label2][aa2] += obs2
534
+ else
535
+ $env_aa_obs[grp_label2][aa2] = obs2
536
+ end
537
+ else
538
+ $env_aa_obs[grp_label2] = Hash.new(0.0)
539
+ $env_aa_obs[grp_label2][aa2] = obs2
540
+ end
541
+
542
+ if $aa_tot_obs.has_key? aa1
543
+ $aa_tot_obs[aa1] += obs1
544
+ else
545
+ $aa_tot_obs[aa1] = obs1
546
+ end
547
+
548
+ if $aa_tot_obs.has_key? aa2
549
+ $aa_tot_obs[aa2] += obs2
550
+ else
551
+ $aa_tot_obs[aa2] = obs2
552
+ end
553
+
554
+ if aa1 != aa2
555
+ if $aa_mut_obs.has_key? aa1
556
+ $aa_mut_obs[aa1] += obs1
557
+ else
558
+ $aa_mut_obs[aa1] = obs1
559
+ end
560
+ if $aa_mut_obs.has_key? aa2
561
+ $aa_mut_obs[aa2] += obs2
562
+ else
563
+ $aa_mut_obs[aa2] = obs2
564
+ end
565
+ end
566
+
567
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
568
+ $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
569
+ end
570
+ end
571
+ end
572
+ end
573
+ end # if !$nosmooth
574
+ end # IO.foreach($tem_list)
575
+
576
+ # print out default header
577
+ $outfh.puts <<HEADER
578
+ # Environment-specific amino acid substitution matrices
579
+ # Creator: egor version #{Egor::VERSION}
580
+ # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
581
+ #
582
+ # Definitions for structural environments:
583
+ # #{$env_features.size - 1} features used
584
+ #
585
+ HEADER
586
+
587
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
588
+
589
+ $outfh.puts <<HEADER
590
+ #
591
+ # (read in from #{$classdef})
592
+ #
593
+ # Number of alignments: #{$ali_size}
594
+ # (list of .tem files read in from #{$tem_list})
595
+ #
596
+ # Total number of environments: #{Integer($envs.size / $amino_acids.size)}
597
+ #
598
+ # There are #{$amino_acids.size} amino acids considered.
599
+ # #{$amino_acids.join}
600
+ #
601
+ HEADER
602
+
603
+ if $noweight
604
+ $outfh.puts "# Weighting scheme: none"
605
+ else
606
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
607
+ end
608
+ $outfh.puts "#"
609
+
610
+ # calculate amino acid frequencies and mutabilities, and
611
+ # print them as default statistics in the header part
612
+ ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
613
+ $tot_aa = $aa_tot_obs.values.sum
614
+
615
+ $outfh.puts "#"
616
+ $outfh.puts "# Total amino acid frequencies:\n"
617
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUTB REL_MUTB REL_FRQ]
618
+
619
+ $aa_tot_obs.each_pair do |res, freq|
620
+ $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
621
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
622
+ $aa_rel_freq[res] = freq / $tot_aa.to_f
623
+ end
624
+
625
+ $amino_acids.each do |res|
626
+ if $noweight
627
+ $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
628
+ [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
629
+ else
630
+ $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
631
+ [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
632
+ end
633
+ end
634
+ $outfh.puts "#"
635
+
636
+ # calculating probabilities for each environment
637
+ $envs.values.each do |e|
638
+ if e.freq_array.sum != 0
639
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
640
+ end
641
+ end
642
+
643
+ # count raw frequencies
644
+ $tot_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
645
+
646
+ # for each combination of environment features
647
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
648
+
649
+ env_groups.to_a.sort_by { |env_group|
650
+ # a bit clumsy sorting here...
651
+ env_group[0].split("").map_with_index { |l, i|
652
+ $env_features[i + 1].labels.index(l)
653
+ }
654
+ }.each_with_index do |group, group_no|
655
+ grp_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
656
+
657
+ $amino_acids.each_with_index do |aa, ai|
658
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
659
+ 0.upto(20) { |j| grp_freq_matrix[ai, j] = freq_array[j] }
660
+ end
661
+
662
+ $tot_freq_matrix += grp_freq_matrix
663
+
664
+ if $output == 0
665
+ $outfh.puts ">#{group[0]} #{group_no}"
666
+ $outfh.puts grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
667
+ end
668
+ end
669
+
670
+ if $output == 0
671
+ $outfh.puts ">Total"
672
+ $outfh.puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
673
+ exit 0
674
+ end
675
+
676
+ # for probability
677
+ if $output == 1
678
+ $outfh.puts <<HEADER
679
+ #
680
+ # Each column (j) represents the probability distribution for the
681
+ # likelihood of acceptance of a mutational event by a residue type j in
682
+ # a particular structural environment (specified after >) leading to
683
+ # any other residue type (i) and sums up to 100.
684
+ #
685
+ HEADER
686
+ end
687
+
688
+ if ($output > 0) && $nosmooth
689
+ # Probability matrices
690
+ tot_prob_matrix = NMatrix.float(21, 21)
691
+
692
+ # for each combination of environment features
693
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
694
+ env_groups.to_a.sort_by { |env_group|
695
+ # a bit clumsy sorting here...
696
+ env_group[0].split("").map_with_index { |l, i|
697
+ $env_features[i + 1].labels.index(l)
698
+ }
699
+ }.each_with_index do |group, group_no|
700
+ grp_prob_matrix = NMatrix.float(21,21)
701
+
702
+ $amino_acids.each_with_index do |aa, ai|
703
+ prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
704
+ 0.upto(20) { |j| grp_prob_matrix[ai, j] = prob_array[j] }
705
+ end
706
+
707
+ tot_prob_matrix += grp_prob_matrix
708
+
709
+ if ($output == 1)
710
+ $outfh.puts ">#{group[0]} #{group_no}"
711
+ $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
712
+ end
713
+ end
714
+
715
+ if ($output == 1)
716
+ $outfh.puts ">Total"
717
+ $outfh.puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
718
+ $outfh.close
719
+ exit 0
720
+ end
721
+ end
722
+
723
+ # for smoothing...
724
+ if ($output > 0) && !$nosmooth
725
+ #
726
+ # p1 probability
727
+ #
728
+ p1 = NArray.float(21)
729
+ a0 = NArray.float(21).fill(1 / 21.0)
730
+ big_N = $tot_aa.to_f
731
+ small_n = 21.0
732
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
733
+ omega2 = 1.0 - omega1
734
+
735
+ if $smooth == :partial
736
+ # for partial smoothing, p1 probability is not smoothed!
737
+ 0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
738
+ $smooth_prob[1] = p1
739
+ else
740
+ # for full smoothing, p1 probability is smoothed
741
+ 0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
742
+ $smooth_prob[1] = p1
743
+ end
744
+
745
+ #
746
+ # p2 and above
747
+ #
748
+ env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
749
+
750
+ if $smooth == :partial
751
+ $outfh.puts <<HEADER
752
+ # Partial Smoothing:
753
+ #
754
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
755
+ # each row in all matrices (no smoothing)
756
+ # ^^^^^^^^^^^^
757
+ # p2(ri|Rj) is estimated as:
758
+ # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
759
+ #
760
+ # p3(ri|Rj,fq) is estimated as:
761
+ # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
762
+ # where
763
+ # A2(ri|fq) = p2(ri|fq) (fixed fq; partial smoothing)
764
+ #
765
+ # The smoothing procedure is curtailed here and finally
766
+ # p5(ri|Rj,...) is estimated as:
767
+ # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
768
+ # where
769
+ # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
770
+ #
771
+ # Weights (omegas) are calculated as in Topham et al. 1993)
772
+ #
773
+ # sigma value used is: 5.00
774
+ #
775
+ HEADER
776
+ 1.upto($env_features.size) do |ci|
777
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
778
+ next if (ci > 2) && (ci < $env_features.size)
779
+
780
+ env_labels.combination(ci) do |c1|
781
+ Enumerable.cart_prod(*c1).each do |labels|
782
+ pattern = "." * $env_features.size
783
+
784
+ labels.each do |label|
785
+ i = label[0].chr.to_i
786
+ l = label[1].chr
787
+ pattern[i] = l
788
+ end
789
+
790
+ if pattern =~ /^\./
791
+ $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
792
+ next
793
+ end
794
+
795
+ # get environmetns, frequencies, and probabilities
796
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
797
+ freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
798
+ prob_arr = NArray.float(21)
799
+ 0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
800
+
801
+ # # assess whether a residue type j is compatible with a particular combination of structural features
802
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
803
+ # if ci == $env_features.size
804
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
805
+ # sub_pattern = "." * $env_features.size
806
+ # sub_pattern[0] = aa_label
807
+ # sub_freq_sum = 0
808
+ #
809
+ # labels[1..-1].each do |label|
810
+ # next if label.start_with?("0")
811
+ # i = label[0].chr.to_i
812
+ # l = label[1].chr
813
+ # sub_pattern[i] = l
814
+ # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
815
+ # sub_freq_arr = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
816
+ # sub_freq_sum += sub_freq_arr.sum
817
+ # end
818
+ #
819
+ # if sub_freq_sum == 0
820
+ # if $smooth_prob.has_key?(ci + 1)
821
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
822
+ # else
823
+ # $smooth_prob[ci + 1] = {}
824
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
825
+ # end
826
+ # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
827
+ # next
828
+ # end
829
+ # end
830
+
831
+ # collect priors if ci > 1
832
+ priors = []
833
+
834
+ if ci == 2
835
+ labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
836
+ priors << $smooth_prob[2][c3.to_set]
837
+ }
838
+ elsif ci == $env_features.size
839
+ labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
840
+ priors << $smooth_prob[3][c3.to_set]
841
+ }
842
+ end
843
+
844
+ # entropy based weighting priors
845
+ entropy_max = Math::log(21)
846
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
847
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
848
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
849
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
850
+
851
+ # smoothing step
852
+ smooth_prob_arr = NArray.float(21)
853
+ big_N = freq_arr.sum.to_f
854
+ small_n = 21.0
855
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
856
+ omega2 = 1.0 - omega1
857
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
858
+
859
+ # normalization step
860
+ smooth_prob_arr_sum = smooth_prob_arr.sum
861
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
862
+
863
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
864
+ if !$smooth_prob.has_key?(ci + 1)
865
+ $smooth_prob[ci + 1] = {}
866
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
867
+ else
868
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
869
+ end
870
+ end
871
+ end
872
+ end
873
+ else
874
+ $outfh.puts <<HEADER
875
+ # Full Smoothing:
876
+ #
877
+ # p1(ri) is estimated as:
878
+ # p1(ri) = omega1 * A0 + omega2 * W1(ri)
879
+ #
880
+ # p2(ri|f1q) is estimated as:
881
+ # p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
882
+ #
883
+ # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
884
+ #
885
+ # p3(ri|f1q,f2q) is estimated as:
886
+ # p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
887
+ # where
888
+ # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
889
+ #
890
+ # The smoothing procedure is NOT curtailed here and it goes upto
891
+ #
892
+ # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
893
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * W5(ri|f1q,f2q,...,fn-1q)
894
+ # where
895
+ # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
896
+ #
897
+ # Weights (omegas) are calculated as in Topham et al. 1993)
898
+ #
899
+ # sigma value used is: 5.00
900
+ #
901
+ HEADER
902
+ # full smooting
903
+ 1.upto($env_features.size) do |ci|
904
+ env_labels.combination(ci) do |c1|
905
+ Enumerable.cart_prod(*c1).each do |labels|
906
+ pattern = "." * $env_features.size
907
+ labels.each do |label|
908
+ j = label[0].chr.to_i
909
+ l = label[1].chr
910
+ pattern[j] = l
911
+ end
912
+
913
+ # get environmetns, frequencies, and probabilities
914
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
915
+ freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
916
+ prob_arr = NArray.float(21)
917
+ 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
918
+
919
+ # collect priors
920
+ priors = []
921
+ if ci > 1
922
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
923
+ else
924
+ priors << $smooth_prob[1]
925
+ end
926
+
927
+ # entropy based weighting priors
928
+ entropy_max = Math::log(21)
929
+ entropies = priors.map do |prior|
930
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
931
+ end
932
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
933
+
934
+ # smoothing step
935
+ smooth_prob_arr = NArray.float(21)
936
+ big_N = freq_arr.sum.to_f
937
+ small_n = 21.0
938
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
939
+ omega2 = 1.0 - omega1
940
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
941
+
942
+ # normalization step
943
+ smooth_prob_arr_sum = smooth_prob_arr.sum
944
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
945
+
946
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
947
+ if !$smooth_prob.has_key?(ci + 1)
948
+ $smooth_prob[ci + 1] = {}
949
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
950
+ else
951
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
952
+ end
953
+ end
954
+ end
955
+ end
956
+ end
957
+
958
+ # updating smoothed probability array for each envrionment
959
+ $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
960
+
961
+ # for a total substitution probability matrix
962
+ tot_smooth_prob_matrix = NMatrix.float(21,21)
963
+
964
+ # grouping environments by its environment labels but amino acid label
965
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
966
+
967
+ # sorting environments and build 21X21 substitution matrices
968
+ env_groups.to_a.sort_by { |env_group|
969
+ # a bit clumsy sorting here...
970
+ env_group[0].split("").map_with_index { |l, i|
971
+ $env_features[i + 1].labels.index(l)
972
+ }
973
+ }.each_with_index do |group, group_no|
974
+ # calculating 21X21 substitution probability matrix for each envrionment
975
+ grp_prob_matrix = NMatrix.float(21,21)
976
+
977
+ $amino_acids.each_with_index do |aa, ai|
978
+ smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
979
+ 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
980
+ end
981
+
982
+ tot_smooth_prob_matrix += grp_prob_matrix
983
+
984
+ if $output == 1
985
+ $outfh.puts ">#{group[0]} #{group_no}"
986
+ $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
987
+ end
988
+ end
989
+
990
+ tot_smooth_prob_matrix /= env_groups.size
991
+
992
+ if $output == 1
993
+ $outfh.puts ">Total"
994
+ $outfh.puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
995
+ $outfh.close
996
+ exit 0
997
+ end
998
+
999
+ if $output == 2
1000
+ $outfh.puts <<HEADER
1001
+ #
1002
+ # The probabilities were then divided by the background probabilities
1003
+ # which were derived from the environment-independent amino acid frequencies.
1004
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1005
+ #
1006
+ # Shown here are logarithms of these values multiplied by 3/log(2)
1007
+ # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1008
+ #
1009
+ # For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
1010
+ #
1011
+ HEADER
1012
+
1013
+ # log-add ratio matrices from now on
1014
+ tot_logo_mat = NMatrix.float(21,21)
1015
+ factor = $scale / Math::log(2)
1016
+
1017
+ # grouping environments by its environment labels but amino acid label
1018
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1019
+
1020
+ # sorting environments and build 21X21 substitution matrices
1021
+ env_groups.to_a.sort_by { |env_group|
1022
+ # a bit clumsy sorting here...
1023
+ env_group[0].split("").map_with_index { |l, i|
1024
+ $env_features[i + 1].labels.index(l)
1025
+ }
1026
+ }.each_with_index do |group, group_no|
1027
+ # calculating 21X21 substitution probability matrix for each envrionment
1028
+ grp_label = group[0]
1029
+ grp_envs = group[1]
1030
+ grp_logo_mat = NMatrix.float(21,21)
1031
+
1032
+ $amino_acids.each_with_index do |aa, ai|
1033
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1034
+ logo_arr = NArray.float(21)
1035
+
1036
+ env.smooth_prob_array.to_a.each_with_index do |prob, j|
1037
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1038
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1039
+ logo_arr[j] = factor * Math::log(odds)
1040
+ end
1041
+ 0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1042
+ end
1043
+
1044
+ tot_logo_mat += grp_logo_mat
1045
+
1046
+ $outfh.puts ">#{grp_label} #{group_no}"
1047
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1048
+ end
1049
+
1050
+ tot_logo_mat /= env_groups.size
1051
+
1052
+ $outfh.puts ">Total"
1053
+ $outfh.puts tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1054
+ $outfh.close
1055
+ exit 0
1056
+ end
1057
+ end
1058
+ end
1059
+ end
1060
+
1061
+ end # class << self
1062
+ end # class CLI
1063
+ end # module Egor