egor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/egor/cli.rb ADDED
@@ -0,0 +1,1063 @@
1
+ require "getoptlong"
2
+ require "logger"
3
+ require "rubygems"
4
+ require "narray"
5
+ require "bio"
6
+ require "set"
7
+ require "facets"
8
+ require "simple_memoize"
9
+
10
+ require "narray_extensions"
11
+ require "nmatrix_extensions"
12
+ require "enumerable_extensions"
13
+ require "math_extensions"
14
+ require "environment_feature"
15
+ require "environment"
16
+
17
+ # This is a module for an actual command line interpreter for Egor
18
+ # ---
19
+ # Copyright (C) 2008-9 Semin Lee
20
+ module Egor
21
+ class CLI
22
+ class << self
23
+
24
+ # :nodoc:
25
+ def print_version
26
+ puts Egor::VERSION
27
+ end
28
+
29
+ # Print Egor's Usage on the screen
30
+ #
31
+ # :call-seq:
32
+ # Egor::CLI::print_usage
33
+ #
34
+ def print_usage
35
+ puts <<-USAGE
36
+ egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables.
37
+
38
+ Usage:
39
+ egor [ options ] -l TEMLIST-file -c CLASSDEF-file
40
+ or
41
+ egor [ options ] -f TEM-file -c CLASSDEF-file
42
+
43
+ Options:
44
+ --tem-file (-f) STRING: a tem file
45
+ --tem-list (-l) STRING: a list for tem files
46
+ --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
47
+ --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
48
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
+ --noweight: calculate substitution counts with no weights (default)
50
+ --smooth (-s) INTEGER:
51
+ 0 for parial smoothing (default)
52
+ 1 for full smoothing
53
+ --nosmooth: perform no smoothing operation
54
+ --cys (-y) INTEGER: (NOT implemented yet)
55
+ 0 for using C and J only for structure
56
+ 1 for both structure and sequence (default)
57
+ --output INTEGER:
58
+ 0 for raw counts (no-smoothing performed)
59
+ 1 for probabilities
60
+ 2 for log-odds (default)
61
+ --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
62
+ --sigma DOUBLE: change the sigma value for smoothing (default 5)
63
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
64
+ --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
65
+ --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
66
+ --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
67
+ --verbose (-v) INTEGER
68
+ 0 for ERROR level (default)
69
+ 1 for WARN or above level
70
+ 2 for INFO or above level
71
+ 3 for DEBUG or above level
72
+ --version: print version
73
+ --help (-h): show help
74
+
75
+ USAGE
76
+ end
77
+
78
+ # Calculate PID between two sequences
79
+ #
80
+ # :call-seq:
81
+ # Egor::CLI::calc_pid(seq1, seq2) -> Float
82
+ #
83
+ def calc_pid(seq1, seq2)
84
+ s1 = seq1.split("")
85
+ s2 = seq2.split("")
86
+ cols = s1.zip(s2)
87
+ align = 0
88
+ ident = 0
89
+ intgp = 0
90
+
91
+ cols.each do |col|
92
+ if (col[0] != "-") && (col[1] != "-")
93
+ align += 1
94
+ if col[0] == col[1]
95
+ ident += 1
96
+ end
97
+ elsif (((col[0] == "-") && (col[1] != "-")) ||
98
+ ((col[0] != "-") && (col[1] == "-")))
99
+ intgp += 1
100
+ end
101
+ end
102
+
103
+ pid = 100.0 * ident.to_f / (align + intgp)
104
+ end
105
+ memoize :calc_pid
106
+
107
+ # :nodoc:
108
+ def execute(arguments=[])
109
+ #
110
+ # Abbreviations in the aa1 codes
111
+ #
112
+ # * env: environment
113
+ # * tem: (FUGUE) template
114
+ # * classdef: (envlironment) class definition
115
+ # * aa: amino acid
116
+ # * aa: weighted amino acid
117
+ # * tot: total
118
+ # * rel: relative
119
+ # * obs: observation (frequency)
120
+ # * mut: mutation
121
+ # * mutb: mutability
122
+ # * freq: frequency
123
+ # * prob: probability
124
+ # * opts: options
125
+ #
126
+
127
+ # Part 1.
128
+ #
129
+ # Global variables and their default values
130
+ #
131
+ $logger = Logger.new(STDOUT)
132
+ $logger.level = Logger::ERROR
133
+ $amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
134
+ $tem_list = nil
135
+ $tem_file = nil
136
+ $classdef = "classdef.dat"
137
+ $outfile = "allmat.dat"
138
+ $outfh = nil # file hanfle for outfile
139
+ $output = 2
140
+ $aa_tot_obs = {}
141
+ $aa_mut_obs = {}
142
+ $aa_mutb = {}
143
+ $aa_rel_mutb = {}
144
+ $aa_rel_freq = {}
145
+ $env_aa_obs = {}
146
+ $ali_size = 0
147
+ $tot_aa = 0
148
+ $sigma = 5.0
149
+ $weight = 60
150
+ $noweight = false
151
+ $smooth = :partial
152
+ $nosmooth = false
153
+ $scale = 3
154
+ $pidmin = nil
155
+ $pidmax = nil
156
+ $scale = 3
157
+ $add = 0
158
+ $penv = false
159
+ $heatmap = false
160
+ $smooth_prob = {}
161
+
162
+ # Part 2.
163
+ #
164
+ # Parsing options
165
+ #
166
+ opts = GetoptLong.new(
167
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
168
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
169
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
170
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
171
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
172
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
173
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
174
+ [ '--heatmap', GetoptLong::NO_ARGUMENT ],
175
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
176
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
177
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
178
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
179
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
180
+ [ '--version', GetoptLong::NO_ARGUMENT ]
181
+ )
182
+
183
+ opts.each do |opt, arg|
184
+ case opt
185
+ when '--help'
186
+ print_usage
187
+ exit 0
188
+ when '--tem-list'
189
+ $tem_list = arg
190
+ when '--tem-file'
191
+ $tem_file = arg
192
+ when '--classdef'
193
+ $classdef = arg
194
+ when '--output'
195
+ $output = arg.to_i
196
+ when '--outfile'
197
+ $outfile = arg
198
+ when '--cyc'
199
+ $logger.error "!!! --cys option is not available yet"
200
+ exit 1
201
+ $cysteine = (arg.to_i == 1 ? false : true)
202
+ when '--weight'
203
+ $weight = arg.to_i
204
+ when '--sigma'
205
+ $sigma = arg.to_f
206
+ when '--pidmin'
207
+ $pidmin = arg.to_f
208
+ when '--pidmax'
209
+ $pidmax = arg.to_f
210
+ when '--noweight'
211
+ $noweight = true
212
+ when '--smooth'
213
+ $smooth = (arg.to_i == 1 ? :full : :parital)
214
+ when '--nosmooth'
215
+ $nosmooth = true
216
+ when '--scale'
217
+ $scale = arg.to_f
218
+ when '--add'
219
+ $add = arg.to_f
220
+ when '--penv'
221
+ $logger.error "!!! --penv option is not available yet"
222
+ exit 1
223
+ $penv = true
224
+ when '--heatmap'
225
+ $heatmap = true
226
+ when '--verbose'
227
+ $logger.level = case arg.to_i
228
+ when 0 then Logger::ERROR
229
+ when 1 then Logger::WARN
230
+ when 2 then Logger::INFO
231
+ when 3 then Logger::DEBUG
232
+ else Logger::ERROR
233
+ end
234
+ when '--version'
235
+ print_version
236
+ exit 0
237
+ end
238
+ end
239
+
240
+ # when arguments are nonsense, print usage
241
+ if ((ARGV.length != 0) ||
242
+ (!$tem_list && !$tem_file) ||
243
+ ($tem_list && $tem_file))
244
+ print_usage
245
+ exit 1
246
+ end
247
+
248
+ # Part 3.
249
+ #
250
+ # Reading Environment Class Definition File
251
+ #
252
+
253
+ # a hash for storing all environment feature objects
254
+ $env_features = []
255
+
256
+ # aa1 amino acid in a substitution itself is a environment feature
257
+ $env_features << EnvironmentFeature.new("sequence",
258
+ $amino_acids,
259
+ $amino_acids,
260
+ "F",
261
+ "F")
262
+
263
+ # read environment class definiton file and
264
+ # store them into the hash prepared above
265
+ IO.foreach($classdef) do |line|
266
+ if line.start_with?("#")
267
+ next
268
+ elsif (env_ftr = line.chomp.split(/;/)).length == 5
269
+ $logger.info ">>> An environment feature, #{line.chomp} detected"
270
+ if env_ftr[-1] == "T"
271
+ # skip silenced environment feature
272
+ $logger.warn "!!! The environment feature, #{line.chomp} silent"
273
+ next
274
+ end
275
+ if env_ftr[-2] == "T"
276
+ $logger.warn "!!! The environment feature, #{line.chomp} constrained"
277
+ end
278
+ $env_features << EnvironmentFeature.new(env_ftr[0],
279
+ env_ftr[1].split(""),
280
+ env_ftr[2].split(""),
281
+ env_ftr[3],
282
+ env_ftr[4])
283
+ else
284
+ $logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
285
+ exit 1
286
+ end
287
+ end
288
+
289
+ # a hash for storing all environment objects
290
+ $envs = {}
291
+
292
+ # generate all possible combinations of environment labels, and
293
+ # create & store every environment object into the hash prepared above with the label as a key
294
+ $env_features.inject([]) { |sum, ec|
295
+ sum << ec.labels
296
+ }.inject { |pro, lb|
297
+ pro.product(lb)
298
+ }.each_with_index { |e, i|
299
+ $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
300
+ }
301
+
302
+ # Part 4.
303
+ #
304
+ # Reading TEM file or TEMLIST list file and couting substitutions
305
+ #
306
+
307
+ # a global file handle for output
308
+ $outfh = File.open($outfile, "w")
309
+
310
+ if $tem_file
311
+ $tem_list = [$tem_file]
312
+ end
313
+
314
+ if $tem_list
315
+ IO.foreach($tem_list) do |tem_file|
316
+ tem_file.chomp!
317
+
318
+ $logger.info ">>> Analysing #{tem_file} ..."
319
+
320
+ ali = Bio::Alignment::OriginalAlignment.new
321
+ ff = Bio::FlatFile.auto(tem_file)
322
+ ff.each_entry do |pir|
323
+ if pir.definition == "sequence"
324
+ ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
325
+ end
326
+ end
327
+
328
+ $ali_size += ali.size
329
+ env_labels = {}
330
+ disulphide = {}
331
+
332
+ ali.each_pair do |key, seq|
333
+ # check disulphide bond environment first!
334
+ ff.rewind
335
+ ff.each_entry do |pir|
336
+ if (pir.entry_id == key) && (pir.definition == "disulphide")
337
+ disulphide[key] = pir.data.gsub("\n", "").split("")
338
+ end
339
+ end
340
+
341
+ $env_features.each_with_index do |ec, ei|
342
+ env_labels[key] = [] unless env_labels.has_key?(key)
343
+
344
+ ff.rewind
345
+ ff.each_entry do |pir|
346
+ if (pir.entry_id == key) && (pir.definition == ec.name)
347
+ labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
348
+ if sym == "-"
349
+ "-"
350
+ elsif sym == "X" || sym == "x"
351
+ "X"
352
+ else
353
+ if ei == 0 # Amino Acid Environment Feature
354
+ ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
355
+ else
356
+ ec.labels[ec.symbols.index(sym)]
357
+ end
358
+ end
359
+ end
360
+
361
+ if env_labels[key].empty?
362
+ env_labels[key] = labels
363
+ else
364
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
365
+ end
366
+ end
367
+ end
368
+ end
369
+ end
370
+
371
+ if $noweight
372
+ ali.each_pair do |id1, seq1|
373
+ ali.each_pair do |id2, seq2|
374
+ if id1 != id2
375
+ pid = calc_pid(seq1, seq2)
376
+ s1 = seq1.split("")
377
+ s2 = seq2.split("")
378
+
379
+ # check PID_MIN
380
+ if $pidmin && (pid < $pidmin)
381
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
382
+ next
383
+ end
384
+
385
+ # check PID_MAX
386
+ if $pidmax && (pid > $pidmax)
387
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
388
+ next
389
+ end
390
+
391
+ s1.each_with_index do |aa1, pos|
392
+ if env_labels[id1][pos].include?("X")
393
+ $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
394
+ next
395
+ end
396
+
397
+ aa1.upcase!
398
+ aa2 = s2[pos].upcase
399
+
400
+ if !$amino_acids.include?(aa1)
401
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
402
+ next
403
+ end
404
+
405
+ if !$amino_acids.include?(aa2)
406
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
407
+ next
408
+ end
409
+
410
+ aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
411
+ aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
412
+
413
+ $envs[env_labels[id1][pos]].add_residue_count(aa2)
414
+
415
+ grp_label = env_labels[id1][pos][1..-1]
416
+
417
+ if $env_aa_obs.has_key? grp_label
418
+ if $env_aa_obs[grp_label].has_key? aa1
419
+ $env_aa_obs[grp_label][aa1] += 1
420
+ else
421
+ $env_aa_obs[grp_label][aa1] = 1
422
+ end
423
+ else
424
+ $env_aa_obs[grp_label] = Hash.new(0)
425
+ $env_aa_obs[grp_label][aa1] = 1
426
+ end
427
+
428
+ if $aa_tot_obs.has_key? aa1
429
+ $aa_tot_obs[aa1] += 1
430
+ else
431
+ $aa_tot_obs[aa1] = 1
432
+ end
433
+
434
+ if aa1 != aa2
435
+ if $aa_mut_obs.has_key? aa1
436
+ $aa_mut_obs[aa1] += 1
437
+ else
438
+ $aa_mut_obs[aa1] = 1
439
+ end
440
+ end
441
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
442
+ end
443
+ end
444
+ end
445
+ end
446
+ else
447
+ # BLOSUM-like weighting
448
+ clusters = []
449
+ ali.each_pair { |i, s| clusters << [i] }
450
+
451
+ # a loop for single linkage clustering
452
+ begin
453
+ continue = false
454
+ 0.upto(clusters.size - 2) do |i|
455
+ indexes = []
456
+ (i + 1).upto(clusters.size - 1) do |j|
457
+ found = false
458
+ clusters[i].each do |c1|
459
+ clusters[j].each do |c2|
460
+ if calc_pid(ali[c1], ali[c2]) >= $weight
461
+ indexes << j
462
+ found = true
463
+ break
464
+ end
465
+ end
466
+ break if found
467
+ end
468
+ end
469
+
470
+ unless indexes.empty?
471
+ continue = true
472
+ group = clusters[i]
473
+ indexes.each do |k|
474
+ group = group.concat(clusters[k])
475
+ clusters[k] = nil
476
+ end
477
+ clusters[i] = group
478
+ clusters.compact!
479
+ end
480
+ end
481
+ end while(continue)
482
+
483
+ clusters.combination(2).each do |cluster1, cluster2|
484
+ cluster1.each do |id1|
485
+ cluster2.each do |id2|
486
+ seq1 = ali[id1].split("")
487
+ seq2 = ali[id2].split("")
488
+ seq1.each_with_index do |aa1, pos|
489
+ if env_labels[id1][pos].include?("X")
490
+ $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
491
+ next
492
+ end
493
+
494
+ aa1.upcase!
495
+ aa2 = seq2[pos].upcase
496
+
497
+ if !$amino_acids.include?(aa1)
498
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
499
+ next
500
+ end
501
+
502
+ if !$amino_acids.include?(aa2)
503
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
504
+ next
505
+ end
506
+
507
+ aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
508
+ aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
509
+ size1 = cluster1.size
510
+ size2 = cluster2.size
511
+ obs1 = 1.0 / size1
512
+ obs2 = 1.0 / size2
513
+
514
+ $envs[env_labels[id1][pos]].add_residue_count(aa2, 1.0 / (size1 * size2))
515
+ $envs[env_labels[id2][pos]].add_residue_count(aa1, 1.0 / (size1 * size2))
516
+
517
+ grp_label1 = env_labels[id1][pos][1..-1]
518
+ grp_label2 = env_labels[id2][pos][1..-1]
519
+
520
+ if $env_aa_obs.has_key? grp_label1
521
+ if $env_aa_obs[grp_label1].has_key? aa1
522
+ $env_aa_obs[grp_label1][aa1] += obs1
523
+ else
524
+ $env_aa_obs[grp_label1][aa1] = obs1
525
+ end
526
+ else
527
+ $env_aa_obs[grp_label1] = Hash.new(0.0)
528
+ $env_aa_obs[grp_label1][aa1] = obs1
529
+ end
530
+
531
+ if $env_aa_obs.has_key? grp_label2
532
+ if $env_aa_obs[grp_label2].has_key? aa2
533
+ $env_aa_obs[grp_label2][aa2] += obs2
534
+ else
535
+ $env_aa_obs[grp_label2][aa2] = obs2
536
+ end
537
+ else
538
+ $env_aa_obs[grp_label2] = Hash.new(0.0)
539
+ $env_aa_obs[grp_label2][aa2] = obs2
540
+ end
541
+
542
+ if $aa_tot_obs.has_key? aa1
543
+ $aa_tot_obs[aa1] += obs1
544
+ else
545
+ $aa_tot_obs[aa1] = obs1
546
+ end
547
+
548
+ if $aa_tot_obs.has_key? aa2
549
+ $aa_tot_obs[aa2] += obs2
550
+ else
551
+ $aa_tot_obs[aa2] = obs2
552
+ end
553
+
554
+ if aa1 != aa2
555
+ if $aa_mut_obs.has_key? aa1
556
+ $aa_mut_obs[aa1] += obs1
557
+ else
558
+ $aa_mut_obs[aa1] = obs1
559
+ end
560
+ if $aa_mut_obs.has_key? aa2
561
+ $aa_mut_obs[aa2] += obs2
562
+ else
563
+ $aa_mut_obs[aa2] = obs2
564
+ end
565
+ end
566
+
567
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
568
+ $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
569
+ end
570
+ end
571
+ end
572
+ end
573
+ end # if !$nosmooth
574
+ end # IO.foreach($tem_list)
575
+
576
+ # print out default header
577
+ $outfh.puts <<HEADER
578
+ # Environment-specific amino acid substitution matrices
579
+ # Creator: egor version #{Egor::VERSION}
580
+ # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
581
+ #
582
+ # Definitions for structural environments:
583
+ # #{$env_features.size - 1} features used
584
+ #
585
+ HEADER
586
+
587
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
588
+
589
+ $outfh.puts <<HEADER
590
+ #
591
+ # (read in from #{$classdef})
592
+ #
593
+ # Number of alignments: #{$ali_size}
594
+ # (list of .tem files read in from #{$tem_list})
595
+ #
596
+ # Total number of environments: #{Integer($envs.size / $amino_acids.size)}
597
+ #
598
+ # There are #{$amino_acids.size} amino acids considered.
599
+ # #{$amino_acids.join}
600
+ #
601
+ HEADER
602
+
603
+ if $noweight
604
+ $outfh.puts "# Weighting scheme: none"
605
+ else
606
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
607
+ end
608
+ $outfh.puts "#"
609
+
610
+ # calculate amino acid frequencies and mutabilities, and
611
+ # print them as default statistics in the header part
612
+ ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
613
+ $tot_aa = $aa_tot_obs.values.sum
614
+
615
+ $outfh.puts "#"
616
+ $outfh.puts "# Total amino acid frequencies:\n"
617
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUTB REL_MUTB REL_FRQ]
618
+
619
+ $aa_tot_obs.each_pair do |res, freq|
620
+ $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
621
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
622
+ $aa_rel_freq[res] = freq / $tot_aa.to_f
623
+ end
624
+
625
+ $amino_acids.each do |res|
626
+ if $noweight
627
+ $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
628
+ [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
629
+ else
630
+ $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
631
+ [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
632
+ end
633
+ end
634
+ $outfh.puts "#"
635
+
636
+ # calculating probabilities for each environment
637
+ $envs.values.each do |e|
638
+ if e.freq_array.sum != 0
639
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
640
+ end
641
+ end
642
+
643
+ # count raw frequencies
644
+ $tot_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
645
+
646
+ # for each combination of environment features
647
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
648
+
649
+ env_groups.to_a.sort_by { |env_group|
650
+ # a bit clumsy sorting here...
651
+ env_group[0].split("").map_with_index { |l, i|
652
+ $env_features[i + 1].labels.index(l)
653
+ }
654
+ }.each_with_index do |group, group_no|
655
+ grp_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
656
+
657
+ $amino_acids.each_with_index do |aa, ai|
658
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
659
+ 0.upto(20) { |j| grp_freq_matrix[ai, j] = freq_array[j] }
660
+ end
661
+
662
+ $tot_freq_matrix += grp_freq_matrix
663
+
664
+ if $output == 0
665
+ $outfh.puts ">#{group[0]} #{group_no}"
666
+ $outfh.puts grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
667
+ end
668
+ end
669
+
670
+ if $output == 0
671
+ $outfh.puts ">Total"
672
+ $outfh.puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
673
+ exit 0
674
+ end
675
+
676
+ # for probability
677
+ if $output == 1
678
+ $outfh.puts <<HEADER
679
+ #
680
+ # Each column (j) represents the probability distribution for the
681
+ # likelihood of acceptance of a mutational event by a residue type j in
682
+ # a particular structural environment (specified after >) leading to
683
+ # any other residue type (i) and sums up to 100.
684
+ #
685
+ HEADER
686
+ end
687
+
688
+ if ($output > 0) && $nosmooth
689
+ # Probability matrices
690
+ tot_prob_matrix = NMatrix.float(21, 21)
691
+
692
+ # for each combination of environment features
693
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
694
+ env_groups.to_a.sort_by { |env_group|
695
+ # a bit clumsy sorting here...
696
+ env_group[0].split("").map_with_index { |l, i|
697
+ $env_features[i + 1].labels.index(l)
698
+ }
699
+ }.each_with_index do |group, group_no|
700
+ grp_prob_matrix = NMatrix.float(21,21)
701
+
702
+ $amino_acids.each_with_index do |aa, ai|
703
+ prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
704
+ 0.upto(20) { |j| grp_prob_matrix[ai, j] = prob_array[j] }
705
+ end
706
+
707
+ tot_prob_matrix += grp_prob_matrix
708
+
709
+ if ($output == 1)
710
+ $outfh.puts ">#{group[0]} #{group_no}"
711
+ $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
712
+ end
713
+ end
714
+
715
+ if ($output == 1)
716
+ $outfh.puts ">Total"
717
+ $outfh.puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
718
+ $outfh.close
719
+ exit 0
720
+ end
721
+ end
722
+
723
+ # for smoothing...
724
+ if ($output > 0) && !$nosmooth
725
+ #
726
+ # p1 probability
727
+ #
728
+ p1 = NArray.float(21)
729
+ a0 = NArray.float(21).fill(1 / 21.0)
730
+ big_N = $tot_aa.to_f
731
+ small_n = 21.0
732
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
733
+ omega2 = 1.0 - omega1
734
+
735
+ if $smooth == :partial
736
+ # for partial smoothing, p1 probability is not smoothed!
737
+ 0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
738
+ $smooth_prob[1] = p1
739
+ else
740
+ # for full smoothing, p1 probability is smoothed
741
+ 0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
742
+ $smooth_prob[1] = p1
743
+ end
744
+
745
+ #
746
+ # p2 and above
747
+ #
748
+ env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
749
+
750
+ if $smooth == :partial
751
+ $outfh.puts <<HEADER
752
+ # Partial Smoothing:
753
+ #
754
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
755
+ # each row in all matrices (no smoothing)
756
+ # ^^^^^^^^^^^^
757
+ # p2(ri|Rj) is estimated as:
758
+ # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
759
+ #
760
+ # p3(ri|Rj,fq) is estimated as:
761
+ # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
762
+ # where
763
+ # A2(ri|fq) = p2(ri|fq) (fixed fq; partial smoothing)
764
+ #
765
+ # The smoothing procedure is curtailed here and finally
766
+ # p5(ri|Rj,...) is estimated as:
767
+ # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
768
+ # where
769
+ # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
770
+ #
771
+ # Weights (omegas) are calculated as in Topham et al. 1993)
772
+ #
773
+ # sigma value used is: 5.00
774
+ #
775
+ HEADER
776
+ 1.upto($env_features.size) do |ci|
777
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
778
+ next if (ci > 2) && (ci < $env_features.size)
779
+
780
+ env_labels.combination(ci) do |c1|
781
+ Enumerable.cart_prod(*c1).each do |labels|
782
+ pattern = "." * $env_features.size
783
+
784
+ labels.each do |label|
785
+ i = label[0].chr.to_i
786
+ l = label[1].chr
787
+ pattern[i] = l
788
+ end
789
+
790
+ if pattern =~ /^\./
791
+ $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
792
+ next
793
+ end
794
+
795
+ # get environmetns, frequencies, and probabilities
796
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
797
+ freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
798
+ prob_arr = NArray.float(21)
799
+ 0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
800
+
801
+ # # assess whether a residue type j is compatible with a particular combination of structural features
802
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
803
+ # if ci == $env_features.size
804
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
805
+ # sub_pattern = "." * $env_features.size
806
+ # sub_pattern[0] = aa_label
807
+ # sub_freq_sum = 0
808
+ #
809
+ # labels[1..-1].each do |label|
810
+ # next if label.start_with?("0")
811
+ # i = label[0].chr.to_i
812
+ # l = label[1].chr
813
+ # sub_pattern[i] = l
814
+ # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
815
+ # sub_freq_arr = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
816
+ # sub_freq_sum += sub_freq_arr.sum
817
+ # end
818
+ #
819
+ # if sub_freq_sum == 0
820
+ # if $smooth_prob.has_key?(ci + 1)
821
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
822
+ # else
823
+ # $smooth_prob[ci + 1] = {}
824
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
825
+ # end
826
+ # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
827
+ # next
828
+ # end
829
+ # end
830
+
831
+ # collect priors if ci > 1
832
+ priors = []
833
+
834
+ if ci == 2
835
+ labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
836
+ priors << $smooth_prob[2][c3.to_set]
837
+ }
838
+ elsif ci == $env_features.size
839
+ labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
840
+ priors << $smooth_prob[3][c3.to_set]
841
+ }
842
+ end
843
+
844
+ # entropy based weighting priors
845
+ entropy_max = Math::log(21)
846
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
847
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
848
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
849
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
850
+
851
+ # smoothing step
852
+ smooth_prob_arr = NArray.float(21)
853
+ big_N = freq_arr.sum.to_f
854
+ small_n = 21.0
855
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
856
+ omega2 = 1.0 - omega1
857
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
858
+
859
+ # normalization step
860
+ smooth_prob_arr_sum = smooth_prob_arr.sum
861
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
862
+
863
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
864
+ if !$smooth_prob.has_key?(ci + 1)
865
+ $smooth_prob[ci + 1] = {}
866
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
867
+ else
868
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
869
+ end
870
+ end
871
+ end
872
+ end
873
+ else
874
+ $outfh.puts <<HEADER
875
+ # Full Smoothing:
876
+ #
877
+ # p1(ri) is estimated as:
878
+ # p1(ri) = omega1 * A0 + omega2 * W1(ri)
879
+ #
880
+ # p2(ri|f1q) is estimated as:
881
+ # p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
882
+ #
883
+ # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
884
+ #
885
+ # p3(ri|f1q,f2q) is estimated as:
886
+ # p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
887
+ # where
888
+ # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
889
+ #
890
+ # The smoothing procedure is NOT curtailed here and it goes upto
891
+ #
892
+ # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
893
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * W5(ri|f1q,f2q,...,fn-1q)
894
+ # where
895
+ # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
896
+ #
897
+ # Weights (omegas) are calculated as in Topham et al. 1993)
898
+ #
899
+ # sigma value used is: 5.00
900
+ #
901
+ HEADER
902
+ # full smooting
903
+ 1.upto($env_features.size) do |ci|
904
+ env_labels.combination(ci) do |c1|
905
+ Enumerable.cart_prod(*c1).each do |labels|
906
+ pattern = "." * $env_features.size
907
+ labels.each do |label|
908
+ j = label[0].chr.to_i
909
+ l = label[1].chr
910
+ pattern[j] = l
911
+ end
912
+
913
+ # get environmetns, frequencies, and probabilities
914
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
915
+ freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
916
+ prob_arr = NArray.float(21)
917
+ 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
918
+
919
+ # collect priors
920
+ priors = []
921
+ if ci > 1
922
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
923
+ else
924
+ priors << $smooth_prob[1]
925
+ end
926
+
927
+ # entropy based weighting priors
928
+ entropy_max = Math::log(21)
929
+ entropies = priors.map do |prior|
930
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
931
+ end
932
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
933
+
934
+ # smoothing step
935
+ smooth_prob_arr = NArray.float(21)
936
+ big_N = freq_arr.sum.to_f
937
+ small_n = 21.0
938
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
939
+ omega2 = 1.0 - omega1
940
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
941
+
942
+ # normalization step
943
+ smooth_prob_arr_sum = smooth_prob_arr.sum
944
+ 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
945
+
946
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
947
+ if !$smooth_prob.has_key?(ci + 1)
948
+ $smooth_prob[ci + 1] = {}
949
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
950
+ else
951
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
952
+ end
953
+ end
954
+ end
955
+ end
956
+ end
957
+
958
+ # updating smoothed probability array for each envrionment
959
+ $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
960
+
961
+ # for a total substitution probability matrix
962
+ tot_smooth_prob_matrix = NMatrix.float(21,21)
963
+
964
+ # grouping environments by its environment labels but amino acid label
965
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
966
+
967
+ # sorting environments and build 21X21 substitution matrices
968
+ env_groups.to_a.sort_by { |env_group|
969
+ # a bit clumsy sorting here...
970
+ env_group[0].split("").map_with_index { |l, i|
971
+ $env_features[i + 1].labels.index(l)
972
+ }
973
+ }.each_with_index do |group, group_no|
974
+ # calculating 21X21 substitution probability matrix for each envrionment
975
+ grp_prob_matrix = NMatrix.float(21,21)
976
+
977
+ $amino_acids.each_with_index do |aa, ai|
978
+ smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
979
+ 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
980
+ end
981
+
982
+ tot_smooth_prob_matrix += grp_prob_matrix
983
+
984
+ if $output == 1
985
+ $outfh.puts ">#{group[0]} #{group_no}"
986
+ $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
987
+ end
988
+ end
989
+
990
+ tot_smooth_prob_matrix /= env_groups.size
991
+
992
+ if $output == 1
993
+ $outfh.puts ">Total"
994
+ $outfh.puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
995
+ $outfh.close
996
+ exit 0
997
+ end
998
+
999
+ if $output == 2
1000
+ $outfh.puts <<HEADER
1001
+ #
1002
+ # The probabilities were then divided by the background probabilities
1003
+ # which were derived from the environment-independent amino acid frequencies.
1004
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1005
+ #
1006
+ # Shown here are logarithms of these values multiplied by 3/log(2)
1007
+ # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1008
+ #
1009
+ # For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
1010
+ #
1011
+ HEADER
1012
+
1013
+ # log-add ratio matrices from now on
1014
+ tot_logo_mat = NMatrix.float(21,21)
1015
+ factor = $scale / Math::log(2)
1016
+
1017
+ # grouping environments by its environment labels but amino acid label
1018
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1019
+
1020
+ # sorting environments and build 21X21 substitution matrices
1021
+ env_groups.to_a.sort_by { |env_group|
1022
+ # a bit clumsy sorting here...
1023
+ env_group[0].split("").map_with_index { |l, i|
1024
+ $env_features[i + 1].labels.index(l)
1025
+ }
1026
+ }.each_with_index do |group, group_no|
1027
+ # calculating 21X21 substitution probability matrix for each envrionment
1028
+ grp_label = group[0]
1029
+ grp_envs = group[1]
1030
+ grp_logo_mat = NMatrix.float(21,21)
1031
+
1032
+ $amino_acids.each_with_index do |aa, ai|
1033
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1034
+ logo_arr = NArray.float(21)
1035
+
1036
+ env.smooth_prob_array.to_a.each_with_index do |prob, j|
1037
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1038
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1039
+ logo_arr[j] = factor * Math::log(odds)
1040
+ end
1041
+ 0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1042
+ end
1043
+
1044
+ tot_logo_mat += grp_logo_mat
1045
+
1046
+ $outfh.puts ">#{grp_label} #{group_no}"
1047
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1048
+ end
1049
+
1050
+ tot_logo_mat /= env_groups.size
1051
+
1052
+ $outfh.puts ">Total"
1053
+ $outfh.puts tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1054
+ $outfh.close
1055
+ exit 0
1056
+ end
1057
+ end
1058
+ end
1059
+ end
1060
+
1061
+ end # class << self
1062
+ end # class CLI
1063
+ end # module Egor