ulla 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.git/COMMIT_EDITMSG +13 -0
  2. data/.git/HEAD +1 -0
  3. data/.git/config +11 -0
  4. data/.git/description +1 -0
  5. data/.git/hooks/applypatch-msg.sample +15 -0
  6. data/.git/hooks/commit-msg.sample +24 -0
  7. data/.git/hooks/post-commit.sample +8 -0
  8. data/.git/hooks/post-receive.sample +15 -0
  9. data/.git/hooks/post-update.sample +8 -0
  10. data/.git/hooks/pre-applypatch.sample +14 -0
  11. data/.git/hooks/pre-commit.sample +18 -0
  12. data/.git/hooks/pre-rebase.sample +169 -0
  13. data/.git/hooks/prepare-commit-msg.sample +36 -0
  14. data/.git/hooks/update.sample +107 -0
  15. data/.git/index +0 -0
  16. data/.git/info/exclude +6 -0
  17. data/.git/logs/HEAD +3 -0
  18. data/.git/logs/refs/heads/master +3 -0
  19. data/.git/logs/refs/remotes/origin/HEAD +1 -0
  20. data/.git/objects/06/9494e479f28b5751fb135b8e55e8fef3d3a02e +0 -0
  21. data/.git/objects/22/0df784191ad94983ca1d943e49fe482c9d1069 +0 -0
  22. data/.git/objects/3b/b6f2b7f563175a13a0ccd723aab761552f448b +0 -0
  23. data/.git/objects/41/f48aefb4d7a6a87eb423eaae77ae1e8a58dd6c +0 -0
  24. data/.git/objects/44/d1f1782e3ea1d9fd2f9054784b53e8e810a8ca +0 -0
  25. data/.git/objects/4f/364c2eac29f5c7fcbf06419c4f58074cd32ace +2 -0
  26. data/.git/objects/57/1326145a7a4b3e58f3d3008ba343135f213b05 +4 -0
  27. data/.git/objects/6c/4f0844f62b7345f0651b0fb2829a8f157469fb +3 -0
  28. data/.git/objects/73/8dc79450de050f12d48a32602f2ddbe6807029 +0 -0
  29. data/.git/objects/7b/4acb3aee6616d80e295ee21fe8bb7ee93ebe96 +2 -0
  30. data/.git/objects/9e/0a9235b0d70a8029098070007fb414cb52504e +2 -0
  31. data/.git/objects/9e/bfcad2906aac4a23a7c9689a47b76723f5d152 +0 -0
  32. data/.git/objects/a6/578c95f2f474303464b572e9dac716432472b2 +0 -0
  33. data/.git/objects/a8/65ef5700ff04601c6fc40fa5ede3cc25534723 +0 -0
  34. data/.git/objects/aa/285cb176668c5e49c54c6e1d3cc27bd47fd4f4 +0 -0
  35. data/.git/objects/b8/e3828a1082137c4aa4595386bdfb73e3c75b9d +0 -0
  36. data/.git/objects/c2/fb6afc000952b56354fe195682645000d2aea2 +0 -0
  37. data/.git/objects/c4/a0553ca0e3c4628e688ecb5e3304a8a8ac0c28 +2 -0
  38. data/.git/objects/c8/d49f83c4a32cff2d87dd4aa5f83eb7aac3a753 +5 -0
  39. data/.git/objects/ca/c25e8049075ed4bff993705acb4750b2b62ba9 +0 -0
  40. data/.git/objects/d2/ff2e939339eb3fb776e064c258e71dfa1cf396 +0 -0
  41. data/.git/objects/d7/cedf9e2a8ff35b5d7dafdc0f20daed9c65ce44 +0 -0
  42. data/.git/objects/e2/e81af59e3a6c4aa8daac62add62860ae776ba4 +0 -0
  43. data/.git/objects/e5/7c47d183ce5dda1a944c7ee1c19c8a0c4bb278 +0 -0
  44. data/.git/objects/eb/f4a4e1e50bb30731597f776e56b0ccb0c9959f +0 -0
  45. data/.git/objects/f6/39d6f6cf883fde4b9052012919c1df3288c7da +0 -0
  46. data/.git/objects/f8/2346f308f49053df108b7c31ac3089e8b4b4ac +0 -0
  47. data/.git/objects/fb/4b193bb1cbe9041d2f00176f6caa6acfb1fc12 +0 -0
  48. data/.git/objects/pack/pack-aebf617a0b8e016433238d2f21f542bc5b21bd15.idx +0 -0
  49. data/.git/objects/pack/pack-aebf617a0b8e016433238d2f21f542bc5b21bd15.pack +0 -0
  50. data/.git/packed-refs +8 -0
  51. data/.git/refs/heads/master +1 -0
  52. data/.git/refs/remotes/origin/HEAD +1 -0
  53. data/.gitignore +8 -0
  54. data/History.txt +42 -0
  55. data/Manifest.txt +90 -0
  56. data/PostInstall.txt +5 -0
  57. data/README.rdoc +259 -0
  58. data/Rakefile +32 -0
  59. data/bin/ulla +10 -0
  60. data/config/website.yml +2 -0
  61. data/config/website.yml.sample +2 -0
  62. data/lib/math_extensions.rb +7 -0
  63. data/lib/narray_extensions.rb +22 -0
  64. data/lib/nmatrix_extensions.rb +245 -0
  65. data/lib/string_extensions.rb +17 -0
  66. data/lib/ulla/cli.rb +1742 -0
  67. data/lib/ulla/environment.rb +34 -0
  68. data/lib/ulla/environment_class_hash.rb +20 -0
  69. data/lib/ulla/environment_feature.rb +26 -0
  70. data/lib/ulla/environment_feature_array.rb +12 -0
  71. data/lib/ulla/heatmap_array.rb +111 -0
  72. data/lib/ulla.rb +6 -0
  73. data/script/console +10 -0
  74. data/script/destroy +14 -0
  75. data/script/generate +14 -0
  76. data/script/txt2html +71 -0
  77. data/test/test_helper.rb +2 -0
  78. data/test/test_math_extensions.rb +11 -0
  79. data/test/test_narray_extensions.rb +14 -0
  80. data/test/test_nmatrix_extensions.rb +16 -0
  81. data/test/test_string_extensions.rb +11 -0
  82. data/test/test_ulla.rb +11 -0
  83. data/test/ulla/test_cli.rb +9 -0
  84. data/test/ulla/test_environment_class_hash.rb +25 -0
  85. data/test/ulla/test_environment_feature.rb +29 -0
  86. data/website/index.html +16 -0
  87. data/website/index.txt +217 -0
  88. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  89. data/website/stylesheets/screen.css +158 -0
  90. data/website/template.html.erb +57 -0
  91. metadata +215 -0
data/lib/ulla/cli.rb ADDED
@@ -0,0 +1,1742 @@
1
+ require 'rubygems'
2
+ require 'getoptlong'
3
+ require 'logger'
4
+ require 'narray'
5
+ require 'bio'
6
+ require 'set'
7
+ require 'facets'
8
+
9
+ require 'math_extensions'
10
+ require 'string_extensions'
11
+ require 'narray_extensions'
12
+ require 'nmatrix_extensions'
13
+
14
+ require 'ulla/environment'
15
+ require 'ulla/environment_class_hash'
16
+ require 'ulla/environment_feature'
17
+ require 'ulla/environment_feature_array'
18
+ require 'ulla/heatmap_array'
19
+
20
+ # This is a module for an actual command line interpreter for Ulla
21
+ # ---
22
+ # Copyright (C) 2008-9 Semin Lee
23
+ module Ulla
24
+ class CLI
25
+ class << self
26
+
27
+ # :nodoc:
28
+ def print_version
29
+ puts VERSION
30
+ end
31
+
32
+ # Print Ulla's Usage on the screen
33
+ #
34
+ # :call-seq:
35
+ # Ulla::CLI::print_usage
36
+ #
37
+ def print_usage
38
+ puts <<-USAGE
39
+ ulla: a program to calculate environment-specific amino acid substitution tables.
40
+
41
+ Usage:
42
+ ulla [ options ] -l TEMLIST-file -c CLASSDEF-file
43
+ or
44
+ ulla [ options ] -f TEM-file -c CLASSDEF-file
45
+
46
+ Options:
47
+ --tem-file (-f) FILE: a tem file
48
+ --tem-list (-l) FILE: a list for tem files
49
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
50
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
51
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
52
+ --noweight: calculate substitution counts with no weights
53
+ --smooth (-s) INTEGER:
54
+ 0 for partial smoothing (default)
55
+ 1 for full smoothing
56
+ --p1smooth: perform smoothing for p1 probability calculation when partial smoothing
57
+ --nosmooth: perform no smoothing operation
58
+ --cys (-y) INTEGER:
59
+ 0 for using C and J only for structure (default)
60
+ 1 for both structure and sequence
61
+ 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
62
+ --output INTEGER:
63
+ 0 for raw counts (no smoothing performed)
64
+ 1 for probabilities
65
+ 2 for log-odds (default)
66
+ --noroundoff: do not round off log odds ratio
67
+ --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
68
+ --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
69
+ --autosigma: automatically adjust the sigma value for smoothing
70
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
71
+ --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
72
+ --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
73
+ --heatmap INTEGER:
74
+ 0 create a heat map file for each substitution table
75
+ 1 create one big file containing all heat maps from substitution tables
76
+ 2 do both 0 and 1
77
+ --heatmap-format INTEGER:
78
+ 0 for Portable Network Graphics (PNG) Format (default)
79
+ 1 for Graphics Interchange Format (GIF)
80
+ 2 for Joint Photographic Experts Group (JPEG) Format
81
+ 3 for Microsoft Windows bitmap (BMP) Format
82
+ 4 for Portable Document Format (PDF)
83
+ --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
84
+ --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
85
+ --heatmap-values: print values in the cells when generating heat maps
86
+ --verbose (-v) INTEGER
87
+ 0 for ERROR level
88
+ 1 for WARN or above level (default)
89
+ 2 for INFO or above level
90
+ 3 for DEBUG or above level
91
+ --version: print version
92
+ --help (-h): show help
93
+
94
+ USAGE
95
+ end
96
+
97
+ # Calculate PID between two sequences
98
+ #
99
+ # :call-seq:
100
+ # Ulla::CLI::calculate_pid(seq1, seq2) -> Float
101
+ #
102
+ def calculate_pid(seq1, seq2)
103
+ aas1 = seq1.split('')
104
+ aas2 = seq2.split('')
105
+ cols = aas1.zip(aas2)
106
+ align = 0 # no. of aligned columns
107
+ ident = 0 # no. of identical columns
108
+ intgp = 0 # no. of internal gaps
109
+
110
+ cols.each do |col|
111
+ if (col[0] != '-') && (col[1] != '-')
112
+ align += 1
113
+ if col[0] == col[1]
114
+ ident += 1
115
+ end
116
+ elsif (((col[0] == '-') && (col[1] != '-')) ||
117
+ ((col[0] != '-') && (col[1] == '-')))
118
+ intgp += 1
119
+ end
120
+ end
121
+
122
+ pid = 100.0 * ident.to_f / (align + intgp)
123
+ end
124
+
125
+ # :nodoc:
126
+ def execute(arguments=[])
127
+ #
128
+ # * Abbreviations in the codes
129
+ #
130
+ # env: environment
131
+ # tem: (FUGUE) template
132
+ # classdef: (envlironment) class definition
133
+ # aa: amino acid
134
+ # aa: weighted amino acid
135
+ # tot: total
136
+ # rel: relative
137
+ # jnt: joint
138
+ # cnt: count
139
+ # mut: mutation
140
+ # mutb: mutability
141
+ # freq: frequency
142
+ # prob: probability
143
+ # logo: log odds ratio
144
+ # opts: options
145
+ # fh: file handle
146
+ # ff: flat file
147
+ # ali: alignment
148
+ # mat: matrix
149
+ # arr: array
150
+
151
+
152
+ # Part 1.
153
+ #
154
+ # Global variables and their default values
155
+ #
156
+
157
+ $logger = Logger.new(STDOUT)
158
+ $logger.level = Logger::WARN
159
+
160
+ # default set of 21 amino acids including J (Cysteine, the free thiol form)
161
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
162
+ $tem_list = nil
163
+ $tem_file = nil
164
+ $classdef = 'classdef.dat'
165
+ $outfile = 'allmat.dat'
166
+ $outfh = nil # file hanfle for outfile
167
+ $output = 2 # default: log odds matrix
168
+ $ali_size = 0
169
+ $tot_aa = 0
170
+ $sigma = 5.0
171
+ $autosigma = false
172
+ $weight = 60
173
+ $noweight = false
174
+ $smooth = :partial
175
+ $nosmooth = false
176
+ $noroundoff = false
177
+ $p1smooth = false
178
+ $scale = 3
179
+ $pidmin = nil
180
+ $pidmax = nil
181
+ $scale = 3
182
+ $add = nil
183
+ $cys = 0
184
+ $targetenv = false
185
+ $penv = false
186
+ $heatmap = nil
187
+ $heatmapcol = nil
188
+ $heatmapformat = 'png'
189
+ $heatmapstem = 'heatmaps'
190
+ $heatmapvalues = false
191
+ $rvg_width = 550
192
+ $rvg_height = 650
193
+ $canvas_width = 550
194
+ $canvas_height = 650
195
+ $cell_width = 20
196
+ $cell_height = 20
197
+
198
+ $aa_tot_cnt = Hash.new(0)
199
+ $aa_mut_cnt = Hash.new(0)
200
+ $aa_mutb = {}
201
+ $aa_rel_mutb = {}
202
+ $aa_tot_freq = {}
203
+ $aa_env_cnt = Hash.new(0)
204
+ $smooth_prob = {}
205
+ $tot_cnt_mat = nil
206
+ $tot_prob_mat = nil
207
+ $tot_logo_mat = nil
208
+ $tot_smooth_prob = {}
209
+
210
+ # minimum ratio of amino acid count to sigma value
211
+ $min_cnt_sigma_ratio = 500.0
212
+
213
+ #
214
+ # Part 1 END
215
+ #
216
+
217
+ # Part 2.
218
+ #
219
+ # Parsing options
220
+ #
221
+
222
+ opts = GetoptLong.new(
223
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
224
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
225
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
226
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
227
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
228
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
229
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
230
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
231
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
232
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
233
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
234
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
235
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
237
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
240
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
241
+ [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
242
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
243
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
244
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
245
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
246
+ [ '--version', GetoptLong::NO_ARGUMENT ]
247
+ )
248
+
249
+ begin
250
+ opts.each do |opt, arg|
251
+ case opt
252
+ when '--help'
253
+ print_usage
254
+ exit 0
255
+ when '--tem-list'
256
+ $tem_list = arg
257
+ when '--tem-file'
258
+ $tem_file = arg
259
+ when '--classdef'
260
+ $classdef = arg
261
+ when '--output'
262
+ $output = arg.to_i
263
+ when '--outfile'
264
+ $outfile = arg
265
+ when '--cys'
266
+ $cys = arg.to_i
267
+ when '--targetenv'
268
+ $targetenv = (arg.to_i == 1) ? true : false
269
+ when '--weight'
270
+ $weight = arg.to_i
271
+ when '--sigma'
272
+ $sigma = arg.to_f
273
+ when '--autosigma'
274
+ $autosigma = true
275
+ when '--pidmin'
276
+ $pidmin = arg.to_f
277
+ when '--pidmax'
278
+ $pidmax = arg.to_f
279
+ when '--noweight'
280
+ $noweight = true
281
+ when '--noroundoff'
282
+ $noroundoff = true
283
+ when '--smooth'
284
+ $smooth = (arg.to_i == 1) ? :full : :partial
285
+ when '--nosmooth'
286
+ $nosmooth = true
287
+ when '--p1smooth'
288
+ $p1smooth = true
289
+ when '--scale'
290
+ $scale = arg.to_f
291
+ when '--add'
292
+ $add = arg.to_f
293
+ when '--penv'
294
+ warn "--penv option is not supported."
295
+ exit 1
296
+ $penv = true
297
+ when '--heatmap'
298
+ $heatmap = case arg.to_i
299
+ when (0..2) then arg.to_i
300
+ else
301
+ warn "--heatmap #{arg.to_i} is not allowed."
302
+ exit1
303
+ end
304
+ when '--heatmap-columns'
305
+ $heatmapcol = arg.to_i
306
+ when '--heatmap-stem'
307
+ $heatmapstem = arg.to_s
308
+ when '--heatmap-format'
309
+ $heatmapformat = case arg.to_i
310
+ when 0 then 'png'
311
+ when 1 then 'gif'
312
+ when 2 then 'jpg'
313
+ when 3 then 'bmp'
314
+ when 4 then 'pdf'
315
+ else
316
+ warn "--heatmap-format #{arg.to_i} is not supported."
317
+ exit 1
318
+ end
319
+ when '--heatmap-values'
320
+ $heatmapvalues = true
321
+ when '--verbose'
322
+ $logger.level = case arg.to_i
323
+ when 0 then Logger::ERROR
324
+ when 1 then Logger::WARN
325
+ when 2 then Logger::INFO
326
+ when 3 then Logger::DEBUG
327
+ else
328
+ warn "--verbose (-v) #{arg.to_i} is not supported."
329
+ exit 1
330
+ end
331
+ when '--version'
332
+ print_version
333
+ exit 0
334
+ end
335
+ end
336
+ rescue
337
+ # invalid option
338
+ exit 1
339
+ end
340
+
341
+ # when arguments are nonsense, print usage
342
+ if ((ARGV.length != 0) ||
343
+ (!$tem_list && !$tem_file) ||
344
+ ($tem_list && $tem_file))
345
+ print_usage
346
+ exit 1
347
+ end
348
+
349
+ # warn if any input file is missing
350
+ if $tem_list && !File.exist?($tem_list)
351
+ warn "Cannot find template list file, #{$tem_list}"
352
+ exit 1
353
+ end
354
+
355
+ if $tem_file && !File.exist?($tem_file)
356
+ warn "Cannot find template file, #{$tem_file}"
357
+ exit 1
358
+ end
359
+
360
+ if $classdef && !File.exist?($classdef)
361
+ warn "Cannot find environment class definition file, #{$classdef}"
362
+ exit 1
363
+ end
364
+
365
+ #
366
+ # Part 2 END
367
+ #
368
+
369
+
370
+ # Part 3.
371
+ #
372
+ # Reading Environment Class Definition File
373
+ #
374
+
375
+ # check --cys option and modify amino_acids set if necessary
376
+ if $cys == 2
377
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
378
+ end
379
+
380
+ # create an EnvironmentFeatureList object for storing all environment
381
+ # features
382
+ $env_features = EnvironmentFeatureArray.new
383
+
384
+ # an array for storing indexes of constrained environment features
385
+ $cst_features = []
386
+
387
+ # add substituted amino acid (aa1) in a substitution to the environment
388
+ # feature list
389
+ $env_features << EnvironmentFeature.new('sequence',
390
+ $amino_acids,
391
+ $amino_acids,
392
+ 'F',
393
+ 'F')
394
+
395
+ # read environment class definiton file and store them into
396
+ # the hash prepared above
397
+ env_index = 1
398
+
399
+ IO.foreach($classdef) do |line|
400
+ line.chomp!
401
+ if line.start_with?('#')
402
+ next
403
+ elsif (env_ftr = line.chomp.split(/;/)).length == 5
404
+ $logger.info "An environment feature, #{line} detected."
405
+ if env_ftr[-1] == 'T'
406
+ # skip silenced environment feature
407
+ $logger.warn "The environment feature, #{line} silent."
408
+ next
409
+ end
410
+ if env_ftr[-2] == 'T'
411
+ $cst_features << env_index
412
+ $logger.warn "The environment feature, #{line} constrained."
413
+ end
414
+ $env_features << EnvironmentFeature.new(env_ftr[0],
415
+ env_ftr[1].split(''),
416
+ env_ftr[2].split(''),
417
+ env_ftr[3],
418
+ env_ftr[4])
419
+ env_index += 1
420
+ else
421
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
+ "a environment class definition."
423
+ exit 1
424
+ end
425
+ end
426
+
427
+ # a hash for storing all environment classes
428
+ $env_classes = EnvironmentClassHash.new
429
+
430
+ # generate all possible combinations of environment labels, and store
431
+ # every environment class into the hash prepared above with the label
432
+ # as a key
433
+ $env_features.label_combinations.each_with_index { |e, i|
434
+ $env_classes[e.flatten.join] = Environment.new(i,
435
+ e.flatten.join,
436
+ $amino_acids)
437
+ }
438
+
439
+ #
440
+ # Part 3 END
441
+ #
442
+
443
+
444
+ # Part 4.
445
+ #
446
+ # Reading TEM file or TEMLIST list file and couting substitutions
447
+ #
448
+
449
+ # a global file handle for output
450
+ $outfh = File.open($outfile, 'w')
451
+
452
+ if $tem_file
453
+ $tem_list_io = StringIO.new($tem_file)
454
+ end
455
+
456
+ if $tem_list
457
+ $tem_list_io = File.open($tem_list)
458
+ end
459
+
460
+ $tem_list_io.each_line do |tem_file|
461
+ tem_file.chomp!
462
+
463
+ ali = Bio::Alignment::OriginalAlignment.new
464
+ ff = Bio::FlatFile.auto(tem_file)
465
+
466
+ ff.each_entry do |pir|
467
+ if (pir.definition == 'sequence') || (pir.definition == 'structure')
468
+ ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
469
+ end
470
+ end
471
+
472
+ if ali.size < 2
473
+ $logger.warn "Skipped #{tem_file} which has only one unique entry."
474
+ next
475
+ end
476
+
477
+ $ali_size += 1
478
+ env_labels = {}
479
+ disulphide = {}
480
+
481
+ ali.each_pair do |key, seq|
482
+ # check disulphide bond environment first!
483
+ ff.rewind
484
+ ff.each_entry do |pir|
485
+ if ((pir.entry_id == key) &&
486
+ ((pir.definition == "disulphide") ||
487
+ (pir.definition == "disulfide")))
488
+ disulphide[key] = pir.data.remove_internal_spaces.split('')
489
+ end
490
+ end
491
+
492
+ $env_features.each_with_index do |ec, ei|
493
+ env_labels[key] = [] unless env_labels.has_key?(key)
494
+
495
+ ff.rewind
496
+ ff.each_entry do |pir|
497
+ if (pir.entry_id == key) && (pir.definition == ec.name)
498
+ labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
499
+ if sym == '-'
500
+ '-'
501
+ elsif sym == 'X' || sym == 'x'
502
+ 'X'
503
+ else
504
+ if ei == 0 # Amino Acid Environment Feature
505
+ (disulphide.has_key?(key) &&
506
+ (disulphide[key][pos] == 'F') &&
507
+ (sym == 'C')) ? 'J' : sym
508
+ else
509
+ ec.labels[ec.symbols.index(sym)]
510
+ end
511
+ end
512
+ end
513
+
514
+ if env_labels[key].empty?
515
+ env_labels[key] = labels
516
+ else
517
+ env_labels[key].each_with_index { |e, i|
518
+ env_labels[key][i] = e + labels[i]
519
+ }
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end
525
+
526
+ if $noweight
527
+ ali.each_pair do |id1, seq1|
528
+ ali.each_pair do |id2, seq2|
529
+ if id1 != id2
530
+ pid = calculate_pid(seq1, seq2)
531
+ s1 = seq1.split('')
532
+ s2 = seq2.split('')
533
+
534
+ # check PID_MIN
535
+ if $pidmin && (pid < $pidmin)
536
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
537
+ "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
538
+ next
539
+ end
540
+
541
+ # check PID_MAX
542
+ if $pidmax && (pid > $pidmax)
543
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
544
+ "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
545
+ next
546
+ end
547
+
548
+ s1.each_with_index do |aa1, pos|
549
+ aa1.upcase!
550
+ aa2 = s2[pos].upcase
551
+
552
+ if env_labels[id1][pos].include?('X')
553
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
554
+ next
555
+ end
556
+
557
+ if env_labels[id2][pos].include?('X')
558
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
559
+ next
560
+ end
561
+
562
+ unless $amino_acids.include?(aa1)
563
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
564
+ next
565
+ end
566
+
567
+ unless $amino_acids.include?(aa2)
568
+ $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
569
+ next
570
+ end
571
+
572
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
573
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
574
+
575
+ if $cst_features.empty?
576
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
577
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
578
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
579
+ else
580
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
581
+ next
582
+ end
583
+
584
+ grp_label = env_labels[id1][pos][1..-1]
585
+
586
+ if $aa_env_cnt.has_key? grp_label
587
+ if $aa_env_cnt[grp_label].has_key? aa1
588
+ $aa_env_cnt[grp_label][aa1] += 1
589
+ else
590
+ $aa_env_cnt[grp_label][aa1] = 1
591
+ end
592
+ else
593
+ $aa_env_cnt[grp_label] = Hash.new(0)
594
+ $aa_env_cnt[grp_label][aa1] = 1
595
+ end
596
+
597
+ if $aa_tot_cnt.has_key? aa1
598
+ $aa_tot_cnt[aa1] += 1
599
+ else
600
+ $aa_tot_cnt[aa1] = 1
601
+ end
602
+
603
+ if aa1 != aa2
604
+ if $aa_mut_cnt.has_key? aa1
605
+ $aa_mut_cnt[aa1] += 1
606
+ else
607
+ $aa_mut_cnt[aa1] = 1
608
+ end
609
+ end
610
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
611
+ end
612
+ end
613
+ end
614
+ end
615
+ else
616
+ # BLOSUM-like weighting
617
+ clusters = []
618
+ ali.each_pair { |i, s| clusters << [i] }
619
+
620
+ # a loop for single linkage clustering
621
+ begin
622
+ continue = false
623
+ 0.upto(clusters.size - 2) do |i|
624
+ indexes = []
625
+ (i + 1).upto(clusters.size - 1) do |j|
626
+ found = false
627
+ clusters[i].each do |c1|
628
+ clusters[j].each do |c2|
629
+ if calculate_pid(ali[c1], ali[c2]) >= $weight
630
+ indexes << j
631
+ found = true
632
+ break
633
+ end
634
+ end
635
+ break if found
636
+ end
637
+ end
638
+
639
+ unless indexes.empty?
640
+ continue = true
641
+ group = clusters[i]
642
+ indexes.each do |k|
643
+ group = group.concat(clusters[k])
644
+ clusters[k] = nil
645
+ end
646
+ clusters[i] = group
647
+ clusters.compact!
648
+ end
649
+ end
650
+ end while(continue)
651
+
652
+ if clusters.size < 2
653
+ $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
654
+ next
655
+ end
656
+
657
+ clusters.combination(2).each do |cluster1, cluster2|
658
+ cluster1.each do |id1|
659
+ cluster2.each do |id2|
660
+ seq1 = ali[id1].split('')
661
+ seq2 = ali[id2].split('')
662
+
663
+ seq1.each_with_index do |aa1, pos|
664
+ aa1.upcase!
665
+ aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
666
+
667
+ if env_labels[id1][pos].include?('X')
668
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
669
+ next
670
+ end
671
+
672
+ if env_labels[id2][pos].include?('X')
673
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
674
+ next
675
+ end
676
+
677
+ unless $amino_acids.include?(aa1)
678
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
679
+ next
680
+ end
681
+
682
+ unless $amino_acids.include?(aa2)
683
+ $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
684
+ next
685
+ end
686
+
687
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
688
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
689
+ cnt1 = 1.0 / cluster1.size
690
+ cnt2 = 1.0 / cluster2.size
691
+ jnt_cnt = cnt1 * cnt2
692
+
693
+ if $cst_features.empty?
694
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
695
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
696
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
697
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
698
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
699
+ else
700
+ $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
701
+ next
702
+ end
703
+
704
+ grp_label1 = env_labels[id1][pos][1..-1]
705
+ grp_label2 = env_labels[id2][pos][1..-1]
706
+
707
+ if $aa_env_cnt.has_key? grp_label1
708
+ if $aa_env_cnt[grp_label1].has_key? aa1
709
+ $aa_env_cnt[grp_label1][aa1] += cnt1
710
+ else
711
+ $aa_env_cnt[grp_label1][aa1] = cnt1
712
+ end
713
+ else
714
+ $aa_env_cnt[grp_label1] = Hash.new(0.0)
715
+ $aa_env_cnt[grp_label1][aa1] = cnt1
716
+ end
717
+
718
+ if $aa_env_cnt.has_key? grp_label2
719
+ if $aa_env_cnt[grp_label2].has_key? aa2
720
+ $aa_env_cnt[grp_label2][aa2] += cnt2
721
+ else
722
+ $aa_env_cnt[grp_label2][aa2] = cnt2
723
+ end
724
+ else
725
+ $aa_env_cnt[grp_label2] = Hash.new(0.0)
726
+ $aa_env_cnt[grp_label2][aa2] = cnt2
727
+ end
728
+
729
+ if $aa_tot_cnt.has_key? aa1
730
+ $aa_tot_cnt[aa1] += cnt1
731
+ else
732
+ $aa_tot_cnt[aa1] = cnt1
733
+ end
734
+
735
+ if $aa_tot_cnt.has_key? aa2
736
+ $aa_tot_cnt[aa2] += cnt2
737
+ else
738
+ $aa_tot_cnt[aa2] = cnt2
739
+ end
740
+
741
+ if aa1 != aa2
742
+ if $aa_mut_cnt.has_key? aa1
743
+ $aa_mut_cnt[aa1] += cnt1
744
+ else
745
+ $aa_mut_cnt[aa1] = cnt1
746
+ end
747
+ if $aa_mut_cnt.has_key? aa2
748
+ $aa_mut_cnt[aa2] += cnt2
749
+ else
750
+ $aa_mut_cnt[aa2] = cnt2
751
+ end
752
+ end
753
+
754
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
755
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
756
+ end
757
+ end
758
+ end
759
+ end
760
+ end
761
+ $logger.info "Analysing #{tem_file} done."
762
+ end
763
+
764
+ # print out default header
765
+ $outfh.puts <<HEADER
766
+ # Environment-specific amino acid substitution matrices
767
+ # Creator: ulla version #{VERSION}
768
+ # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
769
+ #
770
+ # Definitions for structural environments:
771
+ # #{$env_features.size - 1} features used
772
+ #
773
+ HEADER
774
+
775
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
776
+
777
+ $outfh.puts <<HEADER
778
+ # (read in from #{$classdef})
779
+ #
780
+ # Number of alignments: #{$ali_size}
781
+ # (list of .tem files read in from #{$tem_list})
782
+ #
783
+ # Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
784
+ #
785
+ # There are #{$amino_acids.size} amino acids considered.
786
+ # #{$amino_acids.join}
787
+ #
788
+ HEADER
789
+
790
+ if $amino_acids.include? 'J'
791
+ $outfh.puts <<HEADER
792
+ # C: Cystine (the disulfide-bonded form)
793
+ # J: Cysteine (the free thiol form)
794
+ #
795
+ HEADER
796
+ end
797
+
798
+ if $noweight
799
+ $outfh.puts '# Weighting scheme: none'
800
+ else
801
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
802
+ end
803
+
804
+ # calculate amino acid frequencies and mutabilities, and
805
+ # print them as default statistics in the header part
806
+ ala_factor = if $aa_tot_cnt['A'] == 0
807
+ 0.0
808
+ elsif $aa_mut_cnt['A'] == 0
809
+ 0.0
810
+ else
811
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
812
+ end
813
+ $tot_aa = $aa_tot_cnt.values.sum
814
+
815
+ $outfh.puts '#'
816
+ $outfh.puts "# Total amino acid frequencies:\n"
817
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
818
+
819
+ min_cnt = -1
820
+ min_sigma = nil
821
+
822
+ $amino_acids.each do |res|
823
+ if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
824
+ if min_cnt < 0
825
+ min_cnt = $aa_tot_cnt[res]
826
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
827
+ elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
828
+ min_cnt = $aa_tot_cnt[res]
829
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
830
+ end
831
+
832
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
833
+ end
834
+
835
+ $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
836
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
837
+ $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
838
+ end
839
+
840
+ $amino_acids.each do |res|
841
+ if $noweight
842
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
843
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
844
+ else
845
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
846
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
847
+ end
848
+ end
849
+
850
+ if min_cnt > -1
851
+ $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
852
+ if $autosigma
853
+ $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
854
+ $sigma = min_sigma
855
+ end
856
+ end
857
+
858
+ $outfh.puts '#'
859
+ $outfh.puts '# RES: Amino acid one letter code'
860
+ $outfh.puts '# TOT_OBS: Total count of incidence'
861
+ $outfh.puts '# MUT_OBS: Total count of mutation'
862
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
863
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
864
+ $outfh.puts '# REL_FREQ: Relative frequency'
865
+ $outfh.puts '#'
866
+
867
+ #
868
+ # Part 4. END
869
+ #
870
+
871
+
872
+ # Part 5.
873
+ #
874
+ # Generating substitution frequency matrices
875
+ #
876
+
877
+ # calculating probabilities for each environment
878
+ $env_classes.values.each do |e|
879
+ if e.freq_array.sum != 0
880
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
881
+ end
882
+ end
883
+
884
+ # count raw frequencies
885
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
886
+ group_matrices = []
887
+
888
+ # for each combination of environment features
889
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
890
+ grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
891
+
892
+ $amino_acids.each_with_index do |aa, aj|
893
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
894
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
895
+ end
896
+
897
+ $tot_cnt_mat += grp_cnt_mat
898
+ group_matrices << [group[0], grp_cnt_mat]
899
+ end
900
+
901
+ $logger.info "Counting substitutions done."
902
+
903
+ if $output == 0
904
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
905
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
906
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
907
+
908
+ group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
909
+ # for a matrix file
910
+ stem = "#{grp_no}. #{grp_label}"
911
+ $outfh.puts ">#{grp_label} #{grp_no}"
912
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
913
+ :row_header => $amino_acids)
914
+
915
+ # for a heat map
916
+ if $heatmap == 0 or $heatmap == 2
917
+ grp_cnt_mat.heatmap(:col_header => $amino_acids,
918
+ :row_header => $amino_acids,
919
+ :rvg_width => $rvg_width,
920
+ :rvg_height => $rvg_height,
921
+ :canvas_width => $canvas_width,
922
+ :canvas_height => $canvas_height,
923
+ :max_val => grp_max_val.ceil,
924
+ :min_val => 0,
925
+ :print_value => $heatmapvalues,
926
+ :title => stem).write("#{stem}.#{$heatmapformat}")
927
+
928
+ $logger.info "Generating a heat map for #{stem} table done."
929
+ end
930
+
931
+ if $heatmap == 1 or $heatmap == 2
932
+ title_font_size = $rvg_width * $heatmapcol / 80.0
933
+ heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
934
+ :row_header => $amino_acids,
935
+ :rvg_width => $rvg_width,
936
+ :rvg_height => $rvg_height - 50,
937
+ :canvas_width => $canvas_width,
938
+ :canvas_height => $canvas_height - 50,
939
+ :max_val => grp_max_val.ceil,
940
+ :min_val => 0,
941
+ :print_value => $heatmapvalues,
942
+ :print_gradient => false,
943
+ :title => stem,
944
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
945
+ end
946
+ end
947
+
948
+ if $heatmap == 1 or $heatmap == 2
949
+ file = "#{$heatmapstem}.#{$heatmapformat}"
950
+ heatmaps.heatmap(:columns => $heatmapcol,
951
+ :rvg_width => $rvg_width,
952
+ :max_val => grp_max_val.ceil,
953
+ :min_val => 0).write(file)
954
+
955
+ $logger.info "Generating heat maps in a file, #{file} done."
956
+ end
957
+
958
+ # total
959
+ $outfh.puts '>Total'
960
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
961
+ :row_header => $amino_acids)
962
+
963
+ if $heatmap == 0 or $heatmap == 2
964
+ stem = "#{group_matrices.size}. TOTAL"
965
+ heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
966
+ :row_header => $amino_acids,
967
+ :rvg_width => $rvg_width,
968
+ :rvg_height => $rvg_height,
969
+ :canvas_width => $canvas_width,
970
+ :canvas_height => $canvas_height,
971
+ :max_val => $tot_cnt_mat.max.ceil,
972
+ :min_val => 0,
973
+ :print_value => $heatmapvalues,
974
+ :title => stem).write("#{stem}.#{$heatmapformat}")
975
+
976
+ $logger.info "Generating a heat map for #{stem} table done."
977
+ end
978
+ exit 0
979
+ end
980
+
981
+ #
982
+ # Part 5. END
983
+ #
984
+
985
+
986
+ # Part 6.
987
+ #
988
+ # Calculating substitution probability tables
989
+ #
990
+
991
+ if $output == 1
992
+ $outfh.puts <<HEADER
993
+ #
994
+ # Each column (j) represents the probability distribution for the
995
+ # likelihood of acceptance of a mutational event by a residue type j in
996
+ # a particular structural environment (specified after >) leading to
997
+ # any other residue type (i) and sums up to 100.
998
+ #
999
+ HEADER
1000
+ end
1001
+
1002
+ # when nosmoothing !!!
1003
+ if ($output > 0) && $nosmooth
1004
+ # reinitialize $tot_cnt_mat for pseudocounts
1005
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1006
+
1007
+ # for each combination of environment features
1008
+ pseudo_cnt = $add || (1.0 / $env_classes.group_size)
1009
+
1010
+ # add pseudo counts for each frequency vector
1011
+ $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
1012
+
1013
+ # re-calculate probability vector for each environment class
1014
+ $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
1015
+
1016
+ group_matrices = []
1017
+
1018
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1019
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1020
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1021
+
1022
+ $amino_acids.each_with_index do |aa, aj|
1023
+ env_class = group[1].find { |e| e.label.start_with?(aa) }
1024
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
1025
+ 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
1026
+ end
1027
+
1028
+ $tot_cnt_mat += grp_cnt_mat
1029
+ group_matrices << [group[0], grp_prob_mat]
1030
+ end
1031
+
1032
+ if $output == 1
1033
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1034
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1035
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1036
+
1037
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1038
+ # for a matrix file
1039
+ stem = "#{grp_no}. #{grp_label}"
1040
+ $outfh.puts ">#{grp_label} #{grp_no}"
1041
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1042
+ :row_header => $amino_acids)
1043
+
1044
+
1045
+ # for a heat map
1046
+ if $heatmap == 0 or $heatmap == 2
1047
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1048
+ :row_header => $amino_acids,
1049
+ :rvg_width => $rvg_width,
1050
+ :rvg_height => $rvg_height,
1051
+ :canvas_width => $canvas_width,
1052
+ :canvas_height => $canvas_height,
1053
+ :max_val => grp_max_val.ceil,
1054
+ :min_val => 0,
1055
+ :print_value => $heatmapvalues,
1056
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1057
+
1058
+ $logger.info "Generating a heat map for #{stem} table done."
1059
+ end
1060
+
1061
+ if $heatmap == 1 or $heatmap == 2
1062
+ title_font_size = $rvg_width * $heatmapcol / 80.0
1063
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1064
+ :row_header => $amino_acids,
1065
+ :rvg_width => $rvg_width,
1066
+ :rvg_height => $rvg_height - 50,
1067
+ :canvas_width => $canvas_width,
1068
+ :canvas_height => $canvas_height - 50,
1069
+ :max_val => grp_max_val.ceil,
1070
+ :min_val => 0,
1071
+ :print_value => $heatmapvalues,
1072
+ :print_gradient => false,
1073
+ :title => stem,
1074
+ :title_font_size => title_font_size)
1075
+ end
1076
+ end
1077
+
1078
+ # for heat maps in a single file
1079
+ if $heatmap == 1 or $heatmap == 2
1080
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1081
+ heatmaps.heatmap(:columns => $heatmapcol,
1082
+ :rvg_width => $rvg_width,
1083
+ :max_val => grp_max_val.ceil,
1084
+ :min_val => 0).write(file)
1085
+
1086
+ $logger.info "Generating heat maps in a file, #{file} done."
1087
+ end
1088
+ end
1089
+
1090
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1091
+
1092
+ 0.upto($amino_acids.size - 1) do |aj|
1093
+ col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
1094
+ 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
1095
+ end
1096
+
1097
+ if $output == 1
1098
+ $outfh.puts '>Total'
1099
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1100
+ :row_header => $amino_acids)
1101
+ $outfh.close
1102
+
1103
+ # for a heat map
1104
+ if $heatmap == 0 or $heatmap == 2
1105
+ stem = "#{group_matrices.size}. TOTAL"
1106
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1107
+ :row_header => $amino_acids,
1108
+ :rvg_width => $rvg_width,
1109
+ :rvg_height => $rvg_height,
1110
+ :canvas_width => $canvas_width,
1111
+ :canvas_height => $canvas_height,
1112
+ :max_val => $tot_prob_mat.max.ceil,
1113
+ :min_val => 0,
1114
+ :print_value => $heatmapvalues,
1115
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1116
+
1117
+ $logger.info "Generating a heat map for #{stem} table done."
1118
+ end
1119
+ exit 0
1120
+ end
1121
+
1122
+ $logger.info 'Calculating substitution probabilities (no smoothing) done.'
1123
+ end
1124
+
1125
+ # when smoothing!!!
1126
+ if ($output > 0) && !$nosmooth
1127
+ #
1128
+ # p1 probabilities
1129
+ #
1130
+ p1 = NArray.float($amino_acids.size)
1131
+ a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
1132
+ big_N = $tot_aa.to_f
1133
+ small_n = $amino_acids.size.to_f
1134
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1135
+ omega2 = 1.0 - omega1
1136
+
1137
+ if ($smooth == :full) || $p1smooth
1138
+ # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
1139
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
1140
+ $smooth_prob[1] = p1
1141
+ elsif ($smooth == :partial)
1142
+ # no smoothing for p1 probabilities just as Kenji's subst
1143
+ # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
1144
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
1145
+ $smooth_prob[1] = p1
1146
+ end
1147
+
1148
+ #
1149
+ # p2 and above
1150
+ #
1151
+ env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
1152
+
1153
+ if $smooth == :partial
1154
+ $outfh.puts <<HEADER
1155
+ #
1156
+ # Partial Smoothing:
1157
+ #
1158
+ HEADER
1159
+ if $p1smooth
1160
+ $outfh.puts <<HEADER
1161
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1162
+ # each row in all matrices and smoothing them with A0 (a uniform distribution)
1163
+ # ^^^^^^^^^
1164
+ HEADER
1165
+ else
1166
+ $outfh.puts <<HEADER
1167
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1168
+ # each row in all matrices without smoothing
1169
+ # ^^^^^^^^^^^^^^^^^
1170
+ HEADER
1171
+ end
1172
+
1173
+ $outfh.puts <<HEADER
1174
+ # p2(ri|Rj) is estimated as:
1175
+ # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
1176
+ #
1177
+ # p3(ri|Rj,fq) is estimated as:
1178
+ # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
1179
+ # where
1180
+ # A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
1181
+ #
1182
+ # The smoothing procedure is curtailed here and finally
1183
+ # ^^^^^^^^^
1184
+ # p5(ri|Rj,...) is estimated as:
1185
+ # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
1186
+ # where
1187
+ # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
1188
+ #
1189
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1190
+ #
1191
+ # sigma value used is: #{$sigma}
1192
+ #
1193
+ HEADER
1194
+ 1.upto($env_features.size) do |ci|
1195
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
1196
+ if (ci > 2) && (ci < $env_features.size)
1197
+ $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
1198
+ next
1199
+ end
1200
+
1201
+ env_labels.combination(ci) do |c1|
1202
+ c1[0].product(*c1[1..-1]).each do |labels|
1203
+ pattern = '.' * $env_features.size
1204
+
1205
+ labels.each do |label|
1206
+ i = label[0].chr.to_i
1207
+ l = label[1].chr
1208
+ pattern[i] = l
1209
+ end
1210
+
1211
+ if pattern =~ /^\./
1212
+ $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
1213
+ next
1214
+ end
1215
+
1216
+ # get environments matching the pattern created above
1217
+ # and calculate amino acid frequencies and their probabilities for all the environments
1218
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1219
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1220
+ prob_arr = NArray.float($amino_acids.size)
1221
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
1222
+
1223
+ # # assess whether a residue type j is compatible with a particular combination of structural features
1224
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
1225
+ # if ci == $env_features.size
1226
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
1227
+ # sub_pattern = '.' * $env_features.size
1228
+ # sub_pattern[0] = aa_label
1229
+ # sub_freq_sum = 0
1230
+ #
1231
+ # labels[1..-1].each do |label|
1232
+ # next if label.start_with?('0')
1233
+ # i = label[0].chr.to_i
1234
+ # l = label[1].chr
1235
+ # sub_pattern[i] = l
1236
+ # sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1237
+ # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1238
+ # sub_freq_sum += sub_freq_arr.sum
1239
+ # end
1240
+ #
1241
+ # if sub_freq_sum == 0
1242
+ # if $smooth_prob.has_key?(ci + 1)
1243
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1244
+ # else
1245
+ # $smooth_prob[ci + 1] = {}
1246
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1247
+ # end
1248
+ # $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
1249
+ # next
1250
+ # end
1251
+ # end
1252
+
1253
+ # collect priors
1254
+ priors = []
1255
+
1256
+ if ci == 1
1257
+ priors << $smooth_prob[1]
1258
+ elsif ci == 2
1259
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
1260
+ priors << $smooth_prob[2][c3.to_set]
1261
+ }
1262
+ elsif ci == $env_features.size
1263
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
1264
+ priors << $smooth_prob[3][c3.to_set]
1265
+ }
1266
+ end
1267
+
1268
+ # entropy based prior weighting step
1269
+ entropy_max = Math::log($amino_acids.size)
1270
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
1271
+ begin
1272
+ p == 0.0 ? s - 1 : s + p * Math::log(p)
1273
+ rescue
1274
+ #puts "P: #{p}"
1275
+ end
1276
+ } }
1277
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1278
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1279
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1280
+
1281
+ # smoothing step
1282
+ smooth_prob_arr = NArray.float($amino_acids.size)
1283
+ big_N = freq_arr.sum.to_f
1284
+ small_n = $amino_acids.size.to_f
1285
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1286
+ omega2 = 1.0 - omega1
1287
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1288
+
1289
+ # normalization step
1290
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1291
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1292
+
1293
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1294
+ if $smooth_prob.has_key?(ci + 1)
1295
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1296
+ else
1297
+ $smooth_prob[ci + 1] = {}
1298
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1299
+ end
1300
+ end
1301
+ end
1302
+ end
1303
+ $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
1304
+ else
1305
+ $outfh.puts <<HEADER
1306
+ #
1307
+ # Full Smoothing:
1308
+ #
1309
+ # p1(ri) is estimated as:
1310
+ # p1(ri) = omega1 * A0 + omega2 * W1(ri)
1311
+ #
1312
+ # p2(ri|f1q) is estimated as:
1313
+ # p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
1314
+ #
1315
+ # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
1316
+ #
1317
+ # p3(ri|f1q,f2q) is estimated as:
1318
+ # p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
1319
+ # where
1320
+ # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
1321
+ #
1322
+ # The smoothing procedure is NOT curtailed here and it goes upto
1323
+ # ^^^^^^^^^^^^^
1324
+ #
1325
+ # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
1326
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
1327
+ # where
1328
+ # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
1329
+ #
1330
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1331
+ #
1332
+ # sigma value used is: #{$sigma}
1333
+ #
1334
+ HEADER
1335
+ # full smooting
1336
+ 1.upto($env_features.size) do |ci|
1337
+ env_labels.combination(ci) do |c1|
1338
+ c1[0].product(*c1[1..-1]).each do |labels|
1339
+ pattern = '.' * $env_features.size
1340
+ labels.each do |label|
1341
+ j = label[0].chr.to_i
1342
+ l = label[1].chr
1343
+ pattern[j] = l
1344
+ end
1345
+
1346
+ # get environmetns, frequencies, and probabilities
1347
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1348
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1349
+ prob_arr = NArray.float($amino_acids.size)
1350
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
1351
+
1352
+ # collect priors
1353
+ priors = []
1354
+ if ci > 1
1355
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
1356
+ else
1357
+ priors << $smooth_prob[1]
1358
+ end
1359
+
1360
+ # entropy based weighting priors
1361
+ entropy_max = Math::log($amino_acids.size)
1362
+ entropies = priors.map do |prior|
1363
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
1364
+ end
1365
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
1366
+
1367
+ # smoothing step
1368
+ smooth_prob_arr = NArray.float($amino_acids.size)
1369
+ big_N = freq_arr.sum.to_f
1370
+ small_n = $amino_acids.size.to_f
1371
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1372
+ omega2 = 1.0 - omega1
1373
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1374
+
1375
+ # normalization step
1376
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1377
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1378
+
1379
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1380
+ if $smooth_prob.has_key?(ci + 1)
1381
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1382
+ else
1383
+ $smooth_prob[ci + 1] = {}
1384
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1385
+ end
1386
+ end
1387
+ end
1388
+ end
1389
+ $logger.info 'Calculating substitution probabilities (full smoothing) done.'
1390
+ end
1391
+
1392
+ # updating smoothed probability array for each envrionment
1393
+ $env_classes.values.each do |env|
1394
+ env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1395
+ end
1396
+
1397
+ # sorting environments and build 21X21 substitution matrices
1398
+ group_matrices = []
1399
+
1400
+ $env_classes.groups_sorted_by_residue_labels.each do |group|
1401
+ # calculating 21X21 substitution probability matrix for each envrionment
1402
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1403
+
1404
+ $amino_acids.each_with_index do |aa, ai|
1405
+ smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1406
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1407
+ end
1408
+
1409
+ group_matrices << [group[0], grp_prob_mat]
1410
+ end
1411
+
1412
+ if $output == 1
1413
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1414
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1415
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1416
+
1417
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1418
+ # for a matrix file
1419
+ stem = "#{grp_no}. #{grp_label}"
1420
+ $outfh.puts ">#{grp_label} #{grp_no}"
1421
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1422
+ :row_header => $amino_acids)
1423
+
1424
+ # for heat map generation
1425
+ if $heatmap == 0 or $heatmap == 2
1426
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1427
+ :row_header => $amino_acids,
1428
+ :rvg_width => $rvg_width,
1429
+ :rvg_height => $rvg_height,
1430
+ :canvas_width => $canvas_width,
1431
+ :canvas_height => $canvas_height,
1432
+ :max_val => grp_max_val.ceil,
1433
+ :min_val => 0,
1434
+ :print_value => $heatmapvalues,
1435
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1436
+
1437
+ $logger.info "Generating a heat map for #{stem} table done."
1438
+ end
1439
+
1440
+ if $heatmap == 1 or $heatmap == 2
1441
+ title_font_size = $rvg_width * $heatmapcol / 80.0
1442
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1443
+ :row_header => $amino_acids,
1444
+ :rvg_width => $rvg_width,
1445
+ :rvg_height => $rvg_height - 50,
1446
+ :canvas_width => $canvas_width,
1447
+ :canvas_height => $canvas_height - 50,
1448
+ :max_val => grp_max_val.ceil,
1449
+ :min_val => 0,
1450
+ :print_value => $heatmapvalues,
1451
+ :print_gradient => false,
1452
+ :title => stem,
1453
+ :title_font_size => title_font_size)
1454
+ end
1455
+ end
1456
+
1457
+ # for heat maps in a single file
1458
+ if $heatmap == 1 or $heatmap == 2
1459
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1460
+ heatmaps.heatmap(:columns => $heatmapcol,
1461
+ :rvg_width => $rvg_width,
1462
+ :max_val => grp_max_val.ceil,
1463
+ :min_val => 0).write(file)
1464
+
1465
+ $logger.info "Generating heat maps in a file, #{file} done."
1466
+ end
1467
+ end
1468
+
1469
+ # for a total substitution probability matrix
1470
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1471
+
1472
+ $amino_acids.each_with_index do |aa, aj|
1473
+ 0.upto($amino_acids.size - 1) do |ai|
1474
+ $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
1475
+ end
1476
+ end
1477
+
1478
+ if $output == 1
1479
+ $outfh.puts '>Total'
1480
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1481
+ :row_header => $amino_acids)
1482
+ $outfh.close
1483
+
1484
+ # for a heat map
1485
+ if $heatmap == 0 or $heatmap == 2
1486
+ stem = "#{group_matrices.size}. TOTAL"
1487
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1488
+ :row_header => $amino_acids,
1489
+ :rvg_width => $rvg_width,
1490
+ :rvg_height => $rvg_height,
1491
+ :canvas_width => $canvas_width,
1492
+ :canvas_height => $canvas_height,
1493
+ :max_val => $tot_prob_mat.max.ceil,
1494
+ :min_val => 0,
1495
+ :print_value => $heatmapvalues,
1496
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1497
+
1498
+ $logger.info "Generating a heat map for #{stem} table done."
1499
+ end
1500
+ exit 0
1501
+ end
1502
+ end
1503
+
1504
+ #
1505
+ # Part 6. END
1506
+ #
1507
+
1508
+
1509
+ # Part 7.
1510
+ #
1511
+ # Calculating log odds ratio scoring matrices
1512
+ #
1513
+ if $output == 2
1514
+ $outfh.puts <<HEADER
1515
+ #
1516
+ # The probabilities were then divided by the background probabilities
1517
+ HEADER
1518
+ if $penv
1519
+ $outfh.puts <<HEADER
1520
+ # which were derived from the environment-dependent amino acid frequencies.
1521
+ # ^^^^^^^^^^^^^^^^^^^^^
1522
+ HEADER
1523
+ else
1524
+ $outfh.puts <<HEADER
1525
+ # which were derived from the environment-independent amino acid frequencies.
1526
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1527
+ HEADER
1528
+ end
1529
+
1530
+ grp_logo_mats = []
1531
+ factor = $scale / Math::log(2)
1532
+
1533
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1534
+ # calculating substitution probability matrix for each envrionment
1535
+ grp_label = group[0]
1536
+ grp_envs = group[1]
1537
+ grp_logo_mat = $cys == 0 ?
1538
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1539
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1540
+
1541
+ $amino_acids.each_with_index do |aa, aj|
1542
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1543
+ env.logo_array = $cys == 0 ?
1544
+ NArray.float($amino_acids.size + 1) :
1545
+ NArray.float($amino_acids.size)
1546
+
1547
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1548
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1549
+ odds = prob / pai
1550
+ env.logo_array[ai] = factor * Math::log(odds)
1551
+ grp_logo_mat[aj, ai] = env.logo_array[ai]
1552
+ end
1553
+
1554
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1555
+ if $cys == 0
1556
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1557
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1558
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1559
+ odds = prob / pai
1560
+ env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1561
+ grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1562
+ end
1563
+ end
1564
+
1565
+ grp_logo_mats << [grp_label, grp_logo_mat]
1566
+ end
1567
+
1568
+ $tot_logo_mat = $cys == 0 ?
1569
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1570
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1571
+
1572
+ $amino_acids.each_with_index do |aa1, aj|
1573
+ $amino_acids.each_with_index do |aa2, ai|
1574
+ prob = $tot_prob_mat[aj, ai]
1575
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1576
+ odds = prob / pai
1577
+ $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1578
+ end
1579
+
1580
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1581
+ if $cys == 0
1582
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1583
+ prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1584
+ odds = prob / pai
1585
+ $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1586
+ end
1587
+ end
1588
+
1589
+
1590
+ # calculating relative entropy for each amino acid pair H and
1591
+ # the expected score E in bit units
1592
+ tot_E = 0.0
1593
+ tot_H = 0.0
1594
+
1595
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1596
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1597
+ if j != i
1598
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1599
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1600
+ else
1601
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1602
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1603
+ end
1604
+ end
1605
+ end
1606
+
1607
+ $outfh.puts <<HEADER
1608
+ #
1609
+ # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1610
+ HEADER
1611
+ unless $noroundoff
1612
+ $outfh.puts <<HEADER
1613
+ # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1614
+ HEADER
1615
+ end
1616
+
1617
+ $outfh.puts <<HEADER
1618
+ # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1619
+ #
1620
+ HEADER
1621
+
1622
+ grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
1623
+ grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
1624
+ abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
1625
+ row_header = $cys ? $amino_acids + %w[U] : $amino_acids
1626
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1627
+ $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
1628
+
1629
+ grp_logo_mats.each_with_index do |arr, grp_no|
1630
+ grp_label = arr[0]
1631
+ grp_logo_mat = arr[1]
1632
+ stem = "#{grp_no}. #{grp_label}"
1633
+
1634
+ unless $noroundoff
1635
+ grp_logo_mat = grp_logo_mat.round
1636
+ end
1637
+
1638
+ # for a matrix file
1639
+ $outfh.puts ">#{grp_label} #{grp_no}"
1640
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
1641
+ :row_header => row_header)
1642
+ # for a heat map
1643
+ if $heatmap == 0 or $heatmap == 2
1644
+ grp_logo_mat.heatmap(:col_header => $amino_acids,
1645
+ :row_header => row_header,
1646
+ :rvg_width => $rvg_width,
1647
+ :rvg_height => $rvg_height,
1648
+ :canvas_width => $canvas_width,
1649
+ :canvas_height => $canvas_height,
1650
+ :gradient_beg_color => '#0000FF',
1651
+ :gradient_mid_color => '#FFFFFF',
1652
+ :gradient_end_color => '#FF0000',
1653
+ :max_val => abs_max_val.ceil,
1654
+ :mid_val => 0,
1655
+ :min_val => -1 * abs_max_val.ceil,
1656
+ :print_value => $heatmapvalues,
1657
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1658
+
1659
+ $logger.info "Generating a heat map for #{stem} table done."
1660
+ end
1661
+
1662
+ if $heatmap == 1 or $heatmap == 2
1663
+ title_font_size = $rvg_width * $heatmapcol / 80.0
1664
+ heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1665
+ :row_header => row_header,
1666
+ :rvg_width => $rvg_width,
1667
+ :rvg_height => $rvg_height - 50,
1668
+ :canvas_width => $canvas_width,
1669
+ :canvas_height => $canvas_height - 50,
1670
+ :gradient_beg_color => '#0000FF',
1671
+ :gradient_mid_color => '#FFFFFF',
1672
+ :gradient_end_color => '#FF0000',
1673
+ :max_val => abs_max_val.ceil,
1674
+ :mid_val => 0,
1675
+ :min_val => -1 * abs_max_val.ceil,
1676
+ :print_value => $heatmapvalues,
1677
+ :print_gradient => false,
1678
+ :title => stem,
1679
+ :title_font_size => title_font_size)
1680
+ end
1681
+ end
1682
+
1683
+ # for heat maps in a single file
1684
+ if $heatmap == 1 or $heatmap == 2
1685
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1686
+ heatmaps.heatmap(:columns => $heatmapcol,
1687
+ :rvg_width => $rvg_width,
1688
+ :gradient_beg_color => '#0000FF',
1689
+ :gradient_mid_color => '#FFFFFF',
1690
+ :gradient_end_color => '#FF0000',
1691
+ :max_val => abs_max_val.ceil,
1692
+ :mid_val => 0,
1693
+ :min_val => -1 * abs_max_val.ceil).write(file)
1694
+
1695
+ $logger.info "Generating heat maps in a file, #{file} done."
1696
+ end
1697
+
1698
+ # for a matrix file
1699
+ unless $noroundoff
1700
+ $tot_logo_mat = $tot_logo_mat.round
1701
+ end
1702
+
1703
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1704
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
1705
+ :row_header => row_header)
1706
+
1707
+ # for a heat map
1708
+ if $heatmap == 0 or $heatmap == 2
1709
+ stem = "#{group_matrices.size}. TOTAL"
1710
+ tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
1711
+ $tot_logo_mat.heatmap(:col_header => $amino_acids,
1712
+ :row_header => row_header,
1713
+ :rvg_width => $rvg_width,
1714
+ :rvg_height => $rvg_height,
1715
+ :canvas_width => $canvas_width,
1716
+ :canvas_height => $canvas_height,
1717
+ :gradient_beg_color => '#0000FF',
1718
+ :gradient_mid_color => '#FFFFFF',
1719
+ :gradient_end_color => '#FF0000',
1720
+ :max_val => tot_abs_max_val.ceil,
1721
+ :mid_val => 0,
1722
+ :min_val => -1 * tot_abs_max_val.ceil,
1723
+ :print_value => $heatmapvalues,
1724
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1725
+
1726
+ $logger.info "Generating a heat map for #{stem} table done."
1727
+ end
1728
+
1729
+ $logger.info "Calculating log odds ratios done."
1730
+ end
1731
+
1732
+ #
1733
+ # Part 7. END
1734
+ #
1735
+
1736
+ $outfh.close
1737
+ exit 0
1738
+ end
1739
+ end
1740
+
1741
+ end # class CLI
1742
+ end # module Ulla