ulla 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.git/COMMIT_EDITMSG +13 -0
  2. data/.git/HEAD +1 -0
  3. data/.git/config +11 -0
  4. data/.git/description +1 -0
  5. data/.git/hooks/applypatch-msg.sample +15 -0
  6. data/.git/hooks/commit-msg.sample +24 -0
  7. data/.git/hooks/post-commit.sample +8 -0
  8. data/.git/hooks/post-receive.sample +15 -0
  9. data/.git/hooks/post-update.sample +8 -0
  10. data/.git/hooks/pre-applypatch.sample +14 -0
  11. data/.git/hooks/pre-commit.sample +18 -0
  12. data/.git/hooks/pre-rebase.sample +169 -0
  13. data/.git/hooks/prepare-commit-msg.sample +36 -0
  14. data/.git/hooks/update.sample +107 -0
  15. data/.git/index +0 -0
  16. data/.git/info/exclude +6 -0
  17. data/.git/logs/HEAD +3 -0
  18. data/.git/logs/refs/heads/master +3 -0
  19. data/.git/logs/refs/remotes/origin/HEAD +1 -0
  20. data/.git/objects/06/9494e479f28b5751fb135b8e55e8fef3d3a02e +0 -0
  21. data/.git/objects/22/0df784191ad94983ca1d943e49fe482c9d1069 +0 -0
  22. data/.git/objects/3b/b6f2b7f563175a13a0ccd723aab761552f448b +0 -0
  23. data/.git/objects/41/f48aefb4d7a6a87eb423eaae77ae1e8a58dd6c +0 -0
  24. data/.git/objects/44/d1f1782e3ea1d9fd2f9054784b53e8e810a8ca +0 -0
  25. data/.git/objects/4f/364c2eac29f5c7fcbf06419c4f58074cd32ace +2 -0
  26. data/.git/objects/57/1326145a7a4b3e58f3d3008ba343135f213b05 +4 -0
  27. data/.git/objects/6c/4f0844f62b7345f0651b0fb2829a8f157469fb +3 -0
  28. data/.git/objects/73/8dc79450de050f12d48a32602f2ddbe6807029 +0 -0
  29. data/.git/objects/7b/4acb3aee6616d80e295ee21fe8bb7ee93ebe96 +2 -0
  30. data/.git/objects/9e/0a9235b0d70a8029098070007fb414cb52504e +2 -0
  31. data/.git/objects/9e/bfcad2906aac4a23a7c9689a47b76723f5d152 +0 -0
  32. data/.git/objects/a6/578c95f2f474303464b572e9dac716432472b2 +0 -0
  33. data/.git/objects/a8/65ef5700ff04601c6fc40fa5ede3cc25534723 +0 -0
  34. data/.git/objects/aa/285cb176668c5e49c54c6e1d3cc27bd47fd4f4 +0 -0
  35. data/.git/objects/b8/e3828a1082137c4aa4595386bdfb73e3c75b9d +0 -0
  36. data/.git/objects/c2/fb6afc000952b56354fe195682645000d2aea2 +0 -0
  37. data/.git/objects/c4/a0553ca0e3c4628e688ecb5e3304a8a8ac0c28 +2 -0
  38. data/.git/objects/c8/d49f83c4a32cff2d87dd4aa5f83eb7aac3a753 +5 -0
  39. data/.git/objects/ca/c25e8049075ed4bff993705acb4750b2b62ba9 +0 -0
  40. data/.git/objects/d2/ff2e939339eb3fb776e064c258e71dfa1cf396 +0 -0
  41. data/.git/objects/d7/cedf9e2a8ff35b5d7dafdc0f20daed9c65ce44 +0 -0
  42. data/.git/objects/e2/e81af59e3a6c4aa8daac62add62860ae776ba4 +0 -0
  43. data/.git/objects/e5/7c47d183ce5dda1a944c7ee1c19c8a0c4bb278 +0 -0
  44. data/.git/objects/eb/f4a4e1e50bb30731597f776e56b0ccb0c9959f +0 -0
  45. data/.git/objects/f6/39d6f6cf883fde4b9052012919c1df3288c7da +0 -0
  46. data/.git/objects/f8/2346f308f49053df108b7c31ac3089e8b4b4ac +0 -0
  47. data/.git/objects/fb/4b193bb1cbe9041d2f00176f6caa6acfb1fc12 +0 -0
  48. data/.git/objects/pack/pack-aebf617a0b8e016433238d2f21f542bc5b21bd15.idx +0 -0
  49. data/.git/objects/pack/pack-aebf617a0b8e016433238d2f21f542bc5b21bd15.pack +0 -0
  50. data/.git/packed-refs +8 -0
  51. data/.git/refs/heads/master +1 -0
  52. data/.git/refs/remotes/origin/HEAD +1 -0
  53. data/.gitignore +8 -0
  54. data/History.txt +42 -0
  55. data/Manifest.txt +90 -0
  56. data/PostInstall.txt +5 -0
  57. data/README.rdoc +259 -0
  58. data/Rakefile +32 -0
  59. data/bin/ulla +10 -0
  60. data/config/website.yml +2 -0
  61. data/config/website.yml.sample +2 -0
  62. data/lib/math_extensions.rb +7 -0
  63. data/lib/narray_extensions.rb +22 -0
  64. data/lib/nmatrix_extensions.rb +245 -0
  65. data/lib/string_extensions.rb +17 -0
  66. data/lib/ulla/cli.rb +1742 -0
  67. data/lib/ulla/environment.rb +34 -0
  68. data/lib/ulla/environment_class_hash.rb +20 -0
  69. data/lib/ulla/environment_feature.rb +26 -0
  70. data/lib/ulla/environment_feature_array.rb +12 -0
  71. data/lib/ulla/heatmap_array.rb +111 -0
  72. data/lib/ulla.rb +6 -0
  73. data/script/console +10 -0
  74. data/script/destroy +14 -0
  75. data/script/generate +14 -0
  76. data/script/txt2html +71 -0
  77. data/test/test_helper.rb +2 -0
  78. data/test/test_math_extensions.rb +11 -0
  79. data/test/test_narray_extensions.rb +14 -0
  80. data/test/test_nmatrix_extensions.rb +16 -0
  81. data/test/test_string_extensions.rb +11 -0
  82. data/test/test_ulla.rb +11 -0
  83. data/test/ulla/test_cli.rb +9 -0
  84. data/test/ulla/test_environment_class_hash.rb +25 -0
  85. data/test/ulla/test_environment_feature.rb +29 -0
  86. data/website/index.html +16 -0
  87. data/website/index.txt +217 -0
  88. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  89. data/website/stylesheets/screen.css +158 -0
  90. data/website/template.html.erb +57 -0
  91. metadata +215 -0
data/lib/ulla/cli.rb ADDED
@@ -0,0 +1,1742 @@
1
+ require 'rubygems'
2
+ require 'getoptlong'
3
+ require 'logger'
4
+ require 'narray'
5
+ require 'bio'
6
+ require 'set'
7
+ require 'facets'
8
+
9
+ require 'math_extensions'
10
+ require 'string_extensions'
11
+ require 'narray_extensions'
12
+ require 'nmatrix_extensions'
13
+
14
+ require 'ulla/environment'
15
+ require 'ulla/environment_class_hash'
16
+ require 'ulla/environment_feature'
17
+ require 'ulla/environment_feature_array'
18
+ require 'ulla/heatmap_array'
19
+
20
+ # This is a module for an actual command line interpreter for Ulla
21
+ # ---
22
+ # Copyright (C) 2008-9 Semin Lee
23
+ module Ulla
24
+ class CLI
25
+ class << self
26
+
27
+ # :nodoc:
28
+ def print_version
29
+ puts VERSION
30
+ end
31
+
32
+ # Print Ulla's Usage on the screen
33
+ #
34
+ # :call-seq:
35
+ # Ulla::CLI::print_usage
36
+ #
37
+ def print_usage
38
+ puts <<-USAGE
39
+ ulla: a program to calculate environment-specific amino acid substitution tables.
40
+
41
+ Usage:
42
+ ulla [ options ] -l TEMLIST-file -c CLASSDEF-file
43
+ or
44
+ ulla [ options ] -f TEM-file -c CLASSDEF-file
45
+
46
+ Options:
47
+ --tem-file (-f) FILE: a tem file
48
+ --tem-list (-l) FILE: a list for tem files
49
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
50
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
51
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
52
+ --noweight: calculate substitution counts with no weights
53
+ --smooth (-s) INTEGER:
54
+ 0 for partial smoothing (default)
55
+ 1 for full smoothing
56
+ --p1smooth: perform smoothing for p1 probability calculation when partial smoothing
57
+ --nosmooth: perform no smoothing operation
58
+ --cys (-y) INTEGER:
59
+ 0 for using C and J only for structure (default)
60
+ 1 for both structure and sequence
61
+ 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
62
+ --output INTEGER:
63
+ 0 for raw counts (no smoothing performed)
64
+ 1 for probabilities
65
+ 2 for log-odds (default)
66
+ --noroundoff: do not round off log odds ratio
67
+ --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
68
+ --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
69
+ --autosigma: automatically adjust the sigma value for smoothing
70
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
71
+ --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
72
+ --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
73
+ --heatmap INTEGER:
74
+ 0 create a heat map file for each substitution table
75
+ 1 create one big file containing all heat maps from substitution tables
76
+ 2 do both 0 and 1
77
+ --heatmap-format INTEGER:
78
+ 0 for Portable Network Graphics (PNG) Format (default)
79
+ 1 for Graphics Interchange Format (GIF)
80
+ 2 for Joint Photographic Experts Group (JPEG) Format
81
+ 3 for Microsoft Windows bitmap (BMP) Format
82
+ 4 for Portable Document Format (PDF)
83
+ --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables))
84
+ --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap')
85
+ --heatmap-values: print values in the cells when generating heat maps
86
+ --verbose (-v) INTEGER
87
+ 0 for ERROR level
88
+ 1 for WARN or above level (default)
89
+ 2 for INFO or above level
90
+ 3 for DEBUG or above level
91
+ --version: print version
92
+ --help (-h): show help
93
+
94
+ USAGE
95
+ end
96
+
97
+ # Calculate PID between two sequences
98
+ #
99
+ # :call-seq:
100
+ # Ulla::CLI::calculate_pid(seq1, seq2) -> Float
101
+ #
102
+ def calculate_pid(seq1, seq2)
103
+ aas1 = seq1.split('')
104
+ aas2 = seq2.split('')
105
+ cols = aas1.zip(aas2)
106
+ align = 0 # no. of aligned columns
107
+ ident = 0 # no. of identical columns
108
+ intgp = 0 # no. of internal gaps
109
+
110
+ cols.each do |col|
111
+ if (col[0] != '-') && (col[1] != '-')
112
+ align += 1
113
+ if col[0] == col[1]
114
+ ident += 1
115
+ end
116
+ elsif (((col[0] == '-') && (col[1] != '-')) ||
117
+ ((col[0] != '-') && (col[1] == '-')))
118
+ intgp += 1
119
+ end
120
+ end
121
+
122
+ pid = 100.0 * ident.to_f / (align + intgp)
123
+ end
124
+
125
+ # :nodoc:
126
+ def execute(arguments=[])
127
+ #
128
+ # * Abbreviations in the codes
129
+ #
130
+ # env: environment
131
+ # tem: (FUGUE) template
132
+ # classdef: (envlironment) class definition
133
+ # aa: amino acid
134
+ # aa: weighted amino acid
135
+ # tot: total
136
+ # rel: relative
137
+ # jnt: joint
138
+ # cnt: count
139
+ # mut: mutation
140
+ # mutb: mutability
141
+ # freq: frequency
142
+ # prob: probability
143
+ # logo: log odds ratio
144
+ # opts: options
145
+ # fh: file handle
146
+ # ff: flat file
147
+ # ali: alignment
148
+ # mat: matrix
149
+ # arr: array
150
+
151
+
152
+ # Part 1.
153
+ #
154
+ # Global variables and their default values
155
+ #
156
+
157
+ $logger = Logger.new(STDOUT)
158
+ $logger.level = Logger::WARN
159
+
160
+ # default set of 21 amino acids including J (Cysteine, the free thiol form)
161
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
162
+ $tem_list = nil
163
+ $tem_file = nil
164
+ $classdef = 'classdef.dat'
165
+ $outfile = 'allmat.dat'
166
+ $outfh = nil # file hanfle for outfile
167
+ $output = 2 # default: log odds matrix
168
+ $ali_size = 0
169
+ $tot_aa = 0
170
+ $sigma = 5.0
171
+ $autosigma = false
172
+ $weight = 60
173
+ $noweight = false
174
+ $smooth = :partial
175
+ $nosmooth = false
176
+ $noroundoff = false
177
+ $p1smooth = false
178
+ $scale = 3
179
+ $pidmin = nil
180
+ $pidmax = nil
181
+ $scale = 3
182
+ $add = nil
183
+ $cys = 0
184
+ $targetenv = false
185
+ $penv = false
186
+ $heatmap = nil
187
+ $heatmapcol = nil
188
+ $heatmapformat = 'png'
189
+ $heatmapstem = 'heatmaps'
190
+ $heatmapvalues = false
191
+ $rvg_width = 550
192
+ $rvg_height = 650
193
+ $canvas_width = 550
194
+ $canvas_height = 650
195
+ $cell_width = 20
196
+ $cell_height = 20
197
+
198
+ $aa_tot_cnt = Hash.new(0)
199
+ $aa_mut_cnt = Hash.new(0)
200
+ $aa_mutb = {}
201
+ $aa_rel_mutb = {}
202
+ $aa_tot_freq = {}
203
+ $aa_env_cnt = Hash.new(0)
204
+ $smooth_prob = {}
205
+ $tot_cnt_mat = nil
206
+ $tot_prob_mat = nil
207
+ $tot_logo_mat = nil
208
+ $tot_smooth_prob = {}
209
+
210
+ # minimum ratio of amino acid count to sigma value
211
+ $min_cnt_sigma_ratio = 500.0
212
+
213
+ #
214
+ # Part 1 END
215
+ #
216
+
217
+ # Part 2.
218
+ #
219
+ # Parsing options
220
+ #
221
+
222
+ opts = GetoptLong.new(
223
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
224
+ [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
225
+ [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
226
+ [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
227
+ [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
228
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
229
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
230
+ [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
231
+ [ '--noweight', GetoptLong::NO_ARGUMENT ],
232
+ [ '--noroundoff', GetoptLong::NO_ARGUMENT ],
233
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
234
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
235
+ [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
236
+ [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
237
+ [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ],
238
+ [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ],
239
+ [ '--heatmap-values', GetoptLong::NO_ARGUMENT ],
240
+ [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
241
+ [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ],
242
+ [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
243
+ [ '--penv', GetoptLong::NO_ARGUMENT ],
244
+ [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ],
245
+ [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ],
246
+ [ '--version', GetoptLong::NO_ARGUMENT ]
247
+ )
248
+
249
+ begin
250
+ opts.each do |opt, arg|
251
+ case opt
252
+ when '--help'
253
+ print_usage
254
+ exit 0
255
+ when '--tem-list'
256
+ $tem_list = arg
257
+ when '--tem-file'
258
+ $tem_file = arg
259
+ when '--classdef'
260
+ $classdef = arg
261
+ when '--output'
262
+ $output = arg.to_i
263
+ when '--outfile'
264
+ $outfile = arg
265
+ when '--cys'
266
+ $cys = arg.to_i
267
+ when '--targetenv'
268
+ $targetenv = (arg.to_i == 1) ? true : false
269
+ when '--weight'
270
+ $weight = arg.to_i
271
+ when '--sigma'
272
+ $sigma = arg.to_f
273
+ when '--autosigma'
274
+ $autosigma = true
275
+ when '--pidmin'
276
+ $pidmin = arg.to_f
277
+ when '--pidmax'
278
+ $pidmax = arg.to_f
279
+ when '--noweight'
280
+ $noweight = true
281
+ when '--noroundoff'
282
+ $noroundoff = true
283
+ when '--smooth'
284
+ $smooth = (arg.to_i == 1) ? :full : :partial
285
+ when '--nosmooth'
286
+ $nosmooth = true
287
+ when '--p1smooth'
288
+ $p1smooth = true
289
+ when '--scale'
290
+ $scale = arg.to_f
291
+ when '--add'
292
+ $add = arg.to_f
293
+ when '--penv'
294
+ warn "--penv option is not supported."
295
+ exit 1
296
+ $penv = true
297
+ when '--heatmap'
298
+ $heatmap = case arg.to_i
299
+ when (0..2) then arg.to_i
300
+ else
301
+ warn "--heatmap #{arg.to_i} is not allowed."
302
+ exit1
303
+ end
304
+ when '--heatmap-columns'
305
+ $heatmapcol = arg.to_i
306
+ when '--heatmap-stem'
307
+ $heatmapstem = arg.to_s
308
+ when '--heatmap-format'
309
+ $heatmapformat = case arg.to_i
310
+ when 0 then 'png'
311
+ when 1 then 'gif'
312
+ when 2 then 'jpg'
313
+ when 3 then 'bmp'
314
+ when 4 then 'pdf'
315
+ else
316
+ warn "--heatmap-format #{arg.to_i} is not supported."
317
+ exit 1
318
+ end
319
+ when '--heatmap-values'
320
+ $heatmapvalues = true
321
+ when '--verbose'
322
+ $logger.level = case arg.to_i
323
+ when 0 then Logger::ERROR
324
+ when 1 then Logger::WARN
325
+ when 2 then Logger::INFO
326
+ when 3 then Logger::DEBUG
327
+ else
328
+ warn "--verbose (-v) #{arg.to_i} is not supported."
329
+ exit 1
330
+ end
331
+ when '--version'
332
+ print_version
333
+ exit 0
334
+ end
335
+ end
336
+ rescue
337
+ # invalid option
338
+ exit 1
339
+ end
340
+
341
+ # when arguments are nonsense, print usage
342
+ if ((ARGV.length != 0) ||
343
+ (!$tem_list && !$tem_file) ||
344
+ ($tem_list && $tem_file))
345
+ print_usage
346
+ exit 1
347
+ end
348
+
349
+ # warn if any input file is missing
350
+ if $tem_list && !File.exist?($tem_list)
351
+ warn "Cannot find template list file, #{$tem_list}"
352
+ exit 1
353
+ end
354
+
355
+ if $tem_file && !File.exist?($tem_file)
356
+ warn "Cannot find template file, #{$tem_file}"
357
+ exit 1
358
+ end
359
+
360
+ if $classdef && !File.exist?($classdef)
361
+ warn "Cannot find environment class definition file, #{$classdef}"
362
+ exit 1
363
+ end
364
+
365
+ #
366
+ # Part 2 END
367
+ #
368
+
369
+
370
+ # Part 3.
371
+ #
372
+ # Reading Environment Class Definition File
373
+ #
374
+
375
+ # check --cys option and modify amino_acids set if necessary
376
+ if $cys == 2
377
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
378
+ end
379
+
380
+ # create an EnvironmentFeatureList object for storing all environment
381
+ # features
382
+ $env_features = EnvironmentFeatureArray.new
383
+
384
+ # an array for storing indexes of constrained environment features
385
+ $cst_features = []
386
+
387
+ # add substituted amino acid (aa1) in a substitution to the environment
388
+ # feature list
389
+ $env_features << EnvironmentFeature.new('sequence',
390
+ $amino_acids,
391
+ $amino_acids,
392
+ 'F',
393
+ 'F')
394
+
395
+ # read environment class definiton file and store them into
396
+ # the hash prepared above
397
+ env_index = 1
398
+
399
+ IO.foreach($classdef) do |line|
400
+ line.chomp!
401
+ if line.start_with?('#')
402
+ next
403
+ elsif (env_ftr = line.chomp.split(/;/)).length == 5
404
+ $logger.info "An environment feature, #{line} detected."
405
+ if env_ftr[-1] == 'T'
406
+ # skip silenced environment feature
407
+ $logger.warn "The environment feature, #{line} silent."
408
+ next
409
+ end
410
+ if env_ftr[-2] == 'T'
411
+ $cst_features << env_index
412
+ $logger.warn "The environment feature, #{line} constrained."
413
+ end
414
+ $env_features << EnvironmentFeature.new(env_ftr[0],
415
+ env_ftr[1].split(''),
416
+ env_ftr[2].split(''),
417
+ env_ftr[3],
418
+ env_ftr[4])
419
+ env_index += 1
420
+ else
421
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for" +
422
+ "a environment class definition."
423
+ exit 1
424
+ end
425
+ end
426
+
427
+ # a hash for storing all environment classes
428
+ $env_classes = EnvironmentClassHash.new
429
+
430
+ # generate all possible combinations of environment labels, and store
431
+ # every environment class into the hash prepared above with the label
432
+ # as a key
433
+ $env_features.label_combinations.each_with_index { |e, i|
434
+ $env_classes[e.flatten.join] = Environment.new(i,
435
+ e.flatten.join,
436
+ $amino_acids)
437
+ }
438
+
439
+ #
440
+ # Part 3 END
441
+ #
442
+
443
+
444
+ # Part 4.
445
+ #
446
+ # Reading TEM file or TEMLIST list file and couting substitutions
447
+ #
448
+
449
+ # a global file handle for output
450
+ $outfh = File.open($outfile, 'w')
451
+
452
+ if $tem_file
453
+ $tem_list_io = StringIO.new($tem_file)
454
+ end
455
+
456
+ if $tem_list
457
+ $tem_list_io = File.open($tem_list)
458
+ end
459
+
460
+ $tem_list_io.each_line do |tem_file|
461
+ tem_file.chomp!
462
+
463
+ ali = Bio::Alignment::OriginalAlignment.new
464
+ ff = Bio::FlatFile.auto(tem_file)
465
+
466
+ ff.each_entry do |pir|
467
+ if (pir.definition == 'sequence') || (pir.definition == 'structure')
468
+ ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id)
469
+ end
470
+ end
471
+
472
+ if ali.size < 2
473
+ $logger.warn "Skipped #{tem_file} which has only one unique entry."
474
+ next
475
+ end
476
+
477
+ $ali_size += 1
478
+ env_labels = {}
479
+ disulphide = {}
480
+
481
+ ali.each_pair do |key, seq|
482
+ # check disulphide bond environment first!
483
+ ff.rewind
484
+ ff.each_entry do |pir|
485
+ if ((pir.entry_id == key) &&
486
+ ((pir.definition == "disulphide") ||
487
+ (pir.definition == "disulfide")))
488
+ disulphide[key] = pir.data.remove_internal_spaces.split('')
489
+ end
490
+ end
491
+
492
+ $env_features.each_with_index do |ec, ei|
493
+ env_labels[key] = [] unless env_labels.has_key?(key)
494
+
495
+ ff.rewind
496
+ ff.each_entry do |pir|
497
+ if (pir.entry_id == key) && (pir.definition == ec.name)
498
+ labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos|
499
+ if sym == '-'
500
+ '-'
501
+ elsif sym == 'X' || sym == 'x'
502
+ 'X'
503
+ else
504
+ if ei == 0 # Amino Acid Environment Feature
505
+ (disulphide.has_key?(key) &&
506
+ (disulphide[key][pos] == 'F') &&
507
+ (sym == 'C')) ? 'J' : sym
508
+ else
509
+ ec.labels[ec.symbols.index(sym)]
510
+ end
511
+ end
512
+ end
513
+
514
+ if env_labels[key].empty?
515
+ env_labels[key] = labels
516
+ else
517
+ env_labels[key].each_with_index { |e, i|
518
+ env_labels[key][i] = e + labels[i]
519
+ }
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end
525
+
526
+ if $noweight
527
+ ali.each_pair do |id1, seq1|
528
+ ali.each_pair do |id2, seq2|
529
+ if id1 != id2
530
+ pid = calculate_pid(seq1, seq2)
531
+ s1 = seq1.split('')
532
+ s2 = seq2.split('')
533
+
534
+ # check PID_MIN
535
+ if $pidmin && (pid < $pidmin)
536
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
537
+ "having PID, #{pid}% less than PID_MIN, #{$pidmin}."
538
+ next
539
+ end
540
+
541
+ # check PID_MAX
542
+ if $pidmax && (pid > $pidmax)
543
+ $logger.info "Skip alignment between #{id1} and #{id2} " +
544
+ "having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
545
+ next
546
+ end
547
+
548
+ s1.each_with_index do |aa1, pos|
549
+ aa1.upcase!
550
+ aa2 = s2[pos].upcase
551
+
552
+ if env_labels[id1][pos].include?('X')
553
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
554
+ next
555
+ end
556
+
557
+ if env_labels[id2][pos].include?('X')
558
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
559
+ next
560
+ end
561
+
562
+ unless $amino_acids.include?(aa1)
563
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
564
+ next
565
+ end
566
+
567
+ unless $amino_acids.include?(aa2)
568
+ $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
569
+ next
570
+ end
571
+
572
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
573
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
574
+
575
+ if $cst_features.empty?
576
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
577
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
578
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
579
+ else
580
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
581
+ next
582
+ end
583
+
584
+ grp_label = env_labels[id1][pos][1..-1]
585
+
586
+ if $aa_env_cnt.has_key? grp_label
587
+ if $aa_env_cnt[grp_label].has_key? aa1
588
+ $aa_env_cnt[grp_label][aa1] += 1
589
+ else
590
+ $aa_env_cnt[grp_label][aa1] = 1
591
+ end
592
+ else
593
+ $aa_env_cnt[grp_label] = Hash.new(0)
594
+ $aa_env_cnt[grp_label][aa1] = 1
595
+ end
596
+
597
+ if $aa_tot_cnt.has_key? aa1
598
+ $aa_tot_cnt[aa1] += 1
599
+ else
600
+ $aa_tot_cnt[aa1] = 1
601
+ end
602
+
603
+ if aa1 != aa2
604
+ if $aa_mut_cnt.has_key? aa1
605
+ $aa_mut_cnt[aa1] += 1
606
+ else
607
+ $aa_mut_cnt[aa1] = 1
608
+ end
609
+ end
610
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
611
+ end
612
+ end
613
+ end
614
+ end
615
+ else
616
+ # BLOSUM-like weighting
617
+ clusters = []
618
+ ali.each_pair { |i, s| clusters << [i] }
619
+
620
+ # a loop for single linkage clustering
621
+ begin
622
+ continue = false
623
+ 0.upto(clusters.size - 2) do |i|
624
+ indexes = []
625
+ (i + 1).upto(clusters.size - 1) do |j|
626
+ found = false
627
+ clusters[i].each do |c1|
628
+ clusters[j].each do |c2|
629
+ if calculate_pid(ali[c1], ali[c2]) >= $weight
630
+ indexes << j
631
+ found = true
632
+ break
633
+ end
634
+ end
635
+ break if found
636
+ end
637
+ end
638
+
639
+ unless indexes.empty?
640
+ continue = true
641
+ group = clusters[i]
642
+ indexes.each do |k|
643
+ group = group.concat(clusters[k])
644
+ clusters[k] = nil
645
+ end
646
+ clusters[i] = group
647
+ clusters.compact!
648
+ end
649
+ end
650
+ end while(continue)
651
+
652
+ if clusters.size < 2
653
+ $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level."
654
+ next
655
+ end
656
+
657
+ clusters.combination(2).each do |cluster1, cluster2|
658
+ cluster1.each do |id1|
659
+ cluster2.each do |id2|
660
+ seq1 = ali[id1].split('')
661
+ seq2 = ali[id2].split('')
662
+
663
+ seq1.each_with_index do |aa1, pos|
664
+ aa1.upcase!
665
+ aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
666
+
667
+ if env_labels[id1][pos].include?('X')
668
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
669
+ next
670
+ end
671
+
672
+ if env_labels[id2][pos].include?('X')
673
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
674
+ next
675
+ end
676
+
677
+ unless $amino_acids.include?(aa1)
678
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
679
+ next
680
+ end
681
+
682
+ unless $amino_acids.include?(aa2)
683
+ $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
684
+ next
685
+ end
686
+
687
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
688
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
689
+ cnt1 = 1.0 / cluster1.size
690
+ cnt2 = 1.0 / cluster2.size
691
+ jnt_cnt = cnt1 * cnt2
692
+
693
+ if $cst_features.empty?
694
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
695
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
696
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
697
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt)
698
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt)
699
+ else
700
+ $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
701
+ next
702
+ end
703
+
704
+ grp_label1 = env_labels[id1][pos][1..-1]
705
+ grp_label2 = env_labels[id2][pos][1..-1]
706
+
707
+ if $aa_env_cnt.has_key? grp_label1
708
+ if $aa_env_cnt[grp_label1].has_key? aa1
709
+ $aa_env_cnt[grp_label1][aa1] += cnt1
710
+ else
711
+ $aa_env_cnt[grp_label1][aa1] = cnt1
712
+ end
713
+ else
714
+ $aa_env_cnt[grp_label1] = Hash.new(0.0)
715
+ $aa_env_cnt[grp_label1][aa1] = cnt1
716
+ end
717
+
718
+ if $aa_env_cnt.has_key? grp_label2
719
+ if $aa_env_cnt[grp_label2].has_key? aa2
720
+ $aa_env_cnt[grp_label2][aa2] += cnt2
721
+ else
722
+ $aa_env_cnt[grp_label2][aa2] = cnt2
723
+ end
724
+ else
725
+ $aa_env_cnt[grp_label2] = Hash.new(0.0)
726
+ $aa_env_cnt[grp_label2][aa2] = cnt2
727
+ end
728
+
729
+ if $aa_tot_cnt.has_key? aa1
730
+ $aa_tot_cnt[aa1] += cnt1
731
+ else
732
+ $aa_tot_cnt[aa1] = cnt1
733
+ end
734
+
735
+ if $aa_tot_cnt.has_key? aa2
736
+ $aa_tot_cnt[aa2] += cnt2
737
+ else
738
+ $aa_tot_cnt[aa2] = cnt2
739
+ end
740
+
741
+ if aa1 != aa2
742
+ if $aa_mut_cnt.has_key? aa1
743
+ $aa_mut_cnt[aa1] += cnt1
744
+ else
745
+ $aa_mut_cnt[aa1] = cnt1
746
+ end
747
+ if $aa_mut_cnt.has_key? aa2
748
+ $aa_mut_cnt[aa2] += cnt2
749
+ else
750
+ $aa_mut_cnt[aa2] = cnt2
751
+ end
752
+ end
753
+
754
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
755
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
756
+ end
757
+ end
758
+ end
759
+ end
760
+ end
761
+ $logger.info "Analysing #{tem_file} done."
762
+ end
763
+
764
+ # print out default header
765
+ $outfh.puts <<HEADER
766
+ # Environment-specific amino acid substitution matrices
767
+ # Creator: ulla version #{VERSION}
768
+ # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
769
+ #
770
+ # Definitions for structural environments:
771
+ # #{$env_features.size - 1} features used
772
+ #
773
+ HEADER
774
+
775
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
776
+
777
+ $outfh.puts <<HEADER
778
+ # (read in from #{$classdef})
779
+ #
780
+ # Number of alignments: #{$ali_size}
781
+ # (list of .tem files read in from #{$tem_list})
782
+ #
783
+ # Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
784
+ #
785
+ # There are #{$amino_acids.size} amino acids considered.
786
+ # #{$amino_acids.join}
787
+ #
788
+ HEADER
789
+
790
+ if $amino_acids.include? 'J'
791
+ $outfh.puts <<HEADER
792
+ # C: Cystine (the disulfide-bonded form)
793
+ # J: Cysteine (the free thiol form)
794
+ #
795
+ HEADER
796
+ end
797
+
798
+ if $noweight
799
+ $outfh.puts '# Weighting scheme: none'
800
+ else
801
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
802
+ end
803
+
804
+ # calculate amino acid frequencies and mutabilities, and
805
+ # print them as default statistics in the header part
806
+ ala_factor = if $aa_tot_cnt['A'] == 0
807
+ 0.0
808
+ elsif $aa_mut_cnt['A'] == 0
809
+ 0.0
810
+ else
811
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
812
+ end
813
+ $tot_aa = $aa_tot_cnt.values.sum
814
+
815
+ $outfh.puts '#'
816
+ $outfh.puts "# Total amino acid frequencies:\n"
817
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
818
+
819
+ min_cnt = -1
820
+ min_sigma = nil
821
+
822
+ $amino_acids.each do |res|
823
+ if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio
824
+ if min_cnt < 0
825
+ min_cnt = $aa_tot_cnt[res]
826
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
827
+ elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res])
828
+ min_cnt = $aa_tot_cnt[res]
829
+ min_sigma = min_cnt / $min_cnt_sigma_ratio
830
+ end
831
+
832
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
833
+ end
834
+
835
+ $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
836
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
837
+ $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
838
+ end
839
+
840
+ $amino_acids.each do |res|
841
+ if $noweight
842
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
843
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
844
+ else
845
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
846
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
847
+ end
848
+ end
849
+
850
+ if min_cnt > -1
851
+ $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
852
+ if $autosigma
853
+ $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
854
+ $sigma = min_sigma
855
+ end
856
+ end
857
+
858
+ $outfh.puts '#'
859
+ $outfh.puts '# RES: Amino acid one letter code'
860
+ $outfh.puts '# TOT_OBS: Total count of incidence'
861
+ $outfh.puts '# MUT_OBS: Total count of mutation'
862
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
863
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)'
864
+ $outfh.puts '# REL_FREQ: Relative frequency'
865
+ $outfh.puts '#'
866
+
867
+ #
868
+ # Part 4. END
869
+ #
870
+
871
+
872
+ # Part 5.
873
+ #
874
+ # Generating substitution frequency matrices
875
+ #
876
+
877
+ # calculating probabilities for each environment
878
+ $env_classes.values.each do |e|
879
+ if e.freq_array.sum != 0
880
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
881
+ end
882
+ end
883
+
884
+ # count raw frequencies
885
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
886
+ group_matrices = []
887
+
888
+ # for each combination of environment features
889
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
890
+ grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
891
+
892
+ $amino_acids.each_with_index do |aa, aj|
893
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
894
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
895
+ end
896
+
897
+ $tot_cnt_mat += grp_cnt_mat
898
+ group_matrices << [group[0], grp_cnt_mat]
899
+ end
900
+
901
+ $logger.info "Counting substitutions done."
902
+
903
+ if $output == 0
904
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
905
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max
906
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
907
+
908
+ group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no|
909
+ # for a matrix file
910
+ stem = "#{grp_no}. #{grp_label}"
911
+ $outfh.puts ">#{grp_label} #{grp_no}"
912
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids,
913
+ :row_header => $amino_acids)
914
+
915
+ # for a heat map
916
+ if $heatmap == 0 or $heatmap == 2
917
+ grp_cnt_mat.heatmap(:col_header => $amino_acids,
918
+ :row_header => $amino_acids,
919
+ :rvg_width => $rvg_width,
920
+ :rvg_height => $rvg_height,
921
+ :canvas_width => $canvas_width,
922
+ :canvas_height => $canvas_height,
923
+ :max_val => grp_max_val.ceil,
924
+ :min_val => 0,
925
+ :print_value => $heatmapvalues,
926
+ :title => stem).write("#{stem}.#{$heatmapformat}")
927
+
928
+ $logger.info "Generating a heat map for #{stem} table done."
929
+ end
930
+
931
+ if $heatmap == 1 or $heatmap == 2
932
+ title_font_size = $rvg_width * $heatmapcol / 80.0
933
+ heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids,
934
+ :row_header => $amino_acids,
935
+ :rvg_width => $rvg_width,
936
+ :rvg_height => $rvg_height - 50,
937
+ :canvas_width => $canvas_width,
938
+ :canvas_height => $canvas_height - 50,
939
+ :max_val => grp_max_val.ceil,
940
+ :min_val => 0,
941
+ :print_value => $heatmapvalues,
942
+ :print_gradient => false,
943
+ :title => stem,
944
+ :title_font_size => $rvg_width * $heatmapcol / 100.0)
945
+ end
946
+ end
947
+
948
+ if $heatmap == 1 or $heatmap == 2
949
+ file = "#{$heatmapstem}.#{$heatmapformat}"
950
+ heatmaps.heatmap(:columns => $heatmapcol,
951
+ :rvg_width => $rvg_width,
952
+ :max_val => grp_max_val.ceil,
953
+ :min_val => 0).write(file)
954
+
955
+ $logger.info "Generating heat maps in a file, #{file} done."
956
+ end
957
+
958
+ # total
959
+ $outfh.puts '>Total'
960
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids,
961
+ :row_header => $amino_acids)
962
+
963
+ if $heatmap == 0 or $heatmap == 2
964
+ stem = "#{group_matrices.size}. TOTAL"
965
+ heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids,
966
+ :row_header => $amino_acids,
967
+ :rvg_width => $rvg_width,
968
+ :rvg_height => $rvg_height,
969
+ :canvas_width => $canvas_width,
970
+ :canvas_height => $canvas_height,
971
+ :max_val => $tot_cnt_mat.max.ceil,
972
+ :min_val => 0,
973
+ :print_value => $heatmapvalues,
974
+ :title => stem).write("#{stem}.#{$heatmapformat}")
975
+
976
+ $logger.info "Generating a heat map for #{stem} table done."
977
+ end
978
+ exit 0
979
+ end
980
+
981
+ #
982
+ # Part 5. END
983
+ #
984
+
985
+
986
+ # Part 6.
987
+ #
988
+ # Calculating substitution probability tables
989
+ #
990
+
991
+ if $output == 1
992
+ $outfh.puts <<HEADER
993
+ #
994
+ # Each column (j) represents the probability distribution for the
995
+ # likelihood of acceptance of a mutational event by a residue type j in
996
+ # a particular structural environment (specified after >) leading to
997
+ # any other residue type (i) and sums up to 100.
998
+ #
999
+ HEADER
1000
+ end
1001
+
1002
+ # when nosmoothing !!!
1003
+ if ($output > 0) && $nosmooth
1004
+ # reinitialize $tot_cnt_mat for pseudocounts
1005
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1006
+
1007
+ # for each combination of environment features
1008
+ pseudo_cnt = $add || (1.0 / $env_classes.group_size)
1009
+
1010
+ # add pseudo counts for each frequency vector
1011
+ $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
1012
+
1013
+ # re-calculate probability vector for each environment class
1014
+ $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
1015
+
1016
+ group_matrices = []
1017
+
1018
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1019
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1020
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1021
+
1022
+ $amino_acids.each_with_index do |aa, aj|
1023
+ env_class = group[1].find { |e| e.label.start_with?(aa) }
1024
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
1025
+ 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
1026
+ end
1027
+
1028
+ $tot_cnt_mat += grp_cnt_mat
1029
+ group_matrices << [group[0], grp_prob_mat]
1030
+ end
1031
+
1032
+ if $output == 1
1033
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1034
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1035
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1036
+
1037
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1038
+ # for a matrix file
1039
+ stem = "#{grp_no}. #{grp_label}"
1040
+ $outfh.puts ">#{grp_label} #{grp_no}"
1041
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1042
+ :row_header => $amino_acids)
1043
+
1044
+
1045
+ # for a heat map
1046
+ if $heatmap == 0 or $heatmap == 2
1047
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1048
+ :row_header => $amino_acids,
1049
+ :rvg_width => $rvg_width,
1050
+ :rvg_height => $rvg_height,
1051
+ :canvas_width => $canvas_width,
1052
+ :canvas_height => $canvas_height,
1053
+ :max_val => grp_max_val.ceil,
1054
+ :min_val => 0,
1055
+ :print_value => $heatmapvalues,
1056
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1057
+
1058
+ $logger.info "Generating a heat map for #{stem} table done."
1059
+ end
1060
+
1061
+ if $heatmap == 1 or $heatmap == 2
1062
+ title_font_size = $rvg_width * $heatmapcol / 80.0
1063
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1064
+ :row_header => $amino_acids,
1065
+ :rvg_width => $rvg_width,
1066
+ :rvg_height => $rvg_height - 50,
1067
+ :canvas_width => $canvas_width,
1068
+ :canvas_height => $canvas_height - 50,
1069
+ :max_val => grp_max_val.ceil,
1070
+ :min_val => 0,
1071
+ :print_value => $heatmapvalues,
1072
+ :print_gradient => false,
1073
+ :title => stem,
1074
+ :title_font_size => title_font_size)
1075
+ end
1076
+ end
1077
+
1078
+ # for heat maps in a single file
1079
+ if $heatmap == 1 or $heatmap == 2
1080
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1081
+ heatmaps.heatmap(:columns => $heatmapcol,
1082
+ :rvg_width => $rvg_width,
1083
+ :max_val => grp_max_val.ceil,
1084
+ :min_val => 0).write(file)
1085
+
1086
+ $logger.info "Generating heat maps in a file, #{file} done."
1087
+ end
1088
+ end
1089
+
1090
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1091
+
1092
+ 0.upto($amino_acids.size - 1) do |aj|
1093
+ col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
1094
+ 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
1095
+ end
1096
+
1097
+ if $output == 1
1098
+ $outfh.puts '>Total'
1099
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1100
+ :row_header => $amino_acids)
1101
+ $outfh.close
1102
+
1103
+ # for a heat map
1104
+ if $heatmap == 0 or $heatmap == 2
1105
+ stem = "#{group_matrices.size}. TOTAL"
1106
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1107
+ :row_header => $amino_acids,
1108
+ :rvg_width => $rvg_width,
1109
+ :rvg_height => $rvg_height,
1110
+ :canvas_width => $canvas_width,
1111
+ :canvas_height => $canvas_height,
1112
+ :max_val => $tot_prob_mat.max.ceil,
1113
+ :min_val => 0,
1114
+ :print_value => $heatmapvalues,
1115
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1116
+
1117
+ $logger.info "Generating a heat map for #{stem} table done."
1118
+ end
1119
+ exit 0
1120
+ end
1121
+
1122
+ $logger.info 'Calculating substitution probabilities (no smoothing) done.'
1123
+ end
1124
+
1125
+ # when smoothing!!!
1126
+ if ($output > 0) && !$nosmooth
1127
+ #
1128
+ # p1 probabilities
1129
+ #
1130
+ p1 = NArray.float($amino_acids.size)
1131
+ a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
1132
+ big_N = $tot_aa.to_f
1133
+ small_n = $amino_acids.size.to_f
1134
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1135
+ omega2 = 1.0 - omega1
1136
+
1137
+ if ($smooth == :full) || $p1smooth
1138
+ # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
1139
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
1140
+ $smooth_prob[1] = p1
1141
+ elsif ($smooth == :partial)
1142
+ # no smoothing for p1 probabilities just as Kenji's subst
1143
+ # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
1144
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
1145
+ $smooth_prob[1] = p1
1146
+ end
1147
+
1148
+ #
1149
+ # p2 and above
1150
+ #
1151
+ env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
1152
+
1153
+ if $smooth == :partial
1154
+ $outfh.puts <<HEADER
1155
+ #
1156
+ # Partial Smoothing:
1157
+ #
1158
+ HEADER
1159
+ if $p1smooth
1160
+ $outfh.puts <<HEADER
1161
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1162
+ # each row in all matrices and smoothing them with A0 (a uniform distribution)
1163
+ # ^^^^^^^^^
1164
+ HEADER
1165
+ else
1166
+ $outfh.puts <<HEADER
1167
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
1168
+ # each row in all matrices without smoothing
1169
+ # ^^^^^^^^^^^^^^^^^
1170
+ HEADER
1171
+ end
1172
+
1173
+ $outfh.puts <<HEADER
1174
+ # p2(ri|Rj) is estimated as:
1175
+ # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
1176
+ #
1177
+ # p3(ri|Rj,fq) is estimated as:
1178
+ # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
1179
+ # where
1180
+ # A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
1181
+ #
1182
+ # The smoothing procedure is curtailed here and finally
1183
+ # ^^^^^^^^^
1184
+ # p5(ri|Rj,...) is estimated as:
1185
+ # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
1186
+ # where
1187
+ # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
1188
+ #
1189
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1190
+ #
1191
+ # sigma value used is: #{$sigma}
1192
+ #
1193
+ HEADER
1194
+ 1.upto($env_features.size) do |ci|
1195
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
1196
+ if (ci > 2) && (ci < $env_features.size)
1197
+ $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
1198
+ next
1199
+ end
1200
+
1201
+ env_labels.combination(ci) do |c1|
1202
+ c1[0].product(*c1[1..-1]).each do |labels|
1203
+ pattern = '.' * $env_features.size
1204
+
1205
+ labels.each do |label|
1206
+ i = label[0].chr.to_i
1207
+ l = label[1].chr
1208
+ pattern[i] = l
1209
+ end
1210
+
1211
+ if pattern =~ /^\./
1212
+ $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
1213
+ next
1214
+ end
1215
+
1216
+ # get environments matching the pattern created above
1217
+ # and calculate amino acid frequencies and their probabilities for all the environments
1218
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1219
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1220
+ prob_arr = NArray.float($amino_acids.size)
1221
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
1222
+
1223
+ # # assess whether a residue type j is compatible with a particular combination of structural features
1224
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
1225
+ # if ci == $env_features.size
1226
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
1227
+ # sub_pattern = '.' * $env_features.size
1228
+ # sub_pattern[0] = aa_label
1229
+ # sub_freq_sum = 0
1230
+ #
1231
+ # labels[1..-1].each do |label|
1232
+ # next if label.start_with?('0')
1233
+ # i = label[0].chr.to_i
1234
+ # l = label[1].chr
1235
+ # sub_pattern[i] = l
1236
+ # sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1237
+ # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1238
+ # sub_freq_sum += sub_freq_arr.sum
1239
+ # end
1240
+ #
1241
+ # if sub_freq_sum == 0
1242
+ # if $smooth_prob.has_key?(ci + 1)
1243
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1244
+ # else
1245
+ # $smooth_prob[ci + 1] = {}
1246
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
1247
+ # end
1248
+ # $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
1249
+ # next
1250
+ # end
1251
+ # end
1252
+
1253
+ # collect priors
1254
+ priors = []
1255
+
1256
+ if ci == 1
1257
+ priors << $smooth_prob[1]
1258
+ elsif ci == 2
1259
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
1260
+ priors << $smooth_prob[2][c3.to_set]
1261
+ }
1262
+ elsif ci == $env_features.size
1263
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
1264
+ priors << $smooth_prob[3][c3.to_set]
1265
+ }
1266
+ end
1267
+
1268
+ # entropy based prior weighting step
1269
+ entropy_max = Math::log($amino_acids.size)
1270
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
1271
+ begin
1272
+ p == 0.0 ? s - 1 : s + p * Math::log(p)
1273
+ rescue
1274
+ #puts "P: #{p}"
1275
+ end
1276
+ } }
1277
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
1278
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
1279
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
1280
+
1281
+ # smoothing step
1282
+ smooth_prob_arr = NArray.float($amino_acids.size)
1283
+ big_N = freq_arr.sum.to_f
1284
+ small_n = $amino_acids.size.to_f
1285
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1286
+ omega2 = 1.0 - omega1
1287
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1288
+
1289
+ # normalization step
1290
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1291
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1292
+
1293
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1294
+ if $smooth_prob.has_key?(ci + 1)
1295
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1296
+ else
1297
+ $smooth_prob[ci + 1] = {}
1298
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1299
+ end
1300
+ end
1301
+ end
1302
+ end
1303
+ $logger.info 'Calculating substitution probabilities (partial smoothing) done.'
1304
+ else
1305
+ $outfh.puts <<HEADER
1306
+ #
1307
+ # Full Smoothing:
1308
+ #
1309
+ # p1(ri) is estimated as:
1310
+ # p1(ri) = omega1 * A0 + omega2 * W1(ri)
1311
+ #
1312
+ # p2(ri|f1q) is estimated as:
1313
+ # p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)
1314
+ #
1315
+ # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)
1316
+ #
1317
+ # p3(ri|f1q,f2q) is estimated as:
1318
+ # p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)
1319
+ # where
1320
+ # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
1321
+ #
1322
+ # The smoothing procedure is NOT curtailed here and it goes upto
1323
+ # ^^^^^^^^^^^^^
1324
+ #
1325
+ # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
1326
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
1327
+ # where
1328
+ # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
1329
+ #
1330
+ # Weights (omegas) are calculated as in Topham et al. (1993)
1331
+ #
1332
+ # sigma value used is: #{$sigma}
1333
+ #
1334
+ HEADER
1335
+ # full smooting
1336
+ 1.upto($env_features.size) do |ci|
1337
+ env_labels.combination(ci) do |c1|
1338
+ c1[0].product(*c1[1..-1]).each do |labels|
1339
+ pattern = '.' * $env_features.size
1340
+ labels.each do |label|
1341
+ j = label[0].chr.to_i
1342
+ l = label[1].chr
1343
+ pattern[j] = l
1344
+ end
1345
+
1346
+ # get environmetns, frequencies, and probabilities
1347
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1348
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1349
+ prob_arr = NArray.float($amino_acids.size)
1350
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
1351
+
1352
+ # collect priors
1353
+ priors = []
1354
+ if ci > 1
1355
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
1356
+ else
1357
+ priors << $smooth_prob[1]
1358
+ end
1359
+
1360
+ # entropy based weighting priors
1361
+ entropy_max = Math::log($amino_acids.size)
1362
+ entropies = priors.map do |prior|
1363
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
1364
+ end
1365
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
1366
+
1367
+ # smoothing step
1368
+ smooth_prob_arr = NArray.float($amino_acids.size)
1369
+ big_N = freq_arr.sum.to_f
1370
+ small_n = $amino_acids.size.to_f
1371
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1372
+ omega2 = 1.0 - omega1
1373
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1374
+
1375
+ # normalization step
1376
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1377
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1378
+
1379
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1380
+ if $smooth_prob.has_key?(ci + 1)
1381
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1382
+ else
1383
+ $smooth_prob[ci + 1] = {}
1384
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1385
+ end
1386
+ end
1387
+ end
1388
+ end
1389
+ $logger.info 'Calculating substitution probabilities (full smoothing) done.'
1390
+ end
1391
+
1392
+ # updating smoothed probability array for each envrionment
1393
+ $env_classes.values.each do |env|
1394
+ env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1395
+ end
1396
+
1397
+ # sorting environments and build 21X21 substitution matrices
1398
+ group_matrices = []
1399
+
1400
+ $env_classes.groups_sorted_by_residue_labels.each do |group|
1401
+ # calculating 21X21 substitution probability matrix for each envrionment
1402
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1403
+
1404
+ $amino_acids.each_with_index do |aa, ai|
1405
+ smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1406
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1407
+ end
1408
+
1409
+ group_matrices << [group[0], grp_prob_mat]
1410
+ end
1411
+
1412
+ if $output == 1
1413
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1414
+ grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100
1415
+ $heatmapcol ||= Math::sqrt(group_matrices.size).round
1416
+
1417
+ group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no|
1418
+ # for a matrix file
1419
+ stem = "#{grp_no}. #{grp_label}"
1420
+ $outfh.puts ">#{grp_label} #{grp_no}"
1421
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids,
1422
+ :row_header => $amino_acids)
1423
+
1424
+ # for heat map generation
1425
+ if $heatmap == 0 or $heatmap == 2
1426
+ grp_prob_mat.heatmap(:col_header => $amino_acids,
1427
+ :row_header => $amino_acids,
1428
+ :rvg_width => $rvg_width,
1429
+ :rvg_height => $rvg_height,
1430
+ :canvas_width => $canvas_width,
1431
+ :canvas_height => $canvas_height,
1432
+ :max_val => grp_max_val.ceil,
1433
+ :min_val => 0,
1434
+ :print_value => $heatmapvalues,
1435
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1436
+
1437
+ $logger.info "Generating a heat map for #{stem} table done."
1438
+ end
1439
+
1440
+ if $heatmap == 1 or $heatmap == 2
1441
+ title_font_size = $rvg_width * $heatmapcol / 80.0
1442
+ heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids,
1443
+ :row_header => $amino_acids,
1444
+ :rvg_width => $rvg_width,
1445
+ :rvg_height => $rvg_height - 50,
1446
+ :canvas_width => $canvas_width,
1447
+ :canvas_height => $canvas_height - 50,
1448
+ :max_val => grp_max_val.ceil,
1449
+ :min_val => 0,
1450
+ :print_value => $heatmapvalues,
1451
+ :print_gradient => false,
1452
+ :title => stem,
1453
+ :title_font_size => title_font_size)
1454
+ end
1455
+ end
1456
+
1457
+ # for heat maps in a single file
1458
+ if $heatmap == 1 or $heatmap == 2
1459
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1460
+ heatmaps.heatmap(:columns => $heatmapcol,
1461
+ :rvg_width => $rvg_width,
1462
+ :max_val => grp_max_val.ceil,
1463
+ :min_val => 0).write(file)
1464
+
1465
+ $logger.info "Generating heat maps in a file, #{file} done."
1466
+ end
1467
+ end
1468
+
1469
+ # for a total substitution probability matrix
1470
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1471
+
1472
+ $amino_acids.each_with_index do |aa, aj|
1473
+ 0.upto($amino_acids.size - 1) do |ai|
1474
+ $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
1475
+ end
1476
+ end
1477
+
1478
+ if $output == 1
1479
+ $outfh.puts '>Total'
1480
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids,
1481
+ :row_header => $amino_acids)
1482
+ $outfh.close
1483
+
1484
+ # for a heat map
1485
+ if $heatmap == 0 or $heatmap == 2
1486
+ stem = "#{group_matrices.size}. TOTAL"
1487
+ $tot_prob_mat.heatmap(:col_header => $amino_acids,
1488
+ :row_header => $amino_acids,
1489
+ :rvg_width => $rvg_width,
1490
+ :rvg_height => $rvg_height,
1491
+ :canvas_width => $canvas_width,
1492
+ :canvas_height => $canvas_height,
1493
+ :max_val => $tot_prob_mat.max.ceil,
1494
+ :min_val => 0,
1495
+ :print_value => $heatmapvalues,
1496
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1497
+
1498
+ $logger.info "Generating a heat map for #{stem} table done."
1499
+ end
1500
+ exit 0
1501
+ end
1502
+ end
1503
+
1504
+ #
1505
+ # Part 6. END
1506
+ #
1507
+
1508
+
1509
+ # Part 7.
1510
+ #
1511
+ # Calculating log odds ratio scoring matrices
1512
+ #
1513
+ if $output == 2
1514
+ $outfh.puts <<HEADER
1515
+ #
1516
+ # The probabilities were then divided by the background probabilities
1517
+ HEADER
1518
+ if $penv
1519
+ $outfh.puts <<HEADER
1520
+ # which were derived from the environment-dependent amino acid frequencies.
1521
+ # ^^^^^^^^^^^^^^^^^^^^^
1522
+ HEADER
1523
+ else
1524
+ $outfh.puts <<HEADER
1525
+ # which were derived from the environment-independent amino acid frequencies.
1526
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1527
+ HEADER
1528
+ end
1529
+
1530
+ grp_logo_mats = []
1531
+ factor = $scale / Math::log(2)
1532
+
1533
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1534
+ # calculating substitution probability matrix for each envrionment
1535
+ grp_label = group[0]
1536
+ grp_envs = group[1]
1537
+ grp_logo_mat = $cys == 0 ?
1538
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1539
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1540
+
1541
+ $amino_acids.each_with_index do |aa, aj|
1542
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1543
+ env.logo_array = $cys == 0 ?
1544
+ NArray.float($amino_acids.size + 1) :
1545
+ NArray.float($amino_acids.size)
1546
+
1547
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1548
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1549
+ odds = prob / pai
1550
+ env.logo_array[ai] = factor * Math::log(odds)
1551
+ grp_logo_mat[aj, ai] = env.logo_array[ai]
1552
+ end
1553
+
1554
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1555
+ if $cys == 0
1556
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1557
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1558
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1559
+ odds = prob / pai
1560
+ env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1561
+ grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1562
+ end
1563
+ end
1564
+
1565
+ grp_logo_mats << [grp_label, grp_logo_mat]
1566
+ end
1567
+
1568
+ $tot_logo_mat = $cys == 0 ?
1569
+ NMatrix.float($amino_acids.size, $amino_acids.size + 1) :
1570
+ NMatrix.float($amino_acids.size, $amino_acids.size)
1571
+
1572
+ $amino_acids.each_with_index do |aa1, aj|
1573
+ $amino_acids.each_with_index do |aa2, ai|
1574
+ prob = $tot_prob_mat[aj, ai]
1575
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1576
+ odds = prob / pai
1577
+ $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1578
+ end
1579
+
1580
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1581
+ if $cys == 0
1582
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1583
+ prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1584
+ odds = prob / pai
1585
+ $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1586
+ end
1587
+ end
1588
+
1589
+
1590
+ # calculating relative entropy for each amino acid pair H and
1591
+ # the expected score E in bit units
1592
+ tot_E = 0.0
1593
+ tot_H = 0.0
1594
+
1595
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1596
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1597
+ if j != i
1598
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1599
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1600
+ else
1601
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1602
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1603
+ end
1604
+ end
1605
+ end
1606
+
1607
+ $outfh.puts <<HEADER
1608
+ #
1609
+ # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1610
+ HEADER
1611
+ unless $noroundoff
1612
+ $outfh.puts <<HEADER
1613
+ # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1614
+ HEADER
1615
+ end
1616
+
1617
+ $outfh.puts <<HEADER
1618
+ # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1619
+ #
1620
+ HEADER
1621
+
1622
+ grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.max
1623
+ grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min
1624
+ abs_max_val = [grp_max_val.abs, grp_min_val.abs].max
1625
+ row_header = $cys ? $amino_acids + %w[U] : $amino_acids
1626
+ heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2
1627
+ $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round
1628
+
1629
+ grp_logo_mats.each_with_index do |arr, grp_no|
1630
+ grp_label = arr[0]
1631
+ grp_logo_mat = arr[1]
1632
+ stem = "#{grp_no}. #{grp_label}"
1633
+
1634
+ unless $noroundoff
1635
+ grp_logo_mat = grp_logo_mat.round
1636
+ end
1637
+
1638
+ # for a matrix file
1639
+ $outfh.puts ">#{grp_label} #{grp_no}"
1640
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids,
1641
+ :row_header => row_header)
1642
+ # for a heat map
1643
+ if $heatmap == 0 or $heatmap == 2
1644
+ grp_logo_mat.heatmap(:col_header => $amino_acids,
1645
+ :row_header => row_header,
1646
+ :rvg_width => $rvg_width,
1647
+ :rvg_height => $rvg_height,
1648
+ :canvas_width => $canvas_width,
1649
+ :canvas_height => $canvas_height,
1650
+ :gradient_beg_color => '#0000FF',
1651
+ :gradient_mid_color => '#FFFFFF',
1652
+ :gradient_end_color => '#FF0000',
1653
+ :max_val => abs_max_val.ceil,
1654
+ :mid_val => 0,
1655
+ :min_val => -1 * abs_max_val.ceil,
1656
+ :print_value => $heatmapvalues,
1657
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1658
+
1659
+ $logger.info "Generating a heat map for #{stem} table done."
1660
+ end
1661
+
1662
+ if $heatmap == 1 or $heatmap == 2
1663
+ title_font_size = $rvg_width * $heatmapcol / 80.0
1664
+ heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids,
1665
+ :row_header => row_header,
1666
+ :rvg_width => $rvg_width,
1667
+ :rvg_height => $rvg_height - 50,
1668
+ :canvas_width => $canvas_width,
1669
+ :canvas_height => $canvas_height - 50,
1670
+ :gradient_beg_color => '#0000FF',
1671
+ :gradient_mid_color => '#FFFFFF',
1672
+ :gradient_end_color => '#FF0000',
1673
+ :max_val => abs_max_val.ceil,
1674
+ :mid_val => 0,
1675
+ :min_val => -1 * abs_max_val.ceil,
1676
+ :print_value => $heatmapvalues,
1677
+ :print_gradient => false,
1678
+ :title => stem,
1679
+ :title_font_size => title_font_size)
1680
+ end
1681
+ end
1682
+
1683
+ # for heat maps in a single file
1684
+ if $heatmap == 1 or $heatmap == 2
1685
+ file = "#{$heatmapstem}.#{$heatmapformat}"
1686
+ heatmaps.heatmap(:columns => $heatmapcol,
1687
+ :rvg_width => $rvg_width,
1688
+ :gradient_beg_color => '#0000FF',
1689
+ :gradient_mid_color => '#FFFFFF',
1690
+ :gradient_end_color => '#FF0000',
1691
+ :max_val => abs_max_val.ceil,
1692
+ :mid_val => 0,
1693
+ :min_val => -1 * abs_max_val.ceil).write(file)
1694
+
1695
+ $logger.info "Generating heat maps in a file, #{file} done."
1696
+ end
1697
+
1698
+ # for a matrix file
1699
+ unless $noroundoff
1700
+ $tot_logo_mat = $tot_logo_mat.round
1701
+ end
1702
+
1703
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1704
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids,
1705
+ :row_header => row_header)
1706
+
1707
+ # for a heat map
1708
+ if $heatmap == 0 or $heatmap == 2
1709
+ stem = "#{group_matrices.size}. TOTAL"
1710
+ tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max
1711
+ $tot_logo_mat.heatmap(:col_header => $amino_acids,
1712
+ :row_header => row_header,
1713
+ :rvg_width => $rvg_width,
1714
+ :rvg_height => $rvg_height,
1715
+ :canvas_width => $canvas_width,
1716
+ :canvas_height => $canvas_height,
1717
+ :gradient_beg_color => '#0000FF',
1718
+ :gradient_mid_color => '#FFFFFF',
1719
+ :gradient_end_color => '#FF0000',
1720
+ :max_val => tot_abs_max_val.ceil,
1721
+ :mid_val => 0,
1722
+ :min_val => -1 * tot_abs_max_val.ceil,
1723
+ :print_value => $heatmapvalues,
1724
+ :title => stem).write("#{stem}.#{$heatmapformat}")
1725
+
1726
+ $logger.info "Generating a heat map for #{stem} table done."
1727
+ end
1728
+
1729
+ $logger.info "Calculating log odds ratios done."
1730
+ end
1731
+
1732
+ #
1733
+ # Part 7. END
1734
+ #
1735
+
1736
+ $outfh.close
1737
+ exit 0
1738
+ end
1739
+ end
1740
+
1741
+ end # class CLI
1742
+ end # module Ulla