egor 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/egor/cli.rb CHANGED
@@ -1,18 +1,20 @@
1
- require "getoptlong"
2
- require "logger"
3
- require "rubygems"
4
- require "narray"
5
- require "bio"
6
- require "set"
7
- require "facets"
8
- require "simple_memoize"
9
-
10
- require "narray_extensions"
11
- require "nmatrix_extensions"
12
- require "enumerable_extensions"
13
- require "math_extensions"
14
- require "environment_feature"
15
- require "environment"
1
+ require 'rubygems'
2
+ require 'getoptlong'
3
+ require 'logger'
4
+ require 'narray'
5
+ require 'bio'
6
+ require 'set'
7
+ require 'facets'
8
+ require 'simple_memoize'
9
+
10
+ require 'narray_extensions'
11
+ require 'nmatrix_extensions'
12
+ require 'enumerable_extensions'
13
+ require 'math_extensions'
14
+ require 'environment'
15
+ require 'environment_class_hash'
16
+ require 'environment_feature'
17
+ require 'environment_feature_array'
16
18
 
17
19
  # This is a module for an actual command line interpreter for Egor
18
20
  # ---
@@ -45,29 +47,32 @@ Options:
45
47
  --tem-list (-l) FILE: a list for tem files
46
48
  --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
47
49
  --outfile (-o) FILE: output filename (default 'allmat.dat')
48
- --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
- --noweight: calculate substitution counts with no weights (default)
50
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
51
+ --noweight: calculate substitution counts with no weights
50
52
  --smooth (-s) INTEGER:
51
53
  0 for partial smoothing (default)
52
54
  1 for full smoothing
55
+ --p1smooth: perform smoothing for p1 probability calculation when partial smoothing
53
56
  --nosmooth: perform no smoothing operation
54
57
  --cys (-y) INTEGER:
55
58
  0 for using C and J only for structure (default)
56
59
  1 for both structure and sequence
57
- 2 for using only C for both (should be set having no 'disulphide bonds' environment feature)
60
+ 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
58
61
  --output INTEGER:
59
- 0 for raw counts (no-smoothing performed)
62
+ 0 for raw counts (no smoothing performed)
60
63
  1 for probabilities
61
64
  2 for log-odds (default)
65
+ --noround: do not round off log odds ratio
62
66
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
63
- --sigma DOUBLE: change the sigma value for smoothing (default 5)
67
+ --sigma DOUBLE: change the sigma value for smoothing (default 5.0)
68
+ --autosigma: automatically adjust the sigma value for smoothing
64
69
  --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
65
70
  --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
66
71
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
67
72
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
68
73
  --verbose (-v) INTEGER
69
- 0 for ERROR level (default)
70
- 1 for WARN or above level
74
+ 0 for ERROR level
75
+ 1 for WARN or above level (default)
71
76
  2 for INFO or above level
72
77
  3 for DEBUG or above level
73
78
  --version: print version
@@ -79,72 +84,85 @@ Options:
79
84
  # Calculate PID between two sequences
80
85
  #
81
86
  # :call-seq:
82
- # Egor::CLI::calc_pid(seq1, seq2) -> Float
87
+ # Egor::CLI::calculate_pid(seq1, seq2) -> Float
83
88
  #
84
- def calc_pid(seq1, seq2)
85
- s1 = seq1.split("")
86
- s2 = seq2.split("")
89
+ def calculate_pid(seq1, seq2)
90
+ s1 = seq1.split('')
91
+ s2 = seq2.split('')
87
92
  cols = s1.zip(s2)
88
93
  align = 0
89
94
  ident = 0
90
95
  intgp = 0
91
96
 
92
97
  cols.each do |col|
93
- if (col[0] != "-") && (col[1] != "-")
98
+ if (col[0] != '-') && (col[1] != '-')
94
99
  align += 1
95
100
  if col[0] == col[1]
96
101
  ident += 1
97
102
  end
98
- elsif (((col[0] == "-") && (col[1] != "-")) ||
99
- ((col[0] != "-") && (col[1] == "-")))
103
+ elsif (((col[0] == '-') && (col[1] != '-')) || ((col[0] != '-') && (col[1] == '-')))
100
104
  intgp += 1
101
105
  end
102
106
  end
103
107
 
104
108
  pid = 100.0 * ident.to_f / (align + intgp)
105
109
  end
106
- memoize :calc_pid
110
+ memoize :calculate_pid
107
111
 
108
112
  # :nodoc:
109
113
  def execute(arguments=[])
110
114
  #
111
- # Abbreviations in the aa1 codes
112
- #
113
- # * env: environment
114
- # * tem: (FUGUE) template
115
- # * classdef: (envlironment) class definition
116
- # * aa: amino acid
117
- # * aa: weighted amino acid
118
- # * tot: total
119
- # * rel: relative
120
- # * obs: observation (frequency)
121
- # * mut: mutation
122
- # * mutb: mutability
123
- # * freq: frequency
124
- # * prob: probability
125
- # * opts: options
115
+ # * Abbreviations in the codes
126
116
  #
117
+ # env: environment
118
+ # tem: (FUGUE) template
119
+ # classdef: (envlironment) class definition
120
+ # aa: amino acid
121
+ # aa: weighted amino acid
122
+ # tot: total
123
+ # rel: relative
124
+ # obs: observation
125
+ # cnt: count
126
+ # mut: mutation
127
+ # mutb: mutability
128
+ # freq: frequency
129
+ # prob: probability
130
+ # logo: log odds ratio
131
+ # opts: options
132
+ # fh: file handle
133
+ # ff: flat file
134
+ # ali: alignment
135
+ # mat: matrix
136
+ # arr: array
137
+
127
138
 
128
139
  # Part 1.
129
140
  #
130
141
  # Global variables and their default values
131
142
  #
143
+
132
144
  $logger = Logger.new(STDOUT)
133
- $logger.level = Logger::ERROR
134
- $amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("")
145
+ $logger.level = Logger::WARN
146
+
147
+ # default set of 21 amino acids including J (Cysteine, the free thiol form)
148
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
149
+
135
150
  $tem_list = nil
136
151
  $tem_file = nil
137
- $classdef = "classdef.dat"
138
- $outfile = "allmat.dat"
152
+ $classdef = 'classdef.dat'
153
+ $outfile = 'allmat.dat'
139
154
  $outfh = nil # file hanfle for outfile
140
- $output = 2
155
+ $output = 2 # default: log odds matrix
141
156
  $ali_size = 0
142
157
  $tot_aa = 0
143
158
  $sigma = 5.0
159
+ $autosigma = false
144
160
  $weight = 60
145
161
  $noweight = false
146
162
  $smooth = :partial
147
163
  $nosmooth = false
164
+ $noround = false
165
+ $p1smooth = false
148
166
  $scale = 3
149
167
  $pidmin = nil
150
168
  $pidmax = nil
@@ -153,16 +171,21 @@ Options:
153
171
  $cys = 0
154
172
  $penv = false
155
173
 
156
- $aa_tot_obs = Hash.new(0)
157
- $aa_mut_obs = Hash.new(0)
174
+ $aa_tot_cnt = Hash.new(0)
175
+ $aa_mut_cnt = Hash.new(0)
158
176
  $aa_mutb = {}
159
177
  $aa_rel_mutb = {}
160
- $aa_rel_freq = {}
161
- $env_aa_obs = Hash.new(0)
178
+ $aa_tot_freq = {}
179
+ $aa_env_cnt = Hash.new(0)
162
180
  $smooth_prob = {}
163
- $tot_freq_mat = nil
181
+ $tot_cnt_mat = nil
164
182
  $tot_prob_mat = nil
165
183
  $tot_logo_mat = nil
184
+ $tot_smooth_prob = {}
185
+
186
+ # minimum ratio of amino acid count to sigma value
187
+ $min_obs_sigma_ratio = 500.0
188
+
166
189
  #
167
190
  # Part 1 END
168
191
  #
@@ -171,6 +194,7 @@ Options:
171
194
  #
172
195
  # Parsing options
173
196
  #
197
+
174
198
  opts = GetoptLong.new(
175
199
  [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
176
200
  [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
@@ -178,9 +202,13 @@ Options:
178
202
  [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
179
203
  [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
180
204
  [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
205
+ [ '--p1smooth', GetoptLong::NO_ARGUMENT ],
181
206
  [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
182
207
  [ '--noweight', GetoptLong::NO_ARGUMENT ],
183
- [ '--heatmap', GetoptLong::NO_ARGUMENT ],
208
+ [ '--noround', GetoptLong::NO_ARGUMENT ],
209
+ [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
210
+ [ '--autosigma', GetoptLong::NO_ARGUMENT ],
211
+ #[ '--heatmap', GetoptLong::NO_ARGUMENT ],
184
212
  [ '--output', GetoptLong::REQUIRED_ARGUMENT ],
185
213
  [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
186
214
  [ '--penv', GetoptLong::NO_ARGUMENT ],
@@ -189,70 +217,95 @@ Options:
189
217
  [ '--version', GetoptLong::NO_ARGUMENT ]
190
218
  )
191
219
 
192
- opts.each do |opt, arg|
193
- case opt
194
- when '--help'
195
- print_usage
196
- exit 0
197
- when '--tem-list'
198
- $tem_list = arg
199
- when '--tem-file'
200
- $tem_file = arg
201
- when '--classdef'
202
- $classdef = arg
203
- when '--output'
204
- $output = arg.to_i
205
- when '--outfile'
206
- $outfile = arg
207
- when '--cys'
208
- $cys = arg.to_i
209
- when '--weight'
210
- $weight = arg.to_i
211
- when '--sigma'
212
- $sigma = arg.to_f
213
- when '--pidmin'
214
- $pidmin = arg.to_f
215
- when '--pidmax'
216
- $pidmax = arg.to_f
217
- when '--noweight'
218
- $noweight = true
219
- when '--smooth'
220
- $smooth = (arg.to_i == 1 ? :full : :partial)
221
- when '--nosmooth'
222
- $nosmooth = true
223
- when '--scale'
224
- $scale = arg.to_f
225
- when '--add'
226
- $logger.error "!!! --add option is not supported yet"
227
- exit 1
228
- $add = arg.to_f
229
- when '--penv'
230
- $logger.error "!!! --penv option is not supported yet"
231
- exit 1
232
- $penv = true
233
- when '--heatmap'
234
- $heatmap = true
235
- when '--verbose'
236
- $logger.level = case arg.to_i
237
- when 0 then Logger::ERROR
238
- when 1 then Logger::WARN
239
- when 2 then Logger::INFO
240
- when 3 then Logger::DEBUG
241
- else Logger::ERROR
242
- end
243
- when '--version'
244
- print_version
245
- exit 0
220
+ begin
221
+ opts.each do |opt, arg|
222
+ case opt
223
+ when '--help'
224
+ print_usage
225
+ exit 0
226
+ when '--tem-list'
227
+ $tem_list = arg
228
+ when '--tem-file'
229
+ $tem_file = arg
230
+ when '--classdef'
231
+ $classdef = arg
232
+ when '--output'
233
+ $output = arg.to_i
234
+ when '--outfile'
235
+ $outfile = arg
236
+ when '--cys'
237
+ $cys = arg.to_i
238
+ when '--weight'
239
+ $weight = arg.to_i
240
+ when '--sigma'
241
+ $sigma = arg.to_f
242
+ when '--autosigma'
243
+ $autosigma = true
244
+ when '--pidmin'
245
+ $pidmin = arg.to_f
246
+ when '--pidmax'
247
+ $pidmax = arg.to_f
248
+ when '--noweight'
249
+ $noweight = true
250
+ when '--noround'
251
+ $noround = true
252
+ when '--smooth'
253
+ $smooth = (arg.to_i == 1) ? :full : :partial
254
+ when '--nosmooth'
255
+ $nosmooth = true
256
+ when '--p1smooth'
257
+ $p1smooth = true
258
+ when '--scale'
259
+ $scale = arg.to_f
260
+ when '--add'
261
+ $add = arg.to_f
262
+ when '--penv'
263
+ warn "--penv option is not supported yet."
264
+ exit 1
265
+ $penv = true
266
+ # when '--heatmap'
267
+ # $heatmap = true
268
+ when '--verbose'
269
+ $logger.level = case arg.to_i
270
+ when 0 then Logger::ERROR
271
+ when 1 then Logger::WARN
272
+ when 2 then Logger::INFO
273
+ when 3 then Logger::DEBUG
274
+ else Logger::WARN
275
+ end
276
+ when '--version'
277
+ print_version
278
+ exit 0
279
+ end
246
280
  end
281
+ rescue
282
+ # invalid option
283
+ exit 1
247
284
  end
248
285
 
249
286
  # when arguments are nonsense, print usage
250
- if ((ARGV.length != 0) ||
251
- (!$tem_list && !$tem_file) ||
252
- ($tem_list && $tem_file))
287
+ if ((ARGV.length != 0) || (!$tem_list && !$tem_file) || ($tem_list && $tem_file))
253
288
  print_usage
254
289
  exit 1
255
290
  end
291
+
292
+ # warn if any input file is missing
293
+ if $tem_list && !File.exist?($tem_list)
294
+ warn "Cannot find template list file, #{$tem_list}"
295
+ exit 1
296
+ end
297
+
298
+ if $tem_file && !File.exist?($tem_file)
299
+ warn "Cannot find template file, #{$tem_file}"
300
+ exit 1
301
+ end
302
+
303
+ if $classdef && !File.exist?($classdef)
304
+ warn "Cannot find environment class definition file, #{$classdef}"
305
+ exit 1
306
+ end
307
+
308
+
256
309
  #
257
310
  # Part 2 END
258
311
  #
@@ -263,76 +316,68 @@ Options:
263
316
  # Reading Environment Class Definition File
264
317
  #
265
318
 
266
- # set amino_acids
267
- $amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
319
+ $logger.info "Egor START."
268
320
 
269
- # an array for storing all environment feature objects
270
- $env_features = []
321
+ # check --cys option and modify amino_acids set if necessary
322
+ if $cys == 2
323
+ $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
324
+ end
325
+
326
+ # create an EnvironmentFeatureList object for storing all environment features
327
+ $env_features = EnvironmentFeatureArray.new
271
328
 
272
329
  # an array for storing indexes of constrained environment features
273
330
  $cst_features = []
274
331
 
275
- # aa1 amino acid in a substitution itself is a environment feature
276
- $env_features << EnvironmentFeature.new("sequence",
277
- $amino_acids,
278
- $amino_acids,
279
- "F",
280
- "F")
332
+ # add substituted amino acid (aa1) in a substitution to the environment feature list
333
+ $env_features << EnvironmentFeature.new('sequence', $amino_acids, $amino_acids, 'F', 'F')
281
334
 
282
- # read environment class definiton file and
283
- # store them into the hash prepared above
335
+ # read environment class definiton file and store them into the hash prepared above
284
336
  env_index = 1
285
337
 
286
338
  IO.foreach($classdef) do |line|
287
339
  line.chomp!
288
- if line.start_with?("#")
340
+ if line.start_with?('#')
289
341
  next
290
342
  elsif (env_ftr = line.chomp.split(/;/)).length == 5
291
- $logger.info ">>> An environment feature, #{line} detected"
292
- if env_ftr[-1] == "T"
343
+ $logger.info "An environment feature, #{line} detected."
344
+ if env_ftr[-1] == 'T'
293
345
  # skip silenced environment feature
294
- $logger.warn "!!! The environment feature, #{line} silent"
346
+ $logger.warn "The environment feature, #{line} silent."
295
347
  next
296
348
  end
297
- if env_ftr[-2] == "T"
349
+ if env_ftr[-2] == 'T'
298
350
  $cst_features << env_index
299
- $logger.warn "!!! The environment feature, #{line} constrained"
351
+ $logger.warn "The environment feature, #{line} constrained."
300
352
  end
301
- $env_features << EnvironmentFeature.new(env_ftr[0],
302
- env_ftr[1].split(""),
303
- env_ftr[2].split(""),
304
- env_ftr[3],
305
- env_ftr[4])
353
+ $env_features << EnvironmentFeature.new(env_ftr[0], env_ftr[1].split(''), env_ftr[2].split(''), env_ftr[3], env_ftr[4])
306
354
  env_index += 1
307
355
  else
308
- $logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
356
+ $logger.error "\"#{line}\" doesn't seem to be a proper format for a environment class definition."
309
357
  exit 1
310
358
  end
311
359
  end
312
360
 
313
- # a hash for storing all environment objects
314
- $envs = {}
315
-
316
- # generate all possible combinations of environment labels, and
317
- # create & store every environment object into the hash prepared above with the label as a key
318
- $env_features.inject([]) { |sum, ec|
319
- sum << ec.labels
320
- }.inject { |pro, lb|
321
- pro.product(lb)
322
- }.each_with_index { |e, i|
323
- $envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
361
+ # a hash for storing all environment classes
362
+ $env_classes = EnvironmentClassHash.new
363
+
364
+ # generate all possible combinations of environment labels, and store every environment class into the hash prepared above with the label as a key
365
+ $env_features.label_combinations.each_with_index { |e, i|
366
+ $env_classes[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
324
367
  }
368
+
325
369
  #
326
370
  # Part 3 END
327
371
  #
328
372
 
373
+
329
374
  # Part 4.
330
375
  #
331
376
  # Reading TEM file or TEMLIST list file and couting substitutions
332
377
  #
333
378
 
334
379
  # a global file handle for output
335
- $outfh = File.open($outfile, "w")
380
+ $outfh = File.open($outfile, 'w')
336
381
 
337
382
  if $tem_file
338
383
  $tem_list_io = StringIO.new($tem_file)
@@ -345,18 +390,19 @@ Options:
345
390
  $tem_list_io.each_line do |tem_file|
346
391
  tem_file.chomp!
347
392
 
348
- $logger.info ">>> Analysing #{tem_file} ..."
393
+ $logger.info "Analysing #{tem_file} ..."
349
394
 
350
395
  ali = Bio::Alignment::OriginalAlignment.new
351
396
  ff = Bio::FlatFile.auto(tem_file)
397
+
352
398
  ff.each_entry do |pir|
353
- if pir.definition == "sequence"
354
- ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
399
+ if (pir.definition == 'sequence') || (pir.definition == 'structure')
400
+ ali.add_seq(pir.data.gsub("\n", ''), pir.entry_id)
355
401
  end
356
402
  end
357
403
 
358
404
  if ali.size < 2
359
- $logger.warn "!!! Skipped #{tem_file}, there is only one 'sequence' entry"
405
+ $logger.warn "Skipped #{tem_file}, there is only one unique entry."
360
406
  next
361
407
  end
362
408
 
@@ -368,8 +414,8 @@ Options:
368
414
  # check disulphide bond environment first!
369
415
  ff.rewind
370
416
  ff.each_entry do |pir|
371
- if (pir.entry_id == key) && (pir.definition == "disulphide")
372
- disulphide[key] = pir.data.gsub("\n", "").split("")
417
+ if (pir.entry_id == key) && ((pir.definition == "disulphide") || (pir.definition == "disulfide"))
418
+ disulphide[key] = pir.data.gsub("\n", '').split('')
373
419
  end
374
420
  end
375
421
 
@@ -379,14 +425,14 @@ Options:
379
425
  ff.rewind
380
426
  ff.each_entry do |pir|
381
427
  if (pir.entry_id == key) && (pir.definition == ec.name)
382
- labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
383
- if sym == "-"
384
- "-"
385
- elsif sym == "X" || sym == "x"
386
- "X"
428
+ labels = pir.data.gsub("\n", '').split('').map_with_index do |sym, pos|
429
+ if sym == '-'
430
+ '-'
431
+ elsif sym == 'X' || sym == 'x'
432
+ 'X'
387
433
  else
388
434
  if ei == 0 # Amino Acid Environment Feature
389
- (( disulphide.has_key?(key) and disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
435
+ (disulphide.has_key?(key) && (disulphide[key][pos] == 'F') && (sym == 'C')) ? 'J' : sym
390
436
  else
391
437
  ec.labels[ec.symbols.index(sym)]
392
438
  end
@@ -407,19 +453,19 @@ Options:
407
453
  ali.each_pair do |id1, seq1|
408
454
  ali.each_pair do |id2, seq2|
409
455
  if id1 != id2
410
- pid = calc_pid(seq1, seq2)
411
- s1 = seq1.split("")
412
- s2 = seq2.split("")
456
+ pid = calculate_pid(seq1, seq2)
457
+ s1 = seq1.split('')
458
+ s2 = seq2.split('')
413
459
 
414
460
  # check PID_MIN
415
461
  if $pidmin && (pid < $pidmin)
416
- $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
462
+ $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
417
463
  next
418
464
  end
419
465
 
420
466
  # check PID_MAX
421
467
  if $pidmax && (pid > $pidmax)
422
- $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
468
+ $logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
423
469
  next
424
470
  end
425
471
 
@@ -427,65 +473,65 @@ Options:
427
473
  aa1.upcase!
428
474
  aa2 = s2[pos].upcase
429
475
 
430
- if env_labels[id1][pos].include?("X")
431
- $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
476
+ if env_labels[id1][pos].include?('X')
477
+ $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
432
478
  next
433
479
  end
434
480
 
435
- if env_labels[id2][pos].include?("X")
436
- $logger.info ">>> Substitutions to #{id2}-#{pos}-#{aa2} were masked"
481
+ if env_labels[id2][pos].include?('X')
482
+ $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
437
483
  next
438
484
  end
439
485
 
440
- if !$amino_acids.include?(aa1)
441
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
486
+ unless $amino_acids.include?(aa1)
487
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
442
488
  next
443
489
  end
444
490
 
445
- if !$amino_acids.include?(aa2)
446
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
491
+ unless $amino_acids.include?(aa2)
492
+ $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
447
493
  next
448
494
  end
449
495
 
450
- aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
451
- aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
496
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
497
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
452
498
 
453
499
  if $cst_features.empty?
454
- $envs[env_labels[id1][pos]].increase_residue_count(aa2)
455
- elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
456
- $envs[env_labels[id1][pos]].increase_residue_count(aa2)
500
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
501
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
502
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
457
503
  else
458
- $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
504
+ $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
459
505
  next
460
506
  end
461
507
 
462
508
  grp_label = env_labels[id1][pos][1..-1]
463
509
 
464
- if $env_aa_obs.has_key? grp_label
465
- if $env_aa_obs[grp_label].has_key? aa1
466
- $env_aa_obs[grp_label][aa1] += 1
510
+ if $aa_env_cnt.has_key? grp_label
511
+ if $aa_env_cnt[grp_label].has_key? aa1
512
+ $aa_env_cnt[grp_label][aa1] += 1
467
513
  else
468
- $env_aa_obs[grp_label][aa1] = 1
514
+ $aa_env_cnt[grp_label][aa1] = 1
469
515
  end
470
516
  else
471
- $env_aa_obs[grp_label] = Hash.new(0)
472
- $env_aa_obs[grp_label][aa1] = 1
517
+ $aa_env_cnt[grp_label] = Hash.new(0)
518
+ $aa_env_cnt[grp_label][aa1] = 1
473
519
  end
474
520
 
475
- if $aa_tot_obs.has_key? aa1
476
- $aa_tot_obs[aa1] += 1
521
+ if $aa_tot_cnt.has_key? aa1
522
+ $aa_tot_cnt[aa1] += 1
477
523
  else
478
- $aa_tot_obs[aa1] = 1
524
+ $aa_tot_cnt[aa1] = 1
479
525
  end
480
526
 
481
527
  if aa1 != aa2
482
- if $aa_mut_obs.has_key? aa1
483
- $aa_mut_obs[aa1] += 1
528
+ if $aa_mut_cnt.has_key? aa1
529
+ $aa_mut_cnt[aa1] += 1
484
530
  else
485
- $aa_mut_obs[aa1] = 1
531
+ $aa_mut_cnt[aa1] = 1
486
532
  end
487
533
  end
488
- $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
534
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
489
535
  end
490
536
  end
491
537
  end
@@ -504,7 +550,7 @@ Options:
504
550
  found = false
505
551
  clusters[i].each do |c1|
506
552
  clusters[j].each do |c2|
507
- if calc_pid(ali[c1], ali[c2]) >= $weight
553
+ if calculate_pid(ali[c1], ali[c2]) >= $weight
508
554
  indexes << j
509
555
  found = true
510
556
  break
@@ -527,106 +573,110 @@ Options:
527
573
  end
528
574
  end while(continue)
529
575
 
576
+ if clusters.size < 2
577
+ $logger.debug "Skipped #{tem_file} because there is only one cluster at the #{$weight} PID level."
578
+ next
579
+ end
580
+
530
581
  clusters.combination(2).each do |cluster1, cluster2|
531
582
  cluster1.each do |id1|
532
583
  cluster2.each do |id2|
533
- seq1 = ali[id1].split("")
534
- seq2 = ali[id2].split("")
584
+ seq1 = ali[id1].split('')
585
+ seq2 = ali[id2].split('')
535
586
 
536
587
  seq1.each_with_index do |aa1, pos|
537
588
  aa1.upcase!
538
- aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
589
+ aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
539
590
 
540
- if env_labels[id1][pos].include?("X")
541
- $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
591
+ if env_labels[id1][pos].include?('X')
592
+ $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
542
593
  next
543
594
  end
544
595
 
545
- if env_labels[id2][pos].include?("X")
546
- $logger.debug "*** Substitutions to #{id2}-#{pos}-#{aa2} were masked"
596
+ if env_labels[id2][pos].include?('X')
597
+ $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
547
598
  next
548
599
  end
549
600
 
550
- if !$amino_acids.include?(aa1)
551
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
601
+ unless $amino_acids.include?(aa1)
602
+ $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
552
603
  next
553
604
  end
554
605
 
555
- if !$amino_acids.include?(aa2)
556
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
606
+ unless $amino_acids.include?(aa2)
607
+ $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
557
608
  next
558
609
  end
559
610
 
560
- aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
561
- aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
562
- size1 = cluster1.size
563
- size2 = cluster2.size
564
- obs1 = 1.0 / size1
565
- obs2 = 1.0 / size2
611
+ aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
612
+ aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
613
+ obs1 = 1.0 / cluster1.size
614
+ obs2 = 1.0 / cluster2.size
615
+ obs_cnt = obs1 * obs2
566
616
 
567
617
  if $cst_features.empty?
568
- $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
569
- $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
570
- elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
571
- $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
572
- $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
618
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
619
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
620
+ elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
621
+ $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
622
+ $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
573
623
  else
574
- $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
624
+ $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
575
625
  next
576
626
  end
577
627
 
578
628
  grp_label1 = env_labels[id1][pos][1..-1]
579
629
  grp_label2 = env_labels[id2][pos][1..-1]
580
630
 
581
- if $env_aa_obs.has_key? grp_label1
582
- if $env_aa_obs[grp_label1].has_key? aa1
583
- $env_aa_obs[grp_label1][aa1] += obs1
631
+ if $aa_env_cnt.has_key? grp_label1
632
+ if $aa_env_cnt[grp_label1].has_key? aa1
633
+ $aa_env_cnt[grp_label1][aa1] += obs1
584
634
  else
585
- $env_aa_obs[grp_label1][aa1] = obs1
635
+ $aa_env_cnt[grp_label1][aa1] = obs1
586
636
  end
587
637
  else
588
- $env_aa_obs[grp_label1] = Hash.new(0.0)
589
- $env_aa_obs[grp_label1][aa1] = obs1
638
+ $aa_env_cnt[grp_label1] = Hash.new(0.0)
639
+ $aa_env_cnt[grp_label1][aa1] = obs1
590
640
  end
591
641
 
592
- if $env_aa_obs.has_key? grp_label2
593
- if $env_aa_obs[grp_label2].has_key? aa2
594
- $env_aa_obs[grp_label2][aa2] += obs2
642
+ if $aa_env_cnt.has_key? grp_label2
643
+ if $aa_env_cnt[grp_label2].has_key? aa2
644
+ $aa_env_cnt[grp_label2][aa2] += obs2
595
645
  else
596
- $env_aa_obs[grp_label2][aa2] = obs2
646
+ $aa_env_cnt[grp_label2][aa2] = obs2
597
647
  end
598
648
  else
599
- $env_aa_obs[grp_label2] = Hash.new(0.0)
600
- $env_aa_obs[grp_label2][aa2] = obs2
649
+ $aa_env_cnt[grp_label2] = Hash.new(0.0)
650
+ $aa_env_cnt[grp_label2][aa2] = obs2
601
651
  end
602
652
 
603
- if $aa_tot_obs.has_key? aa1
604
- $aa_tot_obs[aa1] += obs1
653
+ if $aa_tot_cnt.has_key? aa1
654
+ $aa_tot_cnt[aa1] += obs1
605
655
  else
606
- $aa_tot_obs[aa1] = obs1
656
+ $aa_tot_cnt[aa1] = obs1
607
657
  end
608
658
 
609
- if $aa_tot_obs.has_key? aa2
610
- $aa_tot_obs[aa2] += obs2
659
+ if $aa_tot_cnt.has_key? aa2
660
+ $aa_tot_cnt[aa2] += obs2
611
661
  else
612
- $aa_tot_obs[aa2] = obs2
662
+ $aa_tot_cnt[aa2] = obs2
613
663
  end
614
664
 
615
665
  if aa1 != aa2
616
- if $aa_mut_obs.has_key? aa1
617
- $aa_mut_obs[aa1] += obs1
666
+ if $aa_mut_cnt.has_key? aa1
667
+ $aa_mut_cnt[aa1] += obs1
618
668
  else
619
- $aa_mut_obs[aa1] = obs1
669
+ $aa_mut_cnt[aa1] = obs1
620
670
  end
621
- if $aa_mut_obs.has_key? aa2
622
- $aa_mut_obs[aa2] += obs2
671
+ if $aa_mut_cnt.has_key? aa2
672
+ $aa_mut_cnt[aa2] += obs2
623
673
  else
624
- $aa_mut_obs[aa2] = obs2
674
+ $aa_mut_cnt[aa2] = obs2
625
675
  end
626
676
  end
627
677
 
628
- $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
629
- $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution for #{env_labels[id2][pos]}"
678
+ $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
679
+ $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
630
680
  end
631
681
  end
632
682
  end
@@ -636,7 +686,6 @@ Options:
636
686
 
637
687
  # print out default header
638
688
  $outfh.puts <<HEADER
639
- #
640
689
  # Environment-specific amino acid substitution matrices
641
690
  # Creator: egor version #{Egor::VERSION}
642
691
  # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
@@ -649,55 +698,94 @@ HEADER
649
698
  $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
650
699
 
651
700
  $outfh.puts <<HEADER
652
- #
653
701
  # (read in from #{$classdef})
654
702
  #
655
703
  # Number of alignments: #{$ali_size}
656
704
  # (list of .tem files read in from #{$tem_list})
657
705
  #
658
- # Total number of environments: #{Integer($envs.size / $amino_acids.size)}
706
+ # Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
659
707
  #
660
708
  # There are #{$amino_acids.size} amino acids considered.
661
709
  # #{$amino_acids.join}
662
710
  #
663
711
  HEADER
664
712
 
713
+ if $amino_acids.include? 'J'
714
+ $outfh.puts <<HEADER
715
+ # C: Cystine (the disulfide-bonded form)
716
+ # J: Cysteine (the free thiol form)
717
+ #
718
+ HEADER
719
+ end
720
+
665
721
  if $noweight
666
- $outfh.puts "# Weighting scheme: none"
722
+ $outfh.puts '# Weighting scheme: none'
667
723
  else
668
724
  $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
669
725
  end
670
726
 
671
727
  # calculate amino acid frequencies and mutabilities, and
672
728
  # print them as default statistics in the header part
673
- ala_factor = if $aa_tot_obs["A"] == 0
729
+ ala_factor = if $aa_tot_cnt['A'] == 0
674
730
  0.0
675
- elsif $aa_mut_obs["A"] == 0
731
+ elsif $aa_mut_cnt['A'] == 0
676
732
  0.0
677
733
  else
678
- 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
734
+ 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
679
735
  end
680
- $tot_aa = $aa_tot_obs.values.sum
736
+ $tot_aa = $aa_tot_cnt.values.sum
681
737
 
682
- $outfh.puts "#"
738
+ $outfh.puts '#'
683
739
  $outfh.puts "# Total amino acid frequencies:\n"
684
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
740
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
741
+
742
+ min_obs = -1
743
+ min_sigma = nil
685
744
 
686
745
  $amino_acids.each do |res|
687
- $aa_mutb[res] = $aa_tot_obs[res] == 0 ? 1.0 : $aa_mut_obs[res] / $aa_tot_obs[res].to_f
746
+ if ($aa_tot_cnt[res] / $sigma) < $min_obs_sigma_ratio
747
+ if min_obs < 0
748
+ min_obs = $aa_tot_cnt[res]
749
+ min_sigma = min_obs / $min_obs_sigma_ratio
750
+ elsif (min_obs > 0) && (min_obs > $aa_tot_cnt[res])
751
+ min_obs = $aa_tot_cnt[res]
752
+ min_sigma = min_obs / $min_obs_sigma_ratio
753
+ end
754
+
755
+ $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total observation (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
756
+ end
757
+
758
+ $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
688
759
  $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
689
- $aa_rel_freq[res] = $aa_tot_obs[res] == 0 ? 0.0 : $aa_tot_obs[res] / $tot_aa.to_f
760
+ $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
690
761
  end
691
762
 
692
763
  $amino_acids.each do |res|
693
764
  if $noweight
694
- $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
695
- [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
765
+ $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
766
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
696
767
  else
697
- $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
698
- [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
768
+ $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
769
+ [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
770
+ end
771
+ end
772
+
773
+ if min_obs > -1
774
+ $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
775
+ if $autosigma
776
+ $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
777
+ $sigma = min_sigma
699
778
  end
700
779
  end
780
+
781
+ $outfh.puts '#'
782
+ $outfh.puts '# RES: Amino acid one letter code'
783
+ $outfh.puts '# TOT_OBS: Total observations of incidence'
784
+ $outfh.puts '# MUT_OBS: Total observations of mutation'
785
+ $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
786
+ $outfh.puts '# REL_MUTB: Relative mutability (ALA=100)'
787
+ $outfh.puts '# REL_FREQ: Relative frequency'
788
+ $outfh.puts '#'
701
789
  #
702
790
  # Part 4. END
703
791
  #
@@ -705,48 +793,45 @@ HEADER
705
793
 
706
794
  # Part 5.
707
795
  #
708
- # Calculating substitution frequency tables
796
+ # Generating substitution frequency matrices
709
797
  #
710
798
 
711
799
  # calculating probabilities for each environment
712
- $envs.values.each do |e|
800
+ $env_classes.values.each do |e|
713
801
  if e.freq_array.sum != 0
714
802
  e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
715
803
  end
716
804
  end
717
805
 
718
806
  # count raw frequencies
719
- $tot_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
807
+ $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
720
808
 
721
809
  # for each combination of environment features
722
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
723
-
724
- env_groups.to_a.sort_by { |env_group|
725
- # a bit clumsy sorting here...
726
- env_group[0].split("").map_with_index { |l, i|
727
- $env_features[i + 1].labels.index(l)
728
- }
729
- }.each_with_index do |group, group_no|
730
- grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
810
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
811
+ grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
731
812
 
732
- $amino_acids.each_with_index do |aa, ai|
813
+ $amino_acids.each_with_index do |aa, aj|
733
814
  freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
734
- 0.upto($amino_acids.size - 1) { |j| grp_freq_mat[ai, j] = freq_array[j] }
815
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
735
816
  end
736
817
 
737
- $tot_freq_mat += grp_freq_mat
818
+ $tot_cnt_mat += grp_cnt_mat
738
819
 
739
820
  if $output == 0
740
821
  $outfh.puts ">#{group[0]} #{group_no}"
741
- $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
822
+ $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
742
823
  end
743
824
  end
744
825
 
745
826
  if $output == 0
746
- $outfh.puts ">Total"
747
- $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
827
+ $outfh.puts '>Total'
828
+ $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
829
+ $logger.info 'Egor END.'
748
830
  exit 0
749
831
  end
832
+
833
+ $logger.info "Counting substitutions is done."
834
+
750
835
  #
751
836
  # Part 5. END
752
837
  #
@@ -770,25 +855,29 @@ HEADER
770
855
 
771
856
  # when nosmoothing !!!
772
857
  if ($output > 0) && $nosmooth
773
- # Probability matrices
774
- $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
858
+ # reinitialize $tot_cnt_mat for pseudocounts
859
+ $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
775
860
 
776
861
  # for each combination of environment features
777
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
778
- env_groups.to_a.sort_by { |env_group|
779
- # a bit clumsy sorting here...
780
- env_group[0].split("").map_with_index { |l, i|
781
- $env_features[i + 1].labels.index(l)
782
- }
783
- }.each_with_index do |group, group_no|
784
- grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
862
+ pseudo_cnt = $add || (1.0 / $env_classes.group_size)
785
863
 
786
- $amino_acids.each_with_index do |aa, ai|
787
- prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
788
- 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = prob_array[j] }
864
+ # add pseudo counts for each frequency vector
865
+ $env_classes.values.each { |e| e.freq_array += pseudo_cnt }
866
+
867
+ # re-calculate probability vector for each environment class
868
+ $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
869
+
870
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
871
+ grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
872
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
873
+
874
+ $amino_acids.each_with_index do |aa, aj|
875
+ env_class = group[1].find { |e| e.label.start_with?(aa) }
876
+ 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
877
+ 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
789
878
  end
790
879
 
791
- $tot_prob_mat += grp_prob_mat
880
+ $tot_cnt_mat += grp_cnt_mat
792
881
 
793
882
  if ($output == 1)
794
883
  $outfh.puts ">#{group[0]} #{group_no}"
@@ -796,10 +885,20 @@ HEADER
796
885
  end
797
886
  end
798
887
 
888
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
889
+
890
+ 0.upto($amino_acids.size - 1) do |aj|
891
+ col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
892
+ 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
893
+ end
894
+
895
+ $logger.info 'Calculating substitution probabilities is done (no smoothing)'
896
+
799
897
  if ($output == 1)
800
- $outfh.puts ">Total"
898
+ $outfh.puts '>Total'
801
899
  $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
802
900
  $outfh.close
901
+ $logger.info 'Egor END.'
803
902
  exit 0
804
903
  end
805
904
  end
@@ -807,7 +906,7 @@ HEADER
807
906
  # when smoothing!!!
808
907
  if ($output > 0) && !$nosmooth
809
908
  #
810
- # p1 probability
909
+ # p1 probabilities
811
910
  #
812
911
  p1 = NArray.float($amino_acids.size)
813
912
  a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
@@ -816,55 +915,73 @@ HEADER
816
915
  omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
817
916
  omega2 = 1.0 - omega1
818
917
 
819
- if $smooth == :partial
820
- # for partial smoothing, p1 probability is not smoothed!
821
- 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
918
+ if ($smooth == :full) || $p1smooth
919
+ # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
920
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
822
921
  $smooth_prob[1] = p1
823
- else
824
- # for full smoothing, p1 probability is smoothed
825
- 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
922
+ elsif ($smooth == :partial)
923
+ # no smoothing for p1 probabilities just as Kenji's subst
924
+ # in this case, p1 probabilities were taken from the amino acid frequencies of your data set
925
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
826
926
  $smooth_prob[1] = p1
827
927
  end
828
928
 
829
929
  #
830
930
  # p2 and above
831
931
  #
832
- env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
932
+ env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
833
933
 
834
934
  if $smooth == :partial
835
935
  $outfh.puts <<HEADER
836
936
  #
837
937
  # Partial Smoothing:
838
938
  #
939
+ HEADER
940
+ if $p1smooth
941
+ $outfh.puts <<HEADER
839
942
  # p1(ri) (i.e., amino acid composition) is estimated by summing over
840
- # each row in all matrices (no smoothing)
841
- # ^^^^^^^^^^^^
943
+ # each row in all matrices and smoothing them with A0 (a uniform distribution)
944
+ # ^^^^^^^^^
945
+ HEADER
946
+ else
947
+ $outfh.puts <<HEADER
948
+ # p1(ri) (i.e., amino acid composition) is estimated by summing over
949
+ # each row in all matrices without smoothing
950
+ # ^^^^^^^^^^^^^^^^^
951
+ HEADER
952
+ end
953
+
954
+ $outfh.puts <<HEADER
842
955
  # p2(ri|Rj) is estimated as:
843
956
  # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
844
957
  #
845
958
  # p3(ri|Rj,fq) is estimated as:
846
959
  # p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
847
960
  # where
848
- # A2(ri|fq) = p2(ri|fq) (fixed fq; partial smoothing)
961
+ # A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
849
962
  #
850
963
  # The smoothing procedure is curtailed here and finally
964
+ # ^^^^^^^^^
851
965
  # p5(ri|Rj,...) is estimated as:
852
966
  # p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
853
967
  # where
854
968
  # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
855
969
  #
856
- # Weights (omegas) are calculated as in Topham et al. 1993)
970
+ # Weights (omegas) are calculated as in Topham et al. (1993)
857
971
  #
858
- # sigma value used is: 5.00
972
+ # sigma value used is: #{$sigma}
859
973
  #
860
974
  HEADER
861
975
  1.upto($env_features.size) do |ci|
862
976
  # for partial smoothing, only P1 ~ P3, and Pn are considered
863
- next if (ci > 2) && (ci < $env_features.size)
977
+ if (ci > 2) && (ci < $env_features.size)
978
+ $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
979
+ next
980
+ end
864
981
 
865
982
  env_labels.combination(ci) do |c1|
866
983
  Enumerable.cart_prod(*c1).each do |labels|
867
- pattern = "." * $env_features.size
984
+ pattern = '.' * $env_features.size
868
985
 
869
986
  labels.each do |label|
870
987
  i = label[0].chr.to_i
@@ -873,30 +990,31 @@ HEADER
873
990
  end
874
991
 
875
992
  if pattern =~ /^\./
876
- $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
993
+ $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
877
994
  next
878
995
  end
879
996
 
880
- # get environmetns, frequencies, and probabilities
881
- envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
997
+ # get environments matching the pattern created above
998
+ # and calculate amino acid frequencies and their probabilities for all the environments
999
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
882
1000
  freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
883
1001
  prob_arr = NArray.float($amino_acids.size)
884
- 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
1002
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
885
1003
 
886
1004
  # # assess whether a residue type j is compatible with a particular combination of structural features
887
1005
  # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
888
1006
  # if ci == $env_features.size
889
1007
  # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
890
- # sub_pattern = "." * $env_features.size
1008
+ # sub_pattern = '.' * $env_features.size
891
1009
  # sub_pattern[0] = aa_label
892
1010
  # sub_freq_sum = 0
893
1011
  #
894
1012
  # labels[1..-1].each do |label|
895
- # next if label.start_with?("0")
1013
+ # next if label.start_with?('0')
896
1014
  # i = label[0].chr.to_i
897
1015
  # l = label[1].chr
898
1016
  # sub_pattern[i] = l
899
- # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
1017
+ # sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
900
1018
  # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
901
1019
  # sub_freq_sum += sub_freq_arr.sum
902
1020
  # end
@@ -908,25 +1026,27 @@ HEADER
908
1026
  # $smooth_prob[ci + 1] = {}
909
1027
  # $smooth_prob[ci + 1][labels.to_set] = prob_arr
910
1028
  # end
911
- # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
1029
+ # $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
912
1030
  # next
913
1031
  # end
914
1032
  # end
915
1033
 
916
- # collect priors if ci > 1
917
- priors = []
1034
+ # collect priors
1035
+ priors = []
918
1036
 
919
- if ci == 2
920
- labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
1037
+ if ci == 1
1038
+ priors << $smooth_prob[1]
1039
+ elsif ci == 2
1040
+ labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
921
1041
  priors << $smooth_prob[2][c3.to_set]
922
1042
  }
923
1043
  elsif ci == $env_features.size
924
- labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
1044
+ labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
925
1045
  priors << $smooth_prob[3][c3.to_set]
926
1046
  }
927
1047
  end
928
1048
 
929
- # entropy based weighting priors
1049
+ # entropy based prior weighting step
930
1050
  entropy_max = Math::log($amino_acids.size)
931
1051
  entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
932
1052
  begin
@@ -952,15 +1072,16 @@ HEADER
952
1072
  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
953
1073
 
954
1074
  # store smoothed probabilties in a hash using a set of envrionment labels as a key
955
- if !$smooth_prob.has_key?(ci + 1)
956
- $smooth_prob[ci + 1] = {}
1075
+ if $smooth_prob.has_key?(ci + 1)
957
1076
  $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
958
1077
  else
1078
+ $smooth_prob[ci + 1] = {}
959
1079
  $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
960
1080
  end
961
1081
  end
962
1082
  end
963
1083
  end
1084
+ $logger.info 'Calculating substitution probabilities is done (partial smoothing).'
964
1085
  else
965
1086
  $outfh.puts <<HEADER
966
1087
  #
@@ -980,22 +1101,23 @@ HEADER
980
1101
  # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
981
1102
  #
982
1103
  # The smoothing procedure is NOT curtailed here and it goes upto
1104
+ # ^^^^^^^^^^^^^
983
1105
  #
984
1106
  # pn(ri|f1q,f2q,...,fn-1q) is estimated as:
985
- # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * W5(ri|f1q,f2q,...,fn-1q)
1107
+ # pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
986
1108
  # where
987
1109
  # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
988
1110
  #
989
- # Weights (omegas) are calculated as in Topham et al. 1993)
1111
+ # Weights (omegas) are calculated as in Topham et al. (1993)
990
1112
  #
991
- # sigma value used is: 5.00
1113
+ # sigma value used is: #{$sigma}
992
1114
  #
993
1115
  HEADER
994
1116
  # full smooting
995
1117
  1.upto($env_features.size) do |ci|
996
1118
  env_labels.combination(ci) do |c1|
997
1119
  Enumerable.cart_prod(*c1).each do |labels|
998
- pattern = "." * $env_features.size
1120
+ pattern = '.' * $env_features.size
999
1121
  labels.each do |label|
1000
1122
  j = label[0].chr.to_i
1001
1123
  l = label[1].chr
@@ -1003,7 +1125,7 @@ HEADER
1003
1125
  end
1004
1126
 
1005
1127
  # get environmetns, frequencies, and probabilities
1006
- envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
1128
+ envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
1007
1129
  freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
1008
1130
  prob_arr = NArray.float($amino_acids.size)
1009
1131
  0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
@@ -1036,58 +1158,57 @@ HEADER
1036
1158
  0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1037
1159
 
1038
1160
  # store smoothed probabilties in a hash using a set of envrionment labels as a key
1039
- if !$smooth_prob.has_key?(ci + 1)
1040
- $smooth_prob[ci + 1] = {}
1161
+ if $smooth_prob.has_key?(ci + 1)
1041
1162
  $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1042
1163
  else
1164
+ $smooth_prob[ci + 1] = {}
1043
1165
  $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1044
1166
  end
1045
1167
  end
1046
1168
  end
1047
1169
  end
1170
+ $logger.info 'Calculating substitution probabilities is done (full smoothing).'
1048
1171
  end
1049
1172
 
1050
1173
  # updating smoothed probability array for each envrionment
1051
- $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
1052
-
1053
- # for a total substitution probability matrix
1054
- $tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
1055
-
1056
- # grouping environments by its environment labels but amino acid label
1057
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1174
+ $env_classes.values.each do |env|
1175
+ env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
1176
+ end
1058
1177
 
1059
1178
  # sorting environments and build 21X21 substitution matrices
1060
- env_groups.to_a.sort_by { |env_group|
1061
- # a bit clumsy sorting here...
1062
- env_group[0].split("").map_with_index { |l, i|
1063
- $env_features[i + 1].labels.index(l)
1064
- }
1065
- }.each_with_index do |group, group_no|
1179
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1066
1180
  # calculating 21X21 substitution probability matrix for each envrionment
1067
- grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
1181
+ grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1068
1182
 
1069
1183
  $amino_acids.each_with_index do |aa, ai|
1070
- smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1071
- 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
1184
+ smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1185
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
1072
1186
  end
1073
1187
 
1074
- $tot_prob_mat += grp_prob_mat
1075
-
1076
1188
  if $output == 1
1077
1189
  $outfh.puts ">#{group[0]} #{group_no}"
1078
1190
  $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1079
1191
  end
1080
1192
  end
1081
1193
 
1082
- $tot_prob_mat /= env_groups.size
1194
+ # for a total substitution probability matrix
1195
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
1196
+
1197
+ $amino_acids.each_with_index do |aa, aj|
1198
+ 0.upto($amino_acids.size - 1) do |ai|
1199
+ $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
1200
+ end
1201
+ end
1083
1202
 
1084
1203
  if $output == 1
1085
- $outfh.puts ">Total"
1204
+ $outfh.puts '>Total'
1086
1205
  $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1087
1206
  $outfh.close
1207
+ $logger.info 'Egor END.'
1088
1208
  exit 0
1089
1209
  end
1090
1210
  end
1211
+
1091
1212
  #
1092
1213
  # Part 6. END
1093
1214
  #
@@ -1104,79 +1225,88 @@ HEADER
1104
1225
  HEADER
1105
1226
  if $penv
1106
1227
  $outfh.puts <<HEADER
1107
- # which were derived from the environment-independent amino acid frequencies.
1108
- # ^^^^^^^^^^^^^^^^^^^^^^^
1228
+ # which were derived from the environment-dependent amino acid frequencies.
1229
+ # ^^^^^^^^^^^^^^^^^^^^^
1109
1230
  HEADER
1110
1231
  else
1111
1232
  $outfh.puts <<HEADER
1112
- # which were derived from the environment-dependent amino acid frequencies.
1113
- # ^^^^^^^^^^^^^^^^^^^^^
1233
+ # which were derived from the environment-independent amino acid frequencies.
1234
+ # ^^^^^^^^^^^^^^^^^^^^^^^
1114
1235
  HEADER
1115
1236
  end
1116
1237
 
1117
- $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1118
1238
  grp_logo_mats = []
1119
1239
  factor = $scale / Math::log(2)
1120
1240
 
1121
- # grouping environments by its environment labels but amino acid label
1122
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1123
-
1124
- # sorting environments and build 21X21 substitution matrices
1125
- env_groups.to_a.sort_by { |env_group|
1126
- # a bit clumsy sorting here...
1127
- env_group[0].split("").map_with_index { |l, i|
1128
- $env_features[i + 1].labels.index(l)
1129
- }
1130
- }.each_with_index do |group, group_no|
1241
+ $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
1131
1242
  # calculating substitution probability matrix for each envrionment
1132
1243
  grp_label = group[0]
1133
1244
  grp_envs = group[1]
1134
1245
  grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1135
1246
 
1136
- $amino_acids.each_with_index do |aa, ai|
1137
- env = grp_envs.detect { |e| e.label.start_with?(aa) }
1138
- logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1139
-
1140
- env.send($nosmooth ? "prob_array" : "smooth_prob_array").to_a.each_with_index do |prob, j|
1141
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1142
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1143
- logo_arr[j] = factor * Math::log(odds)
1247
+ $amino_acids.each_with_index do |aa, aj|
1248
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1249
+ #paj = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').sum / $tot_cnt_mat.sum
1250
+ env.logo_array = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1251
+
1252
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
1253
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1254
+ #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1255
+ odds = prob / pai
1256
+ env.logo_array[ai] = factor * Math::log(odds)
1257
+ grp_logo_mat[aj, ai] = env.logo_array[ai]
1144
1258
  end
1145
1259
 
1146
- 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1147
-
1148
- # adding log odds ratio for "U" (J or C) when --cyc is 0
1260
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1149
1261
  if $cys == 0
1150
- paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1151
- prob = env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("C")] +
1152
- env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("J")]
1153
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1154
- logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1155
- grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1262
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1263
+ prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
1264
+ env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
1265
+ #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1266
+ odds = prob / pai
1267
+ env.logo_array[$amino_acids.size] = factor * Math::log(odds)
1268
+ grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
1156
1269
  end
1157
1270
  end
1158
1271
 
1159
- $tot_logo_mat += grp_logo_mat
1160
1272
  grp_logo_mats << [grp_label, grp_logo_mat]
1161
1273
  end
1162
1274
 
1163
- $tot_logo_mat /= env_groups.size
1275
+ $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1276
+
1277
+ $amino_acids.each_with_index do |aa1, aj|
1278
+ $amino_acids.each_with_index do |aa2, ai|
1279
+ prob = $tot_prob_mat[aj, ai]
1280
+ pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
1281
+ #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1282
+ odds = prob / pai
1283
+ $tot_logo_mat[aj, ai] = factor * Math::log(odds)
1284
+ end
1285
+
1286
+ # adding log odds ratio for 'U' (J or C) when --cyc is 0
1287
+ if $cys == 0
1288
+ pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
1289
+ prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
1290
+ #odds = prob == 0.0 ? 0.000001 / pai : prob / pai
1291
+ odds = prob / pai
1292
+ $tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
1293
+ end
1294
+ end
1295
+
1164
1296
 
1165
1297
  # calculating relative entropy for each amino acid pair H and
1166
1298
  # the expected score E in bit units
1167
- #
1168
- # I'm a bit suspicious about this part...
1169
1299
  tot_E = 0.0
1170
1300
  tot_H = 0.0
1171
1301
 
1172
- 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1173
- 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1174
- if i != j
1175
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1176
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1302
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1303
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
1304
+ if j != i
1305
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
1306
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
1177
1307
  else
1178
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1179
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1308
+ tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
1309
+ tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
1180
1310
  end
1181
1311
  end
1182
1312
  end
@@ -1184,8 +1314,14 @@ HEADER
1184
1314
  $outfh.puts <<HEADER
1185
1315
  #
1186
1316
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1187
- # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1188
- #
1317
+ HEADER
1318
+ unless $noround
1319
+ $outfh.puts <<HEADER
1320
+ # rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
1321
+ HEADER
1322
+ end
1323
+
1324
+ $outfh.puts <<HEADER
1189
1325
  # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1190
1326
  #
1191
1327
  HEADER
@@ -1194,24 +1330,40 @@ HEADER
1194
1330
  grp_label = arr[0]
1195
1331
  grp_logo_mat = arr[1]
1196
1332
 
1333
+ unless $noround
1334
+ grp_logo_mat = grp_logo_mat.round
1335
+ end
1336
+
1197
1337
  $outfh.puts ">#{grp_label} #{grp_no}"
1198
1338
  if $cys
1199
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1339
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1200
1340
  else
1201
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1341
+ $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1202
1342
  end
1203
1343
  end
1204
1344
 
1205
1345
  $outfh.puts ">Total #{grp_logo_mats.size}"
1206
1346
 
1347
+ unless $noround
1348
+ $tot_logo_mat = $tot_logo_mat.round
1349
+ end
1350
+
1207
1351
  if $cys == 0
1208
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1352
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1209
1353
  else
1210
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1354
+ $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1211
1355
  end
1212
- $outfh.close
1213
- exit 0
1356
+
1357
+ $logger.info "Calculating log odds ratio is done."
1358
+
1359
+ #
1360
+ # Part 7. END
1361
+ #
1214
1362
  end
1363
+
1364
+ $outfh.close
1365
+ $logger.info "Egor END."
1366
+ exit 0
1215
1367
  end
1216
1368
  end
1217
1369