egor 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.0.3 2008-12-09
2
+
3
+ * 2 major enhancement:
4
+ * An option '--cys (-j) 2' added not to distinguish J from C, so 'disulphide bond' environment feature is not prerequisite
5
+ * Masking works for target amino acid, too
6
+
1
7
  == 0.0.2 2008-11-13
2
8
 
3
9
  * 2 major enhancement:
data/README.rdoc CHANGED
@@ -2,10 +2,12 @@
2
2
 
3
3
  * http://egor.rubyforge.org
4
4
 
5
+
5
6
  == DESCRIPTION:
6
7
 
7
8
  egor: Esst GeneratOR, a program for calculating environment-specific substitution tables
8
9
 
10
+
9
11
  == FEATURES/PROBLEMS:
10
12
 
11
13
  * No more segmentation fault
@@ -14,34 +16,42 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
14
16
  * Full smoothing supported
15
17
  * In theory, infinite number of environment features can be handled
16
18
 
19
+
20
+ == INSTALL:
21
+
22
+ $ sudo gem install egor
23
+
24
+
17
25
  == BASIC USAGE:
18
26
 
19
27
  $ egor -l TEMLIST-file -c classdef.dat
20
28
  or
21
29
  $ egor -f TEM-file -c classdef.dat
22
30
 
31
+
23
32
  == OPTIONS:
24
- --tem-file (-f) STRING: a tem file
25
- --tem-list (-l) STRING: a list for tem files
26
- --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
27
- --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
28
- --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (not supported yet)
33
+ --tem-file (-f) FILE: a tem file
34
+ --tem-list (-l) FILE: a list for tem files
35
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
36
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
37
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
29
38
  --noweight: calculate substitution counts with no weights (default)
30
39
  --smooth (-s) INTEGER:
31
40
  0 for parial smoothing (default)
32
41
  1 for full smoothing
33
42
  --nosmooth: perform no smoothing operation
34
- --cys (-y) INTEGER: (!!!not implemented yet!!!)
35
- 0 for using C and J only for structure
36
- 1 for both structure and sequence (default)
43
+ --cys (-y) INTEGER:
44
+ 0 for using C and J only for structure (default)
45
+ 1 for both structure and sequence
46
+ 2 for using only C for both
37
47
  --output INTEGER:
38
48
  0 for raw counts (no-smoothing performed)
39
49
  1 for probabilities
40
50
  2 for log-odds (default)
41
51
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
42
52
  --sigma DOUBLE: change the sigma value for smoothing (default 5)
43
- --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
44
- --penv: use environment-dependent frequencies for log-odds calculation (default false) (!!!not implemented yet!!!)
53
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/=classes)
54
+ --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
45
55
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
46
56
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
47
57
  --verbose (-v) INTEGER
@@ -52,17 +62,19 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
52
62
  --version: print version
53
63
  --help (-h): show help
54
64
 
65
+
55
66
  == REQUIREMENTS:
56
67
 
57
68
  * ruby 1.8.6 or above (http://www.ruby-lang.org)
58
69
  * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
70
+
71
+ Following RubyGems will be automatically installed if you have rubygems installed on your machine
72
+
59
73
  * narray (http://narray.rubyforge.org/)
60
74
  * facets (http://facets.rubyforge.org/)
61
75
  * bio (http://bioruby.open-bio.org/)
76
+ * simple_memoize (http://github.com/JackDanger/simple_memoize/tree/master)
62
77
 
63
- == INSTALL:
64
-
65
- $ sudo gem install egor
66
78
 
67
79
  == LICENSE:
68
80
 
data/egor.gemspec CHANGED
@@ -2,11 +2,12 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{egor}
5
- s.version = "0.0.1"
5
+ s.version = "0.0.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Semin Lee"]
9
- s.date = %q{2008-11-10}
9
+ s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
10
+ s.date = %q{2008-12-09}
10
11
  s.default_executable = %q{egor}
11
12
  s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
12
13
  s.email = ["seminlee@gmail.com"]
@@ -20,8 +21,9 @@ Gem::Specification.new do |s|
20
21
  s.require_paths = ["lib"]
21
22
  s.rubyforge_project = %q{egor}
22
23
  s.rubygems_version = %q{1.3.1}
24
+ s.signing_key = %q{/Users/semin/.gem/gem-private_key.pem}
23
25
  s.summary = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
24
- s.test_files = ["test/test_helper.rb", "test/test_egor.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_nmatrix_extensions.rb", "test/test_egor_cli.rb"]
26
+ s.test_files = ["test/test_egor.rb", "test/test_egor_cli.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_helper.rb", "test/test_nmatrix_extensions.rb"]
25
27
 
26
28
  if s.respond_to? :specification_version then
27
29
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
@@ -32,14 +34,14 @@ Gem::Specification.new do |s|
32
34
  s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
33
35
  s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
34
36
  s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
35
- s.add_development_dependency(%q<newgem>, [">= 1.0.7"])
37
+ s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
36
38
  s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
37
39
  else
38
40
  s.add_dependency(%q<narray>, [">= 0.5.9.5"])
39
41
  s.add_dependency(%q<bio>, [">= 1.2.1"])
40
42
  s.add_dependency(%q<facets>, [">= 2.4.5"])
41
43
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
42
- s.add_dependency(%q<newgem>, [">= 1.0.7"])
44
+ s.add_dependency(%q<newgem>, [">= 1.1.0"])
43
45
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
44
46
  end
45
47
  else
@@ -47,7 +49,7 @@ Gem::Specification.new do |s|
47
49
  s.add_dependency(%q<bio>, [">= 1.2.1"])
48
50
  s.add_dependency(%q<facets>, [">= 2.4.5"])
49
51
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
50
- s.add_dependency(%q<newgem>, [">= 1.0.7"])
52
+ s.add_dependency(%q<newgem>, [">= 1.1.0"])
51
53
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
52
54
  end
53
55
  end
data/lib/egor/cli.rb CHANGED
@@ -44,7 +44,7 @@ Options:
44
44
  --tem-file (-f) FILE: a tem file
45
45
  --tem-list (-l) FILE: a list for tem files
46
46
  --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
47
- --outfile (-o) FILE: output filename ("allmat.dat" if not specified)
47
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
48
48
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
49
  --noweight: calculate substitution counts with no weights (default)
50
50
  --smooth (-s) INTEGER:
@@ -54,6 +54,7 @@ Options:
54
54
  --cys (-y) INTEGER:
55
55
  0 for using C and J only for structure (default)
56
56
  1 for both structure and sequence
57
+ 2 for using only C for both
57
58
  --output INTEGER:
58
59
  0 for raw counts (no-smoothing performed)
59
60
  1 for probabilities
@@ -152,12 +153,12 @@ Options:
152
153
  $cys = 0
153
154
  $penv = false
154
155
 
155
- $aa_tot_obs = {}
156
- $aa_mut_obs = {}
156
+ $aa_tot_obs = Hash.new(0)
157
+ $aa_mut_obs = Hash.new(0)
157
158
  $aa_mutb = {}
158
159
  $aa_rel_mutb = {}
159
160
  $aa_rel_freq = {}
160
- $env_aa_obs = {}
161
+ $env_aa_obs = Hash.new(0)
161
162
  $smooth_prob = {}
162
163
  $tot_freq_mat = nil
163
164
  $tot_prob_mat = nil
@@ -200,7 +201,7 @@ Options:
200
201
  when '--outfile'
201
202
  $outfile = arg
202
203
  when '--cys'
203
- $cys = (arg.to_i == 1 ? false : true)
204
+ $cys = arg.to_i
204
205
  when '--weight'
205
206
  $weight = arg.to_i
206
207
  when '--sigma'
@@ -255,10 +256,12 @@ Options:
255
256
  # Reading Environment Class Definition File
256
257
  #
257
258
 
259
+ # set amino_acids
260
+ $amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
261
+
258
262
  # an array for storing all environment feature objects
259
263
  $env_features = []
260
264
 
261
-
262
265
  # an array for storing indexes of constrained environment features
263
266
  $cst_features = []
264
267
 
@@ -310,7 +313,7 @@ Options:
310
313
  }.inject { |pro, lb|
311
314
  pro.product(lb)
312
315
  }.each_with_index { |e, i|
313
- $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
316
+ $envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
314
317
  }
315
318
 
316
319
  # Part 4.
@@ -322,291 +325,308 @@ Options:
322
325
  $outfh = File.open($outfile, "w")
323
326
 
324
327
  if $tem_file
325
- $tem_list = [$tem_file]
328
+ $tem_list_io = StringIO.new($tem_file)
326
329
  end
327
330
 
328
331
  if $tem_list
329
- IO.foreach($tem_list) do |tem_file|
330
- tem_file.chomp!
332
+ $tem_list_io = File.open($tem_list)
333
+ end
334
+
335
+ $tem_list_io.each_line do |tem_file|
336
+ tem_file.chomp!
337
+
338
+ $logger.info ">>> Analysing #{tem_file} ..."
339
+
340
+ ali = Bio::Alignment::OriginalAlignment.new
341
+ ff = Bio::FlatFile.auto(tem_file)
342
+ ff.each_entry do |pir|
343
+ if pir.definition == "sequence"
344
+ ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
345
+ end
346
+ end
347
+
348
+ if ali.size < 2
349
+ $logger.warn "!!! Skipped #{tem_file}, there is only one 'sequence' entry"
350
+ next
351
+ end
331
352
 
332
- $logger.info ">>> Analysing #{tem_file} ..."
353
+ $ali_size += 1
354
+ env_labels = {}
355
+ disulphide = {}
333
356
 
334
- ali = Bio::Alignment::OriginalAlignment.new
335
- ff = Bio::FlatFile.auto(tem_file)
357
+ ali.each_pair do |key, seq|
358
+ # check disulphide bond environment first!
359
+ ff.rewind
336
360
  ff.each_entry do |pir|
337
- if pir.definition == "sequence"
338
- ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
361
+ if (pir.entry_id == key) && (pir.definition == "disulphide")
362
+ disulphide[key] = pir.data.gsub("\n", "").split("")
339
363
  end
340
364
  end
341
365
 
342
- $ali_size += 1
343
- env_labels = {}
344
- disulphide = {}
366
+ $env_features.each_with_index do |ec, ei|
367
+ env_labels[key] = [] unless env_labels.has_key?(key)
345
368
 
346
- ali.each_pair do |key, seq|
347
- # check disulphide bond environment first!
348
369
  ff.rewind
349
370
  ff.each_entry do |pir|
350
- if (pir.entry_id == key) && (pir.definition == "disulphide")
351
- disulphide[key] = pir.data.gsub("\n", "").split("")
352
- end
353
- end
354
-
355
- $env_features.each_with_index do |ec, ei|
356
- env_labels[key] = [] unless env_labels.has_key?(key)
357
-
358
- ff.rewind
359
- ff.each_entry do |pir|
360
- if (pir.entry_id == key) && (pir.definition == ec.name)
361
- labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
362
- if sym == "-"
363
- "-"
364
- elsif sym == "X" || sym == "x"
365
- "X"
371
+ if (pir.entry_id == key) && (pir.definition == ec.name)
372
+ labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
373
+ if sym == "-"
374
+ "-"
375
+ elsif sym == "X" || sym == "x"
376
+ "X"
377
+ else
378
+ if ei == 0 # Amino Acid Environment Feature
379
+ (( disulphide.has_key?(key) and disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
366
380
  else
367
- if ei == 0 # Amino Acid Environment Feature
368
- ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
369
- else
370
- ec.labels[ec.symbols.index(sym)]
371
- end
381
+ ec.labels[ec.symbols.index(sym)]
372
382
  end
373
383
  end
384
+ end
374
385
 
375
- if env_labels[key].empty?
376
- env_labels[key] = labels
377
- else
378
- env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
379
- end
386
+ if env_labels[key].empty?
387
+ env_labels[key] = labels
388
+ else
389
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
380
390
  end
381
391
  end
382
392
  end
383
393
  end
394
+ end
395
+
396
+ if $noweight
397
+ ali.each_pair do |id1, seq1|
398
+ ali.each_pair do |id2, seq2|
399
+ if id1 != id2
400
+ pid = calc_pid(seq1, seq2)
401
+ s1 = seq1.split("")
402
+ s2 = seq2.split("")
403
+
404
+ # check PID_MIN
405
+ if $pidmin && (pid < $pidmin)
406
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
407
+ next
408
+ end
384
409
 
385
- if $noweight
386
- ali.each_pair do |id1, seq1|
387
- ali.each_pair do |id2, seq2|
388
- if id1 != id2
389
- pid = calc_pid(seq1, seq2)
390
- s1 = seq1.split("")
391
- s2 = seq2.split("")
392
-
393
- # check PID_MIN
394
- if $pidmin && (pid < $pidmin)
395
- $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
410
+ # check PID_MAX
411
+ if $pidmax && (pid > $pidmax)
412
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
413
+ next
414
+ end
415
+
416
+ s1.each_with_index do |aa1, pos|
417
+ aa1.upcase!
418
+ aa2 = s2[pos].upcase
419
+
420
+ if env_labels[id1][pos].include?("X")
421
+ $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
396
422
  next
397
423
  end
398
424
 
399
- # check PID_MAX
400
- if $pidmax && (pid > $pidmax)
401
- $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
425
+ if env_labels[id2][pos].include?("X")
426
+ $logger.info ">>> Substitutions to #{id2}-#{pos}-#{aa2} were masked"
402
427
  next
403
428
  end
404
429
 
405
- s1.each_with_index do |aa1, pos|
406
- if env_labels[id1][pos].include?("X")
407
- $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
408
- next
409
- end
410
-
411
- aa1.upcase!
412
- aa2 = s2[pos].upcase
413
-
414
- if !$amino_acids.include?(aa1)
415
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
416
- next
417
- end
430
+ if !$amino_acids.include?(aa1)
431
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
432
+ next
433
+ end
418
434
 
419
- if !$amino_acids.include?(aa2)
420
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
421
- next
422
- end
435
+ if !$amino_acids.include?(aa2)
436
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
437
+ next
438
+ end
423
439
 
424
- aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
425
- aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
440
+ aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
441
+ aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
426
442
 
427
- if $cst_features.empty?
428
- $envs[env_labels[id1][pos]].increase_residue_count(aa2)
429
- elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
430
- env_labels[id2][pos].split("").values_at(*$cst_features))
431
- $envs[env_labels[id1][pos]].increase_residue_count(aa2)
432
- else
433
- $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
434
- next
435
- end
443
+ if $cst_features.empty?
444
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
445
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
446
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
447
+ else
448
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
449
+ next
450
+ end
436
451
 
437
- grp_label = env_labels[id1][pos][1..-1]
452
+ grp_label = env_labels[id1][pos][1..-1]
438
453
 
439
- if $env_aa_obs.has_key? grp_label
440
- if $env_aa_obs[grp_label].has_key? aa1
441
- $env_aa_obs[grp_label][aa1] += 1
442
- else
443
- $env_aa_obs[grp_label][aa1] = 1
444
- end
454
+ if $env_aa_obs.has_key? grp_label
455
+ if $env_aa_obs[grp_label].has_key? aa1
456
+ $env_aa_obs[grp_label][aa1] += 1
445
457
  else
446
- $env_aa_obs[grp_label] = Hash.new(0)
447
458
  $env_aa_obs[grp_label][aa1] = 1
448
459
  end
460
+ else
461
+ $env_aa_obs[grp_label] = Hash.new(0)
462
+ $env_aa_obs[grp_label][aa1] = 1
463
+ end
449
464
 
450
- if $aa_tot_obs.has_key? aa1
451
- $aa_tot_obs[aa1] += 1
452
- else
453
- $aa_tot_obs[aa1] = 1
454
- end
465
+ if $aa_tot_obs.has_key? aa1
466
+ $aa_tot_obs[aa1] += 1
467
+ else
468
+ $aa_tot_obs[aa1] = 1
469
+ end
455
470
 
456
- if aa1 != aa2
457
- if $aa_mut_obs.has_key? aa1
458
- $aa_mut_obs[aa1] += 1
459
- else
460
- $aa_mut_obs[aa1] = 1
461
- end
471
+ if aa1 != aa2
472
+ if $aa_mut_obs.has_key? aa1
473
+ $aa_mut_obs[aa1] += 1
474
+ else
475
+ $aa_mut_obs[aa1] = 1
462
476
  end
463
- $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
464
477
  end
478
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
465
479
  end
466
480
  end
467
481
  end
468
- else
469
- # BLOSUM-like weighting
470
- clusters = []
471
- ali.each_pair { |i, s| clusters << [i] }
472
-
473
- # a loop for single linkage clustering
474
- begin
475
- continue = false
476
- 0.upto(clusters.size - 2) do |i|
477
- indexes = []
478
- (i + 1).upto(clusters.size - 1) do |j|
479
- found = false
480
- clusters[i].each do |c1|
481
- clusters[j].each do |c2|
482
- if calc_pid(ali[c1], ali[c2]) >= $weight
483
- indexes << j
484
- found = true
485
- break
486
- end
482
+ end
483
+ else
484
+ # BLOSUM-like weighting
485
+ clusters = []
486
+ ali.each_pair { |i, s| clusters << [i] }
487
+
488
+ # a loop for single linkage clustering
489
+ begin
490
+ continue = false
491
+ 0.upto(clusters.size - 2) do |i|
492
+ indexes = []
493
+ (i + 1).upto(clusters.size - 1) do |j|
494
+ found = false
495
+ clusters[i].each do |c1|
496
+ clusters[j].each do |c2|
497
+ if calc_pid(ali[c1], ali[c2]) >= $weight
498
+ indexes << j
499
+ found = true
500
+ break
487
501
  end
488
- break if found
489
502
  end
503
+ break if found
490
504
  end
505
+ end
491
506
 
492
- unless indexes.empty?
493
- continue = true
494
- group = clusters[i]
495
- indexes.each do |k|
496
- group = group.concat(clusters[k])
497
- clusters[k] = nil
498
- end
499
- clusters[i] = group
500
- clusters.compact!
507
+ unless indexes.empty?
508
+ continue = true
509
+ group = clusters[i]
510
+ indexes.each do |k|
511
+ group = group.concat(clusters[k])
512
+ clusters[k] = nil
501
513
  end
514
+ clusters[i] = group
515
+ clusters.compact!
502
516
  end
503
- end while(continue)
504
-
505
- clusters.combination(2).each do |cluster1, cluster2|
506
- cluster1.each do |id1|
507
- cluster2.each do |id2|
508
- seq1 = ali[id1].split("")
509
- seq2 = ali[id2].split("")
510
-
511
- seq1.each_with_index do |aa1, pos|
512
- if env_labels[id1][pos].include?("X")
513
- $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
514
- next
515
- end
517
+ end
518
+ end while(continue)
516
519
 
517
- aa1.upcase!
518
- aa2 = seq2[pos].upcase
520
+ clusters.combination(2).each do |cluster1, cluster2|
521
+ cluster1.each do |id1|
522
+ cluster2.each do |id2|
523
+ seq1 = ali[id1].split("")
524
+ seq2 = ali[id2].split("")
519
525
 
520
- if !$amino_acids.include?(aa1)
521
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
522
- next
523
- end
526
+ seq1.each_with_index do |aa1, pos|
527
+ aa1.upcase!
528
+ aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
524
529
 
525
- if !$amino_acids.include?(aa2)
526
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
527
- next
528
- end
530
+ if env_labels[id1][pos].include?("X")
531
+ $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
532
+ next
533
+ end
529
534
 
530
- aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
531
- aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
532
- size1 = cluster1.size
533
- size2 = cluster2.size
534
- obs1 = 1.0 / size1
535
- obs2 = 1.0 / size2
536
-
537
- if $cst_features.empty?
538
- $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
539
- $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
540
- elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
541
- env_labels[id2][pos].split("").values_at(*$cst_features))
542
- $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
543
- $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
544
- else
545
- $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
546
- next
547
- end
535
+ if env_labels[id2][pos].include?("X")
536
+ $logger.debug "*** Substitutions to #{id2}-#{pos}-#{aa2} were masked"
537
+ next
538
+ end
539
+
540
+ if !$amino_acids.include?(aa1)
541
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
542
+ next
543
+ end
544
+
545
+ if !$amino_acids.include?(aa2)
546
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
547
+ next
548
+ end
549
+
550
+ aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
551
+ aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
552
+ size1 = cluster1.size
553
+ size2 = cluster2.size
554
+ obs1 = 1.0 / size1
555
+ obs2 = 1.0 / size2
556
+
557
+ if $cst_features.empty?
558
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
559
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
560
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
561
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
562
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
563
+ else
564
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
565
+ next
566
+ end
548
567
 
549
- grp_label1 = env_labels[id1][pos][1..-1]
550
- grp_label2 = env_labels[id2][pos][1..-1]
568
+ grp_label1 = env_labels[id1][pos][1..-1]
569
+ grp_label2 = env_labels[id2][pos][1..-1]
551
570
 
552
- if $env_aa_obs.has_key? grp_label1
553
- if $env_aa_obs[grp_label1].has_key? aa1
554
- $env_aa_obs[grp_label1][aa1] += obs1
555
- else
556
- $env_aa_obs[grp_label1][aa1] = obs1
557
- end
571
+ if $env_aa_obs.has_key? grp_label1
572
+ if $env_aa_obs[grp_label1].has_key? aa1
573
+ $env_aa_obs[grp_label1][aa1] += obs1
558
574
  else
559
- $env_aa_obs[grp_label1] = Hash.new(0.0)
560
575
  $env_aa_obs[grp_label1][aa1] = obs1
561
576
  end
577
+ else
578
+ $env_aa_obs[grp_label1] = Hash.new(0.0)
579
+ $env_aa_obs[grp_label1][aa1] = obs1
580
+ end
562
581
 
563
- if $env_aa_obs.has_key? grp_label2
564
- if $env_aa_obs[grp_label2].has_key? aa2
565
- $env_aa_obs[grp_label2][aa2] += obs2
566
- else
567
- $env_aa_obs[grp_label2][aa2] = obs2
568
- end
582
+ if $env_aa_obs.has_key? grp_label2
583
+ if $env_aa_obs[grp_label2].has_key? aa2
584
+ $env_aa_obs[grp_label2][aa2] += obs2
569
585
  else
570
- $env_aa_obs[grp_label2] = Hash.new(0.0)
571
586
  $env_aa_obs[grp_label2][aa2] = obs2
572
587
  end
588
+ else
589
+ $env_aa_obs[grp_label2] = Hash.new(0.0)
590
+ $env_aa_obs[grp_label2][aa2] = obs2
591
+ end
573
592
 
574
- if $aa_tot_obs.has_key? aa1
575
- $aa_tot_obs[aa1] += obs1
576
- else
577
- $aa_tot_obs[aa1] = obs1
578
- end
593
+ if $aa_tot_obs.has_key? aa1
594
+ $aa_tot_obs[aa1] += obs1
595
+ else
596
+ $aa_tot_obs[aa1] = obs1
597
+ end
579
598
 
580
- if $aa_tot_obs.has_key? aa2
581
- $aa_tot_obs[aa2] += obs2
599
+ if $aa_tot_obs.has_key? aa2
600
+ $aa_tot_obs[aa2] += obs2
601
+ else
602
+ $aa_tot_obs[aa2] = obs2
603
+ end
604
+
605
+ if aa1 != aa2
606
+ if $aa_mut_obs.has_key? aa1
607
+ $aa_mut_obs[aa1] += obs1
582
608
  else
583
- $aa_tot_obs[aa2] = obs2
609
+ $aa_mut_obs[aa1] = obs1
584
610
  end
585
-
586
- if aa1 != aa2
587
- if $aa_mut_obs.has_key? aa1
588
- $aa_mut_obs[aa1] += obs1
589
- else
590
- $aa_mut_obs[aa1] = obs1
591
- end
592
- if $aa_mut_obs.has_key? aa2
593
- $aa_mut_obs[aa2] += obs2
594
- else
595
- $aa_mut_obs[aa2] = obs2
596
- end
611
+ if $aa_mut_obs.has_key? aa2
612
+ $aa_mut_obs[aa2] += obs2
613
+ else
614
+ $aa_mut_obs[aa2] = obs2
597
615
  end
598
-
599
- $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
600
- $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
601
616
  end
617
+
618
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
619
+ $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution for #{env_labels[id2][pos]}"
602
620
  end
603
621
  end
604
622
  end
605
- end # if !$nosmooth
606
- end # IO.foreach($tem_list)
623
+ end
624
+ end # if !$nosmooth
625
+ end
607
626
 
608
- # print out default header
609
- $outfh.puts <<HEADER
627
+ # print out default header
628
+ $outfh.puts <<HEADER
629
+ #
610
630
  # Environment-specific amino acid substitution matrices
611
631
  # Creator: egor version #{Egor::VERSION}
612
632
  # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
@@ -616,9 +636,9 @@ Options:
616
636
  #
617
637
  HEADER
618
638
 
619
- $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
639
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
620
640
 
621
- $outfh.puts <<HEADER
641
+ $outfh.puts <<HEADER
622
642
  #
623
643
  # (read in from #{$classdef})
624
644
  #
@@ -632,164 +652,164 @@ HEADER
632
652
  #
633
653
  HEADER
634
654
 
655
+ if $noweight
656
+ $outfh.puts "# Weighting scheme: none"
657
+ else
658
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
659
+ end
660
+
661
+ # calculate amino acid frequencies and mutabilities, and
662
+ # print them as default statistics in the header part
663
+ ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
664
+ $tot_aa = $aa_tot_obs.values.sum
665
+
666
+ $outfh.puts "#"
667
+ $outfh.puts "# Total amino acid frequencies:\n"
668
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
669
+
670
+ $amino_acids.each do |res|
671
+ $aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
672
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
673
+ $aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
674
+ end
675
+
676
+ $amino_acids.each do |res|
635
677
  if $noweight
636
- $outfh.puts "# Weighting scheme: none"
678
+ $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
679
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
637
680
  else
638
- $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
681
+ $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
682
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
639
683
  end
684
+ end
640
685
 
641
- # calculate amino acid frequencies and mutabilities, and
642
- # print them as default statistics in the header part
643
- ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
644
- $tot_aa = $aa_tot_obs.values.sum
645
686
 
646
- $outfh.puts "#"
647
- $outfh.puts "# Total amino acid frequencies:\n"
648
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
687
+ # Part 5.
688
+ #
689
+ # Calculating substitution frequency tables
690
+ #
649
691
 
650
- $aa_tot_obs.each_pair do |res, freq|
651
- $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
652
- $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
653
- $aa_rel_freq[res] = freq / $tot_aa.to_f
692
+ # calculating probabilities for each environment
693
+ $envs.values.each do |e|
694
+ if e.freq_array.sum != 0
695
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
654
696
  end
697
+ end
655
698
 
656
- $amino_acids.each do |res|
657
- if $noweight
658
- $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
659
- [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
660
- else
661
- $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
662
- [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
663
- end
664
- end
699
+ # count raw frequencies
700
+ $tot_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
665
701
 
702
+ # for each combination of environment features
703
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
666
704
 
667
- # Part 5.
668
- #
669
- # Calculating substitution frequency tables
670
- #
705
+ env_groups.to_a.sort_by { |env_group|
706
+ # a bit clumsy sorting here...
707
+ env_group[0].split("").map_with_index { |l, i|
708
+ $env_features[i + 1].labels.index(l)
709
+ }
710
+ }.each_with_index do |group, group_no|
711
+ grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
671
712
 
672
- # calculating probabilities for each environment
673
- $envs.values.each do |e|
674
- if e.freq_array.sum != 0
675
- e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
676
- end
713
+ $amino_acids.each_with_index do |aa, ai|
714
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
715
+ 0.upto($amino_acids.size - 1) { |j| grp_freq_mat[ai, j] = freq_array[j] }
716
+ end
717
+
718
+ $tot_freq_mat += grp_freq_mat
719
+
720
+ if $output == 0
721
+ $outfh.puts ">#{group[0]} #{group_no}"
722
+ $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
677
723
  end
724
+ end
725
+
726
+ if $output == 0
727
+ $outfh.puts ">Total"
728
+ $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
729
+ exit 0
730
+ end
731
+
678
732
 
679
- # count raw frequencies
680
- $tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
733
+ # Part 6.
734
+ #
735
+ # Calculating substitution probability tables
736
+ #
737
+
738
+ if $output == 1
739
+ $outfh.puts <<HEADER
740
+ #
741
+ # Each column (j) represents the probability distribution for the
742
+ # likelihood of acceptance of a mutational event by a residue type j in
743
+ # a particular structural environment (specified after >) leading to
744
+ # any other residue type (i) and sums up to 100.
745
+ #
746
+ HEADER
747
+ end
748
+
749
+ if ($output > 0) && $nosmooth
750
+ # Probability matrices
751
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
681
752
 
682
753
  # for each combination of environment features
683
754
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
684
-
685
755
  env_groups.to_a.sort_by { |env_group|
686
756
  # a bit clumsy sorting here...
687
757
  env_group[0].split("").map_with_index { |l, i|
688
758
  $env_features[i + 1].labels.index(l)
689
759
  }
690
760
  }.each_with_index do |group, group_no|
691
- grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
761
+ grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
692
762
 
693
763
  $amino_acids.each_with_index do |aa, ai|
694
- freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
695
- 0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
764
+ prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
765
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = prob_array[j] }
696
766
  end
697
767
 
698
- $tot_freq_mat += grp_freq_mat
768
+ $tot_prob_mat += grp_prob_mat
699
769
 
700
- if $output == 0
770
+ if ($output == 1)
701
771
  $outfh.puts ">#{group[0]} #{group_no}"
702
- $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
772
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
703
773
  end
704
774
  end
705
775
 
706
- if $output == 0
776
+ if ($output == 1)
707
777
  $outfh.puts ">Total"
708
- $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
778
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
779
+ $outfh.close
709
780
  exit 0
710
781
  end
782
+ end
711
783
 
712
-
713
- # Part 6.
784
+ # for smoothing...
785
+ if ($output > 0) && !$nosmooth
714
786
  #
715
- # Calculating substitution probability tables
787
+ # p1 probability
716
788
  #
717
-
718
- if $output == 1
719
- $outfh.puts <<HEADER
720
- #
721
- # Each column (j) represents the probability distribution for the
722
- # likelihood of acceptance of a mutational event by a residue type j in
723
- # a particular structural environment (specified after >) leading to
724
- # any other residue type (i) and sums up to 100.
725
- #
726
- HEADER
727
- end
728
-
729
- if ($output > 0) && $nosmooth
730
- # Probability matrices
731
- $tot_prob_mat = NMatrix.float(21, 21)
732
-
733
- # for each combination of environment features
734
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
735
- env_groups.to_a.sort_by { |env_group|
736
- # a bit clumsy sorting here...
737
- env_group[0].split("").map_with_index { |l, i|
738
- $env_features[i + 1].labels.index(l)
739
- }
740
- }.each_with_index do |group, group_no|
741
- grp_prob_mat = NMatrix.float(21,21)
742
-
743
- $amino_acids.each_with_index do |aa, ai|
744
- prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
745
- 0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
746
- end
747
-
748
- $tot_prob_mat += grp_prob_mat
749
-
750
- if ($output == 1)
751
- $outfh.puts ">#{group[0]} #{group_no}"
752
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
753
- end
754
- end
755
-
756
- if ($output == 1)
757
- $outfh.puts ">Total"
758
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
759
- $outfh.close
760
- exit 0
761
- end
789
+ p1 = NArray.float($amino_acids.size)
790
+ a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
791
+ big_N = $tot_aa.to_f
792
+ small_n = $amino_acids.size.to_f
793
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
794
+ omega2 = 1.0 - omega1
795
+
796
+ if $smooth == :partial
797
+ # for partial smoothing, p1 probability is not smoothed!
798
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
799
+ $smooth_prob[1] = p1
800
+ else
801
+ # for full smoothing, p1 probability is smoothed
802
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
803
+ $smooth_prob[1] = p1
762
804
  end
763
805
 
764
- # for smoothing...
765
- if ($output > 0) && !$nosmooth
766
- #
767
- # p1 probability
768
- #
769
- p1 = NArray.float(21)
770
- a0 = NArray.float(21).fill(1 / 21.0)
771
- big_N = $tot_aa.to_f
772
- small_n = 21.0
773
- omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
774
- omega2 = 1.0 - omega1
775
-
776
- if $smooth == :partial
777
- # for partial smoothing, p1 probability is not smoothed!
778
- 0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
779
- $smooth_prob[1] = p1
780
- else
781
- # for full smoothing, p1 probability is smoothed
782
- 0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
783
- $smooth_prob[1] = p1
784
- end
785
-
786
- #
787
- # p2 and above
788
- #
789
- env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
806
+ #
807
+ # p2 and above
808
+ #
809
+ env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
790
810
 
791
- if $smooth == :partial
792
- $outfh.puts <<HEADER
811
+ if $smooth == :partial
812
+ $outfh.puts <<HEADER
793
813
  #
794
814
  # Partial Smoothing:
795
815
  #
@@ -813,106 +833,107 @@ HEADER
813
833
  # Weights (omegas) are calculated as in Topham et al. 1993)
814
834
  #
815
835
  # sigma value used is: 5.00
836
+ #
816
837
  HEADER
817
- 1.upto($env_features.size) do |ci|
818
- # for partial smoothing, only P1 ~ P3, and Pn are considered
819
- next if (ci > 2) && (ci < $env_features.size)
820
-
821
- env_labels.combination(ci) do |c1|
822
- Enumerable.cart_prod(*c1).each do |labels|
823
- pattern = "." * $env_features.size
824
-
825
- labels.each do |label|
826
- i = label[0].chr.to_i
827
- l = label[1].chr
828
- pattern[i] = l
829
- end
838
+ 1.upto($env_features.size) do |ci|
839
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
840
+ next if (ci > 2) && (ci < $env_features.size)
841
+
842
+ env_labels.combination(ci) do |c1|
843
+ Enumerable.cart_prod(*c1).each do |labels|
844
+ pattern = "." * $env_features.size
845
+
846
+ labels.each do |label|
847
+ i = label[0].chr.to_i
848
+ l = label[1].chr
849
+ pattern[i] = l
850
+ end
830
851
 
831
- if pattern =~ /^\./
832
- $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
833
- next
834
- end
852
+ if pattern =~ /^\./
853
+ $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
854
+ next
855
+ end
835
856
 
836
- # get environmetns, frequencies, and probabilities
837
- envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
838
- freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
839
- prob_arr = NArray.float(21)
840
- 0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
841
-
842
- # # assess whether a residue type j is compatible with a particular combination of structural features
843
- # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
844
- # if ci == $env_features.size
845
- # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
846
- # sub_pattern = "." * $env_features.size
847
- # sub_pattern[0] = aa_label
848
- # sub_freq_sum = 0
849
- #
850
- # labels[1..-1].each do |label|
851
- # next if label.start_with?("0")
852
- # i = label[0].chr.to_i
853
- # l = label[1].chr
854
- # sub_pattern[i] = l
855
- # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
856
- # sub_freq_arr = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
857
- # sub_freq_sum += sub_freq_arr.sum
858
- # end
859
- #
860
- # if sub_freq_sum == 0
861
- # if $smooth_prob.has_key?(ci + 1)
862
- # $smooth_prob[ci + 1][labels.to_set] = prob_arr
863
- # else
864
- # $smooth_prob[ci + 1] = {}
865
- # $smooth_prob[ci + 1][labels.to_set] = prob_arr
866
- # end
867
- # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
868
- # next
869
- # end
870
- # end
871
-
872
- # collect priors if ci > 1
873
- priors = []
874
-
875
- if ci == 2
876
- labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
877
- priors << $smooth_prob[2][c3.to_set]
878
- }
879
- elsif ci == $env_features.size
880
- labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
881
- priors << $smooth_prob[3][c3.to_set]
882
- }
883
- end
857
+ # get environmetns, frequencies, and probabilities
858
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
859
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
860
+ prob_arr = NArray.float($amino_acids.size)
861
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
862
+
863
+ # # assess whether a residue type j is compatible with a particular combination of structural features
864
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
865
+ # if ci == $env_features.size
866
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
867
+ # sub_pattern = "." * $env_features.size
868
+ # sub_pattern[0] = aa_label
869
+ # sub_freq_sum = 0
870
+ #
871
+ # labels[1..-1].each do |label|
872
+ # next if label.start_with?("0")
873
+ # i = label[0].chr.to_i
874
+ # l = label[1].chr
875
+ # sub_pattern[i] = l
876
+ # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
877
+ # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
878
+ # sub_freq_sum += sub_freq_arr.sum
879
+ # end
880
+ #
881
+ # if sub_freq_sum == 0
882
+ # if $smooth_prob.has_key?(ci + 1)
883
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
884
+ # else
885
+ # $smooth_prob[ci + 1] = {}
886
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
887
+ # end
888
+ # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
889
+ # next
890
+ # end
891
+ # end
892
+
893
+ # collect priors if ci > 1
894
+ priors = []
895
+
896
+ if ci == 2
897
+ labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
898
+ priors << $smooth_prob[2][c3.to_set]
899
+ }
900
+ elsif ci == $env_features.size
901
+ labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
902
+ priors << $smooth_prob[3][c3.to_set]
903
+ }
904
+ end
884
905
 
885
- # entropy based weighting priors
886
- entropy_max = Math::log(21)
887
- entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
888
- mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
889
- weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
890
- weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
891
-
892
- # smoothing step
893
- smooth_prob_arr = NArray.float(21)
894
- big_N = freq_arr.sum.to_f
895
- small_n = 21.0
896
- omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
897
- omega2 = 1.0 - omega1
898
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
899
-
900
- # normalization step
901
- smooth_prob_arr_sum = smooth_prob_arr.sum
902
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
903
-
904
- # store smoothed probabilties in a hash using a set of envrionment labels as a key
905
- if !$smooth_prob.has_key?(ci + 1)
906
- $smooth_prob[ci + 1] = {}
907
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
908
- else
909
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
910
- end
906
+ # entropy based weighting priors
907
+ entropy_max = Math::log($amino_acids.size)
908
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
909
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
910
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
911
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
912
+
913
+ # smoothing step
914
+ smooth_prob_arr = NArray.float($amino_acids.size)
915
+ big_N = freq_arr.sum.to_f
916
+ small_n = $amino_acids.size.to_f
917
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
918
+ omega2 = 1.0 - omega1
919
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
920
+
921
+ # normalization step
922
+ smooth_prob_arr_sum = smooth_prob_arr.sum
923
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
924
+
925
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
926
+ if !$smooth_prob.has_key?(ci + 1)
927
+ $smooth_prob[ci + 1] = {}
928
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
929
+ else
930
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
911
931
  end
912
932
  end
913
933
  end
914
- else
915
- $outfh.puts <<HEADER
934
+ end
935
+ else
936
+ $outfh.puts <<HEADER
916
937
  #
917
938
  # Full Smoothing:
918
939
  #
@@ -939,193 +960,194 @@ HEADER
939
960
  # Weights (omegas) are calculated as in Topham et al. 1993)
940
961
  #
941
962
  # sigma value used is: 5.00
963
+ #
942
964
  HEADER
943
- # full smooting
944
- 1.upto($env_features.size) do |ci|
945
- env_labels.combination(ci) do |c1|
946
- Enumerable.cart_prod(*c1).each do |labels|
947
- pattern = "." * $env_features.size
948
- labels.each do |label|
949
- j = label[0].chr.to_i
950
- l = label[1].chr
951
- pattern[j] = l
952
- end
965
+ # full smooting
966
+ 1.upto($env_features.size) do |ci|
967
+ env_labels.combination(ci) do |c1|
968
+ Enumerable.cart_prod(*c1).each do |labels|
969
+ pattern = "." * $env_features.size
970
+ labels.each do |label|
971
+ j = label[0].chr.to_i
972
+ l = label[1].chr
973
+ pattern[j] = l
974
+ end
953
975
 
954
- # get environmetns, frequencies, and probabilities
955
- envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
956
- freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
957
- prob_arr = NArray.float(21)
958
- 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
976
+ # get environmetns, frequencies, and probabilities
977
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
978
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
979
+ prob_arr = NArray.float($amino_acids.size)
980
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
959
981
 
960
- # collect priors
961
- priors = []
962
- if ci > 1
963
- labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
964
- else
965
- priors << $smooth_prob[1]
966
- end
982
+ # collect priors
983
+ priors = []
984
+ if ci > 1
985
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
986
+ else
987
+ priors << $smooth_prob[1]
988
+ end
967
989
 
968
- # entropy based weighting priors
969
- entropy_max = Math::log(21)
970
- entropies = priors.map do |prior|
971
- (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
972
- end
973
- weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
974
-
975
- # smoothing step
976
- smooth_prob_arr = NArray.float(21)
977
- big_N = freq_arr.sum.to_f
978
- small_n = 21.0
979
- omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
980
- omega2 = 1.0 - omega1
981
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
982
-
983
- # normalization step
984
- smooth_prob_arr_sum = smooth_prob_arr.sum
985
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
986
-
987
- # store smoothed probabilties in a hash using a set of envrionment labels as a key
988
- if !$smooth_prob.has_key?(ci + 1)
989
- $smooth_prob[ci + 1] = {}
990
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
991
- else
992
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
993
- end
990
+ # entropy based weighting priors
991
+ entropy_max = Math::log($amino_acids.size)
992
+ entropies = priors.map do |prior|
993
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
994
+ end
995
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
996
+
997
+ # smoothing step
998
+ smooth_prob_arr = NArray.float($amino_acids.size)
999
+ big_N = freq_arr.sum.to_f
1000
+ small_n = $amino_acids.size.to_f
1001
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1002
+ omega2 = 1.0 - omega1
1003
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1004
+
1005
+ # normalization step
1006
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1007
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1008
+
1009
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1010
+ if !$smooth_prob.has_key?(ci + 1)
1011
+ $smooth_prob[ci + 1] = {}
1012
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1013
+ else
1014
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
994
1015
  end
995
1016
  end
996
1017
  end
997
1018
  end
1019
+ end
998
1020
 
999
- # updating smoothed probability array for each envrionment
1000
- $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
1001
-
1002
- # for a total substitution probability matrix
1003
- $tot_prob_mat = NMatrix.float(21,21)
1004
-
1005
- # grouping environments by its environment labels but amino acid label
1006
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1021
+ # updating smoothed probability array for each envrionment
1022
+ $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
1007
1023
 
1008
- # sorting environments and build 21X21 substitution matrices
1009
- env_groups.to_a.sort_by { |env_group|
1010
- # a bit clumsy sorting here...
1011
- env_group[0].split("").map_with_index { |l, i|
1012
- $env_features[i + 1].labels.index(l)
1013
- }
1014
- }.each_with_index do |group, group_no|
1015
- # calculating 21X21 substitution probability matrix for each envrionment
1016
- grp_prob_mat = NMatrix.float(21,21)
1024
+ # for a total substitution probability matrix
1025
+ $tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
1017
1026
 
1018
- $amino_acids.each_with_index do |aa, ai|
1019
- smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1020
- 0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
1021
- end
1027
+ # grouping environments by its environment labels but amino acid label
1028
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1022
1029
 
1023
- $tot_prob_mat += grp_prob_mat
1030
+ # sorting environments and build 21X21 substitution matrices
1031
+ env_groups.to_a.sort_by { |env_group|
1032
+ # a bit clumsy sorting here...
1033
+ env_group[0].split("").map_with_index { |l, i|
1034
+ $env_features[i + 1].labels.index(l)
1035
+ }
1036
+ }.each_with_index do |group, group_no|
1037
+ # calculating 21X21 substitution probability matrix for each envrionment
1038
+ grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
1024
1039
 
1025
- if $output == 1
1026
- $outfh.puts ">#{group[0]} #{group_no}"
1027
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1028
- end
1040
+ $amino_acids.each_with_index do |aa, ai|
1041
+ smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1042
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
1029
1043
  end
1030
1044
 
1031
- $tot_prob_mat /= env_groups.size
1045
+ $tot_prob_mat += grp_prob_mat
1032
1046
 
1033
1047
  if $output == 1
1034
- $outfh.puts ">Total"
1035
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1036
- $outfh.close
1037
- exit 0
1048
+ $outfh.puts ">#{group[0]} #{group_no}"
1049
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1038
1050
  end
1051
+ end
1039
1052
 
1053
+ $tot_prob_mat /= env_groups.size
1040
1054
 
1041
- # Part 7.
1042
- #
1043
- # Calculating log-add ratio scoring matrices
1044
- #
1045
- if $output == 2
1046
- $outfh.puts <<HEADER
1055
+ if $output == 1
1056
+ $outfh.puts ">Total"
1057
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1058
+ $outfh.close
1059
+ exit 0
1060
+ end
1061
+
1062
+
1063
+ # Part 7.
1064
+ #
1065
+ # Calculating log odds ratio scoring matrices
1066
+ #
1067
+ if $output == 2
1068
+ $outfh.puts <<HEADER
1047
1069
  #
1048
1070
  # The probabilities were then divided by the background probabilities
1049
1071
  HEADER
1050
- if $penv
1051
- $outfh.puts <<HEADER
1072
+ if $penv
1073
+ $outfh.puts <<HEADER
1052
1074
  # which were derived from the environment-independent amino acid frequencies.
1053
1075
  # ^^^^^^^^^^^^^^^^^^^^^^^
1054
1076
  HEADER
1055
- else
1056
- $outfh.puts <<HEADER
1077
+ else
1078
+ $outfh.puts <<HEADER
1057
1079
  # which were derived from the environment-dependent amino acid frequencies.
1058
1080
  # ^^^^^^^^^^^^^^^^^^^^^
1059
1081
  HEADER
1060
- end
1082
+ end
1061
1083
 
1062
- $tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
1063
- grp_logo_mats = []
1064
- factor = $scale / Math::log(2)
1065
-
1066
- # grouping environments by its environment labels but amino acid label
1067
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1068
-
1069
- # sorting environments and build 21X21 substitution matrices
1070
- env_groups.to_a.sort_by { |env_group|
1071
- # a bit clumsy sorting here...
1072
- env_group[0].split("").map_with_index { |l, i|
1073
- $env_features[i + 1].labels.index(l)
1074
- }
1075
- }.each_with_index do |group, group_no|
1076
- # calculating 21X21 substitution probability matrix for each envrionment
1077
- grp_label = group[0]
1078
- grp_envs = group[1]
1079
- grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
1080
-
1081
- $amino_acids.each_with_index do |aa, ai|
1082
- env = grp_envs.detect { |e| e.label.start_with?(aa) }
1083
- logo_arr = $cys ? NArray.float(22) : NArray.float(21)
1084
-
1085
- env.smooth_prob_array.to_a.each_with_index do |prob, j|
1086
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1087
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1088
- logo_arr[j] = factor * Math::log(odds)
1089
- end
1084
+ $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1085
+ grp_logo_mats = []
1086
+ factor = $scale / Math::log(2)
1090
1087
 
1091
- 0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1088
+ # grouping environments by its environment labels but amino acid label
1089
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1092
1090
 
1093
- # adding log odds ratio for "U" (J or C) when --cyc is ON
1094
- if $cys
1095
- paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1096
- prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1097
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1098
- logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1099
- grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1100
- end
1091
+ # sorting environments and build 21X21 substitution matrices
1092
+ env_groups.to_a.sort_by { |env_group|
1093
+ # a bit clumsy sorting here...
1094
+ env_group[0].split("").map_with_index { |l, i|
1095
+ $env_features[i + 1].labels.index(l)
1096
+ }
1097
+ }.each_with_index do |group, group_no|
1098
+ # calculating 21X21 substitution probability matrix for each envrionment
1099
+ grp_label = group[0]
1100
+ grp_envs = group[1]
1101
+ grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1102
+
1103
+ $amino_acids.each_with_index do |aa, ai|
1104
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1105
+ logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1106
+
1107
+ env.smooth_prob_array.to_a.each_with_index do |prob, j|
1108
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1109
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1110
+ logo_arr[j] = factor * Math::log(odds)
1101
1111
  end
1102
1112
 
1103
- $tot_logo_mat += grp_logo_mat
1104
- grp_logo_mats << [grp_label, grp_logo_mat]
1113
+ 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1114
+
1115
+ # adding log odds ratio for "U" (J or C) when --cyc is 0
1116
+ if $cys == 0
1117
+ paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1118
+ prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1119
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1120
+ logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1121
+ grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1122
+ end
1105
1123
  end
1106
1124
 
1107
- $tot_logo_mat /= env_groups.size
1125
+ $tot_logo_mat += grp_logo_mat
1126
+ grp_logo_mats << [grp_label, grp_logo_mat]
1127
+ end
1108
1128
 
1109
- # calculating relative entropy for each amino acid pair H and
1110
- # the expected score E in bit units
1111
- #
1112
- # I'm a bit suspicious about this part...
1113
- tot_E = 0.0
1114
- tot_H = 0.0
1129
+ $tot_logo_mat /= env_groups.size
1115
1130
 
1116
- 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1117
- 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1118
- if i != j
1119
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1120
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1121
- else
1122
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1123
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1124
- end
1131
+ # calculating relative entropy for each amino acid pair H and
1132
+ # the expected score E in bit units
1133
+ #
1134
+ # I'm a bit suspicious about this part...
1135
+ tot_E = 0.0
1136
+ tot_H = 0.0
1137
+
1138
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1139
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1140
+ if i != j
1141
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1142
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1143
+ else
1144
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1145
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1125
1146
  end
1126
1147
  end
1148
+ end
1127
1149
 
1128
- $outfh.puts <<HEADER
1150
+ $outfh.puts <<HEADER
1129
1151
  #
1130
1152
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1131
1153
  # rounded to the nearest integer (log-odds scores in 1/3 bit units).
@@ -1134,27 +1156,27 @@ HEADER
1134
1156
  #
1135
1157
  HEADER
1136
1158
 
1137
- grp_logo_mats.each_with_index do |arr, grp_no|
1138
- grp_label = arr[0]
1139
- grp_logo_mat = arr[1]
1159
+ grp_logo_mats.each_with_index do |arr, grp_no|
1160
+ grp_label = arr[0]
1161
+ grp_logo_mat = arr[1]
1140
1162
 
1141
- $outfh.puts ">#{grp_label} #{grp_no}"
1142
- if $cys
1143
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1144
- else
1145
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1146
- end
1147
- end
1148
-
1149
- $outfh.puts ">Total #{grp_logo_mats.size}"
1163
+ $outfh.puts ">#{grp_label} #{grp_no}"
1150
1164
  if $cys
1151
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1165
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1152
1166
  else
1153
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1167
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1154
1168
  end
1155
- $outfh.close
1156
- exit 0
1157
1169
  end
1170
+
1171
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1172
+
1173
+ if $cys == 0
1174
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1175
+ else
1176
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1177
+ end
1178
+ $outfh.close
1179
+ exit 0
1158
1180
  end
1159
1181
  end
1160
1182
  end