egor 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.0.3 2008-12-09
2
+
3
+ * 2 major enhancement:
4
+ * An option '--cys (-j) 2' added not to distinguish J from C, so 'disulphide bond' environment feature is not prerequisite
5
+ * Masking works for target amino acid, too
6
+
1
7
  == 0.0.2 2008-11-13
2
8
 
3
9
  * 2 major enhancement:
data/README.rdoc CHANGED
@@ -2,10 +2,12 @@
2
2
 
3
3
  * http://egor.rubyforge.org
4
4
 
5
+
5
6
  == DESCRIPTION:
6
7
 
7
8
  egor: Esst GeneratOR, a program for calculating environment-specific substitution tables
8
9
 
10
+
9
11
  == FEATURES/PROBLEMS:
10
12
 
11
13
  * No more segmentation fault
@@ -14,34 +16,42 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
14
16
  * Full smoothing supported
15
17
  * In theory, infinite number of environment features can be handled
16
18
 
19
+
20
+ == INSTALL:
21
+
22
+ $ sudo gem install egor
23
+
24
+
17
25
  == BASIC USAGE:
18
26
 
19
27
  $ egor -l TEMLIST-file -c classdef.dat
20
28
  or
21
29
  $ egor -f TEM-file -c classdef.dat
22
30
 
31
+
23
32
  == OPTIONS:
24
- --tem-file (-f) STRING: a tem file
25
- --tem-list (-l) STRING: a list for tem files
26
- --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
27
- --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
28
- --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (not supported yet)
33
+ --tem-file (-f) FILE: a tem file
34
+ --tem-list (-l) FILE: a list for tem files
35
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
36
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
37
+ --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
29
38
  --noweight: calculate substitution counts with no weights (default)
30
39
  --smooth (-s) INTEGER:
31
40
  0 for parial smoothing (default)
32
41
  1 for full smoothing
33
42
  --nosmooth: perform no smoothing operation
34
- --cys (-y) INTEGER: (!!!not implemented yet!!!)
35
- 0 for using C and J only for structure
36
- 1 for both structure and sequence (default)
43
+ --cys (-y) INTEGER:
44
+ 0 for using C and J only for structure (default)
45
+ 1 for both structure and sequence
46
+ 2 for using only C for both
37
47
  --output INTEGER:
38
48
  0 for raw counts (no-smoothing performed)
39
49
  1 for probabilities
40
50
  2 for log-odds (default)
41
51
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
42
52
  --sigma DOUBLE: change the sigma value for smoothing (default 5)
43
- --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
44
- --penv: use environment-dependent frequencies for log-odds calculation (default false) (!!!not implemented yet!!!)
53
+ --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/=classes)
54
+ --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
45
55
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
46
56
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
47
57
  --verbose (-v) INTEGER
@@ -52,17 +62,19 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
52
62
  --version: print version
53
63
  --help (-h): show help
54
64
 
65
+
55
66
  == REQUIREMENTS:
56
67
 
57
68
  * ruby 1.8.6 or above (http://www.ruby-lang.org)
58
69
  * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
70
+
71
+ Following RubyGems will be automatically installed if you have rubygems installed on your machine
72
+
59
73
  * narray (http://narray.rubyforge.org/)
60
74
  * facets (http://facets.rubyforge.org/)
61
75
  * bio (http://bioruby.open-bio.org/)
76
+ * simple_memoize (http://github.com/JackDanger/simple_memoize/tree/master)
62
77
 
63
- == INSTALL:
64
-
65
- $ sudo gem install egor
66
78
 
67
79
  == LICENSE:
68
80
 
data/egor.gemspec CHANGED
@@ -2,11 +2,12 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{egor}
5
- s.version = "0.0.1"
5
+ s.version = "0.0.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Semin Lee"]
9
- s.date = %q{2008-11-10}
9
+ s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
10
+ s.date = %q{2008-12-09}
10
11
  s.default_executable = %q{egor}
11
12
  s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
12
13
  s.email = ["seminlee@gmail.com"]
@@ -20,8 +21,9 @@ Gem::Specification.new do |s|
20
21
  s.require_paths = ["lib"]
21
22
  s.rubyforge_project = %q{egor}
22
23
  s.rubygems_version = %q{1.3.1}
24
+ s.signing_key = %q{/Users/semin/.gem/gem-private_key.pem}
23
25
  s.summary = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
24
- s.test_files = ["test/test_helper.rb", "test/test_egor.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_nmatrix_extensions.rb", "test/test_egor_cli.rb"]
26
+ s.test_files = ["test/test_egor.rb", "test/test_egor_cli.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_helper.rb", "test/test_nmatrix_extensions.rb"]
25
27
 
26
28
  if s.respond_to? :specification_version then
27
29
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
@@ -32,14 +34,14 @@ Gem::Specification.new do |s|
32
34
  s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
33
35
  s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
34
36
  s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
35
- s.add_development_dependency(%q<newgem>, [">= 1.0.7"])
37
+ s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
36
38
  s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
37
39
  else
38
40
  s.add_dependency(%q<narray>, [">= 0.5.9.5"])
39
41
  s.add_dependency(%q<bio>, [">= 1.2.1"])
40
42
  s.add_dependency(%q<facets>, [">= 2.4.5"])
41
43
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
42
- s.add_dependency(%q<newgem>, [">= 1.0.7"])
44
+ s.add_dependency(%q<newgem>, [">= 1.1.0"])
43
45
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
44
46
  end
45
47
  else
@@ -47,7 +49,7 @@ Gem::Specification.new do |s|
47
49
  s.add_dependency(%q<bio>, [">= 1.2.1"])
48
50
  s.add_dependency(%q<facets>, [">= 2.4.5"])
49
51
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
50
- s.add_dependency(%q<newgem>, [">= 1.0.7"])
52
+ s.add_dependency(%q<newgem>, [">= 1.1.0"])
51
53
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
52
54
  end
53
55
  end
data/lib/egor/cli.rb CHANGED
@@ -44,7 +44,7 @@ Options:
44
44
  --tem-file (-f) FILE: a tem file
45
45
  --tem-list (-l) FILE: a list for tem files
46
46
  --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
47
- --outfile (-o) FILE: output filename ("allmat.dat" if not specified)
47
+ --outfile (-o) FILE: output filename (default 'allmat.dat')
48
48
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
49
  --noweight: calculate substitution counts with no weights (default)
50
50
  --smooth (-s) INTEGER:
@@ -54,6 +54,7 @@ Options:
54
54
  --cys (-y) INTEGER:
55
55
  0 for using C and J only for structure (default)
56
56
  1 for both structure and sequence
57
+ 2 for using only C for both
57
58
  --output INTEGER:
58
59
  0 for raw counts (no-smoothing performed)
59
60
  1 for probabilities
@@ -152,12 +153,12 @@ Options:
152
153
  $cys = 0
153
154
  $penv = false
154
155
 
155
- $aa_tot_obs = {}
156
- $aa_mut_obs = {}
156
+ $aa_tot_obs = Hash.new(0)
157
+ $aa_mut_obs = Hash.new(0)
157
158
  $aa_mutb = {}
158
159
  $aa_rel_mutb = {}
159
160
  $aa_rel_freq = {}
160
- $env_aa_obs = {}
161
+ $env_aa_obs = Hash.new(0)
161
162
  $smooth_prob = {}
162
163
  $tot_freq_mat = nil
163
164
  $tot_prob_mat = nil
@@ -200,7 +201,7 @@ Options:
200
201
  when '--outfile'
201
202
  $outfile = arg
202
203
  when '--cys'
203
- $cys = (arg.to_i == 1 ? false : true)
204
+ $cys = arg.to_i
204
205
  when '--weight'
205
206
  $weight = arg.to_i
206
207
  when '--sigma'
@@ -255,10 +256,12 @@ Options:
255
256
  # Reading Environment Class Definition File
256
257
  #
257
258
 
259
+ # set amino_acids
260
+ $amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
261
+
258
262
  # an array for storing all environment feature objects
259
263
  $env_features = []
260
264
 
261
-
262
265
  # an array for storing indexes of constrained environment features
263
266
  $cst_features = []
264
267
 
@@ -310,7 +313,7 @@ Options:
310
313
  }.inject { |pro, lb|
311
314
  pro.product(lb)
312
315
  }.each_with_index { |e, i|
313
- $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
316
+ $envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
314
317
  }
315
318
 
316
319
  # Part 4.
@@ -322,291 +325,308 @@ Options:
322
325
  $outfh = File.open($outfile, "w")
323
326
 
324
327
  if $tem_file
325
- $tem_list = [$tem_file]
328
+ $tem_list_io = StringIO.new($tem_file)
326
329
  end
327
330
 
328
331
  if $tem_list
329
- IO.foreach($tem_list) do |tem_file|
330
- tem_file.chomp!
332
+ $tem_list_io = File.open($tem_list)
333
+ end
334
+
335
+ $tem_list_io.each_line do |tem_file|
336
+ tem_file.chomp!
337
+
338
+ $logger.info ">>> Analysing #{tem_file} ..."
339
+
340
+ ali = Bio::Alignment::OriginalAlignment.new
341
+ ff = Bio::FlatFile.auto(tem_file)
342
+ ff.each_entry do |pir|
343
+ if pir.definition == "sequence"
344
+ ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
345
+ end
346
+ end
347
+
348
+ if ali.size < 2
349
+ $logger.warn "!!! Skipped #{tem_file}, there is only one 'sequence' entry"
350
+ next
351
+ end
331
352
 
332
- $logger.info ">>> Analysing #{tem_file} ..."
353
+ $ali_size += 1
354
+ env_labels = {}
355
+ disulphide = {}
333
356
 
334
- ali = Bio::Alignment::OriginalAlignment.new
335
- ff = Bio::FlatFile.auto(tem_file)
357
+ ali.each_pair do |key, seq|
358
+ # check disulphide bond environment first!
359
+ ff.rewind
336
360
  ff.each_entry do |pir|
337
- if pir.definition == "sequence"
338
- ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
361
+ if (pir.entry_id == key) && (pir.definition == "disulphide")
362
+ disulphide[key] = pir.data.gsub("\n", "").split("")
339
363
  end
340
364
  end
341
365
 
342
- $ali_size += 1
343
- env_labels = {}
344
- disulphide = {}
366
+ $env_features.each_with_index do |ec, ei|
367
+ env_labels[key] = [] unless env_labels.has_key?(key)
345
368
 
346
- ali.each_pair do |key, seq|
347
- # check disulphide bond environment first!
348
369
  ff.rewind
349
370
  ff.each_entry do |pir|
350
- if (pir.entry_id == key) && (pir.definition == "disulphide")
351
- disulphide[key] = pir.data.gsub("\n", "").split("")
352
- end
353
- end
354
-
355
- $env_features.each_with_index do |ec, ei|
356
- env_labels[key] = [] unless env_labels.has_key?(key)
357
-
358
- ff.rewind
359
- ff.each_entry do |pir|
360
- if (pir.entry_id == key) && (pir.definition == ec.name)
361
- labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
362
- if sym == "-"
363
- "-"
364
- elsif sym == "X" || sym == "x"
365
- "X"
371
+ if (pir.entry_id == key) && (pir.definition == ec.name)
372
+ labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
373
+ if sym == "-"
374
+ "-"
375
+ elsif sym == "X" || sym == "x"
376
+ "X"
377
+ else
378
+ if ei == 0 # Amino Acid Environment Feature
379
+ (( disulphide.has_key?(key) and disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
366
380
  else
367
- if ei == 0 # Amino Acid Environment Feature
368
- ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
369
- else
370
- ec.labels[ec.symbols.index(sym)]
371
- end
381
+ ec.labels[ec.symbols.index(sym)]
372
382
  end
373
383
  end
384
+ end
374
385
 
375
- if env_labels[key].empty?
376
- env_labels[key] = labels
377
- else
378
- env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
379
- end
386
+ if env_labels[key].empty?
387
+ env_labels[key] = labels
388
+ else
389
+ env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
380
390
  end
381
391
  end
382
392
  end
383
393
  end
394
+ end
395
+
396
+ if $noweight
397
+ ali.each_pair do |id1, seq1|
398
+ ali.each_pair do |id2, seq2|
399
+ if id1 != id2
400
+ pid = calc_pid(seq1, seq2)
401
+ s1 = seq1.split("")
402
+ s2 = seq2.split("")
403
+
404
+ # check PID_MIN
405
+ if $pidmin && (pid < $pidmin)
406
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
407
+ next
408
+ end
384
409
 
385
- if $noweight
386
- ali.each_pair do |id1, seq1|
387
- ali.each_pair do |id2, seq2|
388
- if id1 != id2
389
- pid = calc_pid(seq1, seq2)
390
- s1 = seq1.split("")
391
- s2 = seq2.split("")
392
-
393
- # check PID_MIN
394
- if $pidmin && (pid < $pidmin)
395
- $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
410
+ # check PID_MAX
411
+ if $pidmax && (pid > $pidmax)
412
+ $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
413
+ next
414
+ end
415
+
416
+ s1.each_with_index do |aa1, pos|
417
+ aa1.upcase!
418
+ aa2 = s2[pos].upcase
419
+
420
+ if env_labels[id1][pos].include?("X")
421
+ $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
396
422
  next
397
423
  end
398
424
 
399
- # check PID_MAX
400
- if $pidmax && (pid > $pidmax)
401
- $logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
425
+ if env_labels[id2][pos].include?("X")
426
+ $logger.info ">>> Substitutions to #{id2}-#{pos}-#{aa2} were masked"
402
427
  next
403
428
  end
404
429
 
405
- s1.each_with_index do |aa1, pos|
406
- if env_labels[id1][pos].include?("X")
407
- $logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
408
- next
409
- end
410
-
411
- aa1.upcase!
412
- aa2 = s2[pos].upcase
413
-
414
- if !$amino_acids.include?(aa1)
415
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
416
- next
417
- end
430
+ if !$amino_acids.include?(aa1)
431
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
432
+ next
433
+ end
418
434
 
419
- if !$amino_acids.include?(aa2)
420
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
421
- next
422
- end
435
+ if !$amino_acids.include?(aa2)
436
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
437
+ next
438
+ end
423
439
 
424
- aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
425
- aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
440
+ aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
441
+ aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
426
442
 
427
- if $cst_features.empty?
428
- $envs[env_labels[id1][pos]].increase_residue_count(aa2)
429
- elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
430
- env_labels[id2][pos].split("").values_at(*$cst_features))
431
- $envs[env_labels[id1][pos]].increase_residue_count(aa2)
432
- else
433
- $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
434
- next
435
- end
443
+ if $cst_features.empty?
444
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
445
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
446
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
447
+ else
448
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
449
+ next
450
+ end
436
451
 
437
- grp_label = env_labels[id1][pos][1..-1]
452
+ grp_label = env_labels[id1][pos][1..-1]
438
453
 
439
- if $env_aa_obs.has_key? grp_label
440
- if $env_aa_obs[grp_label].has_key? aa1
441
- $env_aa_obs[grp_label][aa1] += 1
442
- else
443
- $env_aa_obs[grp_label][aa1] = 1
444
- end
454
+ if $env_aa_obs.has_key? grp_label
455
+ if $env_aa_obs[grp_label].has_key? aa1
456
+ $env_aa_obs[grp_label][aa1] += 1
445
457
  else
446
- $env_aa_obs[grp_label] = Hash.new(0)
447
458
  $env_aa_obs[grp_label][aa1] = 1
448
459
  end
460
+ else
461
+ $env_aa_obs[grp_label] = Hash.new(0)
462
+ $env_aa_obs[grp_label][aa1] = 1
463
+ end
449
464
 
450
- if $aa_tot_obs.has_key? aa1
451
- $aa_tot_obs[aa1] += 1
452
- else
453
- $aa_tot_obs[aa1] = 1
454
- end
465
+ if $aa_tot_obs.has_key? aa1
466
+ $aa_tot_obs[aa1] += 1
467
+ else
468
+ $aa_tot_obs[aa1] = 1
469
+ end
455
470
 
456
- if aa1 != aa2
457
- if $aa_mut_obs.has_key? aa1
458
- $aa_mut_obs[aa1] += 1
459
- else
460
- $aa_mut_obs[aa1] = 1
461
- end
471
+ if aa1 != aa2
472
+ if $aa_mut_obs.has_key? aa1
473
+ $aa_mut_obs[aa1] += 1
474
+ else
475
+ $aa_mut_obs[aa1] = 1
462
476
  end
463
- $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
464
477
  end
478
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
465
479
  end
466
480
  end
467
481
  end
468
- else
469
- # BLOSUM-like weighting
470
- clusters = []
471
- ali.each_pair { |i, s| clusters << [i] }
472
-
473
- # a loop for single linkage clustering
474
- begin
475
- continue = false
476
- 0.upto(clusters.size - 2) do |i|
477
- indexes = []
478
- (i + 1).upto(clusters.size - 1) do |j|
479
- found = false
480
- clusters[i].each do |c1|
481
- clusters[j].each do |c2|
482
- if calc_pid(ali[c1], ali[c2]) >= $weight
483
- indexes << j
484
- found = true
485
- break
486
- end
482
+ end
483
+ else
484
+ # BLOSUM-like weighting
485
+ clusters = []
486
+ ali.each_pair { |i, s| clusters << [i] }
487
+
488
+ # a loop for single linkage clustering
489
+ begin
490
+ continue = false
491
+ 0.upto(clusters.size - 2) do |i|
492
+ indexes = []
493
+ (i + 1).upto(clusters.size - 1) do |j|
494
+ found = false
495
+ clusters[i].each do |c1|
496
+ clusters[j].each do |c2|
497
+ if calc_pid(ali[c1], ali[c2]) >= $weight
498
+ indexes << j
499
+ found = true
500
+ break
487
501
  end
488
- break if found
489
502
  end
503
+ break if found
490
504
  end
505
+ end
491
506
 
492
- unless indexes.empty?
493
- continue = true
494
- group = clusters[i]
495
- indexes.each do |k|
496
- group = group.concat(clusters[k])
497
- clusters[k] = nil
498
- end
499
- clusters[i] = group
500
- clusters.compact!
507
+ unless indexes.empty?
508
+ continue = true
509
+ group = clusters[i]
510
+ indexes.each do |k|
511
+ group = group.concat(clusters[k])
512
+ clusters[k] = nil
501
513
  end
514
+ clusters[i] = group
515
+ clusters.compact!
502
516
  end
503
- end while(continue)
504
-
505
- clusters.combination(2).each do |cluster1, cluster2|
506
- cluster1.each do |id1|
507
- cluster2.each do |id2|
508
- seq1 = ali[id1].split("")
509
- seq2 = ali[id2].split("")
510
-
511
- seq1.each_with_index do |aa1, pos|
512
- if env_labels[id1][pos].include?("X")
513
- $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
514
- next
515
- end
517
+ end
518
+ end while(continue)
516
519
 
517
- aa1.upcase!
518
- aa2 = seq2[pos].upcase
520
+ clusters.combination(2).each do |cluster1, cluster2|
521
+ cluster1.each do |id1|
522
+ cluster2.each do |id2|
523
+ seq1 = ali[id1].split("")
524
+ seq2 = ali[id2].split("")
519
525
 
520
- if !$amino_acids.include?(aa1)
521
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
522
- next
523
- end
526
+ seq1.each_with_index do |aa1, pos|
527
+ aa1.upcase!
528
+ aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
524
529
 
525
- if !$amino_acids.include?(aa2)
526
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
527
- next
528
- end
530
+ if env_labels[id1][pos].include?("X")
531
+ $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
532
+ next
533
+ end
529
534
 
530
- aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
531
- aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
532
- size1 = cluster1.size
533
- size2 = cluster2.size
534
- obs1 = 1.0 / size1
535
- obs2 = 1.0 / size2
536
-
537
- if $cst_features.empty?
538
- $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
539
- $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
540
- elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
541
- env_labels[id2][pos].split("").values_at(*$cst_features))
542
- $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
543
- $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
544
- else
545
- $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
546
- next
547
- end
535
+ if env_labels[id2][pos].include?("X")
536
+ $logger.debug "*** Substitutions to #{id2}-#{pos}-#{aa2} were masked"
537
+ next
538
+ end
539
+
540
+ if !$amino_acids.include?(aa1)
541
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
542
+ next
543
+ end
544
+
545
+ if !$amino_acids.include?(aa2)
546
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
547
+ next
548
+ end
549
+
550
+ aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
551
+ aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
552
+ size1 = cluster1.size
553
+ size2 = cluster2.size
554
+ obs1 = 1.0 / size1
555
+ obs2 = 1.0 / size2
556
+
557
+ if $cst_features.empty?
558
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
559
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
560
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
561
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
562
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
563
+ else
564
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
565
+ next
566
+ end
548
567
 
549
- grp_label1 = env_labels[id1][pos][1..-1]
550
- grp_label2 = env_labels[id2][pos][1..-1]
568
+ grp_label1 = env_labels[id1][pos][1..-1]
569
+ grp_label2 = env_labels[id2][pos][1..-1]
551
570
 
552
- if $env_aa_obs.has_key? grp_label1
553
- if $env_aa_obs[grp_label1].has_key? aa1
554
- $env_aa_obs[grp_label1][aa1] += obs1
555
- else
556
- $env_aa_obs[grp_label1][aa1] = obs1
557
- end
571
+ if $env_aa_obs.has_key? grp_label1
572
+ if $env_aa_obs[grp_label1].has_key? aa1
573
+ $env_aa_obs[grp_label1][aa1] += obs1
558
574
  else
559
- $env_aa_obs[grp_label1] = Hash.new(0.0)
560
575
  $env_aa_obs[grp_label1][aa1] = obs1
561
576
  end
577
+ else
578
+ $env_aa_obs[grp_label1] = Hash.new(0.0)
579
+ $env_aa_obs[grp_label1][aa1] = obs1
580
+ end
562
581
 
563
- if $env_aa_obs.has_key? grp_label2
564
- if $env_aa_obs[grp_label2].has_key? aa2
565
- $env_aa_obs[grp_label2][aa2] += obs2
566
- else
567
- $env_aa_obs[grp_label2][aa2] = obs2
568
- end
582
+ if $env_aa_obs.has_key? grp_label2
583
+ if $env_aa_obs[grp_label2].has_key? aa2
584
+ $env_aa_obs[grp_label2][aa2] += obs2
569
585
  else
570
- $env_aa_obs[grp_label2] = Hash.new(0.0)
571
586
  $env_aa_obs[grp_label2][aa2] = obs2
572
587
  end
588
+ else
589
+ $env_aa_obs[grp_label2] = Hash.new(0.0)
590
+ $env_aa_obs[grp_label2][aa2] = obs2
591
+ end
573
592
 
574
- if $aa_tot_obs.has_key? aa1
575
- $aa_tot_obs[aa1] += obs1
576
- else
577
- $aa_tot_obs[aa1] = obs1
578
- end
593
+ if $aa_tot_obs.has_key? aa1
594
+ $aa_tot_obs[aa1] += obs1
595
+ else
596
+ $aa_tot_obs[aa1] = obs1
597
+ end
579
598
 
580
- if $aa_tot_obs.has_key? aa2
581
- $aa_tot_obs[aa2] += obs2
599
+ if $aa_tot_obs.has_key? aa2
600
+ $aa_tot_obs[aa2] += obs2
601
+ else
602
+ $aa_tot_obs[aa2] = obs2
603
+ end
604
+
605
+ if aa1 != aa2
606
+ if $aa_mut_obs.has_key? aa1
607
+ $aa_mut_obs[aa1] += obs1
582
608
  else
583
- $aa_tot_obs[aa2] = obs2
609
+ $aa_mut_obs[aa1] = obs1
584
610
  end
585
-
586
- if aa1 != aa2
587
- if $aa_mut_obs.has_key? aa1
588
- $aa_mut_obs[aa1] += obs1
589
- else
590
- $aa_mut_obs[aa1] = obs1
591
- end
592
- if $aa_mut_obs.has_key? aa2
593
- $aa_mut_obs[aa2] += obs2
594
- else
595
- $aa_mut_obs[aa2] = obs2
596
- end
611
+ if $aa_mut_obs.has_key? aa2
612
+ $aa_mut_obs[aa2] += obs2
613
+ else
614
+ $aa_mut_obs[aa2] = obs2
597
615
  end
598
-
599
- $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
600
- $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
601
616
  end
617
+
618
+ $logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
619
+ $logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution for #{env_labels[id2][pos]}"
602
620
  end
603
621
  end
604
622
  end
605
- end # if !$nosmooth
606
- end # IO.foreach($tem_list)
623
+ end
624
+ end # if !$nosmooth
625
+ end
607
626
 
608
- # print out default header
609
- $outfh.puts <<HEADER
627
+ # print out default header
628
+ $outfh.puts <<HEADER
629
+ #
610
630
  # Environment-specific amino acid substitution matrices
611
631
  # Creator: egor version #{Egor::VERSION}
612
632
  # Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
@@ -616,9 +636,9 @@ Options:
616
636
  #
617
637
  HEADER
618
638
 
619
- $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
639
+ $env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
620
640
 
621
- $outfh.puts <<HEADER
641
+ $outfh.puts <<HEADER
622
642
  #
623
643
  # (read in from #{$classdef})
624
644
  #
@@ -632,164 +652,164 @@ HEADER
632
652
  #
633
653
  HEADER
634
654
 
655
+ if $noweight
656
+ $outfh.puts "# Weighting scheme: none"
657
+ else
658
+ $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
659
+ end
660
+
661
+ # calculate amino acid frequencies and mutabilities, and
662
+ # print them as default statistics in the header part
663
+ ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
664
+ $tot_aa = $aa_tot_obs.values.sum
665
+
666
+ $outfh.puts "#"
667
+ $outfh.puts "# Total amino acid frequencies:\n"
668
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
669
+
670
+ $amino_acids.each do |res|
671
+ $aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
672
+ $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
673
+ $aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
674
+ end
675
+
676
+ $amino_acids.each do |res|
635
677
  if $noweight
636
- $outfh.puts "# Weighting scheme: none"
678
+ $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
679
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
637
680
  else
638
- $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
681
+ $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
682
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
639
683
  end
684
+ end
640
685
 
641
- # calculate amino acid frequencies and mutabilities, and
642
- # print them as default statistics in the header part
643
- ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
644
- $tot_aa = $aa_tot_obs.values.sum
645
686
 
646
- $outfh.puts "#"
647
- $outfh.puts "# Total amino acid frequencies:\n"
648
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
687
+ # Part 5.
688
+ #
689
+ # Calculating substitution frequency tables
690
+ #
649
691
 
650
- $aa_tot_obs.each_pair do |res, freq|
651
- $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
652
- $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
653
- $aa_rel_freq[res] = freq / $tot_aa.to_f
692
+ # calculating probabilities for each environment
693
+ $envs.values.each do |e|
694
+ if e.freq_array.sum != 0
695
+ e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
654
696
  end
697
+ end
655
698
 
656
- $amino_acids.each do |res|
657
- if $noweight
658
- $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
659
- [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
660
- else
661
- $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
662
- [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
663
- end
664
- end
699
+ # count raw frequencies
700
+ $tot_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
665
701
 
702
+ # for each combination of environment features
703
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
666
704
 
667
- # Part 5.
668
- #
669
- # Calculating substitution frequency tables
670
- #
705
+ env_groups.to_a.sort_by { |env_group|
706
+ # a bit clumsy sorting here...
707
+ env_group[0].split("").map_with_index { |l, i|
708
+ $env_features[i + 1].labels.index(l)
709
+ }
710
+ }.each_with_index do |group, group_no|
711
+ grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
671
712
 
672
- # calculating probabilities for each environment
673
- $envs.values.each do |e|
674
- if e.freq_array.sum != 0
675
- e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
676
- end
713
+ $amino_acids.each_with_index do |aa, ai|
714
+ freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
715
+ 0.upto($amino_acids.size - 1) { |j| grp_freq_mat[ai, j] = freq_array[j] }
716
+ end
717
+
718
+ $tot_freq_mat += grp_freq_mat
719
+
720
+ if $output == 0
721
+ $outfh.puts ">#{group[0]} #{group_no}"
722
+ $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
677
723
  end
724
+ end
725
+
726
+ if $output == 0
727
+ $outfh.puts ">Total"
728
+ $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
729
+ exit 0
730
+ end
731
+
678
732
 
679
- # count raw frequencies
680
- $tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
733
+ # Part 6.
734
+ #
735
+ # Calculating substitution probability tables
736
+ #
737
+
738
+ if $output == 1
739
+ $outfh.puts <<HEADER
740
+ #
741
+ # Each column (j) represents the probability distribution for the
742
+ # likelihood of acceptance of a mutational event by a residue type j in
743
+ # a particular structural environment (specified after >) leading to
744
+ # any other residue type (i) and sums up to 100.
745
+ #
746
+ HEADER
747
+ end
748
+
749
+ if ($output > 0) && $nosmooth
750
+ # Probability matrices
751
+ $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
681
752
 
682
753
  # for each combination of environment features
683
754
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
684
-
685
755
  env_groups.to_a.sort_by { |env_group|
686
756
  # a bit clumsy sorting here...
687
757
  env_group[0].split("").map_with_index { |l, i|
688
758
  $env_features[i + 1].labels.index(l)
689
759
  }
690
760
  }.each_with_index do |group, group_no|
691
- grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
761
+ grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
692
762
 
693
763
  $amino_acids.each_with_index do |aa, ai|
694
- freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
695
- 0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
764
+ prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
765
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = prob_array[j] }
696
766
  end
697
767
 
698
- $tot_freq_mat += grp_freq_mat
768
+ $tot_prob_mat += grp_prob_mat
699
769
 
700
- if $output == 0
770
+ if ($output == 1)
701
771
  $outfh.puts ">#{group[0]} #{group_no}"
702
- $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
772
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
703
773
  end
704
774
  end
705
775
 
706
- if $output == 0
776
+ if ($output == 1)
707
777
  $outfh.puts ">Total"
708
- $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
778
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
779
+ $outfh.close
709
780
  exit 0
710
781
  end
782
+ end
711
783
 
712
-
713
- # Part 6.
784
+ # for smoothing...
785
+ if ($output > 0) && !$nosmooth
714
786
  #
715
- # Calculating substitution probability tables
787
+ # p1 probability
716
788
  #
717
-
718
- if $output == 1
719
- $outfh.puts <<HEADER
720
- #
721
- # Each column (j) represents the probability distribution for the
722
- # likelihood of acceptance of a mutational event by a residue type j in
723
- # a particular structural environment (specified after >) leading to
724
- # any other residue type (i) and sums up to 100.
725
- #
726
- HEADER
727
- end
728
-
729
- if ($output > 0) && $nosmooth
730
- # Probability matrices
731
- $tot_prob_mat = NMatrix.float(21, 21)
732
-
733
- # for each combination of environment features
734
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
735
- env_groups.to_a.sort_by { |env_group|
736
- # a bit clumsy sorting here...
737
- env_group[0].split("").map_with_index { |l, i|
738
- $env_features[i + 1].labels.index(l)
739
- }
740
- }.each_with_index do |group, group_no|
741
- grp_prob_mat = NMatrix.float(21,21)
742
-
743
- $amino_acids.each_with_index do |aa, ai|
744
- prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
745
- 0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
746
- end
747
-
748
- $tot_prob_mat += grp_prob_mat
749
-
750
- if ($output == 1)
751
- $outfh.puts ">#{group[0]} #{group_no}"
752
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
753
- end
754
- end
755
-
756
- if ($output == 1)
757
- $outfh.puts ">Total"
758
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
759
- $outfh.close
760
- exit 0
761
- end
789
+ p1 = NArray.float($amino_acids.size)
790
+ a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
791
+ big_N = $tot_aa.to_f
792
+ small_n = $amino_acids.size.to_f
793
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
794
+ omega2 = 1.0 - omega1
795
+
796
+ if $smooth == :partial
797
+ # for partial smoothing, p1 probability is not smoothed!
798
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
799
+ $smooth_prob[1] = p1
800
+ else
801
+ # for full smoothing, p1 probability is smoothed
802
+ 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
803
+ $smooth_prob[1] = p1
762
804
  end
763
805
 
764
- # for smoothing...
765
- if ($output > 0) && !$nosmooth
766
- #
767
- # p1 probability
768
- #
769
- p1 = NArray.float(21)
770
- a0 = NArray.float(21).fill(1 / 21.0)
771
- big_N = $tot_aa.to_f
772
- small_n = 21.0
773
- omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
774
- omega2 = 1.0 - omega1
775
-
776
- if $smooth == :partial
777
- # for partial smoothing, p1 probability is not smoothed!
778
- 0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
779
- $smooth_prob[1] = p1
780
- else
781
- # for full smoothing, p1 probability is smoothed
782
- 0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
783
- $smooth_prob[1] = p1
784
- end
785
-
786
- #
787
- # p2 and above
788
- #
789
- env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
806
+ #
807
+ # p2 and above
808
+ #
809
+ env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
790
810
 
791
- if $smooth == :partial
792
- $outfh.puts <<HEADER
811
+ if $smooth == :partial
812
+ $outfh.puts <<HEADER
793
813
  #
794
814
  # Partial Smoothing:
795
815
  #
@@ -813,106 +833,107 @@ HEADER
813
833
  # Weights (omegas) are calculated as in Topham et al. 1993)
814
834
  #
815
835
  # sigma value used is: 5.00
836
+ #
816
837
  HEADER
817
- 1.upto($env_features.size) do |ci|
818
- # for partial smoothing, only P1 ~ P3, and Pn are considered
819
- next if (ci > 2) && (ci < $env_features.size)
820
-
821
- env_labels.combination(ci) do |c1|
822
- Enumerable.cart_prod(*c1).each do |labels|
823
- pattern = "." * $env_features.size
824
-
825
- labels.each do |label|
826
- i = label[0].chr.to_i
827
- l = label[1].chr
828
- pattern[i] = l
829
- end
838
+ 1.upto($env_features.size) do |ci|
839
+ # for partial smoothing, only P1 ~ P3, and Pn are considered
840
+ next if (ci > 2) && (ci < $env_features.size)
841
+
842
+ env_labels.combination(ci) do |c1|
843
+ Enumerable.cart_prod(*c1).each do |labels|
844
+ pattern = "." * $env_features.size
845
+
846
+ labels.each do |label|
847
+ i = label[0].chr.to_i
848
+ l = label[1].chr
849
+ pattern[i] = l
850
+ end
830
851
 
831
- if pattern =~ /^\./
832
- $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
833
- next
834
- end
852
+ if pattern =~ /^\./
853
+ $logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
854
+ next
855
+ end
835
856
 
836
- # get environmetns, frequencies, and probabilities
837
- envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
838
- freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
839
- prob_arr = NArray.float(21)
840
- 0.upto(20) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
841
-
842
- # # assess whether a residue type j is compatible with a particular combination of structural features
843
- # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
844
- # if ci == $env_features.size
845
- # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
846
- # sub_pattern = "." * $env_features.size
847
- # sub_pattern[0] = aa_label
848
- # sub_freq_sum = 0
849
- #
850
- # labels[1..-1].each do |label|
851
- # next if label.start_with?("0")
852
- # i = label[0].chr.to_i
853
- # l = label[1].chr
854
- # sub_pattern[i] = l
855
- # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
856
- # sub_freq_arr = sub_envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
857
- # sub_freq_sum += sub_freq_arr.sum
858
- # end
859
- #
860
- # if sub_freq_sum == 0
861
- # if $smooth_prob.has_key?(ci + 1)
862
- # $smooth_prob[ci + 1][labels.to_set] = prob_arr
863
- # else
864
- # $smooth_prob[ci + 1] = {}
865
- # $smooth_prob[ci + 1][labels.to_set] = prob_arr
866
- # end
867
- # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
868
- # next
869
- # end
870
- # end
871
-
872
- # collect priors if ci > 1
873
- priors = []
874
-
875
- if ci == 2
876
- labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
877
- priors << $smooth_prob[2][c3.to_set]
878
- }
879
- elsif ci == $env_features.size
880
- labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
881
- priors << $smooth_prob[3][c3.to_set]
882
- }
883
- end
857
+ # get environmetns, frequencies, and probabilities
858
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
859
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
860
+ prob_arr = NArray.float($amino_acids.size)
861
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
862
+
863
+ # # assess whether a residue type j is compatible with a particular combination of structural features
864
+ # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
865
+ # if ci == $env_features.size
866
+ # aa_label = labels.find { |l| l.match(/^0/) }[1].chr
867
+ # sub_pattern = "." * $env_features.size
868
+ # sub_pattern[0] = aa_label
869
+ # sub_freq_sum = 0
870
+ #
871
+ # labels[1..-1].each do |label|
872
+ # next if label.start_with?("0")
873
+ # i = label[0].chr.to_i
874
+ # l = label[1].chr
875
+ # sub_pattern[i] = l
876
+ # sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
877
+ # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
878
+ # sub_freq_sum += sub_freq_arr.sum
879
+ # end
880
+ #
881
+ # if sub_freq_sum == 0
882
+ # if $smooth_prob.has_key?(ci + 1)
883
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
884
+ # else
885
+ # $smooth_prob[ci + 1] = {}
886
+ # $smooth_prob[ci + 1][labels.to_set] = prob_arr
887
+ # end
888
+ # $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
889
+ # next
890
+ # end
891
+ # end
892
+
893
+ # collect priors if ci > 1
894
+ priors = []
895
+
896
+ if ci == 2
897
+ labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
898
+ priors << $smooth_prob[2][c3.to_set]
899
+ }
900
+ elsif ci == $env_features.size
901
+ labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
902
+ priors << $smooth_prob[3][c3.to_set]
903
+ }
904
+ end
884
905
 
885
- # entropy based weighting priors
886
- entropy_max = Math::log(21)
887
- entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
888
- mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
889
- weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
890
- weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
891
-
892
- # smoothing step
893
- smooth_prob_arr = NArray.float(21)
894
- big_N = freq_arr.sum.to_f
895
- small_n = 21.0
896
- omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
897
- omega2 = 1.0 - omega1
898
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
899
-
900
- # normalization step
901
- smooth_prob_arr_sum = smooth_prob_arr.sum
902
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
903
-
904
- # store smoothed probabilties in a hash using a set of envrionment labels as a key
905
- if !$smooth_prob.has_key?(ci + 1)
906
- $smooth_prob[ci + 1] = {}
907
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
908
- else
909
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
910
- end
906
+ # entropy based weighting priors
907
+ entropy_max = Math::log($amino_acids.size)
908
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
909
+ mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
910
+ weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
911
+ weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
912
+
913
+ # smoothing step
914
+ smooth_prob_arr = NArray.float($amino_acids.size)
915
+ big_N = freq_arr.sum.to_f
916
+ small_n = $amino_acids.size.to_f
917
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
918
+ omega2 = 1.0 - omega1
919
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
920
+
921
+ # normalization step
922
+ smooth_prob_arr_sum = smooth_prob_arr.sum
923
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
924
+
925
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
926
+ if !$smooth_prob.has_key?(ci + 1)
927
+ $smooth_prob[ci + 1] = {}
928
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
929
+ else
930
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
911
931
  end
912
932
  end
913
933
  end
914
- else
915
- $outfh.puts <<HEADER
934
+ end
935
+ else
936
+ $outfh.puts <<HEADER
916
937
  #
917
938
  # Full Smoothing:
918
939
  #
@@ -939,193 +960,194 @@ HEADER
939
960
  # Weights (omegas) are calculated as in Topham et al. 1993)
940
961
  #
941
962
  # sigma value used is: 5.00
963
+ #
942
964
  HEADER
943
- # full smooting
944
- 1.upto($env_features.size) do |ci|
945
- env_labels.combination(ci) do |c1|
946
- Enumerable.cart_prod(*c1).each do |labels|
947
- pattern = "." * $env_features.size
948
- labels.each do |label|
949
- j = label[0].chr.to_i
950
- l = label[1].chr
951
- pattern[j] = l
952
- end
965
+ # full smooting
966
+ 1.upto($env_features.size) do |ci|
967
+ env_labels.combination(ci) do |c1|
968
+ Enumerable.cart_prod(*c1).each do |labels|
969
+ pattern = "." * $env_features.size
970
+ labels.each do |label|
971
+ j = label[0].chr.to_i
972
+ l = label[1].chr
973
+ pattern[j] = l
974
+ end
953
975
 
954
- # get environmetns, frequencies, and probabilities
955
- envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
956
- freq_arr = envs.inject(NArray.float(21)) { |sum, env| sum + env.freq_array }
957
- prob_arr = NArray.float(21)
958
- 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
976
+ # get environmetns, frequencies, and probabilities
977
+ envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
978
+ freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
979
+ prob_arr = NArray.float($amino_acids.size)
980
+ 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
959
981
 
960
- # collect priors
961
- priors = []
962
- if ci > 1
963
- labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
964
- else
965
- priors << $smooth_prob[1]
966
- end
982
+ # collect priors
983
+ priors = []
984
+ if ci > 1
985
+ labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
986
+ else
987
+ priors << $smooth_prob[1]
988
+ end
967
989
 
968
- # entropy based weighting priors
969
- entropy_max = Math::log(21)
970
- entropies = priors.map do |prior|
971
- (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
972
- end
973
- weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
974
-
975
- # smoothing step
976
- smooth_prob_arr = NArray.float(21)
977
- big_N = freq_arr.sum.to_f
978
- small_n = 21.0
979
- omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
980
- omega2 = 1.0 - omega1
981
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
982
-
983
- # normalization step
984
- smooth_prob_arr_sum = smooth_prob_arr.sum
985
- 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
986
-
987
- # store smoothed probabilties in a hash using a set of envrionment labels as a key
988
- if !$smooth_prob.has_key?(ci + 1)
989
- $smooth_prob[ci + 1] = {}
990
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
991
- else
992
- $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
993
- end
990
+ # entropy based weighting priors
991
+ entropy_max = Math::log($amino_acids.size)
992
+ entropies = priors.map do |prior|
993
+ (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
994
+ end
995
+ weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
996
+
997
+ # smoothing step
998
+ smooth_prob_arr = NArray.float($amino_acids.size)
999
+ big_N = freq_arr.sum.to_f
1000
+ small_n = $amino_acids.size.to_f
1001
+ omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
1002
+ omega2 = 1.0 - omega1
1003
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
1004
+
1005
+ # normalization step
1006
+ smooth_prob_arr_sum = smooth_prob_arr.sum
1007
+ 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
1008
+
1009
+ # store smoothed probabilties in a hash using a set of envrionment labels as a key
1010
+ if !$smooth_prob.has_key?(ci + 1)
1011
+ $smooth_prob[ci + 1] = {}
1012
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
1013
+ else
1014
+ $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
994
1015
  end
995
1016
  end
996
1017
  end
997
1018
  end
1019
+ end
998
1020
 
999
- # updating smoothed probability array for each envrionment
1000
- $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
1001
-
1002
- # for a total substitution probability matrix
1003
- $tot_prob_mat = NMatrix.float(21,21)
1004
-
1005
- # grouping environments by its environment labels but amino acid label
1006
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1021
+ # updating smoothed probability array for each envrionment
1022
+ $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
1007
1023
 
1008
- # sorting environments and build 21X21 substitution matrices
1009
- env_groups.to_a.sort_by { |env_group|
1010
- # a bit clumsy sorting here...
1011
- env_group[0].split("").map_with_index { |l, i|
1012
- $env_features[i + 1].labels.index(l)
1013
- }
1014
- }.each_with_index do |group, group_no|
1015
- # calculating 21X21 substitution probability matrix for each envrionment
1016
- grp_prob_mat = NMatrix.float(21,21)
1024
+ # for a total substitution probability matrix
1025
+ $tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
1017
1026
 
1018
- $amino_acids.each_with_index do |aa, ai|
1019
- smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1020
- 0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
1021
- end
1027
+ # grouping environments by its environment labels but amino acid label
1028
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1022
1029
 
1023
- $tot_prob_mat += grp_prob_mat
1030
+ # sorting environments and build 21X21 substitution matrices
1031
+ env_groups.to_a.sort_by { |env_group|
1032
+ # a bit clumsy sorting here...
1033
+ env_group[0].split("").map_with_index { |l, i|
1034
+ $env_features[i + 1].labels.index(l)
1035
+ }
1036
+ }.each_with_index do |group, group_no|
1037
+ # calculating 21X21 substitution probability matrix for each envrionment
1038
+ grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
1024
1039
 
1025
- if $output == 1
1026
- $outfh.puts ">#{group[0]} #{group_no}"
1027
- $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1028
- end
1040
+ $amino_acids.each_with_index do |aa, ai|
1041
+ smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
1042
+ 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
1029
1043
  end
1030
1044
 
1031
- $tot_prob_mat /= env_groups.size
1045
+ $tot_prob_mat += grp_prob_mat
1032
1046
 
1033
1047
  if $output == 1
1034
- $outfh.puts ">Total"
1035
- $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1036
- $outfh.close
1037
- exit 0
1048
+ $outfh.puts ">#{group[0]} #{group_no}"
1049
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1038
1050
  end
1051
+ end
1039
1052
 
1053
+ $tot_prob_mat /= env_groups.size
1040
1054
 
1041
- # Part 7.
1042
- #
1043
- # Calculating log-add ratio scoring matrices
1044
- #
1045
- if $output == 2
1046
- $outfh.puts <<HEADER
1055
+ if $output == 1
1056
+ $outfh.puts ">Total"
1057
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1058
+ $outfh.close
1059
+ exit 0
1060
+ end
1061
+
1062
+
1063
+ # Part 7.
1064
+ #
1065
+ # Calculating log odds ratio scoring matrices
1066
+ #
1067
+ if $output == 2
1068
+ $outfh.puts <<HEADER
1047
1069
  #
1048
1070
  # The probabilities were then divided by the background probabilities
1049
1071
  HEADER
1050
- if $penv
1051
- $outfh.puts <<HEADER
1072
+ if $penv
1073
+ $outfh.puts <<HEADER
1052
1074
  # which were derived from the environment-independent amino acid frequencies.
1053
1075
  # ^^^^^^^^^^^^^^^^^^^^^^^
1054
1076
  HEADER
1055
- else
1056
- $outfh.puts <<HEADER
1077
+ else
1078
+ $outfh.puts <<HEADER
1057
1079
  # which were derived from the environment-dependent amino acid frequencies.
1058
1080
  # ^^^^^^^^^^^^^^^^^^^^^
1059
1081
  HEADER
1060
- end
1082
+ end
1061
1083
 
1062
- $tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
1063
- grp_logo_mats = []
1064
- factor = $scale / Math::log(2)
1065
-
1066
- # grouping environments by its environment labels but amino acid label
1067
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1068
-
1069
- # sorting environments and build 21X21 substitution matrices
1070
- env_groups.to_a.sort_by { |env_group|
1071
- # a bit clumsy sorting here...
1072
- env_group[0].split("").map_with_index { |l, i|
1073
- $env_features[i + 1].labels.index(l)
1074
- }
1075
- }.each_with_index do |group, group_no|
1076
- # calculating 21X21 substitution probability matrix for each envrionment
1077
- grp_label = group[0]
1078
- grp_envs = group[1]
1079
- grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
1080
-
1081
- $amino_acids.each_with_index do |aa, ai|
1082
- env = grp_envs.detect { |e| e.label.start_with?(aa) }
1083
- logo_arr = $cys ? NArray.float(22) : NArray.float(21)
1084
-
1085
- env.smooth_prob_array.to_a.each_with_index do |prob, j|
1086
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1087
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1088
- logo_arr[j] = factor * Math::log(odds)
1089
- end
1084
+ $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1085
+ grp_logo_mats = []
1086
+ factor = $scale / Math::log(2)
1090
1087
 
1091
- 0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1088
+ # grouping environments by its environment labels but amino acid label
1089
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1092
1090
 
1093
- # adding log odds ratio for "U" (J or C) when --cyc is ON
1094
- if $cys
1095
- paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1096
- prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1097
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1098
- logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1099
- grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1100
- end
1091
+ # sorting environments and build 21X21 substitution matrices
1092
+ env_groups.to_a.sort_by { |env_group|
1093
+ # a bit clumsy sorting here...
1094
+ env_group[0].split("").map_with_index { |l, i|
1095
+ $env_features[i + 1].labels.index(l)
1096
+ }
1097
+ }.each_with_index do |group, group_no|
1098
+ # calculating 21X21 substitution probability matrix for each envrionment
1099
+ grp_label = group[0]
1100
+ grp_envs = group[1]
1101
+ grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1102
+
1103
+ $amino_acids.each_with_index do |aa, ai|
1104
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1105
+ logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1106
+
1107
+ env.smooth_prob_array.to_a.each_with_index do |prob, j|
1108
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1109
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1110
+ logo_arr[j] = factor * Math::log(odds)
1101
1111
  end
1102
1112
 
1103
- $tot_logo_mat += grp_logo_mat
1104
- grp_logo_mats << [grp_label, grp_logo_mat]
1113
+ 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1114
+
1115
+ # adding log odds ratio for "U" (J or C) when --cyc is 0
1116
+ if $cys == 0
1117
+ paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1118
+ prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1119
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1120
+ logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1121
+ grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1122
+ end
1105
1123
  end
1106
1124
 
1107
- $tot_logo_mat /= env_groups.size
1125
+ $tot_logo_mat += grp_logo_mat
1126
+ grp_logo_mats << [grp_label, grp_logo_mat]
1127
+ end
1108
1128
 
1109
- # calculating relative entropy for each amino acid pair H and
1110
- # the expected score E in bit units
1111
- #
1112
- # I'm a bit suspicious about this part...
1113
- tot_E = 0.0
1114
- tot_H = 0.0
1129
+ $tot_logo_mat /= env_groups.size
1115
1130
 
1116
- 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1117
- 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1118
- if i != j
1119
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1120
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1121
- else
1122
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1123
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1124
- end
1131
+ # calculating relative entropy for each amino acid pair H and
1132
+ # the expected score E in bit units
1133
+ #
1134
+ # I'm a bit suspicious about this part...
1135
+ tot_E = 0.0
1136
+ tot_H = 0.0
1137
+
1138
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1139
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1140
+ if i != j
1141
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1142
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1143
+ else
1144
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1145
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1125
1146
  end
1126
1147
  end
1148
+ end
1127
1149
 
1128
- $outfh.puts <<HEADER
1150
+ $outfh.puts <<HEADER
1129
1151
  #
1130
1152
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1131
1153
  # rounded to the nearest integer (log-odds scores in 1/3 bit units).
@@ -1134,27 +1156,27 @@ HEADER
1134
1156
  #
1135
1157
  HEADER
1136
1158
 
1137
- grp_logo_mats.each_with_index do |arr, grp_no|
1138
- grp_label = arr[0]
1139
- grp_logo_mat = arr[1]
1159
+ grp_logo_mats.each_with_index do |arr, grp_no|
1160
+ grp_label = arr[0]
1161
+ grp_logo_mat = arr[1]
1140
1162
 
1141
- $outfh.puts ">#{grp_label} #{grp_no}"
1142
- if $cys
1143
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1144
- else
1145
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1146
- end
1147
- end
1148
-
1149
- $outfh.puts ">Total #{grp_logo_mats.size}"
1163
+ $outfh.puts ">#{grp_label} #{grp_no}"
1150
1164
  if $cys
1151
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1165
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1152
1166
  else
1153
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1167
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1154
1168
  end
1155
- $outfh.close
1156
- exit 0
1157
1169
  end
1170
+
1171
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1172
+
1173
+ if $cys == 0
1174
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1175
+ else
1176
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1177
+ end
1178
+ $outfh.close
1179
+ exit 0
1158
1180
  end
1159
1181
  end
1160
1182
  end