egor 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/README.rdoc +25 -13
- data/egor.gemspec +8 -6
- data/lib/egor/cli.rb +618 -596
- data/lib/egor.rb +1 -1
- data/lib/environment.rb +9 -9
- data/website/index.html +5 -5
- data/website/index.txt +8 -6
- data.tar.gz.sig +0 -0
- metadata +26 -5
- metadata.gz.sig +1 -0
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
== 0.0.3 2008-12-09
|
2
|
+
|
3
|
+
* 2 major enhancement:
|
4
|
+
* An option '--cys (-j) 2' added not to distinguish J from C, so 'disulphide bond' environment feature is not prerequisite
|
5
|
+
* Masking works for target amino acid, too
|
6
|
+
|
1
7
|
== 0.0.2 2008-11-13
|
2
8
|
|
3
9
|
* 2 major enhancement:
|
data/README.rdoc
CHANGED
@@ -2,10 +2,12 @@
|
|
2
2
|
|
3
3
|
* http://egor.rubyforge.org
|
4
4
|
|
5
|
+
|
5
6
|
== DESCRIPTION:
|
6
7
|
|
7
8
|
egor: Esst GeneratOR, a program for calculating environment-specific substitution tables
|
8
9
|
|
10
|
+
|
9
11
|
== FEATURES/PROBLEMS:
|
10
12
|
|
11
13
|
* No more segmentation fault
|
@@ -14,34 +16,42 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
14
16
|
* Full smoothing supported
|
15
17
|
* In theory, infinite number of environment features can be handled
|
16
18
|
|
19
|
+
|
20
|
+
== INSTALL:
|
21
|
+
|
22
|
+
$ sudo gem install egor
|
23
|
+
|
24
|
+
|
17
25
|
== BASIC USAGE:
|
18
26
|
|
19
27
|
$ egor -l TEMLIST-file -c classdef.dat
|
20
28
|
or
|
21
29
|
$ egor -f TEM-file -c classdef.dat
|
22
30
|
|
31
|
+
|
23
32
|
== OPTIONS:
|
24
|
-
--tem-file (-f)
|
25
|
-
--tem-list (-l)
|
26
|
-
--classdef (-c)
|
27
|
-
--outfile (-o)
|
28
|
-
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
33
|
+
--tem-file (-f) FILE: a tem file
|
34
|
+
--tem-list (-l) FILE: a list for tem files
|
35
|
+
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
36
|
+
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
37
|
+
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
29
38
|
--noweight: calculate substitution counts with no weights (default)
|
30
39
|
--smooth (-s) INTEGER:
|
31
40
|
0 for parial smoothing (default)
|
32
41
|
1 for full smoothing
|
33
42
|
--nosmooth: perform no smoothing operation
|
34
|
-
--cys (-y) INTEGER:
|
35
|
-
0 for using C and J only for structure
|
36
|
-
1 for both structure and sequence
|
43
|
+
--cys (-y) INTEGER:
|
44
|
+
0 for using C and J only for structure (default)
|
45
|
+
1 for both structure and sequence
|
46
|
+
2 for using only C for both
|
37
47
|
--output INTEGER:
|
38
48
|
0 for raw counts (no-smoothing performed)
|
39
49
|
1 for probabilities
|
40
50
|
2 for log-odds (default)
|
41
51
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
42
52
|
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
43
|
-
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1
|
44
|
-
--penv: use environment-dependent frequencies for log-odds calculation (default false) (
|
53
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/=classes)
|
54
|
+
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
45
55
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
46
56
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
47
57
|
--verbose (-v) INTEGER
|
@@ -52,17 +62,19 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
52
62
|
--version: print version
|
53
63
|
--help (-h): show help
|
54
64
|
|
65
|
+
|
55
66
|
== REQUIREMENTS:
|
56
67
|
|
57
68
|
* ruby 1.8.6 or above (http://www.ruby-lang.org)
|
58
69
|
* rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
|
70
|
+
|
71
|
+
Following RubyGems will be automatically installed if you have rubygems installed on your machine
|
72
|
+
|
59
73
|
* narray (http://narray.rubyforge.org/)
|
60
74
|
* facets (http://facets.rubyforge.org/)
|
61
75
|
* bio (http://bioruby.open-bio.org/)
|
76
|
+
* simple_memoize (http://github.com/JackDanger/simple_memoize/tree/master)
|
62
77
|
|
63
|
-
== INSTALL:
|
64
|
-
|
65
|
-
$ sudo gem install egor
|
66
78
|
|
67
79
|
== LICENSE:
|
68
80
|
|
data/egor.gemspec
CHANGED
@@ -2,11 +2,12 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{egor}
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Semin Lee"]
|
9
|
-
s.
|
9
|
+
s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
|
10
|
+
s.date = %q{2008-12-09}
|
10
11
|
s.default_executable = %q{egor}
|
11
12
|
s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
|
12
13
|
s.email = ["seminlee@gmail.com"]
|
@@ -20,8 +21,9 @@ Gem::Specification.new do |s|
|
|
20
21
|
s.require_paths = ["lib"]
|
21
22
|
s.rubyforge_project = %q{egor}
|
22
23
|
s.rubygems_version = %q{1.3.1}
|
24
|
+
s.signing_key = %q{/Users/semin/.gem/gem-private_key.pem}
|
23
25
|
s.summary = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
|
24
|
-
s.test_files = ["test/
|
26
|
+
s.test_files = ["test/test_egor.rb", "test/test_egor_cli.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_helper.rb", "test/test_nmatrix_extensions.rb"]
|
25
27
|
|
26
28
|
if s.respond_to? :specification_version then
|
27
29
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
@@ -32,14 +34,14 @@ Gem::Specification.new do |s|
|
|
32
34
|
s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
|
33
35
|
s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
|
34
36
|
s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
35
|
-
s.add_development_dependency(%q<newgem>, [">= 1.0
|
37
|
+
s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
|
36
38
|
s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
|
37
39
|
else
|
38
40
|
s.add_dependency(%q<narray>, [">= 0.5.9.5"])
|
39
41
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
40
42
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
41
43
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
42
|
-
s.add_dependency(%q<newgem>, [">= 1.0
|
44
|
+
s.add_dependency(%q<newgem>, [">= 1.1.0"])
|
43
45
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
44
46
|
end
|
45
47
|
else
|
@@ -47,7 +49,7 @@ Gem::Specification.new do |s|
|
|
47
49
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
48
50
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
49
51
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
50
|
-
s.add_dependency(%q<newgem>, [">= 1.0
|
52
|
+
s.add_dependency(%q<newgem>, [">= 1.1.0"])
|
51
53
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
52
54
|
end
|
53
55
|
end
|
data/lib/egor/cli.rb
CHANGED
@@ -44,7 +44,7 @@ Options:
|
|
44
44
|
--tem-file (-f) FILE: a tem file
|
45
45
|
--tem-list (-l) FILE: a list for tem files
|
46
46
|
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
47
|
-
--outfile (-o) FILE: output filename (
|
47
|
+
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
48
48
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
49
|
--noweight: calculate substitution counts with no weights (default)
|
50
50
|
--smooth (-s) INTEGER:
|
@@ -54,6 +54,7 @@ Options:
|
|
54
54
|
--cys (-y) INTEGER:
|
55
55
|
0 for using C and J only for structure (default)
|
56
56
|
1 for both structure and sequence
|
57
|
+
2 for using only C for both
|
57
58
|
--output INTEGER:
|
58
59
|
0 for raw counts (no-smoothing performed)
|
59
60
|
1 for probabilities
|
@@ -152,12 +153,12 @@ Options:
|
|
152
153
|
$cys = 0
|
153
154
|
$penv = false
|
154
155
|
|
155
|
-
$aa_tot_obs =
|
156
|
-
$aa_mut_obs =
|
156
|
+
$aa_tot_obs = Hash.new(0)
|
157
|
+
$aa_mut_obs = Hash.new(0)
|
157
158
|
$aa_mutb = {}
|
158
159
|
$aa_rel_mutb = {}
|
159
160
|
$aa_rel_freq = {}
|
160
|
-
$env_aa_obs =
|
161
|
+
$env_aa_obs = Hash.new(0)
|
161
162
|
$smooth_prob = {}
|
162
163
|
$tot_freq_mat = nil
|
163
164
|
$tot_prob_mat = nil
|
@@ -200,7 +201,7 @@ Options:
|
|
200
201
|
when '--outfile'
|
201
202
|
$outfile = arg
|
202
203
|
when '--cys'
|
203
|
-
$cys =
|
204
|
+
$cys = arg.to_i
|
204
205
|
when '--weight'
|
205
206
|
$weight = arg.to_i
|
206
207
|
when '--sigma'
|
@@ -255,10 +256,12 @@ Options:
|
|
255
256
|
# Reading Environment Class Definition File
|
256
257
|
#
|
257
258
|
|
259
|
+
# set amino_acids
|
260
|
+
$amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
|
261
|
+
|
258
262
|
# an array for storing all environment feature objects
|
259
263
|
$env_features = []
|
260
264
|
|
261
|
-
|
262
265
|
# an array for storing indexes of constrained environment features
|
263
266
|
$cst_features = []
|
264
267
|
|
@@ -310,7 +313,7 @@ Options:
|
|
310
313
|
}.inject { |pro, lb|
|
311
314
|
pro.product(lb)
|
312
315
|
}.each_with_index { |e, i|
|
313
|
-
$envs[e.flatten.join] = Environment.new(i, e.flatten.join)
|
316
|
+
$envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
|
314
317
|
}
|
315
318
|
|
316
319
|
# Part 4.
|
@@ -322,291 +325,308 @@ Options:
|
|
322
325
|
$outfh = File.open($outfile, "w")
|
323
326
|
|
324
327
|
if $tem_file
|
325
|
-
$
|
328
|
+
$tem_list_io = StringIO.new($tem_file)
|
326
329
|
end
|
327
330
|
|
328
331
|
if $tem_list
|
329
|
-
|
330
|
-
|
332
|
+
$tem_list_io = File.open($tem_list)
|
333
|
+
end
|
334
|
+
|
335
|
+
$tem_list_io.each_line do |tem_file|
|
336
|
+
tem_file.chomp!
|
337
|
+
|
338
|
+
$logger.info ">>> Analysing #{tem_file} ..."
|
339
|
+
|
340
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
341
|
+
ff = Bio::FlatFile.auto(tem_file)
|
342
|
+
ff.each_entry do |pir|
|
343
|
+
if pir.definition == "sequence"
|
344
|
+
ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
if ali.size < 2
|
349
|
+
$logger.warn "!!! Skipped #{tem_file}, there is only one 'sequence' entry"
|
350
|
+
next
|
351
|
+
end
|
331
352
|
|
332
|
-
|
353
|
+
$ali_size += 1
|
354
|
+
env_labels = {}
|
355
|
+
disulphide = {}
|
333
356
|
|
334
|
-
|
335
|
-
|
357
|
+
ali.each_pair do |key, seq|
|
358
|
+
# check disulphide bond environment first!
|
359
|
+
ff.rewind
|
336
360
|
ff.each_entry do |pir|
|
337
|
-
if pir.definition == "
|
338
|
-
|
361
|
+
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
362
|
+
disulphide[key] = pir.data.gsub("\n", "").split("")
|
339
363
|
end
|
340
364
|
end
|
341
365
|
|
342
|
-
$
|
343
|
-
|
344
|
-
disulphide = {}
|
366
|
+
$env_features.each_with_index do |ec, ei|
|
367
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
345
368
|
|
346
|
-
ali.each_pair do |key, seq|
|
347
|
-
# check disulphide bond environment first!
|
348
369
|
ff.rewind
|
349
370
|
ff.each_entry do |pir|
|
350
|
-
if (pir.entry_id == key) && (pir.definition ==
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
ff.each_entry do |pir|
|
360
|
-
if (pir.entry_id == key) && (pir.definition == ec.name)
|
361
|
-
labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
|
362
|
-
if sym == "-"
|
363
|
-
"-"
|
364
|
-
elsif sym == "X" || sym == "x"
|
365
|
-
"X"
|
371
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
372
|
+
labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
|
373
|
+
if sym == "-"
|
374
|
+
"-"
|
375
|
+
elsif sym == "X" || sym == "x"
|
376
|
+
"X"
|
377
|
+
else
|
378
|
+
if ei == 0 # Amino Acid Environment Feature
|
379
|
+
(( disulphide.has_key?(key) and disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
366
380
|
else
|
367
|
-
|
368
|
-
((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
369
|
-
else
|
370
|
-
ec.labels[ec.symbols.index(sym)]
|
371
|
-
end
|
381
|
+
ec.labels[ec.symbols.index(sym)]
|
372
382
|
end
|
373
383
|
end
|
384
|
+
end
|
374
385
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
end
|
386
|
+
if env_labels[key].empty?
|
387
|
+
env_labels[key] = labels
|
388
|
+
else
|
389
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
380
390
|
end
|
381
391
|
end
|
382
392
|
end
|
383
393
|
end
|
394
|
+
end
|
395
|
+
|
396
|
+
if $noweight
|
397
|
+
ali.each_pair do |id1, seq1|
|
398
|
+
ali.each_pair do |id2, seq2|
|
399
|
+
if id1 != id2
|
400
|
+
pid = calc_pid(seq1, seq2)
|
401
|
+
s1 = seq1.split("")
|
402
|
+
s2 = seq2.split("")
|
403
|
+
|
404
|
+
# check PID_MIN
|
405
|
+
if $pidmin && (pid < $pidmin)
|
406
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
|
407
|
+
next
|
408
|
+
end
|
384
409
|
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
410
|
+
# check PID_MAX
|
411
|
+
if $pidmax && (pid > $pidmax)
|
412
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
|
413
|
+
next
|
414
|
+
end
|
415
|
+
|
416
|
+
s1.each_with_index do |aa1, pos|
|
417
|
+
aa1.upcase!
|
418
|
+
aa2 = s2[pos].upcase
|
419
|
+
|
420
|
+
if env_labels[id1][pos].include?("X")
|
421
|
+
$logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
396
422
|
next
|
397
423
|
end
|
398
424
|
|
399
|
-
|
400
|
-
|
401
|
-
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
|
425
|
+
if env_labels[id2][pos].include?("X")
|
426
|
+
$logger.info ">>> Substitutions to #{id2}-#{pos}-#{aa2} were masked"
|
402
427
|
next
|
403
428
|
end
|
404
429
|
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
end
|
410
|
-
|
411
|
-
aa1.upcase!
|
412
|
-
aa2 = s2[pos].upcase
|
413
|
-
|
414
|
-
if !$amino_acids.include?(aa1)
|
415
|
-
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
|
416
|
-
next
|
417
|
-
end
|
430
|
+
if !$amino_acids.include?(aa1)
|
431
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
|
432
|
+
next
|
433
|
+
end
|
418
434
|
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
435
|
+
if !$amino_acids.include?(aa2)
|
436
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
|
437
|
+
next
|
438
|
+
end
|
423
439
|
|
424
|
-
|
425
|
-
|
440
|
+
aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
441
|
+
aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
426
442
|
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
end
|
443
|
+
if $cst_features.empty?
|
444
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
445
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
|
446
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
447
|
+
else
|
448
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
449
|
+
next
|
450
|
+
end
|
436
451
|
|
437
|
-
|
452
|
+
grp_label = env_labels[id1][pos][1..-1]
|
438
453
|
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
else
|
443
|
-
$env_aa_obs[grp_label][aa1] = 1
|
444
|
-
end
|
454
|
+
if $env_aa_obs.has_key? grp_label
|
455
|
+
if $env_aa_obs[grp_label].has_key? aa1
|
456
|
+
$env_aa_obs[grp_label][aa1] += 1
|
445
457
|
else
|
446
|
-
$env_aa_obs[grp_label] = Hash.new(0)
|
447
458
|
$env_aa_obs[grp_label][aa1] = 1
|
448
459
|
end
|
460
|
+
else
|
461
|
+
$env_aa_obs[grp_label] = Hash.new(0)
|
462
|
+
$env_aa_obs[grp_label][aa1] = 1
|
463
|
+
end
|
449
464
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
465
|
+
if $aa_tot_obs.has_key? aa1
|
466
|
+
$aa_tot_obs[aa1] += 1
|
467
|
+
else
|
468
|
+
$aa_tot_obs[aa1] = 1
|
469
|
+
end
|
455
470
|
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
end
|
471
|
+
if aa1 != aa2
|
472
|
+
if $aa_mut_obs.has_key? aa1
|
473
|
+
$aa_mut_obs[aa1] += 1
|
474
|
+
else
|
475
|
+
$aa_mut_obs[aa1] = 1
|
462
476
|
end
|
463
|
-
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
464
477
|
end
|
478
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
|
465
479
|
end
|
466
480
|
end
|
467
481
|
end
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
482
|
+
end
|
483
|
+
else
|
484
|
+
# BLOSUM-like weighting
|
485
|
+
clusters = []
|
486
|
+
ali.each_pair { |i, s| clusters << [i] }
|
487
|
+
|
488
|
+
# a loop for single linkage clustering
|
489
|
+
begin
|
490
|
+
continue = false
|
491
|
+
0.upto(clusters.size - 2) do |i|
|
492
|
+
indexes = []
|
493
|
+
(i + 1).upto(clusters.size - 1) do |j|
|
494
|
+
found = false
|
495
|
+
clusters[i].each do |c1|
|
496
|
+
clusters[j].each do |c2|
|
497
|
+
if calc_pid(ali[c1], ali[c2]) >= $weight
|
498
|
+
indexes << j
|
499
|
+
found = true
|
500
|
+
break
|
487
501
|
end
|
488
|
-
break if found
|
489
502
|
end
|
503
|
+
break if found
|
490
504
|
end
|
505
|
+
end
|
491
506
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
end
|
499
|
-
clusters[i] = group
|
500
|
-
clusters.compact!
|
507
|
+
unless indexes.empty?
|
508
|
+
continue = true
|
509
|
+
group = clusters[i]
|
510
|
+
indexes.each do |k|
|
511
|
+
group = group.concat(clusters[k])
|
512
|
+
clusters[k] = nil
|
501
513
|
end
|
514
|
+
clusters[i] = group
|
515
|
+
clusters.compact!
|
502
516
|
end
|
503
|
-
end
|
504
|
-
|
505
|
-
clusters.combination(2).each do |cluster1, cluster2|
|
506
|
-
cluster1.each do |id1|
|
507
|
-
cluster2.each do |id2|
|
508
|
-
seq1 = ali[id1].split("")
|
509
|
-
seq2 = ali[id2].split("")
|
510
|
-
|
511
|
-
seq1.each_with_index do |aa1, pos|
|
512
|
-
if env_labels[id1][pos].include?("X")
|
513
|
-
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
514
|
-
next
|
515
|
-
end
|
517
|
+
end
|
518
|
+
end while(continue)
|
516
519
|
|
517
|
-
|
518
|
-
|
520
|
+
clusters.combination(2).each do |cluster1, cluster2|
|
521
|
+
cluster1.each do |id1|
|
522
|
+
cluster2.each do |id2|
|
523
|
+
seq1 = ali[id1].split("")
|
524
|
+
seq2 = ali[id2].split("")
|
519
525
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
end
|
526
|
+
seq1.each_with_index do |aa1, pos|
|
527
|
+
aa1.upcase!
|
528
|
+
aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
|
524
529
|
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
530
|
+
if env_labels[id1][pos].include?("X")
|
531
|
+
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
532
|
+
next
|
533
|
+
end
|
529
534
|
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
535
|
+
if env_labels[id2][pos].include?("X")
|
536
|
+
$logger.debug "*** Substitutions to #{id2}-#{pos}-#{aa2} were masked"
|
537
|
+
next
|
538
|
+
end
|
539
|
+
|
540
|
+
if !$amino_acids.include?(aa1)
|
541
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
542
|
+
next
|
543
|
+
end
|
544
|
+
|
545
|
+
if !$amino_acids.include?(aa2)
|
546
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
547
|
+
next
|
548
|
+
end
|
549
|
+
|
550
|
+
aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
551
|
+
aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
552
|
+
size1 = cluster1.size
|
553
|
+
size2 = cluster2.size
|
554
|
+
obs1 = 1.0 / size1
|
555
|
+
obs2 = 1.0 / size2
|
556
|
+
|
557
|
+
if $cst_features.empty?
|
558
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
559
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
560
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
|
561
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
562
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
563
|
+
else
|
564
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
565
|
+
next
|
566
|
+
end
|
548
567
|
|
549
|
-
|
550
|
-
|
568
|
+
grp_label1 = env_labels[id1][pos][1..-1]
|
569
|
+
grp_label2 = env_labels[id2][pos][1..-1]
|
551
570
|
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
else
|
556
|
-
$env_aa_obs[grp_label1][aa1] = obs1
|
557
|
-
end
|
571
|
+
if $env_aa_obs.has_key? grp_label1
|
572
|
+
if $env_aa_obs[grp_label1].has_key? aa1
|
573
|
+
$env_aa_obs[grp_label1][aa1] += obs1
|
558
574
|
else
|
559
|
-
$env_aa_obs[grp_label1] = Hash.new(0.0)
|
560
575
|
$env_aa_obs[grp_label1][aa1] = obs1
|
561
576
|
end
|
577
|
+
else
|
578
|
+
$env_aa_obs[grp_label1] = Hash.new(0.0)
|
579
|
+
$env_aa_obs[grp_label1][aa1] = obs1
|
580
|
+
end
|
562
581
|
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
else
|
567
|
-
$env_aa_obs[grp_label2][aa2] = obs2
|
568
|
-
end
|
582
|
+
if $env_aa_obs.has_key? grp_label2
|
583
|
+
if $env_aa_obs[grp_label2].has_key? aa2
|
584
|
+
$env_aa_obs[grp_label2][aa2] += obs2
|
569
585
|
else
|
570
|
-
$env_aa_obs[grp_label2] = Hash.new(0.0)
|
571
586
|
$env_aa_obs[grp_label2][aa2] = obs2
|
572
587
|
end
|
588
|
+
else
|
589
|
+
$env_aa_obs[grp_label2] = Hash.new(0.0)
|
590
|
+
$env_aa_obs[grp_label2][aa2] = obs2
|
591
|
+
end
|
573
592
|
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
593
|
+
if $aa_tot_obs.has_key? aa1
|
594
|
+
$aa_tot_obs[aa1] += obs1
|
595
|
+
else
|
596
|
+
$aa_tot_obs[aa1] = obs1
|
597
|
+
end
|
579
598
|
|
580
|
-
|
581
|
-
|
599
|
+
if $aa_tot_obs.has_key? aa2
|
600
|
+
$aa_tot_obs[aa2] += obs2
|
601
|
+
else
|
602
|
+
$aa_tot_obs[aa2] = obs2
|
603
|
+
end
|
604
|
+
|
605
|
+
if aa1 != aa2
|
606
|
+
if $aa_mut_obs.has_key? aa1
|
607
|
+
$aa_mut_obs[aa1] += obs1
|
582
608
|
else
|
583
|
-
$
|
609
|
+
$aa_mut_obs[aa1] = obs1
|
584
610
|
end
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
else
|
590
|
-
$aa_mut_obs[aa1] = obs1
|
591
|
-
end
|
592
|
-
if $aa_mut_obs.has_key? aa2
|
593
|
-
$aa_mut_obs[aa2] += obs2
|
594
|
-
else
|
595
|
-
$aa_mut_obs[aa2] = obs2
|
596
|
-
end
|
611
|
+
if $aa_mut_obs.has_key? aa2
|
612
|
+
$aa_mut_obs[aa2] += obs2
|
613
|
+
else
|
614
|
+
$aa_mut_obs[aa2] = obs2
|
597
615
|
end
|
598
|
-
|
599
|
-
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
600
|
-
$logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
|
601
616
|
end
|
617
|
+
|
618
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
|
619
|
+
$logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution for #{env_labels[id2][pos]}"
|
602
620
|
end
|
603
621
|
end
|
604
622
|
end
|
605
|
-
end
|
606
|
-
end #
|
623
|
+
end
|
624
|
+
end # if !$nosmooth
|
625
|
+
end
|
607
626
|
|
608
|
-
|
609
|
-
|
627
|
+
# print out default header
|
628
|
+
$outfh.puts <<HEADER
|
629
|
+
#
|
610
630
|
# Environment-specific amino acid substitution matrices
|
611
631
|
# Creator: egor version #{Egor::VERSION}
|
612
632
|
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
@@ -616,9 +636,9 @@ Options:
|
|
616
636
|
#
|
617
637
|
HEADER
|
618
638
|
|
619
|
-
|
639
|
+
$env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
|
620
640
|
|
621
|
-
|
641
|
+
$outfh.puts <<HEADER
|
622
642
|
#
|
623
643
|
# (read in from #{$classdef})
|
624
644
|
#
|
@@ -632,164 +652,164 @@ HEADER
|
|
632
652
|
#
|
633
653
|
HEADER
|
634
654
|
|
655
|
+
if $noweight
|
656
|
+
$outfh.puts "# Weighting scheme: none"
|
657
|
+
else
|
658
|
+
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
659
|
+
end
|
660
|
+
|
661
|
+
# calculate amino acid frequencies and mutabilities, and
|
662
|
+
# print them as default statistics in the header part
|
663
|
+
ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
664
|
+
$tot_aa = $aa_tot_obs.values.sum
|
665
|
+
|
666
|
+
$outfh.puts "#"
|
667
|
+
$outfh.puts "# Total amino acid frequencies:\n"
|
668
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
|
669
|
+
|
670
|
+
$amino_acids.each do |res|
|
671
|
+
$aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
|
672
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
673
|
+
$aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
|
674
|
+
end
|
675
|
+
|
676
|
+
$amino_acids.each do |res|
|
635
677
|
if $noweight
|
636
|
-
$outfh.puts "#
|
678
|
+
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
679
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
637
680
|
else
|
638
|
-
$outfh.puts "#
|
681
|
+
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
682
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
639
683
|
end
|
684
|
+
end
|
640
685
|
|
641
|
-
# calculate amino acid frequencies and mutabilities, and
|
642
|
-
# print them as default statistics in the header part
|
643
|
-
ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
644
|
-
$tot_aa = $aa_tot_obs.values.sum
|
645
686
|
|
646
|
-
|
647
|
-
|
648
|
-
|
687
|
+
# Part 5.
|
688
|
+
#
|
689
|
+
# Calculating substitution frequency tables
|
690
|
+
#
|
649
691
|
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
692
|
+
# calculating probabilities for each environment
|
693
|
+
$envs.values.each do |e|
|
694
|
+
if e.freq_array.sum != 0
|
695
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
654
696
|
end
|
697
|
+
end
|
655
698
|
|
656
|
-
|
657
|
-
|
658
|
-
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
659
|
-
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
660
|
-
else
|
661
|
-
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
662
|
-
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
663
|
-
end
|
664
|
-
end
|
699
|
+
# count raw frequencies
|
700
|
+
$tot_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
|
665
701
|
|
702
|
+
# for each combination of environment features
|
703
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
666
704
|
|
667
|
-
|
668
|
-
#
|
669
|
-
|
670
|
-
|
705
|
+
env_groups.to_a.sort_by { |env_group|
|
706
|
+
# a bit clumsy sorting here...
|
707
|
+
env_group[0].split("").map_with_index { |l, i|
|
708
|
+
$env_features[i + 1].labels.index(l)
|
709
|
+
}
|
710
|
+
}.each_with_index do |group, group_no|
|
711
|
+
grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
|
671
712
|
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
713
|
+
$amino_acids.each_with_index do |aa, ai|
|
714
|
+
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
715
|
+
0.upto($amino_acids.size - 1) { |j| grp_freq_mat[ai, j] = freq_array[j] }
|
716
|
+
end
|
717
|
+
|
718
|
+
$tot_freq_mat += grp_freq_mat
|
719
|
+
|
720
|
+
if $output == 0
|
721
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
722
|
+
$outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
677
723
|
end
|
724
|
+
end
|
725
|
+
|
726
|
+
if $output == 0
|
727
|
+
$outfh.puts ">Total"
|
728
|
+
$outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
729
|
+
exit 0
|
730
|
+
end
|
731
|
+
|
678
732
|
|
679
|
-
|
680
|
-
|
733
|
+
# Part 6.
|
734
|
+
#
|
735
|
+
# Calculating substitution probability tables
|
736
|
+
#
|
737
|
+
|
738
|
+
if $output == 1
|
739
|
+
$outfh.puts <<HEADER
|
740
|
+
#
|
741
|
+
# Each column (j) represents the probability distribution for the
|
742
|
+
# likelihood of acceptance of a mutational event by a residue type j in
|
743
|
+
# a particular structural environment (specified after >) leading to
|
744
|
+
# any other residue type (i) and sums up to 100.
|
745
|
+
#
|
746
|
+
HEADER
|
747
|
+
end
|
748
|
+
|
749
|
+
if ($output > 0) && $nosmooth
|
750
|
+
# Probability matrices
|
751
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
681
752
|
|
682
753
|
# for each combination of environment features
|
683
754
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
684
|
-
|
685
755
|
env_groups.to_a.sort_by { |env_group|
|
686
756
|
# a bit clumsy sorting here...
|
687
757
|
env_group[0].split("").map_with_index { |l, i|
|
688
758
|
$env_features[i + 1].labels.index(l)
|
689
759
|
}
|
690
760
|
}.each_with_index do |group, group_no|
|
691
|
-
|
761
|
+
grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
692
762
|
|
693
763
|
$amino_acids.each_with_index do |aa, ai|
|
694
|
-
|
695
|
-
0.upto(
|
764
|
+
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
765
|
+
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = prob_array[j] }
|
696
766
|
end
|
697
767
|
|
698
|
-
$
|
768
|
+
$tot_prob_mat += grp_prob_mat
|
699
769
|
|
700
|
-
if $output ==
|
770
|
+
if ($output == 1)
|
701
771
|
$outfh.puts ">#{group[0]} #{group_no}"
|
702
|
-
$outfh.puts
|
772
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
703
773
|
end
|
704
774
|
end
|
705
775
|
|
706
|
-
if $output ==
|
776
|
+
if ($output == 1)
|
707
777
|
$outfh.puts ">Total"
|
708
|
-
$outfh.puts $
|
778
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
779
|
+
$outfh.close
|
709
780
|
exit 0
|
710
781
|
end
|
782
|
+
end
|
711
783
|
|
712
|
-
|
713
|
-
|
784
|
+
# for smoothing...
|
785
|
+
if ($output > 0) && !$nosmooth
|
714
786
|
#
|
715
|
-
#
|
787
|
+
# p1 probability
|
716
788
|
#
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
#
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
$
|
732
|
-
|
733
|
-
# for each combination of environment features
|
734
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
735
|
-
env_groups.to_a.sort_by { |env_group|
|
736
|
-
# a bit clumsy sorting here...
|
737
|
-
env_group[0].split("").map_with_index { |l, i|
|
738
|
-
$env_features[i + 1].labels.index(l)
|
739
|
-
}
|
740
|
-
}.each_with_index do |group, group_no|
|
741
|
-
grp_prob_mat = NMatrix.float(21,21)
|
742
|
-
|
743
|
-
$amino_acids.each_with_index do |aa, ai|
|
744
|
-
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
745
|
-
0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
|
746
|
-
end
|
747
|
-
|
748
|
-
$tot_prob_mat += grp_prob_mat
|
749
|
-
|
750
|
-
if ($output == 1)
|
751
|
-
$outfh.puts ">#{group[0]} #{group_no}"
|
752
|
-
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
753
|
-
end
|
754
|
-
end
|
755
|
-
|
756
|
-
if ($output == 1)
|
757
|
-
$outfh.puts ">Total"
|
758
|
-
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
759
|
-
$outfh.close
|
760
|
-
exit 0
|
761
|
-
end
|
789
|
+
p1 = NArray.float($amino_acids.size)
|
790
|
+
a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
|
791
|
+
big_N = $tot_aa.to_f
|
792
|
+
small_n = $amino_acids.size.to_f
|
793
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
794
|
+
omega2 = 1.0 - omega1
|
795
|
+
|
796
|
+
if $smooth == :partial
|
797
|
+
# for partial smoothing, p1 probability is not smoothed!
|
798
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
|
799
|
+
$smooth_prob[1] = p1
|
800
|
+
else
|
801
|
+
# for full smoothing, p1 probability is smoothed
|
802
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
803
|
+
$smooth_prob[1] = p1
|
762
804
|
end
|
763
805
|
|
764
|
-
#
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
#
|
769
|
-
p1 = NArray.float(21)
|
770
|
-
a0 = NArray.float(21).fill(1 / 21.0)
|
771
|
-
big_N = $tot_aa.to_f
|
772
|
-
small_n = 21.0
|
773
|
-
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
774
|
-
omega2 = 1.0 - omega1
|
775
|
-
|
776
|
-
if $smooth == :partial
|
777
|
-
# for partial smoothing, p1 probability is not smoothed!
|
778
|
-
0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
|
779
|
-
$smooth_prob[1] = p1
|
780
|
-
else
|
781
|
-
# for full smoothing, p1 probability is smoothed
|
782
|
-
0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
783
|
-
$smooth_prob[1] = p1
|
784
|
-
end
|
785
|
-
|
786
|
-
#
|
787
|
-
# p2 and above
|
788
|
-
#
|
789
|
-
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
806
|
+
#
|
807
|
+
# p2 and above
|
808
|
+
#
|
809
|
+
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
790
810
|
|
791
|
-
|
792
|
-
|
811
|
+
if $smooth == :partial
|
812
|
+
$outfh.puts <<HEADER
|
793
813
|
#
|
794
814
|
# Partial Smoothing:
|
795
815
|
#
|
@@ -813,106 +833,107 @@ HEADER
|
|
813
833
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
814
834
|
#
|
815
835
|
# sigma value used is: 5.00
|
836
|
+
#
|
816
837
|
HEADER
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
838
|
+
1.upto($env_features.size) do |ci|
|
839
|
+
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
840
|
+
next if (ci > 2) && (ci < $env_features.size)
|
841
|
+
|
842
|
+
env_labels.combination(ci) do |c1|
|
843
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
844
|
+
pattern = "." * $env_features.size
|
845
|
+
|
846
|
+
labels.each do |label|
|
847
|
+
i = label[0].chr.to_i
|
848
|
+
l = label[1].chr
|
849
|
+
pattern[i] = l
|
850
|
+
end
|
830
851
|
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
852
|
+
if pattern =~ /^\./
|
853
|
+
$logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
|
854
|
+
next
|
855
|
+
end
|
835
856
|
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
857
|
+
# get environmetns, frequencies, and probabilities
|
858
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
859
|
+
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
860
|
+
prob_arr = NArray.float($amino_acids.size)
|
861
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
|
862
|
+
|
863
|
+
# # assess whether a residue type j is compatible with a particular combination of structural features
|
864
|
+
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
865
|
+
# if ci == $env_features.size
|
866
|
+
# aa_label = labels.find { |l| l.match(/^0/) }[1].chr
|
867
|
+
# sub_pattern = "." * $env_features.size
|
868
|
+
# sub_pattern[0] = aa_label
|
869
|
+
# sub_freq_sum = 0
|
870
|
+
#
|
871
|
+
# labels[1..-1].each do |label|
|
872
|
+
# next if label.start_with?("0")
|
873
|
+
# i = label[0].chr.to_i
|
874
|
+
# l = label[1].chr
|
875
|
+
# sub_pattern[i] = l
|
876
|
+
# sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
877
|
+
# sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
878
|
+
# sub_freq_sum += sub_freq_arr.sum
|
879
|
+
# end
|
880
|
+
#
|
881
|
+
# if sub_freq_sum == 0
|
882
|
+
# if $smooth_prob.has_key?(ci + 1)
|
883
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
884
|
+
# else
|
885
|
+
# $smooth_prob[ci + 1] = {}
|
886
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
887
|
+
# end
|
888
|
+
# $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
|
889
|
+
# next
|
890
|
+
# end
|
891
|
+
# end
|
892
|
+
|
893
|
+
# collect priors if ci > 1
|
894
|
+
priors = []
|
895
|
+
|
896
|
+
if ci == 2
|
897
|
+
labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
|
898
|
+
priors << $smooth_prob[2][c3.to_set]
|
899
|
+
}
|
900
|
+
elsif ci == $env_features.size
|
901
|
+
labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
|
902
|
+
priors << $smooth_prob[3][c3.to_set]
|
903
|
+
}
|
904
|
+
end
|
884
905
|
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
end
|
906
|
+
# entropy based weighting priors
|
907
|
+
entropy_max = Math::log($amino_acids.size)
|
908
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
|
909
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
910
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
911
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
912
|
+
|
913
|
+
# smoothing step
|
914
|
+
smooth_prob_arr = NArray.float($amino_acids.size)
|
915
|
+
big_N = freq_arr.sum.to_f
|
916
|
+
small_n = $amino_acids.size.to_f
|
917
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
918
|
+
omega2 = 1.0 - omega1
|
919
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
920
|
+
|
921
|
+
# normalization step
|
922
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
923
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
924
|
+
|
925
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
926
|
+
if !$smooth_prob.has_key?(ci + 1)
|
927
|
+
$smooth_prob[ci + 1] = {}
|
928
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
929
|
+
else
|
930
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
911
931
|
end
|
912
932
|
end
|
913
933
|
end
|
914
|
-
|
915
|
-
|
934
|
+
end
|
935
|
+
else
|
936
|
+
$outfh.puts <<HEADER
|
916
937
|
#
|
917
938
|
# Full Smoothing:
|
918
939
|
#
|
@@ -939,193 +960,194 @@ HEADER
|
|
939
960
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
940
961
|
#
|
941
962
|
# sigma value used is: 5.00
|
963
|
+
#
|
942
964
|
HEADER
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
965
|
+
# full smooting
|
966
|
+
1.upto($env_features.size) do |ci|
|
967
|
+
env_labels.combination(ci) do |c1|
|
968
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
969
|
+
pattern = "." * $env_features.size
|
970
|
+
labels.each do |label|
|
971
|
+
j = label[0].chr.to_i
|
972
|
+
l = label[1].chr
|
973
|
+
pattern[j] = l
|
974
|
+
end
|
953
975
|
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
976
|
+
# get environmetns, frequencies, and probabilities
|
977
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
978
|
+
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
979
|
+
prob_arr = NArray.float($amino_acids.size)
|
980
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
959
981
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
982
|
+
# collect priors
|
983
|
+
priors = []
|
984
|
+
if ci > 1
|
985
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
986
|
+
else
|
987
|
+
priors << $smooth_prob[1]
|
988
|
+
end
|
967
989
|
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
end
|
990
|
+
# entropy based weighting priors
|
991
|
+
entropy_max = Math::log($amino_acids.size)
|
992
|
+
entropies = priors.map do |prior|
|
993
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
994
|
+
end
|
995
|
+
weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
996
|
+
|
997
|
+
# smoothing step
|
998
|
+
smooth_prob_arr = NArray.float($amino_acids.size)
|
999
|
+
big_N = freq_arr.sum.to_f
|
1000
|
+
small_n = $amino_acids.size.to_f
|
1001
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
1002
|
+
omega2 = 1.0 - omega1
|
1003
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1004
|
+
|
1005
|
+
# normalization step
|
1006
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
1007
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
1008
|
+
|
1009
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1010
|
+
if !$smooth_prob.has_key?(ci + 1)
|
1011
|
+
$smooth_prob[ci + 1] = {}
|
1012
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1013
|
+
else
|
1014
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
994
1015
|
end
|
995
1016
|
end
|
996
1017
|
end
|
997
1018
|
end
|
1019
|
+
end
|
998
1020
|
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
# for a total substitution probability matrix
|
1003
|
-
$tot_prob_mat = NMatrix.float(21,21)
|
1004
|
-
|
1005
|
-
# grouping environments by its environment labels but amino acid label
|
1006
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1021
|
+
# updating smoothed probability array for each envrionment
|
1022
|
+
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
1007
1023
|
|
1008
|
-
|
1009
|
-
|
1010
|
-
# a bit clumsy sorting here...
|
1011
|
-
env_group[0].split("").map_with_index { |l, i|
|
1012
|
-
$env_features[i + 1].labels.index(l)
|
1013
|
-
}
|
1014
|
-
}.each_with_index do |group, group_no|
|
1015
|
-
# calculating 21X21 substitution probability matrix for each envrionment
|
1016
|
-
grp_prob_mat = NMatrix.float(21,21)
|
1024
|
+
# for a total substitution probability matrix
|
1025
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
1017
1026
|
|
1018
|
-
|
1019
|
-
|
1020
|
-
0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
|
1021
|
-
end
|
1027
|
+
# grouping environments by its environment labels but amino acid label
|
1028
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1022
1029
|
|
1023
|
-
|
1030
|
+
# sorting environments and build 21X21 substitution matrices
|
1031
|
+
env_groups.to_a.sort_by { |env_group|
|
1032
|
+
# a bit clumsy sorting here...
|
1033
|
+
env_group[0].split("").map_with_index { |l, i|
|
1034
|
+
$env_features[i + 1].labels.index(l)
|
1035
|
+
}
|
1036
|
+
}.each_with_index do |group, group_no|
|
1037
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1038
|
+
grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
1024
1039
|
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
end
|
1040
|
+
$amino_acids.each_with_index do |aa, ai|
|
1041
|
+
smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
1042
|
+
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
|
1029
1043
|
end
|
1030
1044
|
|
1031
|
-
$tot_prob_mat
|
1045
|
+
$tot_prob_mat += grp_prob_mat
|
1032
1046
|
|
1033
1047
|
if $output == 1
|
1034
|
-
$outfh.puts "
|
1035
|
-
$outfh.puts
|
1036
|
-
$outfh.close
|
1037
|
-
exit 0
|
1048
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
1049
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1038
1050
|
end
|
1051
|
+
end
|
1039
1052
|
|
1053
|
+
$tot_prob_mat /= env_groups.size
|
1040
1054
|
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1055
|
+
if $output == 1
|
1056
|
+
$outfh.puts ">Total"
|
1057
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1058
|
+
$outfh.close
|
1059
|
+
exit 0
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
|
1063
|
+
# Part 7.
|
1064
|
+
#
|
1065
|
+
# Calculating log odds ratio scoring matrices
|
1066
|
+
#
|
1067
|
+
if $output == 2
|
1068
|
+
$outfh.puts <<HEADER
|
1047
1069
|
#
|
1048
1070
|
# The probabilities were then divided by the background probabilities
|
1049
1071
|
HEADER
|
1050
|
-
|
1051
|
-
|
1072
|
+
if $penv
|
1073
|
+
$outfh.puts <<HEADER
|
1052
1074
|
# which were derived from the environment-independent amino acid frequencies.
|
1053
1075
|
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1054
1076
|
HEADER
|
1055
|
-
|
1056
|
-
|
1077
|
+
else
|
1078
|
+
$outfh.puts <<HEADER
|
1057
1079
|
# which were derived from the environment-dependent amino acid frequencies.
|
1058
1080
|
# ^^^^^^^^^^^^^^^^^^^^^
|
1059
1081
|
HEADER
|
1060
|
-
|
1082
|
+
end
|
1061
1083
|
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
# grouping environments by its environment labels but amino acid label
|
1067
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1068
|
-
|
1069
|
-
# sorting environments and build 21X21 substitution matrices
|
1070
|
-
env_groups.to_a.sort_by { |env_group|
|
1071
|
-
# a bit clumsy sorting here...
|
1072
|
-
env_group[0].split("").map_with_index { |l, i|
|
1073
|
-
$env_features[i + 1].labels.index(l)
|
1074
|
-
}
|
1075
|
-
}.each_with_index do |group, group_no|
|
1076
|
-
# calculating 21X21 substitution probability matrix for each envrionment
|
1077
|
-
grp_label = group[0]
|
1078
|
-
grp_envs = group[1]
|
1079
|
-
grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
|
1080
|
-
|
1081
|
-
$amino_acids.each_with_index do |aa, ai|
|
1082
|
-
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1083
|
-
logo_arr = $cys ? NArray.float(22) : NArray.float(21)
|
1084
|
-
|
1085
|
-
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1086
|
-
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1087
|
-
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1088
|
-
logo_arr[j] = factor * Math::log(odds)
|
1089
|
-
end
|
1084
|
+
$tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1085
|
+
grp_logo_mats = []
|
1086
|
+
factor = $scale / Math::log(2)
|
1090
1087
|
|
1091
|
-
|
1088
|
+
# grouping environments by its environment labels but amino acid label
|
1089
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1092
1090
|
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1091
|
+
# sorting environments and build 21X21 substitution matrices
|
1092
|
+
env_groups.to_a.sort_by { |env_group|
|
1093
|
+
# a bit clumsy sorting here...
|
1094
|
+
env_group[0].split("").map_with_index { |l, i|
|
1095
|
+
$env_features[i + 1].labels.index(l)
|
1096
|
+
}
|
1097
|
+
}.each_with_index do |group, group_no|
|
1098
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1099
|
+
grp_label = group[0]
|
1100
|
+
grp_envs = group[1]
|
1101
|
+
grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1102
|
+
|
1103
|
+
$amino_acids.each_with_index do |aa, ai|
|
1104
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1105
|
+
logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1106
|
+
|
1107
|
+
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1108
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1109
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1110
|
+
logo_arr[j] = factor * Math::log(odds)
|
1101
1111
|
end
|
1102
1112
|
|
1103
|
-
$
|
1104
|
-
|
1113
|
+
0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1114
|
+
|
1115
|
+
# adding log odds ratio for "U" (J or C) when --cyc is 0
|
1116
|
+
if $cys == 0
|
1117
|
+
paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
|
1118
|
+
prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
|
1119
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1120
|
+
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1121
|
+
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1122
|
+
end
|
1105
1123
|
end
|
1106
1124
|
|
1107
|
-
$tot_logo_mat
|
1125
|
+
$tot_logo_mat += grp_logo_mat
|
1126
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1127
|
+
end
|
1108
1128
|
|
1109
|
-
|
1110
|
-
# the expected score E in bit units
|
1111
|
-
#
|
1112
|
-
# I'm a bit suspicious about this part...
|
1113
|
-
tot_E = 0.0
|
1114
|
-
tot_H = 0.0
|
1129
|
+
$tot_logo_mat /= env_groups.size
|
1115
1130
|
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1131
|
+
# calculating relative entropy for each amino acid pair H and
|
1132
|
+
# the expected score E in bit units
|
1133
|
+
#
|
1134
|
+
# I'm a bit suspicious about this part...
|
1135
|
+
tot_E = 0.0
|
1136
|
+
tot_H = 0.0
|
1137
|
+
|
1138
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i|
|
1139
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1140
|
+
if i != j
|
1141
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
|
1142
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
|
1143
|
+
else
|
1144
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
|
1145
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
|
1125
1146
|
end
|
1126
1147
|
end
|
1148
|
+
end
|
1127
1149
|
|
1128
|
-
|
1150
|
+
$outfh.puts <<HEADER
|
1129
1151
|
#
|
1130
1152
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1131
1153
|
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
@@ -1134,27 +1156,27 @@ HEADER
|
|
1134
1156
|
#
|
1135
1157
|
HEADER
|
1136
1158
|
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1159
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1160
|
+
grp_label = arr[0]
|
1161
|
+
grp_logo_mat = arr[1]
|
1140
1162
|
|
1141
|
-
|
1142
|
-
if $cys
|
1143
|
-
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1144
|
-
else
|
1145
|
-
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1146
|
-
end
|
1147
|
-
end
|
1148
|
-
|
1149
|
-
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1163
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1150
1164
|
if $cys
|
1151
|
-
$outfh.puts
|
1165
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1152
1166
|
else
|
1153
|
-
$outfh.puts
|
1167
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1154
1168
|
end
|
1155
|
-
$outfh.close
|
1156
|
-
exit 0
|
1157
1169
|
end
|
1170
|
+
|
1171
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1172
|
+
|
1173
|
+
if $cys == 0
|
1174
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1175
|
+
else
|
1176
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1177
|
+
end
|
1178
|
+
$outfh.close
|
1179
|
+
exit 0
|
1158
1180
|
end
|
1159
1181
|
end
|
1160
1182
|
end
|