egor 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/README.rdoc +25 -13
- data/egor.gemspec +8 -6
- data/lib/egor/cli.rb +618 -596
- data/lib/egor.rb +1 -1
- data/lib/environment.rb +9 -9
- data/website/index.html +5 -5
- data/website/index.txt +8 -6
- data.tar.gz.sig +0 -0
- metadata +26 -5
- metadata.gz.sig +1 -0
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
== 0.0.3 2008-12-09
|
2
|
+
|
3
|
+
* 2 major enhancement:
|
4
|
+
* An option '--cys (-j) 2' added not to distinguish J from C, so 'disulphide bond' environment feature is not prerequisite
|
5
|
+
* Masking works for target amino acid, too
|
6
|
+
|
1
7
|
== 0.0.2 2008-11-13
|
2
8
|
|
3
9
|
* 2 major enhancement:
|
data/README.rdoc
CHANGED
@@ -2,10 +2,12 @@
|
|
2
2
|
|
3
3
|
* http://egor.rubyforge.org
|
4
4
|
|
5
|
+
|
5
6
|
== DESCRIPTION:
|
6
7
|
|
7
8
|
egor: Esst GeneratOR, a program for calculating environment-specific substitution tables
|
8
9
|
|
10
|
+
|
9
11
|
== FEATURES/PROBLEMS:
|
10
12
|
|
11
13
|
* No more segmentation fault
|
@@ -14,34 +16,42 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
14
16
|
* Full smoothing supported
|
15
17
|
* In theory, infinite number of environment features can be handled
|
16
18
|
|
19
|
+
|
20
|
+
== INSTALL:
|
21
|
+
|
22
|
+
$ sudo gem install egor
|
23
|
+
|
24
|
+
|
17
25
|
== BASIC USAGE:
|
18
26
|
|
19
27
|
$ egor -l TEMLIST-file -c classdef.dat
|
20
28
|
or
|
21
29
|
$ egor -f TEM-file -c classdef.dat
|
22
30
|
|
31
|
+
|
23
32
|
== OPTIONS:
|
24
|
-
--tem-file (-f)
|
25
|
-
--tem-list (-l)
|
26
|
-
--classdef (-c)
|
27
|
-
--outfile (-o)
|
28
|
-
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
33
|
+
--tem-file (-f) FILE: a tem file
|
34
|
+
--tem-list (-l) FILE: a list for tem files
|
35
|
+
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
36
|
+
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
37
|
+
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
29
38
|
--noweight: calculate substitution counts with no weights (default)
|
30
39
|
--smooth (-s) INTEGER:
|
31
40
|
0 for parial smoothing (default)
|
32
41
|
1 for full smoothing
|
33
42
|
--nosmooth: perform no smoothing operation
|
34
|
-
--cys (-y) INTEGER:
|
35
|
-
0 for using C and J only for structure
|
36
|
-
1 for both structure and sequence
|
43
|
+
--cys (-y) INTEGER:
|
44
|
+
0 for using C and J only for structure (default)
|
45
|
+
1 for both structure and sequence
|
46
|
+
2 for using only C for both
|
37
47
|
--output INTEGER:
|
38
48
|
0 for raw counts (no-smoothing performed)
|
39
49
|
1 for probabilities
|
40
50
|
2 for log-odds (default)
|
41
51
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
42
52
|
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
43
|
-
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1
|
44
|
-
--penv: use environment-dependent frequencies for log-odds calculation (default false) (
|
53
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/=classes)
|
54
|
+
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
45
55
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
46
56
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
47
57
|
--verbose (-v) INTEGER
|
@@ -52,17 +62,19 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
52
62
|
--version: print version
|
53
63
|
--help (-h): show help
|
54
64
|
|
65
|
+
|
55
66
|
== REQUIREMENTS:
|
56
67
|
|
57
68
|
* ruby 1.8.6 or above (http://www.ruby-lang.org)
|
58
69
|
* rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
|
70
|
+
|
71
|
+
Following RubyGems will be automatically installed if you have rubygems installed on your machine
|
72
|
+
|
59
73
|
* narray (http://narray.rubyforge.org/)
|
60
74
|
* facets (http://facets.rubyforge.org/)
|
61
75
|
* bio (http://bioruby.open-bio.org/)
|
76
|
+
* simple_memoize (http://github.com/JackDanger/simple_memoize/tree/master)
|
62
77
|
|
63
|
-
== INSTALL:
|
64
|
-
|
65
|
-
$ sudo gem install egor
|
66
78
|
|
67
79
|
== LICENSE:
|
68
80
|
|
data/egor.gemspec
CHANGED
@@ -2,11 +2,12 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{egor}
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Semin Lee"]
|
9
|
-
s.
|
9
|
+
s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
|
10
|
+
s.date = %q{2008-12-09}
|
10
11
|
s.default_executable = %q{egor}
|
11
12
|
s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
|
12
13
|
s.email = ["seminlee@gmail.com"]
|
@@ -20,8 +21,9 @@ Gem::Specification.new do |s|
|
|
20
21
|
s.require_paths = ["lib"]
|
21
22
|
s.rubyforge_project = %q{egor}
|
22
23
|
s.rubygems_version = %q{1.3.1}
|
24
|
+
s.signing_key = %q{/Users/semin/.gem/gem-private_key.pem}
|
23
25
|
s.summary = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
|
24
|
-
s.test_files = ["test/
|
26
|
+
s.test_files = ["test/test_egor.rb", "test/test_egor_cli.rb", "test/test_enumerable_extensions.rb", "test/test_environment_feature.rb", "test/test_helper.rb", "test/test_nmatrix_extensions.rb"]
|
25
27
|
|
26
28
|
if s.respond_to? :specification_version then
|
27
29
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
@@ -32,14 +34,14 @@ Gem::Specification.new do |s|
|
|
32
34
|
s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
|
33
35
|
s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
|
34
36
|
s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
35
|
-
s.add_development_dependency(%q<newgem>, [">= 1.0
|
37
|
+
s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
|
36
38
|
s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
|
37
39
|
else
|
38
40
|
s.add_dependency(%q<narray>, [">= 0.5.9.5"])
|
39
41
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
40
42
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
41
43
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
42
|
-
s.add_dependency(%q<newgem>, [">= 1.0
|
44
|
+
s.add_dependency(%q<newgem>, [">= 1.1.0"])
|
43
45
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
44
46
|
end
|
45
47
|
else
|
@@ -47,7 +49,7 @@ Gem::Specification.new do |s|
|
|
47
49
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
48
50
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
49
51
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
50
|
-
s.add_dependency(%q<newgem>, [">= 1.0
|
52
|
+
s.add_dependency(%q<newgem>, [">= 1.1.0"])
|
51
53
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
52
54
|
end
|
53
55
|
end
|
data/lib/egor/cli.rb
CHANGED
@@ -44,7 +44,7 @@ Options:
|
|
44
44
|
--tem-file (-f) FILE: a tem file
|
45
45
|
--tem-list (-l) FILE: a list for tem files
|
46
46
|
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
47
|
-
--outfile (-o) FILE: output filename (
|
47
|
+
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
48
48
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
49
|
--noweight: calculate substitution counts with no weights (default)
|
50
50
|
--smooth (-s) INTEGER:
|
@@ -54,6 +54,7 @@ Options:
|
|
54
54
|
--cys (-y) INTEGER:
|
55
55
|
0 for using C and J only for structure (default)
|
56
56
|
1 for both structure and sequence
|
57
|
+
2 for using only C for both
|
57
58
|
--output INTEGER:
|
58
59
|
0 for raw counts (no-smoothing performed)
|
59
60
|
1 for probabilities
|
@@ -152,12 +153,12 @@ Options:
|
|
152
153
|
$cys = 0
|
153
154
|
$penv = false
|
154
155
|
|
155
|
-
$aa_tot_obs =
|
156
|
-
$aa_mut_obs =
|
156
|
+
$aa_tot_obs = Hash.new(0)
|
157
|
+
$aa_mut_obs = Hash.new(0)
|
157
158
|
$aa_mutb = {}
|
158
159
|
$aa_rel_mutb = {}
|
159
160
|
$aa_rel_freq = {}
|
160
|
-
$env_aa_obs =
|
161
|
+
$env_aa_obs = Hash.new(0)
|
161
162
|
$smooth_prob = {}
|
162
163
|
$tot_freq_mat = nil
|
163
164
|
$tot_prob_mat = nil
|
@@ -200,7 +201,7 @@ Options:
|
|
200
201
|
when '--outfile'
|
201
202
|
$outfile = arg
|
202
203
|
when '--cys'
|
203
|
-
$cys =
|
204
|
+
$cys = arg.to_i
|
204
205
|
when '--weight'
|
205
206
|
$weight = arg.to_i
|
206
207
|
when '--sigma'
|
@@ -255,10 +256,12 @@ Options:
|
|
255
256
|
# Reading Environment Class Definition File
|
256
257
|
#
|
257
258
|
|
259
|
+
# set amino_acids
|
260
|
+
$amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
|
261
|
+
|
258
262
|
# an array for storing all environment feature objects
|
259
263
|
$env_features = []
|
260
264
|
|
261
|
-
|
262
265
|
# an array for storing indexes of constrained environment features
|
263
266
|
$cst_features = []
|
264
267
|
|
@@ -310,7 +313,7 @@ Options:
|
|
310
313
|
}.inject { |pro, lb|
|
311
314
|
pro.product(lb)
|
312
315
|
}.each_with_index { |e, i|
|
313
|
-
$envs[e.flatten.join] = Environment.new(i, e.flatten.join)
|
316
|
+
$envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
|
314
317
|
}
|
315
318
|
|
316
319
|
# Part 4.
|
@@ -322,291 +325,308 @@ Options:
|
|
322
325
|
$outfh = File.open($outfile, "w")
|
323
326
|
|
324
327
|
if $tem_file
|
325
|
-
$
|
328
|
+
$tem_list_io = StringIO.new($tem_file)
|
326
329
|
end
|
327
330
|
|
328
331
|
if $tem_list
|
329
|
-
|
330
|
-
|
332
|
+
$tem_list_io = File.open($tem_list)
|
333
|
+
end
|
334
|
+
|
335
|
+
$tem_list_io.each_line do |tem_file|
|
336
|
+
tem_file.chomp!
|
337
|
+
|
338
|
+
$logger.info ">>> Analysing #{tem_file} ..."
|
339
|
+
|
340
|
+
ali = Bio::Alignment::OriginalAlignment.new
|
341
|
+
ff = Bio::FlatFile.auto(tem_file)
|
342
|
+
ff.each_entry do |pir|
|
343
|
+
if pir.definition == "sequence"
|
344
|
+
ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
if ali.size < 2
|
349
|
+
$logger.warn "!!! Skipped #{tem_file}, there is only one 'sequence' entry"
|
350
|
+
next
|
351
|
+
end
|
331
352
|
|
332
|
-
|
353
|
+
$ali_size += 1
|
354
|
+
env_labels = {}
|
355
|
+
disulphide = {}
|
333
356
|
|
334
|
-
|
335
|
-
|
357
|
+
ali.each_pair do |key, seq|
|
358
|
+
# check disulphide bond environment first!
|
359
|
+
ff.rewind
|
336
360
|
ff.each_entry do |pir|
|
337
|
-
if pir.definition == "
|
338
|
-
|
361
|
+
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
362
|
+
disulphide[key] = pir.data.gsub("\n", "").split("")
|
339
363
|
end
|
340
364
|
end
|
341
365
|
|
342
|
-
$
|
343
|
-
|
344
|
-
disulphide = {}
|
366
|
+
$env_features.each_with_index do |ec, ei|
|
367
|
+
env_labels[key] = [] unless env_labels.has_key?(key)
|
345
368
|
|
346
|
-
ali.each_pair do |key, seq|
|
347
|
-
# check disulphide bond environment first!
|
348
369
|
ff.rewind
|
349
370
|
ff.each_entry do |pir|
|
350
|
-
if (pir.entry_id == key) && (pir.definition ==
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
ff.each_entry do |pir|
|
360
|
-
if (pir.entry_id == key) && (pir.definition == ec.name)
|
361
|
-
labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
|
362
|
-
if sym == "-"
|
363
|
-
"-"
|
364
|
-
elsif sym == "X" || sym == "x"
|
365
|
-
"X"
|
371
|
+
if (pir.entry_id == key) && (pir.definition == ec.name)
|
372
|
+
labels = pir.data.gsub("\n", "").split("").map_with_index do |sym, pos|
|
373
|
+
if sym == "-"
|
374
|
+
"-"
|
375
|
+
elsif sym == "X" || sym == "x"
|
376
|
+
"X"
|
377
|
+
else
|
378
|
+
if ei == 0 # Amino Acid Environment Feature
|
379
|
+
(( disulphide.has_key?(key) and disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
366
380
|
else
|
367
|
-
|
368
|
-
((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
|
369
|
-
else
|
370
|
-
ec.labels[ec.symbols.index(sym)]
|
371
|
-
end
|
381
|
+
ec.labels[ec.symbols.index(sym)]
|
372
382
|
end
|
373
383
|
end
|
384
|
+
end
|
374
385
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
end
|
386
|
+
if env_labels[key].empty?
|
387
|
+
env_labels[key] = labels
|
388
|
+
else
|
389
|
+
env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
|
380
390
|
end
|
381
391
|
end
|
382
392
|
end
|
383
393
|
end
|
394
|
+
end
|
395
|
+
|
396
|
+
if $noweight
|
397
|
+
ali.each_pair do |id1, seq1|
|
398
|
+
ali.each_pair do |id2, seq2|
|
399
|
+
if id1 != id2
|
400
|
+
pid = calc_pid(seq1, seq2)
|
401
|
+
s1 = seq1.split("")
|
402
|
+
s2 = seq2.split("")
|
403
|
+
|
404
|
+
# check PID_MIN
|
405
|
+
if $pidmin && (pid < $pidmin)
|
406
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}"
|
407
|
+
next
|
408
|
+
end
|
384
409
|
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
410
|
+
# check PID_MAX
|
411
|
+
if $pidmax && (pid > $pidmax)
|
412
|
+
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
|
413
|
+
next
|
414
|
+
end
|
415
|
+
|
416
|
+
s1.each_with_index do |aa1, pos|
|
417
|
+
aa1.upcase!
|
418
|
+
aa2 = s2[pos].upcase
|
419
|
+
|
420
|
+
if env_labels[id1][pos].include?("X")
|
421
|
+
$logger.info ">>> Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
396
422
|
next
|
397
423
|
end
|
398
424
|
|
399
|
-
|
400
|
-
|
401
|
-
$logger.info ">>> Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}"
|
425
|
+
if env_labels[id2][pos].include?("X")
|
426
|
+
$logger.info ">>> Substitutions to #{id2}-#{pos}-#{aa2} were masked"
|
402
427
|
next
|
403
428
|
end
|
404
429
|
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
end
|
410
|
-
|
411
|
-
aa1.upcase!
|
412
|
-
aa2 = s2[pos].upcase
|
413
|
-
|
414
|
-
if !$amino_acids.include?(aa1)
|
415
|
-
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
|
416
|
-
next
|
417
|
-
end
|
430
|
+
if !$amino_acids.include?(aa1)
|
431
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
|
432
|
+
next
|
433
|
+
end
|
418
434
|
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
435
|
+
if !$amino_acids.include?(aa2)
|
436
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
|
437
|
+
next
|
438
|
+
end
|
423
439
|
|
424
|
-
|
425
|
-
|
440
|
+
aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
441
|
+
aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
426
442
|
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
end
|
443
|
+
if $cst_features.empty?
|
444
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
445
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
|
446
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
447
|
+
else
|
448
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
449
|
+
next
|
450
|
+
end
|
436
451
|
|
437
|
-
|
452
|
+
grp_label = env_labels[id1][pos][1..-1]
|
438
453
|
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
else
|
443
|
-
$env_aa_obs[grp_label][aa1] = 1
|
444
|
-
end
|
454
|
+
if $env_aa_obs.has_key? grp_label
|
455
|
+
if $env_aa_obs[grp_label].has_key? aa1
|
456
|
+
$env_aa_obs[grp_label][aa1] += 1
|
445
457
|
else
|
446
|
-
$env_aa_obs[grp_label] = Hash.new(0)
|
447
458
|
$env_aa_obs[grp_label][aa1] = 1
|
448
459
|
end
|
460
|
+
else
|
461
|
+
$env_aa_obs[grp_label] = Hash.new(0)
|
462
|
+
$env_aa_obs[grp_label][aa1] = 1
|
463
|
+
end
|
449
464
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
465
|
+
if $aa_tot_obs.has_key? aa1
|
466
|
+
$aa_tot_obs[aa1] += 1
|
467
|
+
else
|
468
|
+
$aa_tot_obs[aa1] = 1
|
469
|
+
end
|
455
470
|
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
end
|
471
|
+
if aa1 != aa2
|
472
|
+
if $aa_mut_obs.has_key? aa1
|
473
|
+
$aa_mut_obs[aa1] += 1
|
474
|
+
else
|
475
|
+
$aa_mut_obs[aa1] = 1
|
462
476
|
end
|
463
|
-
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
464
477
|
end
|
478
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
|
465
479
|
end
|
466
480
|
end
|
467
481
|
end
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
482
|
+
end
|
483
|
+
else
|
484
|
+
# BLOSUM-like weighting
|
485
|
+
clusters = []
|
486
|
+
ali.each_pair { |i, s| clusters << [i] }
|
487
|
+
|
488
|
+
# a loop for single linkage clustering
|
489
|
+
begin
|
490
|
+
continue = false
|
491
|
+
0.upto(clusters.size - 2) do |i|
|
492
|
+
indexes = []
|
493
|
+
(i + 1).upto(clusters.size - 1) do |j|
|
494
|
+
found = false
|
495
|
+
clusters[i].each do |c1|
|
496
|
+
clusters[j].each do |c2|
|
497
|
+
if calc_pid(ali[c1], ali[c2]) >= $weight
|
498
|
+
indexes << j
|
499
|
+
found = true
|
500
|
+
break
|
487
501
|
end
|
488
|
-
break if found
|
489
502
|
end
|
503
|
+
break if found
|
490
504
|
end
|
505
|
+
end
|
491
506
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
end
|
499
|
-
clusters[i] = group
|
500
|
-
clusters.compact!
|
507
|
+
unless indexes.empty?
|
508
|
+
continue = true
|
509
|
+
group = clusters[i]
|
510
|
+
indexes.each do |k|
|
511
|
+
group = group.concat(clusters[k])
|
512
|
+
clusters[k] = nil
|
501
513
|
end
|
514
|
+
clusters[i] = group
|
515
|
+
clusters.compact!
|
502
516
|
end
|
503
|
-
end
|
504
|
-
|
505
|
-
clusters.combination(2).each do |cluster1, cluster2|
|
506
|
-
cluster1.each do |id1|
|
507
|
-
cluster2.each do |id2|
|
508
|
-
seq1 = ali[id1].split("")
|
509
|
-
seq2 = ali[id2].split("")
|
510
|
-
|
511
|
-
seq1.each_with_index do |aa1, pos|
|
512
|
-
if env_labels[id1][pos].include?("X")
|
513
|
-
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
514
|
-
next
|
515
|
-
end
|
517
|
+
end
|
518
|
+
end while(continue)
|
516
519
|
|
517
|
-
|
518
|
-
|
520
|
+
clusters.combination(2).each do |cluster1, cluster2|
|
521
|
+
cluster1.each do |id1|
|
522
|
+
cluster2.each do |id2|
|
523
|
+
seq1 = ali[id1].split("")
|
524
|
+
seq2 = ali[id2].split("")
|
519
525
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
end
|
526
|
+
seq1.each_with_index do |aa1, pos|
|
527
|
+
aa1.upcase!
|
528
|
+
aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
|
524
529
|
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
530
|
+
if env_labels[id1][pos].include?("X")
|
531
|
+
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
532
|
+
next
|
533
|
+
end
|
529
534
|
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
535
|
+
if env_labels[id2][pos].include?("X")
|
536
|
+
$logger.debug "*** Substitutions to #{id2}-#{pos}-#{aa2} were masked"
|
537
|
+
next
|
538
|
+
end
|
539
|
+
|
540
|
+
if !$amino_acids.include?(aa1)
|
541
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
542
|
+
next
|
543
|
+
end
|
544
|
+
|
545
|
+
if !$amino_acids.include?(aa2)
|
546
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
547
|
+
next
|
548
|
+
end
|
549
|
+
|
550
|
+
aa1 = (((disulphide.has_key?(id1) and disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
551
|
+
aa2 = (((disulphide.has_key?(id2) and disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
552
|
+
size1 = cluster1.size
|
553
|
+
size2 = cluster2.size
|
554
|
+
obs1 = 1.0 / size1
|
555
|
+
obs2 = 1.0 / size2
|
556
|
+
|
557
|
+
if $cst_features.empty?
|
558
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
559
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
560
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) == env_labels[id2][pos].split("").values_at(*$cst_features))
|
561
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
562
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
563
|
+
else
|
564
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
565
|
+
next
|
566
|
+
end
|
548
567
|
|
549
|
-
|
550
|
-
|
568
|
+
grp_label1 = env_labels[id1][pos][1..-1]
|
569
|
+
grp_label2 = env_labels[id2][pos][1..-1]
|
551
570
|
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
else
|
556
|
-
$env_aa_obs[grp_label1][aa1] = obs1
|
557
|
-
end
|
571
|
+
if $env_aa_obs.has_key? grp_label1
|
572
|
+
if $env_aa_obs[grp_label1].has_key? aa1
|
573
|
+
$env_aa_obs[grp_label1][aa1] += obs1
|
558
574
|
else
|
559
|
-
$env_aa_obs[grp_label1] = Hash.new(0.0)
|
560
575
|
$env_aa_obs[grp_label1][aa1] = obs1
|
561
576
|
end
|
577
|
+
else
|
578
|
+
$env_aa_obs[grp_label1] = Hash.new(0.0)
|
579
|
+
$env_aa_obs[grp_label1][aa1] = obs1
|
580
|
+
end
|
562
581
|
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
else
|
567
|
-
$env_aa_obs[grp_label2][aa2] = obs2
|
568
|
-
end
|
582
|
+
if $env_aa_obs.has_key? grp_label2
|
583
|
+
if $env_aa_obs[grp_label2].has_key? aa2
|
584
|
+
$env_aa_obs[grp_label2][aa2] += obs2
|
569
585
|
else
|
570
|
-
$env_aa_obs[grp_label2] = Hash.new(0.0)
|
571
586
|
$env_aa_obs[grp_label2][aa2] = obs2
|
572
587
|
end
|
588
|
+
else
|
589
|
+
$env_aa_obs[grp_label2] = Hash.new(0.0)
|
590
|
+
$env_aa_obs[grp_label2][aa2] = obs2
|
591
|
+
end
|
573
592
|
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
593
|
+
if $aa_tot_obs.has_key? aa1
|
594
|
+
$aa_tot_obs[aa1] += obs1
|
595
|
+
else
|
596
|
+
$aa_tot_obs[aa1] = obs1
|
597
|
+
end
|
579
598
|
|
580
|
-
|
581
|
-
|
599
|
+
if $aa_tot_obs.has_key? aa2
|
600
|
+
$aa_tot_obs[aa2] += obs2
|
601
|
+
else
|
602
|
+
$aa_tot_obs[aa2] = obs2
|
603
|
+
end
|
604
|
+
|
605
|
+
if aa1 != aa2
|
606
|
+
if $aa_mut_obs.has_key? aa1
|
607
|
+
$aa_mut_obs[aa1] += obs1
|
582
608
|
else
|
583
|
-
$
|
609
|
+
$aa_mut_obs[aa1] = obs1
|
584
610
|
end
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
else
|
590
|
-
$aa_mut_obs[aa1] = obs1
|
591
|
-
end
|
592
|
-
if $aa_mut_obs.has_key? aa2
|
593
|
-
$aa_mut_obs[aa2] += obs2
|
594
|
-
else
|
595
|
-
$aa_mut_obs[aa2] = obs2
|
596
|
-
end
|
611
|
+
if $aa_mut_obs.has_key? aa2
|
612
|
+
$aa_mut_obs[aa2] += obs2
|
613
|
+
else
|
614
|
+
$aa_mut_obs[aa2] = obs2
|
597
615
|
end
|
598
|
-
|
599
|
-
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substituion for #{env_labels[id1][pos]}"
|
600
|
-
$logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substituion for #{env_labels[id2][pos]}"
|
601
616
|
end
|
617
|
+
|
618
|
+
$logger.debug "*** Add #{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution for #{env_labels[id1][pos]}"
|
619
|
+
$logger.debug "*** Add #{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution for #{env_labels[id2][pos]}"
|
602
620
|
end
|
603
621
|
end
|
604
622
|
end
|
605
|
-
end
|
606
|
-
end #
|
623
|
+
end
|
624
|
+
end # if !$nosmooth
|
625
|
+
end
|
607
626
|
|
608
|
-
|
609
|
-
|
627
|
+
# print out default header
|
628
|
+
$outfh.puts <<HEADER
|
629
|
+
#
|
610
630
|
# Environment-specific amino acid substitution matrices
|
611
631
|
# Creator: egor version #{Egor::VERSION}
|
612
632
|
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
@@ -616,9 +636,9 @@ Options:
|
|
616
636
|
#
|
617
637
|
HEADER
|
618
638
|
|
619
|
-
|
639
|
+
$env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
|
620
640
|
|
621
|
-
|
641
|
+
$outfh.puts <<HEADER
|
622
642
|
#
|
623
643
|
# (read in from #{$classdef})
|
624
644
|
#
|
@@ -632,164 +652,164 @@ HEADER
|
|
632
652
|
#
|
633
653
|
HEADER
|
634
654
|
|
655
|
+
if $noweight
|
656
|
+
$outfh.puts "# Weighting scheme: none"
|
657
|
+
else
|
658
|
+
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
659
|
+
end
|
660
|
+
|
661
|
+
# calculate amino acid frequencies and mutabilities, and
|
662
|
+
# print them as default statistics in the header part
|
663
|
+
ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
664
|
+
$tot_aa = $aa_tot_obs.values.sum
|
665
|
+
|
666
|
+
$outfh.puts "#"
|
667
|
+
$outfh.puts "# Total amino acid frequencies:\n"
|
668
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
|
669
|
+
|
670
|
+
$amino_acids.each do |res|
|
671
|
+
$aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
|
672
|
+
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
673
|
+
$aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
|
674
|
+
end
|
675
|
+
|
676
|
+
$amino_acids.each do |res|
|
635
677
|
if $noweight
|
636
|
-
$outfh.puts "#
|
678
|
+
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
679
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
637
680
|
else
|
638
|
-
$outfh.puts "#
|
681
|
+
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
682
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
639
683
|
end
|
684
|
+
end
|
640
685
|
|
641
|
-
# calculate amino acid frequencies and mutabilities, and
|
642
|
-
# print them as default statistics in the header part
|
643
|
-
ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
644
|
-
$tot_aa = $aa_tot_obs.values.sum
|
645
686
|
|
646
|
-
|
647
|
-
|
648
|
-
|
687
|
+
# Part 5.
|
688
|
+
#
|
689
|
+
# Calculating substitution frequency tables
|
690
|
+
#
|
649
691
|
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
692
|
+
# calculating probabilities for each environment
|
693
|
+
$envs.values.each do |e|
|
694
|
+
if e.freq_array.sum != 0
|
695
|
+
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
654
696
|
end
|
697
|
+
end
|
655
698
|
|
656
|
-
|
657
|
-
|
658
|
-
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
659
|
-
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
660
|
-
else
|
661
|
-
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
662
|
-
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
663
|
-
end
|
664
|
-
end
|
699
|
+
# count raw frequencies
|
700
|
+
$tot_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
|
665
701
|
|
702
|
+
# for each combination of environment features
|
703
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
666
704
|
|
667
|
-
|
668
|
-
#
|
669
|
-
|
670
|
-
|
705
|
+
env_groups.to_a.sort_by { |env_group|
|
706
|
+
# a bit clumsy sorting here...
|
707
|
+
env_group[0].split("").map_with_index { |l, i|
|
708
|
+
$env_features[i + 1].labels.index(l)
|
709
|
+
}
|
710
|
+
}.each_with_index do |group, group_no|
|
711
|
+
grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
|
671
712
|
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
713
|
+
$amino_acids.each_with_index do |aa, ai|
|
714
|
+
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
715
|
+
0.upto($amino_acids.size - 1) { |j| grp_freq_mat[ai, j] = freq_array[j] }
|
716
|
+
end
|
717
|
+
|
718
|
+
$tot_freq_mat += grp_freq_mat
|
719
|
+
|
720
|
+
if $output == 0
|
721
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
722
|
+
$outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
677
723
|
end
|
724
|
+
end
|
725
|
+
|
726
|
+
if $output == 0
|
727
|
+
$outfh.puts ">Total"
|
728
|
+
$outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
729
|
+
exit 0
|
730
|
+
end
|
731
|
+
|
678
732
|
|
679
|
-
|
680
|
-
|
733
|
+
# Part 6.
|
734
|
+
#
|
735
|
+
# Calculating substitution probability tables
|
736
|
+
#
|
737
|
+
|
738
|
+
if $output == 1
|
739
|
+
$outfh.puts <<HEADER
|
740
|
+
#
|
741
|
+
# Each column (j) represents the probability distribution for the
|
742
|
+
# likelihood of acceptance of a mutational event by a residue type j in
|
743
|
+
# a particular structural environment (specified after >) leading to
|
744
|
+
# any other residue type (i) and sums up to 100.
|
745
|
+
#
|
746
|
+
HEADER
|
747
|
+
end
|
748
|
+
|
749
|
+
if ($output > 0) && $nosmooth
|
750
|
+
# Probability matrices
|
751
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
681
752
|
|
682
753
|
# for each combination of environment features
|
683
754
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
684
|
-
|
685
755
|
env_groups.to_a.sort_by { |env_group|
|
686
756
|
# a bit clumsy sorting here...
|
687
757
|
env_group[0].split("").map_with_index { |l, i|
|
688
758
|
$env_features[i + 1].labels.index(l)
|
689
759
|
}
|
690
760
|
}.each_with_index do |group, group_no|
|
691
|
-
|
761
|
+
grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
692
762
|
|
693
763
|
$amino_acids.each_with_index do |aa, ai|
|
694
|
-
|
695
|
-
0.upto(
|
764
|
+
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
765
|
+
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = prob_array[j] }
|
696
766
|
end
|
697
767
|
|
698
|
-
$
|
768
|
+
$tot_prob_mat += grp_prob_mat
|
699
769
|
|
700
|
-
if $output ==
|
770
|
+
if ($output == 1)
|
701
771
|
$outfh.puts ">#{group[0]} #{group_no}"
|
702
|
-
$outfh.puts
|
772
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
703
773
|
end
|
704
774
|
end
|
705
775
|
|
706
|
-
if $output ==
|
776
|
+
if ($output == 1)
|
707
777
|
$outfh.puts ">Total"
|
708
|
-
$outfh.puts $
|
778
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
779
|
+
$outfh.close
|
709
780
|
exit 0
|
710
781
|
end
|
782
|
+
end
|
711
783
|
|
712
|
-
|
713
|
-
|
784
|
+
# for smoothing...
|
785
|
+
if ($output > 0) && !$nosmooth
|
714
786
|
#
|
715
|
-
#
|
787
|
+
# p1 probability
|
716
788
|
#
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
#
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
$
|
732
|
-
|
733
|
-
# for each combination of environment features
|
734
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
735
|
-
env_groups.to_a.sort_by { |env_group|
|
736
|
-
# a bit clumsy sorting here...
|
737
|
-
env_group[0].split("").map_with_index { |l, i|
|
738
|
-
$env_features[i + 1].labels.index(l)
|
739
|
-
}
|
740
|
-
}.each_with_index do |group, group_no|
|
741
|
-
grp_prob_mat = NMatrix.float(21,21)
|
742
|
-
|
743
|
-
$amino_acids.each_with_index do |aa, ai|
|
744
|
-
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
745
|
-
0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
|
746
|
-
end
|
747
|
-
|
748
|
-
$tot_prob_mat += grp_prob_mat
|
749
|
-
|
750
|
-
if ($output == 1)
|
751
|
-
$outfh.puts ">#{group[0]} #{group_no}"
|
752
|
-
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
753
|
-
end
|
754
|
-
end
|
755
|
-
|
756
|
-
if ($output == 1)
|
757
|
-
$outfh.puts ">Total"
|
758
|
-
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
759
|
-
$outfh.close
|
760
|
-
exit 0
|
761
|
-
end
|
789
|
+
p1 = NArray.float($amino_acids.size)
|
790
|
+
a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
|
791
|
+
big_N = $tot_aa.to_f
|
792
|
+
small_n = $amino_acids.size.to_f
|
793
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
794
|
+
omega2 = 1.0 - omega1
|
795
|
+
|
796
|
+
if $smooth == :partial
|
797
|
+
# for partial smoothing, p1 probability is not smoothed!
|
798
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
|
799
|
+
$smooth_prob[1] = p1
|
800
|
+
else
|
801
|
+
# for full smoothing, p1 probability is smoothed
|
802
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
803
|
+
$smooth_prob[1] = p1
|
762
804
|
end
|
763
805
|
|
764
|
-
#
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
#
|
769
|
-
p1 = NArray.float(21)
|
770
|
-
a0 = NArray.float(21).fill(1 / 21.0)
|
771
|
-
big_N = $tot_aa.to_f
|
772
|
-
small_n = 21.0
|
773
|
-
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
774
|
-
omega2 = 1.0 - omega1
|
775
|
-
|
776
|
-
if $smooth == :partial
|
777
|
-
# for partial smoothing, p1 probability is not smoothed!
|
778
|
-
0.upto(20) { |i| p1[i] = 100.0 * $aa_rel_freq[$amino_acids[i]] }
|
779
|
-
$smooth_prob[1] = p1
|
780
|
-
else
|
781
|
-
# for full smoothing, p1 probability is smoothed
|
782
|
-
0.upto(20) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
|
783
|
-
$smooth_prob[1] = p1
|
784
|
-
end
|
785
|
-
|
786
|
-
#
|
787
|
-
# p2 and above
|
788
|
-
#
|
789
|
-
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
806
|
+
#
|
807
|
+
# p2 and above
|
808
|
+
#
|
809
|
+
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
790
810
|
|
791
|
-
|
792
|
-
|
811
|
+
if $smooth == :partial
|
812
|
+
$outfh.puts <<HEADER
|
793
813
|
#
|
794
814
|
# Partial Smoothing:
|
795
815
|
#
|
@@ -813,106 +833,107 @@ HEADER
|
|
813
833
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
814
834
|
#
|
815
835
|
# sigma value used is: 5.00
|
836
|
+
#
|
816
837
|
HEADER
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
838
|
+
1.upto($env_features.size) do |ci|
|
839
|
+
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
840
|
+
next if (ci > 2) && (ci < $env_features.size)
|
841
|
+
|
842
|
+
env_labels.combination(ci) do |c1|
|
843
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
844
|
+
pattern = "." * $env_features.size
|
845
|
+
|
846
|
+
labels.each do |label|
|
847
|
+
i = label[0].chr.to_i
|
848
|
+
l = label[1].chr
|
849
|
+
pattern[i] = l
|
850
|
+
end
|
830
851
|
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
852
|
+
if pattern =~ /^\./
|
853
|
+
$logger.debug "*** Skipped environment, #{pattern}, for partial smoothing"
|
854
|
+
next
|
855
|
+
end
|
835
856
|
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
857
|
+
# get environmetns, frequencies, and probabilities
|
858
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
859
|
+
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
860
|
+
prob_arr = NArray.float($amino_acids.size)
|
861
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
|
862
|
+
|
863
|
+
# # assess whether a residue type j is compatible with a particular combination of structural features
|
864
|
+
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
865
|
+
# if ci == $env_features.size
|
866
|
+
# aa_label = labels.find { |l| l.match(/^0/) }[1].chr
|
867
|
+
# sub_pattern = "." * $env_features.size
|
868
|
+
# sub_pattern[0] = aa_label
|
869
|
+
# sub_freq_sum = 0
|
870
|
+
#
|
871
|
+
# labels[1..-1].each do |label|
|
872
|
+
# next if label.start_with?("0")
|
873
|
+
# i = label[0].chr.to_i
|
874
|
+
# l = label[1].chr
|
875
|
+
# sub_pattern[i] = l
|
876
|
+
# sub_envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
877
|
+
# sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
878
|
+
# sub_freq_sum += sub_freq_arr.sum
|
879
|
+
# end
|
880
|
+
#
|
881
|
+
# if sub_freq_sum == 0
|
882
|
+
# if $smooth_prob.has_key?(ci + 1)
|
883
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
884
|
+
# else
|
885
|
+
# $smooth_prob[ci + 1] = {}
|
886
|
+
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
887
|
+
# end
|
888
|
+
# $logger.warn "!!! Smoothing procedure is off for the environment feature combination, #{pattern}"
|
889
|
+
# next
|
890
|
+
# end
|
891
|
+
# end
|
892
|
+
|
893
|
+
# collect priors if ci > 1
|
894
|
+
priors = []
|
895
|
+
|
896
|
+
if ci == 2
|
897
|
+
labels.combination(1).select { |c2| c2[0].start_with?("0") }.each { |c3|
|
898
|
+
priors << $smooth_prob[2][c3.to_set]
|
899
|
+
}
|
900
|
+
elsif ci == $env_features.size
|
901
|
+
labels.combination(2).select { |c2| c2[0].start_with?("0") || c2[1].start_with?("0") }.each { |c3|
|
902
|
+
priors << $smooth_prob[3][c3.to_set]
|
903
|
+
}
|
904
|
+
end
|
884
905
|
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
end
|
906
|
+
# entropy based weighting priors
|
907
|
+
entropy_max = Math::log($amino_acids.size)
|
908
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
|
909
|
+
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
910
|
+
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
911
|
+
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
912
|
+
|
913
|
+
# smoothing step
|
914
|
+
smooth_prob_arr = NArray.float($amino_acids.size)
|
915
|
+
big_N = freq_arr.sum.to_f
|
916
|
+
small_n = $amino_acids.size.to_f
|
917
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
918
|
+
omega2 = 1.0 - omega1
|
919
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
920
|
+
|
921
|
+
# normalization step
|
922
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
923
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
924
|
+
|
925
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
926
|
+
if !$smooth_prob.has_key?(ci + 1)
|
927
|
+
$smooth_prob[ci + 1] = {}
|
928
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
929
|
+
else
|
930
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
911
931
|
end
|
912
932
|
end
|
913
933
|
end
|
914
|
-
|
915
|
-
|
934
|
+
end
|
935
|
+
else
|
936
|
+
$outfh.puts <<HEADER
|
916
937
|
#
|
917
938
|
# Full Smoothing:
|
918
939
|
#
|
@@ -939,193 +960,194 @@ HEADER
|
|
939
960
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
940
961
|
#
|
941
962
|
# sigma value used is: 5.00
|
963
|
+
#
|
942
964
|
HEADER
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
965
|
+
# full smooting
|
966
|
+
1.upto($env_features.size) do |ci|
|
967
|
+
env_labels.combination(ci) do |c1|
|
968
|
+
Enumerable.cart_prod(*c1).each do |labels|
|
969
|
+
pattern = "." * $env_features.size
|
970
|
+
labels.each do |label|
|
971
|
+
j = label[0].chr.to_i
|
972
|
+
l = label[1].chr
|
973
|
+
pattern[j] = l
|
974
|
+
end
|
953
975
|
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
976
|
+
# get environmetns, frequencies, and probabilities
|
977
|
+
envs = $envs.values.select { |env| env.label.match(pattern.to_re) }
|
978
|
+
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
979
|
+
prob_arr = NArray.float($amino_acids.size)
|
980
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
959
981
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
982
|
+
# collect priors
|
983
|
+
priors = []
|
984
|
+
if ci > 1
|
985
|
+
labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
|
986
|
+
else
|
987
|
+
priors << $smooth_prob[1]
|
988
|
+
end
|
967
989
|
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
end
|
990
|
+
# entropy based weighting priors
|
991
|
+
entropy_max = Math::log($amino_acids.size)
|
992
|
+
entropies = priors.map do |prior|
|
993
|
+
(entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
|
994
|
+
end
|
995
|
+
weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum
|
996
|
+
|
997
|
+
# smoothing step
|
998
|
+
smooth_prob_arr = NArray.float($amino_acids.size)
|
999
|
+
big_N = freq_arr.sum.to_f
|
1000
|
+
small_n = $amino_acids.size.to_f
|
1001
|
+
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
1002
|
+
omega2 = 1.0 - omega1
|
1003
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) }
|
1004
|
+
|
1005
|
+
# normalization step
|
1006
|
+
smooth_prob_arr_sum = smooth_prob_arr.sum
|
1007
|
+
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
1008
|
+
|
1009
|
+
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1010
|
+
if !$smooth_prob.has_key?(ci + 1)
|
1011
|
+
$smooth_prob[ci + 1] = {}
|
1012
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1013
|
+
else
|
1014
|
+
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
994
1015
|
end
|
995
1016
|
end
|
996
1017
|
end
|
997
1018
|
end
|
1019
|
+
end
|
998
1020
|
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
# for a total substitution probability matrix
|
1003
|
-
$tot_prob_mat = NMatrix.float(21,21)
|
1004
|
-
|
1005
|
-
# grouping environments by its environment labels but amino acid label
|
1006
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1021
|
+
# updating smoothed probability array for each envrionment
|
1022
|
+
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
1007
1023
|
|
1008
|
-
|
1009
|
-
|
1010
|
-
# a bit clumsy sorting here...
|
1011
|
-
env_group[0].split("").map_with_index { |l, i|
|
1012
|
-
$env_features[i + 1].labels.index(l)
|
1013
|
-
}
|
1014
|
-
}.each_with_index do |group, group_no|
|
1015
|
-
# calculating 21X21 substitution probability matrix for each envrionment
|
1016
|
-
grp_prob_mat = NMatrix.float(21,21)
|
1024
|
+
# for a total substitution probability matrix
|
1025
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
1017
1026
|
|
1018
|
-
|
1019
|
-
|
1020
|
-
0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
|
1021
|
-
end
|
1027
|
+
# grouping environments by its environment labels but amino acid label
|
1028
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1022
1029
|
|
1023
|
-
|
1030
|
+
# sorting environments and build 21X21 substitution matrices
|
1031
|
+
env_groups.to_a.sort_by { |env_group|
|
1032
|
+
# a bit clumsy sorting here...
|
1033
|
+
env_group[0].split("").map_with_index { |l, i|
|
1034
|
+
$env_features[i + 1].labels.index(l)
|
1035
|
+
}
|
1036
|
+
}.each_with_index do |group, group_no|
|
1037
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1038
|
+
grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
1024
1039
|
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
end
|
1040
|
+
$amino_acids.each_with_index do |aa, ai|
|
1041
|
+
smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
1042
|
+
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
|
1029
1043
|
end
|
1030
1044
|
|
1031
|
-
$tot_prob_mat
|
1045
|
+
$tot_prob_mat += grp_prob_mat
|
1032
1046
|
|
1033
1047
|
if $output == 1
|
1034
|
-
$outfh.puts "
|
1035
|
-
$outfh.puts
|
1036
|
-
$outfh.close
|
1037
|
-
exit 0
|
1048
|
+
$outfh.puts ">#{group[0]} #{group_no}"
|
1049
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1038
1050
|
end
|
1051
|
+
end
|
1039
1052
|
|
1053
|
+
$tot_prob_mat /= env_groups.size
|
1040
1054
|
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1055
|
+
if $output == 1
|
1056
|
+
$outfh.puts ">Total"
|
1057
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1058
|
+
$outfh.close
|
1059
|
+
exit 0
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
|
1063
|
+
# Part 7.
|
1064
|
+
#
|
1065
|
+
# Calculating log odds ratio scoring matrices
|
1066
|
+
#
|
1067
|
+
if $output == 2
|
1068
|
+
$outfh.puts <<HEADER
|
1047
1069
|
#
|
1048
1070
|
# The probabilities were then divided by the background probabilities
|
1049
1071
|
HEADER
|
1050
|
-
|
1051
|
-
|
1072
|
+
if $penv
|
1073
|
+
$outfh.puts <<HEADER
|
1052
1074
|
# which were derived from the environment-independent amino acid frequencies.
|
1053
1075
|
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1054
1076
|
HEADER
|
1055
|
-
|
1056
|
-
|
1077
|
+
else
|
1078
|
+
$outfh.puts <<HEADER
|
1057
1079
|
# which were derived from the environment-dependent amino acid frequencies.
|
1058
1080
|
# ^^^^^^^^^^^^^^^^^^^^^
|
1059
1081
|
HEADER
|
1060
|
-
|
1082
|
+
end
|
1061
1083
|
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
# grouping environments by its environment labels but amino acid label
|
1067
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1068
|
-
|
1069
|
-
# sorting environments and build 21X21 substitution matrices
|
1070
|
-
env_groups.to_a.sort_by { |env_group|
|
1071
|
-
# a bit clumsy sorting here...
|
1072
|
-
env_group[0].split("").map_with_index { |l, i|
|
1073
|
-
$env_features[i + 1].labels.index(l)
|
1074
|
-
}
|
1075
|
-
}.each_with_index do |group, group_no|
|
1076
|
-
# calculating 21X21 substitution probability matrix for each envrionment
|
1077
|
-
grp_label = group[0]
|
1078
|
-
grp_envs = group[1]
|
1079
|
-
grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
|
1080
|
-
|
1081
|
-
$amino_acids.each_with_index do |aa, ai|
|
1082
|
-
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1083
|
-
logo_arr = $cys ? NArray.float(22) : NArray.float(21)
|
1084
|
-
|
1085
|
-
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1086
|
-
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1087
|
-
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1088
|
-
logo_arr[j] = factor * Math::log(odds)
|
1089
|
-
end
|
1084
|
+
$tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1085
|
+
grp_logo_mats = []
|
1086
|
+
factor = $scale / Math::log(2)
|
1090
1087
|
|
1091
|
-
|
1088
|
+
# grouping environments by its environment labels but amino acid label
|
1089
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1092
1090
|
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1091
|
+
# sorting environments and build 21X21 substitution matrices
|
1092
|
+
env_groups.to_a.sort_by { |env_group|
|
1093
|
+
# a bit clumsy sorting here...
|
1094
|
+
env_group[0].split("").map_with_index { |l, i|
|
1095
|
+
$env_features[i + 1].labels.index(l)
|
1096
|
+
}
|
1097
|
+
}.each_with_index do |group, group_no|
|
1098
|
+
# calculating 21X21 substitution probability matrix for each envrionment
|
1099
|
+
grp_label = group[0]
|
1100
|
+
grp_envs = group[1]
|
1101
|
+
grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1102
|
+
|
1103
|
+
$amino_acids.each_with_index do |aa, ai|
|
1104
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1105
|
+
logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1106
|
+
|
1107
|
+
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1108
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1109
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1110
|
+
logo_arr[j] = factor * Math::log(odds)
|
1101
1111
|
end
|
1102
1112
|
|
1103
|
-
$
|
1104
|
-
|
1113
|
+
0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1114
|
+
|
1115
|
+
# adding log odds ratio for "U" (J or C) when --cyc is 0
|
1116
|
+
if $cys == 0
|
1117
|
+
paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
|
1118
|
+
prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
|
1119
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1120
|
+
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1121
|
+
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1122
|
+
end
|
1105
1123
|
end
|
1106
1124
|
|
1107
|
-
$tot_logo_mat
|
1125
|
+
$tot_logo_mat += grp_logo_mat
|
1126
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1127
|
+
end
|
1108
1128
|
|
1109
|
-
|
1110
|
-
# the expected score E in bit units
|
1111
|
-
#
|
1112
|
-
# I'm a bit suspicious about this part...
|
1113
|
-
tot_E = 0.0
|
1114
|
-
tot_H = 0.0
|
1129
|
+
$tot_logo_mat /= env_groups.size
|
1115
1130
|
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1131
|
+
# calculating relative entropy for each amino acid pair H and
|
1132
|
+
# the expected score E in bit units
|
1133
|
+
#
|
1134
|
+
# I'm a bit suspicious about this part...
|
1135
|
+
tot_E = 0.0
|
1136
|
+
tot_H = 0.0
|
1137
|
+
|
1138
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i|
|
1139
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1140
|
+
if i != j
|
1141
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
|
1142
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
|
1143
|
+
else
|
1144
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
|
1145
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
|
1125
1146
|
end
|
1126
1147
|
end
|
1148
|
+
end
|
1127
1149
|
|
1128
|
-
|
1150
|
+
$outfh.puts <<HEADER
|
1129
1151
|
#
|
1130
1152
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1131
1153
|
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
@@ -1134,27 +1156,27 @@ HEADER
|
|
1134
1156
|
#
|
1135
1157
|
HEADER
|
1136
1158
|
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1159
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1160
|
+
grp_label = arr[0]
|
1161
|
+
grp_logo_mat = arr[1]
|
1140
1162
|
|
1141
|
-
|
1142
|
-
if $cys
|
1143
|
-
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1144
|
-
else
|
1145
|
-
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1146
|
-
end
|
1147
|
-
end
|
1148
|
-
|
1149
|
-
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1163
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1150
1164
|
if $cys
|
1151
|
-
$outfh.puts
|
1165
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1152
1166
|
else
|
1153
|
-
$outfh.puts
|
1167
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1154
1168
|
end
|
1155
|
-
$outfh.close
|
1156
|
-
exit 0
|
1157
1169
|
end
|
1170
|
+
|
1171
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1172
|
+
|
1173
|
+
if $cys == 0
|
1174
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1175
|
+
else
|
1176
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1177
|
+
end
|
1178
|
+
$outfh.close
|
1179
|
+
exit 0
|
1158
1180
|
end
|
1159
1181
|
end
|
1160
1182
|
end
|