ulla 0.9.9.1 → 0.9.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/History.txt +1 -1
- data/Manifest.txt +4 -0
- data/PostInstall.txt +1 -1
- data/README.rdoc +6 -7
- data/Rakefile +2 -2
- data/lib/narray_extensions.rb +0 -3
- data/lib/nmatrix_extensions.rb +0 -11
- data/lib/ulla.rb +45 -1
- data/lib/ulla/cli.rb +192 -89
- data/lib/ulla/esst.rb +32 -0
- data/lib/ulla/essts.rb +84 -0
- data/lib/ulla/heatmap_array.rb +0 -12
- data/lib/ulla/joy_tem.rb +63 -0
- data/lib/ulla/sequence.rb +7 -0
- data/script/txt2html +5 -5
- data/ulla.gemspec +3 -3
- data/website/index.html +286 -19
- data/website/stylesheets/screen.css +80 -81
- metadata +109 -71
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 40c1f0642b5169fc2e4e54c12c2e7b88e06a9f7f
|
4
|
+
data.tar.gz: 4ac3ec4031a177629ae72cf5de1803e3749bbf66
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 050a3eb74dc176396b8ef9d1f10bee85ce227e9694245776d2c0b2de92595ed919697fd79176e25c0ebeb203f1a5cc689f15f3ad6580609c8d0839b944627550
|
7
|
+
data.tar.gz: d05f5de53ed2ac6552af88153c932f9947d47db5dfdcbe9ef09547f54a924ff8eff1fa8cee2dbde4101f40064e5f0299208977cfff80ce239b930d9b3bcc33a4
|
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -18,6 +18,10 @@ lib/ulla/environment_class_hash.rb
|
|
18
18
|
lib/ulla/environment_feature.rb
|
19
19
|
lib/ulla/environment_feature_array.rb
|
20
20
|
lib/ulla/heatmap_array.rb
|
21
|
+
lib/ulla/esst.rb
|
22
|
+
lib/ulla/essts.rb
|
23
|
+
lib/ulla/joy_tem.rb
|
24
|
+
lib/ulla/sequence.rb
|
21
25
|
script/console
|
22
26
|
script/destroy
|
23
27
|
script/generate
|
data/PostInstall.txt
CHANGED
data/README.rdoc
CHANGED
@@ -25,7 +25,6 @@ Following RubyGems will be automatically installed if you have rubygems installe
|
|
25
25
|
|
26
26
|
* narray (http://narray.rubyforge.org)
|
27
27
|
* bio (http://bioruby.open-bio.org)
|
28
|
-
* Active Support (http://as.rubyonrails.org)
|
29
28
|
* RMagick (http://rmagick.rubyforge.org)
|
30
29
|
|
31
30
|
|
@@ -36,7 +35,7 @@ Following RubyGems will be automatically installed if you have rubygems installe
|
|
36
35
|
|
37
36
|
== Basic Usage
|
38
37
|
|
39
|
-
It's pretty much the same as Kenji's subst (http://
|
38
|
+
It's pretty much the same as Kenji's subst (http://mordred.bioc.cam.ac.uk/~kenji/subst/), so in most cases, you can swap 'subst' with 'ulla'.
|
40
39
|
|
41
40
|
~user $ ulla -l TEMLIST-file -c classdef.dat
|
42
41
|
or
|
@@ -98,7 +97,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
98
97
|
|
99
98
|
== Usage
|
100
99
|
|
101
|
-
1. Prepare an environmental class definition file. For more details, please check this notes (http://
|
100
|
+
1. Prepare an environmental class definition file. For more details, please check this notes (http://mordred.bioc.cam.ac.uk/~kenji/subst/NOTES). You can download a sample environmental class definition file from http://mordred.bioc.cam.ac.uk/~kenji/subst/classdef.dat
|
102
101
|
|
103
102
|
~user $ cat classdef.dat
|
104
103
|
#
|
@@ -108,7 +107,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
108
107
|
secondary structure and phi angle;HEPC;HEPC;T;F
|
109
108
|
solvent accessibility;TF;Aa;F;F
|
110
109
|
|
111
|
-
2. Prepare structural alignments and their annotations of above environmental classes in PIR format. You can download sample alignments from http://
|
110
|
+
2. Prepare structural alignments and their annotations of above environmental classes in PIR format. You can download sample alignments from http://mordred.bioc.cam.ac.uk/~kenji/subst/alltem-allmask.tar.gz or from http://www-cryst.bioc.cam.ac.uk/ESST/
|
112
111
|
|
113
112
|
~user $ cat sample1.tem
|
114
113
|
>P1;1mnma
|
@@ -167,7 +166,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
167
166
|
|
168
167
|
9. In case positions are masked with the character 'X' in any environmental features, all mutations from/to the position will be excluded from substitution counts.
|
169
168
|
|
170
|
-
10. Then, it will produce a file containing all the matrices, which will look like the one below. For more details, please check this notes (http://
|
169
|
+
10. Then, it will produce a file containing all the matrices, which will look like the one below. For more details, please check this notes (http://mordred.bioc.cam.ac.uk/~kenji/subst/NOTES).
|
171
170
|
|
172
171
|
# Environment-specific amino acid substitution matrices
|
173
172
|
# Creator: ulla version 0.0.5
|
@@ -226,7 +225,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
226
225
|
|
227
226
|
which will look like this,
|
228
227
|
|
229
|
-
http://
|
228
|
+
http://mordred.bioc.cam.ac.uk/~semin/images/0.HA.png
|
230
229
|
|
231
230
|
12. To generate one big figure, 'myheatmaps.gif' containing all the heat maps (4 maps in a row),
|
232
231
|
|
@@ -234,7 +233,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
234
233
|
|
235
234
|
which will look like this,
|
236
235
|
|
237
|
-
http://
|
236
|
+
http://mordred.bioc.cam.ac.uk/~semin/images/myheatmaps.gif
|
238
237
|
|
239
238
|
== Repository
|
240
239
|
|
data/Rakefile
CHANGED
@@ -2,6 +2,7 @@ require 'rubygems'
|
|
2
2
|
gem 'hoe', '>= 2.1.0'
|
3
3
|
require 'hoe'
|
4
4
|
require 'fileutils'
|
5
|
+
require './lib/ulla.rb'
|
5
6
|
|
6
7
|
Hoe.plugin :newgem
|
7
8
|
# Hoe.plugin :website
|
@@ -12,7 +13,6 @@ Hoe.plugin :newgem
|
|
12
13
|
$hoe = Hoe.spec 'ulla' do
|
13
14
|
self.developer 'Semin Lee', 'seminlee@gmail.com'
|
14
15
|
self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
|
15
|
-
self.rubyforge_name = self.name # TODO this is default value
|
16
16
|
self.extra_deps = [
|
17
17
|
['narray', '>= 0.5.9.5'],
|
18
18
|
['bio', '>= 1.2.1'],
|
@@ -21,4 +21,4 @@ $hoe = Hoe.spec 'ulla' do
|
|
21
21
|
end
|
22
22
|
|
23
23
|
require 'newgem/tasks'
|
24
|
-
Dir['tasks
|
24
|
+
Dir['tasks/*.rake'].each { |t| load t }
|
data/lib/narray_extensions.rb
CHANGED
data/lib/nmatrix_extensions.rb
CHANGED
@@ -1,14 +1,3 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'narray'
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'rvg/rvg'
|
6
|
-
include Magick
|
7
|
-
rescue Exception => e
|
8
|
-
$logger.warn "#{e.to_s.chomp} For this reason, heat maps cannot be generated."
|
9
|
-
$no_rmagick = true
|
10
|
-
end
|
11
|
-
|
12
1
|
module NMatrixExtensions
|
13
2
|
|
14
3
|
def pretty_string(options={})
|
data/lib/ulla.rb
CHANGED
@@ -1,6 +1,50 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
|
+
require 'bio'
|
5
|
+
require 'set'
|
6
|
+
require 'logger'
|
7
|
+
require 'narray'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'bio'
|
10
|
+
require 'set'
|
11
|
+
require 'inline'
|
12
|
+
require 'narray'
|
13
|
+
require 'logger'
|
14
|
+
require 'narray'
|
15
|
+
require 'stringio'
|
16
|
+
require 'pathname'
|
17
|
+
require 'getoptlong'
|
18
|
+
require 'fork_manager'
|
19
|
+
require 'facets/enumerable'
|
20
|
+
|
21
|
+
begin
|
22
|
+
require 'rvg/rvg'
|
23
|
+
include Magick
|
24
|
+
rescue Exception => e
|
25
|
+
$logger.warn "#{e.to_s.chomp} For this reason, heat maps cannot be generated."
|
26
|
+
$no_rmagick = true
|
27
|
+
end
|
28
|
+
|
29
|
+
require_relative 'math_extensions'
|
30
|
+
require_relative 'array_extensions'
|
31
|
+
require_relative 'string_extensions'
|
32
|
+
require_relative 'narray_extensions'
|
33
|
+
require_relative 'nmatrix_extensions'
|
34
|
+
|
35
|
+
require_relative 'ulla/esst'
|
36
|
+
require_relative 'ulla/essts'
|
37
|
+
require_relative 'ulla/joy_tem'
|
38
|
+
require_relative 'ulla/sequence'
|
39
|
+
require_relative 'ulla/heatmap_array'
|
40
|
+
require_relative 'ulla/environment'
|
41
|
+
require_relative 'ulla/environment_class_hash'
|
42
|
+
require_relative 'ulla/environment_feature'
|
43
|
+
require_relative 'ulla/environment_feature_array'
|
44
|
+
|
4
45
|
module Ulla
|
5
|
-
VERSION = '0.9.9.
|
46
|
+
VERSION = '0.9.9.2'
|
47
|
+
|
48
|
+
$logger = Logger.new(STDOUT)
|
49
|
+
$logger.level = Logger::WARN
|
6
50
|
end
|
data/lib/ulla/cli.rb
CHANGED
@@ -1,15 +1,86 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'getoptlong'
|
3
|
-
require 'logger'
|
4
|
-
require 'narray'
|
5
|
-
require 'bio'
|
6
|
-
require 'set'
|
7
|
-
|
8
1
|
# This is a module for an actual command line interpreter for Ulla
|
9
2
|
# ---
|
10
3
|
# Copyright (C) 2008-9 Semin Lee
|
11
4
|
module Ulla
|
12
5
|
class CLI
|
6
|
+
|
7
|
+
# Calculate PID between two sequences
|
8
|
+
#
|
9
|
+
# :call-seq:
|
10
|
+
# Ulla::CLI::calculate_pid(seq1, seq2, unit) -> Float
|
11
|
+
#
|
12
|
+
def self.calculate_pid_rb(seq1, seq2, unit)
|
13
|
+
aas1 = seq1.scan(/\S{#{unit}}/)
|
14
|
+
aas2 = seq2.scan(/\S{#{unit}}/)
|
15
|
+
gap = ($gap || '-') * unit
|
16
|
+
align = 0 # no. of aligned columns
|
17
|
+
ident = 0 # no. of identical columns
|
18
|
+
intgp = 0 # no. of internal gaps
|
19
|
+
|
20
|
+
if (aas1.size != aas2.size)
|
21
|
+
$logger.error "Cannot calculate PID between unaligned sequences"
|
22
|
+
$logger.error seq1, seq2
|
23
|
+
exit 1
|
24
|
+
end
|
25
|
+
|
26
|
+
(0...aas1.size).each do |i|
|
27
|
+
if (aas1[i] != gap) && (aas2[i] != gap)
|
28
|
+
align += 1
|
29
|
+
if aas1[i] == aas2[i]
|
30
|
+
ident += 1
|
31
|
+
end
|
32
|
+
elsif (((aas1[i] == gap) && (aas2[i] != gap)) ||
|
33
|
+
((aas1[i] != gap) && (aas2[i] == gap)))
|
34
|
+
intgp += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
100.0 * ident / (align + intgp)
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
inline(:C) do |builder|
|
43
|
+
builder.add_compile_flags '-x c++', '-lstdc++'
|
44
|
+
builder.c_singleton %q{
|
45
|
+
static VALUE calculate_pid_cpp(VALUE seq1, VALUE seq2, VALUE unit) {
|
46
|
+
VALUE re = rb_str_plus(rb_str_plus(rb_str_new2("\\\\S{"), rb_funcall(unit, rb_intern("to_s"), 0)), rb_str_new2("}"));
|
47
|
+
VALUE aas1 = rb_funcall(seq1, rb_intern("scan"), 1, rb_reg_new_str(re, 0));
|
48
|
+
VALUE aas2 = rb_funcall(seq2, rb_intern("scan"), 1, rb_reg_new_str(re, 0));
|
49
|
+
//VALUE aas1 = rb_funcall(seq1, rb_intern("split"), 1, rb_str_new2(""));
|
50
|
+
//VALUE aas2 = rb_funcall(seq2, rb_intern("split"), 1, rb_str_new2(""));
|
51
|
+
VALUE *aas1_p = RARRAY_PTR(aas1);
|
52
|
+
VALUE *aas2_p = RARRAY_PTR(aas2);
|
53
|
+
VALUE gap = rb_str_new2("-");
|
54
|
+
long len1 = RARRAY_LEN(aas1);
|
55
|
+
//long len2 = RARRAY_LEN(aas2);
|
56
|
+
double align = 0.0;
|
57
|
+
double ident = 0.0;
|
58
|
+
double intgp = 0.0;
|
59
|
+
|
60
|
+
for (long i = 0; i < len1; i++) {
|
61
|
+
if ((rb_str_equal(aas1_p[i], gap) == Qfalse) && (rb_str_equal(aas2_p[i], gap) == Qfalse)) {
|
62
|
+
align += 1.0;
|
63
|
+
if (rb_str_equal(aas1_p[i], aas2_p[i]) == Qtrue) {
|
64
|
+
ident += 1.0;
|
65
|
+
}
|
66
|
+
} else if (((rb_str_equal(aas1_p[i], gap) == Qtrue) && (rb_str_equal(aas2_p[i], gap) == Qfalse)) ||
|
67
|
+
((rb_str_equal(aas1_p[i], gap) == Qfalse) && (rb_str_equal(aas2_p[i], gap) == Qtrue))) {
|
68
|
+
intgp += 1.0;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
return DBL2NUM(100.0 * ident / (align + intgp));
|
72
|
+
}
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.calculate_pid(seq1, seq2, unit)
|
77
|
+
begin
|
78
|
+
self.calculate_pid_cpp(seq1, seq2, unit)
|
79
|
+
rescue
|
80
|
+
self.calculate_pid_rb(seq1, seq2, unit)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
13
84
|
class << self
|
14
85
|
|
15
86
|
# :nodoc:
|
@@ -38,7 +109,7 @@ Options:
|
|
38
109
|
--tem-file (-f) FILE: a tem file
|
39
110
|
--tem-list (-l) FILE: a list for tem files
|
40
111
|
--classdef (-c) FILE: a file for the defintion of environmental class
|
41
|
-
if no definition file provided, --cys (-y) 2 and --nosmooth options
|
112
|
+
if no definition file provided, --cys (-y) 2 and --nosmooth options applied
|
42
113
|
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
43
114
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
44
115
|
--noweight: calculate substitution counts with no weights
|
@@ -58,13 +129,13 @@ Options:
|
|
58
129
|
0 for raw counts (no smoothing performed)
|
59
130
|
1 for probabilities
|
60
131
|
2 for log-odds (default)
|
61
|
-
--noroundoff: do not round off log
|
62
|
-
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
63
|
-
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
132
|
+
--noroundoff: do not round off log-odds ratio
|
133
|
+
--scale INTEGER: log-odds matrices in 1/n bit units (default: 3)
|
134
|
+
--sigma DOUBLE: change the sigma value for smoothing (default: 5.0)
|
64
135
|
--autosigma: automatically adjust the sigma value for smoothing
|
65
|
-
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
|
66
|
-
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value
|
67
|
-
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value
|
136
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default: 0)
|
137
|
+
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value
|
138
|
+
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value
|
68
139
|
--heatmap INTEGER:
|
69
140
|
0 create a heat map file for each substitution table
|
70
141
|
1 create one big file containing all heat maps from substitution tables
|
@@ -91,35 +162,6 @@ Options:
|
|
91
162
|
puts (verbose ? usage + options : usage)
|
92
163
|
end
|
93
164
|
|
94
|
-
# Calculate PID between two sequences
|
95
|
-
#
|
96
|
-
# :call-seq:
|
97
|
-
# Ulla::CLI::calculate_pid(seq1, seq2) -> Float
|
98
|
-
#
|
99
|
-
def calculate_pid(seq1, seq2, unit)
|
100
|
-
aas1 = seq1.scan(/\w{#{unit}}/)
|
101
|
-
aas2 = seq2.scan(/\w{#{unit}}/)
|
102
|
-
cols = aas1.zip(aas2)
|
103
|
-
gap = ($gap || '-') * unit
|
104
|
-
align = 0 # no. of aligned columns
|
105
|
-
ident = 0 # no. of identical columns
|
106
|
-
intgp = 0 # no. of internal gaps
|
107
|
-
|
108
|
-
cols.each do |col|
|
109
|
-
if (col[0] != gap) && (col[1] != gap)
|
110
|
-
align += 1
|
111
|
-
if col[0] == col[1]
|
112
|
-
ident += 1
|
113
|
-
end
|
114
|
-
elsif (((col[0] == gap) && (col[1] != gap)) ||
|
115
|
-
((col[0] != gap) && (col[1] == gap)))
|
116
|
-
intgp += 1
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
pid = 100.0 * ident.to_f / (align + intgp)
|
121
|
-
end
|
122
|
-
|
123
165
|
# :nodoc:
|
124
166
|
def execute(arguments=[])
|
125
167
|
#
|
@@ -152,9 +194,6 @@ Options:
|
|
152
194
|
# Global variables and their default values
|
153
195
|
#
|
154
196
|
|
155
|
-
$logger = Logger.new(STDOUT)
|
156
|
-
$logger.level = Logger::WARN
|
157
|
-
|
158
197
|
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
159
198
|
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
160
199
|
$gap = '-'
|
@@ -179,7 +218,6 @@ Options:
|
|
179
218
|
$scale = 3
|
180
219
|
$pidmin = nil
|
181
220
|
$pidmax = nil
|
182
|
-
$scale = 3
|
183
221
|
$add = nil
|
184
222
|
$cys = 0
|
185
223
|
$targetenv = false
|
@@ -233,6 +271,9 @@ Options:
|
|
233
271
|
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
234
272
|
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
235
273
|
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
274
|
+
[ '--scale', GetoptLong::REQUIRED_ARGUMENT ],
|
275
|
+
[ '--pidmax', GetoptLong::REQUIRED_ARGUMENT ],
|
276
|
+
[ '--pidmin', GetoptLong::REQUIRED_ARGUMENT ],
|
236
277
|
[ '--add', GetoptLong::REQUIRED_ARGUMENT ],
|
237
278
|
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
238
279
|
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
@@ -297,7 +338,7 @@ Options:
|
|
297
338
|
when '--penv'
|
298
339
|
warn "--penv option is not supported."
|
299
340
|
exit 1
|
300
|
-
|
341
|
+
#$penv = true
|
301
342
|
when '--heatmap'
|
302
343
|
$heatmap = case arg.to_i
|
303
344
|
when (0..2) then arg.to_i
|
@@ -365,19 +406,6 @@ Options:
|
|
365
406
|
warn "Cannot find environment class definition file, #{$classdef}"
|
366
407
|
exit 1
|
367
408
|
end
|
368
|
-
|
369
|
-
require 'math_extensions'
|
370
|
-
require 'array_extensions'
|
371
|
-
require 'string_extensions'
|
372
|
-
require 'narray_extensions'
|
373
|
-
require 'nmatrix_extensions'
|
374
|
-
|
375
|
-
require 'ulla/environment'
|
376
|
-
require 'ulla/environment_class_hash'
|
377
|
-
require 'ulla/environment_feature'
|
378
|
-
require 'ulla/environment_feature_array'
|
379
|
-
require 'ulla/heatmap_array'
|
380
|
-
|
381
409
|
#
|
382
410
|
# Part 2 END
|
383
411
|
#
|
@@ -425,15 +453,18 @@ Options:
|
|
425
453
|
next
|
426
454
|
elsif (env_ftr = line.split(/;/)).length == 5
|
427
455
|
$logger.info "An environment feature, #{line} detected."
|
456
|
+
|
428
457
|
if env_ftr[-1] == 'T'
|
429
458
|
# skip silenced environment feature
|
430
459
|
$logger.warn "The environment feature, #{line} silent."
|
431
460
|
next
|
432
461
|
end
|
462
|
+
|
433
463
|
if env_ftr[-2] == 'T'
|
434
464
|
$cst_features << env_index
|
435
465
|
$logger.warn "The environment feature, #{line} constrained."
|
436
466
|
end
|
467
|
+
|
437
468
|
$env_features << EnvironmentFeature.new(env_ftr[0],
|
438
469
|
env_ftr[1].split(''),
|
439
470
|
env_ftr[2].split(''),
|
@@ -571,7 +602,7 @@ Options:
|
|
571
602
|
seq2 = seq2.split('').each_with_index.map { |aa, pos| aa == $gap ? $ext_gap : env_labels[id2][pos] }.join
|
572
603
|
end
|
573
604
|
|
574
|
-
pid =
|
605
|
+
pid = calculate_pid_cpp(seq1, seq2, $col_size)
|
575
606
|
s1 = seq1.scan(/\S{#{$col_size}}/)
|
576
607
|
s2 = seq2.scan(/\S{#{$col_size}}/)
|
577
608
|
|
@@ -610,8 +641,10 @@ Options:
|
|
610
641
|
next
|
611
642
|
end
|
612
643
|
|
613
|
-
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
614
|
-
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
644
|
+
#aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
645
|
+
#aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
646
|
+
aa1 = (aa1[0].chr == 'C' && (!disulphide.has_key?(id1) || disulphide[id1][pos] == 'F') && $cys != 2) ? 'J' + aa1[1..-1] : aa1
|
647
|
+
aa2 = (aa2[0].chr == 'C' && (!disulphide.has_key?(id2) || disulphide[id2][pos] == 'F') && $cys != 2) ? 'J' + aa2[1..-1] : aa2
|
615
648
|
env_label = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
616
649
|
|
617
650
|
if $cst_features.empty?
|
@@ -648,7 +681,7 @@ Options:
|
|
648
681
|
ali = ext_ali
|
649
682
|
end
|
650
683
|
|
651
|
-
#
|
684
|
+
# loop for single linkage clustering
|
652
685
|
begin
|
653
686
|
continue = false
|
654
687
|
0.upto(clusters.size - 2) do |i|
|
@@ -657,7 +690,7 @@ Options:
|
|
657
690
|
found = false
|
658
691
|
clusters[i].each do |c1|
|
659
692
|
clusters[j].each do |c2|
|
660
|
-
if
|
693
|
+
if calculate_pid_cpp(ali[c1], ali[c2], $col_size) >= $weight
|
661
694
|
indexes << j
|
662
695
|
found = true
|
663
696
|
break
|
@@ -694,12 +727,12 @@ Options:
|
|
694
727
|
seq1.each_with_index do |aa1, pos|
|
695
728
|
aa2 = seq2[pos]
|
696
729
|
|
697
|
-
if env_labels[id1][pos].include?('X')
|
730
|
+
if env_labels.has_key?(id1) && env_labels[id1][pos].include?('X')
|
698
731
|
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1[0].chr} are masked."
|
699
732
|
next
|
700
733
|
end
|
701
734
|
|
702
|
-
if env_labels[id2][pos].include?('X')
|
735
|
+
if env_labels.has_key?(id2) && env_labels[id2][pos].include?('X')
|
703
736
|
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2[0].chr} are masked."
|
704
737
|
next
|
705
738
|
end
|
@@ -714,18 +747,21 @@ Options:
|
|
714
747
|
next
|
715
748
|
end
|
716
749
|
|
717
|
-
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
718
|
-
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
719
|
-
|
720
|
-
|
750
|
+
#aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
751
|
+
#aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
752
|
+
#aa1 = (aa1[0].chr == 'C' && (!disulphide.has_key?(id1) || disulphide[id1][pos] == 'F') && $cys != 2) ? 'J' + aa1[1..-1] : aa1
|
753
|
+
#aa2 = (aa2[0].chr == 'C' && (!disulphide.has_key?(id2) || disulphide[id2][pos] == 'F') && $cys != 2) ? 'J' + aa2[1..-1] : aa2
|
754
|
+
cnt1 = 1.0 / cluster1.size
|
755
|
+
cnt2 = 1.0 / cluster2.size
|
721
756
|
jnt_cnt = cnt1 * cnt2
|
722
757
|
env_label1 = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
723
758
|
env_label2 = $environment == 1 ? aa2 + '-' + aa1[1..-1] : env_labels[id2][pos]
|
724
759
|
|
725
760
|
if $cst_features.empty?
|
726
|
-
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
727
|
-
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
728
|
-
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) ==
|
761
|
+
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
762
|
+
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
763
|
+
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) ==
|
764
|
+
env_labels[id2][pos].split('').values_at(*$cst_features))
|
729
765
|
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
730
766
|
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
731
767
|
else
|
@@ -735,8 +771,71 @@ Options:
|
|
735
771
|
|
736
772
|
$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
|
737
773
|
$aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
|
738
|
-
$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1 if aa1
|
739
|
-
$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2 if aa1
|
774
|
+
($aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1) if aa1 != aa2
|
775
|
+
($aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2) if aa1 != aa2
|
776
|
+
|
777
|
+
#if $cst_features.empty?
|
778
|
+
#if $env_classes.has_key?(env_label1)
|
779
|
+
#if $env_classes.has_key?(env_label2)
|
780
|
+
#$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
781
|
+
#else
|
782
|
+
#if (aa1 == 'C' && aa2 == 'J')
|
783
|
+
#$env_classes[env_label1].increase_residue_count('C', jnt_cnt)
|
784
|
+
#else
|
785
|
+
#$env_classes[env_label1].increase_residue_count(aa2, jnt_cnt)
|
786
|
+
#end
|
787
|
+
#end
|
788
|
+
#end
|
789
|
+
#if $env_classes.has_key?(env_label2)
|
790
|
+
#if $env_classes.has_key?(env_label1)
|
791
|
+
#$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
792
|
+
#else
|
793
|
+
#if (aa2 == 'C' && aa1 == 'J')
|
794
|
+
#$env_classes[env_label2].increase_residue_count('C', jnt_cnt)
|
795
|
+
#else
|
796
|
+
#$env_classes[env_label2].increase_residue_count(aa1, jnt_cnt)
|
797
|
+
#end
|
798
|
+
#end
|
799
|
+
#end
|
800
|
+
#elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
801
|
+
#$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
802
|
+
#$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
803
|
+
#else
|
804
|
+
#$logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
|
805
|
+
#next
|
806
|
+
#end
|
807
|
+
|
808
|
+
#if $env_classes.has_key?(env_label1)
|
809
|
+
#$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
|
810
|
+
|
811
|
+
#if $env_classes.has_key?(env_label2)
|
812
|
+
#if aa1[0].chr != aa2[0].chr
|
813
|
+
#$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1
|
814
|
+
#end
|
815
|
+
#else
|
816
|
+
#if (aa1[0].chr != aa2)
|
817
|
+
#unless (aa1[0].chr == 'C' && aa2 == 'J') || (aa1[0].chr == 'J' && aa2 == 'C')
|
818
|
+
#$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1
|
819
|
+
#end
|
820
|
+
#end
|
821
|
+
#end
|
822
|
+
#end
|
823
|
+
|
824
|
+
#if $env_classes.has_key?(env_label2)
|
825
|
+
#$aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
|
826
|
+
|
827
|
+
#if $env_classes.has_key?(env_label1)
|
828
|
+
#if aa1[0].chr != aa2[0].chr
|
829
|
+
#$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2
|
830
|
+
#end
|
831
|
+
#else
|
832
|
+
#if (aa1 != aa2[0].chr)
|
833
|
+
#unless (aa1 == 'J' && aa2[0].chr == 'C') || (aa1 == 'C' && aa2[0].chr == 'J')
|
834
|
+
#$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2
|
835
|
+
#end
|
836
|
+
#end
|
837
|
+
#end
|
838
|
+
#end
|
740
839
|
|
741
840
|
$logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label1}."
|
742
841
|
$logger.debug "#{id2}-#{pos}-#{aa2[0].chr} -> #{id1}-#{pos}-#{aa1[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label2}."
|
@@ -748,6 +847,13 @@ Options:
|
|
748
847
|
$logger.info "Analysing #{tem_file} done."
|
749
848
|
end
|
750
849
|
|
850
|
+
$tot_aa = $aa_tot_cnt.values.sum
|
851
|
+
|
852
|
+
if $tot_aa < 1
|
853
|
+
$logger.warn "No amino acid substitution counted!"
|
854
|
+
exit 1
|
855
|
+
end
|
856
|
+
|
751
857
|
# print out default header
|
752
858
|
$outfh.puts <<HEADER
|
753
859
|
# Environment-specific amino acid substitution matrices
|
@@ -798,18 +904,15 @@ HEADER
|
|
798
904
|
|
799
905
|
# calculate amino acid frequencies and mutabilities, and
|
800
906
|
# print them as default statistics in the header part
|
907
|
+
|
908
|
+
# pre-calculate ALA's mutability
|
801
909
|
if $environment == 0
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
else
|
807
|
-
100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
|
808
|
-
end
|
910
|
+
ala_mutb = if $aa_tot_cnt['A'] == 0 then 0.0
|
911
|
+
elsif $aa_mut_cnt['A'] == 0 then 0.0
|
912
|
+
else $aa_mut_cnt['A'].to_f / $aa_tot_cnt['A']
|
913
|
+
end
|
809
914
|
end
|
810
915
|
|
811
|
-
$tot_aa = $aa_tot_cnt.values.sum
|
812
|
-
|
813
916
|
$outfh.puts '#'
|
814
917
|
$outfh.puts "# Total amino acid frequencies:\n"
|
815
918
|
|
@@ -843,8 +946,8 @@ HEADER
|
|
843
946
|
end
|
844
947
|
|
845
948
|
if $environment == 0
|
846
|
-
$aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ?
|
847
|
-
$aa_rel_mutb[aa] = $aa_mutb[aa]
|
949
|
+
$aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_mut_cnt[aa] / $aa_tot_cnt[aa].to_f)
|
950
|
+
$aa_rel_mutb[aa] = 100 * $aa_mutb[aa] / ala_mutb
|
848
951
|
end
|
849
952
|
|
850
953
|
$aa_tot_freq[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_tot_cnt[aa] / $tot_aa.to_f)
|
@@ -866,7 +969,7 @@ HEADER
|
|
866
969
|
|
867
970
|
if $noweight
|
868
971
|
if $environment == 0
|
869
|
-
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.
|
972
|
+
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.6f' % columns
|
870
973
|
else
|
871
974
|
$outfh.puts "# %-3s %-#{$env_features.size}s %9d %9d %8.4f" % columns
|
872
975
|
end
|