ulla 0.9.9.1 → 0.9.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/History.txt +1 -1
- data/Manifest.txt +4 -0
- data/PostInstall.txt +1 -1
- data/README.rdoc +6 -7
- data/Rakefile +2 -2
- data/lib/narray_extensions.rb +0 -3
- data/lib/nmatrix_extensions.rb +0 -11
- data/lib/ulla.rb +45 -1
- data/lib/ulla/cli.rb +192 -89
- data/lib/ulla/esst.rb +32 -0
- data/lib/ulla/essts.rb +84 -0
- data/lib/ulla/heatmap_array.rb +0 -12
- data/lib/ulla/joy_tem.rb +63 -0
- data/lib/ulla/sequence.rb +7 -0
- data/script/txt2html +5 -5
- data/ulla.gemspec +3 -3
- data/website/index.html +286 -19
- data/website/stylesheets/screen.css +80 -81
- metadata +109 -71
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 40c1f0642b5169fc2e4e54c12c2e7b88e06a9f7f
|
4
|
+
data.tar.gz: 4ac3ec4031a177629ae72cf5de1803e3749bbf66
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 050a3eb74dc176396b8ef9d1f10bee85ce227e9694245776d2c0b2de92595ed919697fd79176e25c0ebeb203f1a5cc689f15f3ad6580609c8d0839b944627550
|
7
|
+
data.tar.gz: d05f5de53ed2ac6552af88153c932f9947d47db5dfdcbe9ef09547f54a924ff8eff1fa8cee2dbde4101f40064e5f0299208977cfff80ce239b930d9b3bcc33a4
|
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -18,6 +18,10 @@ lib/ulla/environment_class_hash.rb
|
|
18
18
|
lib/ulla/environment_feature.rb
|
19
19
|
lib/ulla/environment_feature_array.rb
|
20
20
|
lib/ulla/heatmap_array.rb
|
21
|
+
lib/ulla/esst.rb
|
22
|
+
lib/ulla/essts.rb
|
23
|
+
lib/ulla/joy_tem.rb
|
24
|
+
lib/ulla/sequence.rb
|
21
25
|
script/console
|
22
26
|
script/destroy
|
23
27
|
script/generate
|
data/PostInstall.txt
CHANGED
data/README.rdoc
CHANGED
@@ -25,7 +25,6 @@ Following RubyGems will be automatically installed if you have rubygems installe
|
|
25
25
|
|
26
26
|
* narray (http://narray.rubyforge.org)
|
27
27
|
* bio (http://bioruby.open-bio.org)
|
28
|
-
* Active Support (http://as.rubyonrails.org)
|
29
28
|
* RMagick (http://rmagick.rubyforge.org)
|
30
29
|
|
31
30
|
|
@@ -36,7 +35,7 @@ Following RubyGems will be automatically installed if you have rubygems installe
|
|
36
35
|
|
37
36
|
== Basic Usage
|
38
37
|
|
39
|
-
It's pretty much the same as Kenji's subst (http://
|
38
|
+
It's pretty much the same as Kenji's subst (http://mordred.bioc.cam.ac.uk/~kenji/subst/), so in most cases, you can swap 'subst' with 'ulla'.
|
40
39
|
|
41
40
|
~user $ ulla -l TEMLIST-file -c classdef.dat
|
42
41
|
or
|
@@ -98,7 +97,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
98
97
|
|
99
98
|
== Usage
|
100
99
|
|
101
|
-
1. Prepare an environmental class definition file. For more details, please check this notes (http://
|
100
|
+
1. Prepare an environmental class definition file. For more details, please check this notes (http://mordred.bioc.cam.ac.uk/~kenji/subst/NOTES). You can download a sample environmental class definition file from http://mordred.bioc.cam.ac.uk/~kenji/subst/classdef.dat
|
102
101
|
|
103
102
|
~user $ cat classdef.dat
|
104
103
|
#
|
@@ -108,7 +107,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
108
107
|
secondary structure and phi angle;HEPC;HEPC;T;F
|
109
108
|
solvent accessibility;TF;Aa;F;F
|
110
109
|
|
111
|
-
2. Prepare structural alignments and their annotations of above environmental classes in PIR format. You can download sample alignments from http://
|
110
|
+
2. Prepare structural alignments and their annotations of above environmental classes in PIR format. You can download sample alignments from http://mordred.bioc.cam.ac.uk/~kenji/subst/alltem-allmask.tar.gz or from http://www-cryst.bioc.cam.ac.uk/ESST/
|
112
111
|
|
113
112
|
~user $ cat sample1.tem
|
114
113
|
>P1;1mnma
|
@@ -167,7 +166,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
167
166
|
|
168
167
|
9. In case positions are masked with the character 'X' in any environmental features, all mutations from/to the position will be excluded from substitution counts.
|
169
168
|
|
170
|
-
10. Then, it will produce a file containing all the matrices, which will look like the one below. For more details, please check this notes (http://
|
169
|
+
10. Then, it will produce a file containing all the matrices, which will look like the one below. For more details, please check this notes (http://mordred.bioc.cam.ac.uk/~kenji/subst/NOTES).
|
171
170
|
|
172
171
|
# Environment-specific amino acid substitution matrices
|
173
172
|
# Creator: ulla version 0.0.5
|
@@ -226,7 +225,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
226
225
|
|
227
226
|
which will look like this,
|
228
227
|
|
229
|
-
http://
|
228
|
+
http://mordred.bioc.cam.ac.uk/~semin/images/0.HA.png
|
230
229
|
|
231
230
|
12. To generate one big figure, 'myheatmaps.gif' containing all the heat maps (4 maps in a row),
|
232
231
|
|
@@ -234,7 +233,7 @@ It's pretty much the same as Kenji's subst (http://www-cryst.bioc.cam.ac.uk/~ken
|
|
234
233
|
|
235
234
|
which will look like this,
|
236
235
|
|
237
|
-
http://
|
236
|
+
http://mordred.bioc.cam.ac.uk/~semin/images/myheatmaps.gif
|
238
237
|
|
239
238
|
== Repository
|
240
239
|
|
data/Rakefile
CHANGED
@@ -2,6 +2,7 @@ require 'rubygems'
|
|
2
2
|
gem 'hoe', '>= 2.1.0'
|
3
3
|
require 'hoe'
|
4
4
|
require 'fileutils'
|
5
|
+
require './lib/ulla.rb'
|
5
6
|
|
6
7
|
Hoe.plugin :newgem
|
7
8
|
# Hoe.plugin :website
|
@@ -12,7 +13,6 @@ Hoe.plugin :newgem
|
|
12
13
|
$hoe = Hoe.spec 'ulla' do
|
13
14
|
self.developer 'Semin Lee', 'seminlee@gmail.com'
|
14
15
|
self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
|
15
|
-
self.rubyforge_name = self.name # TODO this is default value
|
16
16
|
self.extra_deps = [
|
17
17
|
['narray', '>= 0.5.9.5'],
|
18
18
|
['bio', '>= 1.2.1'],
|
@@ -21,4 +21,4 @@ $hoe = Hoe.spec 'ulla' do
|
|
21
21
|
end
|
22
22
|
|
23
23
|
require 'newgem/tasks'
|
24
|
-
Dir['tasks
|
24
|
+
Dir['tasks/*.rake'].each { |t| load t }
|
data/lib/narray_extensions.rb
CHANGED
data/lib/nmatrix_extensions.rb
CHANGED
@@ -1,14 +1,3 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'narray'
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'rvg/rvg'
|
6
|
-
include Magick
|
7
|
-
rescue Exception => e
|
8
|
-
$logger.warn "#{e.to_s.chomp} For this reason, heat maps cannot be generated."
|
9
|
-
$no_rmagick = true
|
10
|
-
end
|
11
|
-
|
12
1
|
module NMatrixExtensions
|
13
2
|
|
14
3
|
def pretty_string(options={})
|
data/lib/ulla.rb
CHANGED
@@ -1,6 +1,50 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
|
+
require 'bio'
|
5
|
+
require 'set'
|
6
|
+
require 'logger'
|
7
|
+
require 'narray'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'bio'
|
10
|
+
require 'set'
|
11
|
+
require 'inline'
|
12
|
+
require 'narray'
|
13
|
+
require 'logger'
|
14
|
+
require 'narray'
|
15
|
+
require 'stringio'
|
16
|
+
require 'pathname'
|
17
|
+
require 'getoptlong'
|
18
|
+
require 'fork_manager'
|
19
|
+
require 'facets/enumerable'
|
20
|
+
|
21
|
+
begin
|
22
|
+
require 'rvg/rvg'
|
23
|
+
include Magick
|
24
|
+
rescue Exception => e
|
25
|
+
$logger.warn "#{e.to_s.chomp} For this reason, heat maps cannot be generated."
|
26
|
+
$no_rmagick = true
|
27
|
+
end
|
28
|
+
|
29
|
+
require_relative 'math_extensions'
|
30
|
+
require_relative 'array_extensions'
|
31
|
+
require_relative 'string_extensions'
|
32
|
+
require_relative 'narray_extensions'
|
33
|
+
require_relative 'nmatrix_extensions'
|
34
|
+
|
35
|
+
require_relative 'ulla/esst'
|
36
|
+
require_relative 'ulla/essts'
|
37
|
+
require_relative 'ulla/joy_tem'
|
38
|
+
require_relative 'ulla/sequence'
|
39
|
+
require_relative 'ulla/heatmap_array'
|
40
|
+
require_relative 'ulla/environment'
|
41
|
+
require_relative 'ulla/environment_class_hash'
|
42
|
+
require_relative 'ulla/environment_feature'
|
43
|
+
require_relative 'ulla/environment_feature_array'
|
44
|
+
|
4
45
|
module Ulla
|
5
|
-
VERSION = '0.9.9.
|
46
|
+
VERSION = '0.9.9.2'
|
47
|
+
|
48
|
+
$logger = Logger.new(STDOUT)
|
49
|
+
$logger.level = Logger::WARN
|
6
50
|
end
|
data/lib/ulla/cli.rb
CHANGED
@@ -1,15 +1,86 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'getoptlong'
|
3
|
-
require 'logger'
|
4
|
-
require 'narray'
|
5
|
-
require 'bio'
|
6
|
-
require 'set'
|
7
|
-
|
8
1
|
# This is a module for an actual command line interpreter for Ulla
|
9
2
|
# ---
|
10
3
|
# Copyright (C) 2008-9 Semin Lee
|
11
4
|
module Ulla
|
12
5
|
class CLI
|
6
|
+
|
7
|
+
# Calculate PID between two sequences
|
8
|
+
#
|
9
|
+
# :call-seq:
|
10
|
+
# Ulla::CLI::calculate_pid(seq1, seq2, unit) -> Float
|
11
|
+
#
|
12
|
+
def self.calculate_pid_rb(seq1, seq2, unit)
|
13
|
+
aas1 = seq1.scan(/\S{#{unit}}/)
|
14
|
+
aas2 = seq2.scan(/\S{#{unit}}/)
|
15
|
+
gap = ($gap || '-') * unit
|
16
|
+
align = 0 # no. of aligned columns
|
17
|
+
ident = 0 # no. of identical columns
|
18
|
+
intgp = 0 # no. of internal gaps
|
19
|
+
|
20
|
+
if (aas1.size != aas2.size)
|
21
|
+
$logger.error "Cannot calculate PID between unaligned sequences"
|
22
|
+
$logger.error seq1, seq2
|
23
|
+
exit 1
|
24
|
+
end
|
25
|
+
|
26
|
+
(0...aas1.size).each do |i|
|
27
|
+
if (aas1[i] != gap) && (aas2[i] != gap)
|
28
|
+
align += 1
|
29
|
+
if aas1[i] == aas2[i]
|
30
|
+
ident += 1
|
31
|
+
end
|
32
|
+
elsif (((aas1[i] == gap) && (aas2[i] != gap)) ||
|
33
|
+
((aas1[i] != gap) && (aas2[i] == gap)))
|
34
|
+
intgp += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
100.0 * ident / (align + intgp)
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
inline(:C) do |builder|
|
43
|
+
builder.add_compile_flags '-x c++', '-lstdc++'
|
44
|
+
builder.c_singleton %q{
|
45
|
+
static VALUE calculate_pid_cpp(VALUE seq1, VALUE seq2, VALUE unit) {
|
46
|
+
VALUE re = rb_str_plus(rb_str_plus(rb_str_new2("\\\\S{"), rb_funcall(unit, rb_intern("to_s"), 0)), rb_str_new2("}"));
|
47
|
+
VALUE aas1 = rb_funcall(seq1, rb_intern("scan"), 1, rb_reg_new_str(re, 0));
|
48
|
+
VALUE aas2 = rb_funcall(seq2, rb_intern("scan"), 1, rb_reg_new_str(re, 0));
|
49
|
+
//VALUE aas1 = rb_funcall(seq1, rb_intern("split"), 1, rb_str_new2(""));
|
50
|
+
//VALUE aas2 = rb_funcall(seq2, rb_intern("split"), 1, rb_str_new2(""));
|
51
|
+
VALUE *aas1_p = RARRAY_PTR(aas1);
|
52
|
+
VALUE *aas2_p = RARRAY_PTR(aas2);
|
53
|
+
VALUE gap = rb_str_new2("-");
|
54
|
+
long len1 = RARRAY_LEN(aas1);
|
55
|
+
//long len2 = RARRAY_LEN(aas2);
|
56
|
+
double align = 0.0;
|
57
|
+
double ident = 0.0;
|
58
|
+
double intgp = 0.0;
|
59
|
+
|
60
|
+
for (long i = 0; i < len1; i++) {
|
61
|
+
if ((rb_str_equal(aas1_p[i], gap) == Qfalse) && (rb_str_equal(aas2_p[i], gap) == Qfalse)) {
|
62
|
+
align += 1.0;
|
63
|
+
if (rb_str_equal(aas1_p[i], aas2_p[i]) == Qtrue) {
|
64
|
+
ident += 1.0;
|
65
|
+
}
|
66
|
+
} else if (((rb_str_equal(aas1_p[i], gap) == Qtrue) && (rb_str_equal(aas2_p[i], gap) == Qfalse)) ||
|
67
|
+
((rb_str_equal(aas1_p[i], gap) == Qfalse) && (rb_str_equal(aas2_p[i], gap) == Qtrue))) {
|
68
|
+
intgp += 1.0;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
return DBL2NUM(100.0 * ident / (align + intgp));
|
72
|
+
}
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.calculate_pid(seq1, seq2, unit)
|
77
|
+
begin
|
78
|
+
self.calculate_pid_cpp(seq1, seq2, unit)
|
79
|
+
rescue
|
80
|
+
self.calculate_pid_rb(seq1, seq2, unit)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
13
84
|
class << self
|
14
85
|
|
15
86
|
# :nodoc:
|
@@ -38,7 +109,7 @@ Options:
|
|
38
109
|
--tem-file (-f) FILE: a tem file
|
39
110
|
--tem-list (-l) FILE: a list for tem files
|
40
111
|
--classdef (-c) FILE: a file for the defintion of environmental class
|
41
|
-
if no definition file provided, --cys (-y) 2 and --nosmooth options
|
112
|
+
if no definition file provided, --cys (-y) 2 and --nosmooth options applied
|
42
113
|
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
43
114
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
44
115
|
--noweight: calculate substitution counts with no weights
|
@@ -58,13 +129,13 @@ Options:
|
|
58
129
|
0 for raw counts (no smoothing performed)
|
59
130
|
1 for probabilities
|
60
131
|
2 for log-odds (default)
|
61
|
-
--noroundoff: do not round off log
|
62
|
-
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
63
|
-
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
132
|
+
--noroundoff: do not round off log-odds ratio
|
133
|
+
--scale INTEGER: log-odds matrices in 1/n bit units (default: 3)
|
134
|
+
--sigma DOUBLE: change the sigma value for smoothing (default: 5.0)
|
64
135
|
--autosigma: automatically adjust the sigma value for smoothing
|
65
|
-
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 0)
|
66
|
-
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value
|
67
|
-
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value
|
136
|
+
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default: 0)
|
137
|
+
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value
|
138
|
+
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value
|
68
139
|
--heatmap INTEGER:
|
69
140
|
0 create a heat map file for each substitution table
|
70
141
|
1 create one big file containing all heat maps from substitution tables
|
@@ -91,35 +162,6 @@ Options:
|
|
91
162
|
puts (verbose ? usage + options : usage)
|
92
163
|
end
|
93
164
|
|
94
|
-
# Calculate PID between two sequences
|
95
|
-
#
|
96
|
-
# :call-seq:
|
97
|
-
# Ulla::CLI::calculate_pid(seq1, seq2) -> Float
|
98
|
-
#
|
99
|
-
def calculate_pid(seq1, seq2, unit)
|
100
|
-
aas1 = seq1.scan(/\w{#{unit}}/)
|
101
|
-
aas2 = seq2.scan(/\w{#{unit}}/)
|
102
|
-
cols = aas1.zip(aas2)
|
103
|
-
gap = ($gap || '-') * unit
|
104
|
-
align = 0 # no. of aligned columns
|
105
|
-
ident = 0 # no. of identical columns
|
106
|
-
intgp = 0 # no. of internal gaps
|
107
|
-
|
108
|
-
cols.each do |col|
|
109
|
-
if (col[0] != gap) && (col[1] != gap)
|
110
|
-
align += 1
|
111
|
-
if col[0] == col[1]
|
112
|
-
ident += 1
|
113
|
-
end
|
114
|
-
elsif (((col[0] == gap) && (col[1] != gap)) ||
|
115
|
-
((col[0] != gap) && (col[1] == gap)))
|
116
|
-
intgp += 1
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
pid = 100.0 * ident.to_f / (align + intgp)
|
121
|
-
end
|
122
|
-
|
123
165
|
# :nodoc:
|
124
166
|
def execute(arguments=[])
|
125
167
|
#
|
@@ -152,9 +194,6 @@ Options:
|
|
152
194
|
# Global variables and their default values
|
153
195
|
#
|
154
196
|
|
155
|
-
$logger = Logger.new(STDOUT)
|
156
|
-
$logger.level = Logger::WARN
|
157
|
-
|
158
197
|
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
159
198
|
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
160
199
|
$gap = '-'
|
@@ -179,7 +218,6 @@ Options:
|
|
179
218
|
$scale = 3
|
180
219
|
$pidmin = nil
|
181
220
|
$pidmax = nil
|
182
|
-
$scale = 3
|
183
221
|
$add = nil
|
184
222
|
$cys = 0
|
185
223
|
$targetenv = false
|
@@ -233,6 +271,9 @@ Options:
|
|
233
271
|
[ '--noroundoff', GetoptLong::NO_ARGUMENT ],
|
234
272
|
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
235
273
|
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
274
|
+
[ '--scale', GetoptLong::REQUIRED_ARGUMENT ],
|
275
|
+
[ '--pidmax', GetoptLong::REQUIRED_ARGUMENT ],
|
276
|
+
[ '--pidmin', GetoptLong::REQUIRED_ARGUMENT ],
|
236
277
|
[ '--add', GetoptLong::REQUIRED_ARGUMENT ],
|
237
278
|
[ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ],
|
238
279
|
[ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ],
|
@@ -297,7 +338,7 @@ Options:
|
|
297
338
|
when '--penv'
|
298
339
|
warn "--penv option is not supported."
|
299
340
|
exit 1
|
300
|
-
|
341
|
+
#$penv = true
|
301
342
|
when '--heatmap'
|
302
343
|
$heatmap = case arg.to_i
|
303
344
|
when (0..2) then arg.to_i
|
@@ -365,19 +406,6 @@ Options:
|
|
365
406
|
warn "Cannot find environment class definition file, #{$classdef}"
|
366
407
|
exit 1
|
367
408
|
end
|
368
|
-
|
369
|
-
require 'math_extensions'
|
370
|
-
require 'array_extensions'
|
371
|
-
require 'string_extensions'
|
372
|
-
require 'narray_extensions'
|
373
|
-
require 'nmatrix_extensions'
|
374
|
-
|
375
|
-
require 'ulla/environment'
|
376
|
-
require 'ulla/environment_class_hash'
|
377
|
-
require 'ulla/environment_feature'
|
378
|
-
require 'ulla/environment_feature_array'
|
379
|
-
require 'ulla/heatmap_array'
|
380
|
-
|
381
409
|
#
|
382
410
|
# Part 2 END
|
383
411
|
#
|
@@ -425,15 +453,18 @@ Options:
|
|
425
453
|
next
|
426
454
|
elsif (env_ftr = line.split(/;/)).length == 5
|
427
455
|
$logger.info "An environment feature, #{line} detected."
|
456
|
+
|
428
457
|
if env_ftr[-1] == 'T'
|
429
458
|
# skip silenced environment feature
|
430
459
|
$logger.warn "The environment feature, #{line} silent."
|
431
460
|
next
|
432
461
|
end
|
462
|
+
|
433
463
|
if env_ftr[-2] == 'T'
|
434
464
|
$cst_features << env_index
|
435
465
|
$logger.warn "The environment feature, #{line} constrained."
|
436
466
|
end
|
467
|
+
|
437
468
|
$env_features << EnvironmentFeature.new(env_ftr[0],
|
438
469
|
env_ftr[1].split(''),
|
439
470
|
env_ftr[2].split(''),
|
@@ -571,7 +602,7 @@ Options:
|
|
571
602
|
seq2 = seq2.split('').each_with_index.map { |aa, pos| aa == $gap ? $ext_gap : env_labels[id2][pos] }.join
|
572
603
|
end
|
573
604
|
|
574
|
-
pid =
|
605
|
+
pid = calculate_pid_cpp(seq1, seq2, $col_size)
|
575
606
|
s1 = seq1.scan(/\S{#{$col_size}}/)
|
576
607
|
s2 = seq2.scan(/\S{#{$col_size}}/)
|
577
608
|
|
@@ -610,8 +641,10 @@ Options:
|
|
610
641
|
next
|
611
642
|
end
|
612
643
|
|
613
|
-
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
614
|
-
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
644
|
+
#aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
645
|
+
#aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
646
|
+
aa1 = (aa1[0].chr == 'C' && (!disulphide.has_key?(id1) || disulphide[id1][pos] == 'F') && $cys != 2) ? 'J' + aa1[1..-1] : aa1
|
647
|
+
aa2 = (aa2[0].chr == 'C' && (!disulphide.has_key?(id2) || disulphide[id2][pos] == 'F') && $cys != 2) ? 'J' + aa2[1..-1] : aa2
|
615
648
|
env_label = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
616
649
|
|
617
650
|
if $cst_features.empty?
|
@@ -648,7 +681,7 @@ Options:
|
|
648
681
|
ali = ext_ali
|
649
682
|
end
|
650
683
|
|
651
|
-
#
|
684
|
+
# loop for single linkage clustering
|
652
685
|
begin
|
653
686
|
continue = false
|
654
687
|
0.upto(clusters.size - 2) do |i|
|
@@ -657,7 +690,7 @@ Options:
|
|
657
690
|
found = false
|
658
691
|
clusters[i].each do |c1|
|
659
692
|
clusters[j].each do |c2|
|
660
|
-
if
|
693
|
+
if calculate_pid_cpp(ali[c1], ali[c2], $col_size) >= $weight
|
661
694
|
indexes << j
|
662
695
|
found = true
|
663
696
|
break
|
@@ -694,12 +727,12 @@ Options:
|
|
694
727
|
seq1.each_with_index do |aa1, pos|
|
695
728
|
aa2 = seq2[pos]
|
696
729
|
|
697
|
-
if env_labels[id1][pos].include?('X')
|
730
|
+
if env_labels.has_key?(id1) && env_labels[id1][pos].include?('X')
|
698
731
|
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1[0].chr} are masked."
|
699
732
|
next
|
700
733
|
end
|
701
734
|
|
702
|
-
if env_labels[id2][pos].include?('X')
|
735
|
+
if env_labels.has_key?(id2) && env_labels[id2][pos].include?('X')
|
703
736
|
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2[0].chr} are masked."
|
704
737
|
next
|
705
738
|
end
|
@@ -714,18 +747,21 @@ Options:
|
|
714
747
|
next
|
715
748
|
end
|
716
749
|
|
717
|
-
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
718
|
-
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
719
|
-
|
720
|
-
|
750
|
+
#aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1[0].chr == 'C') && ($cys != 2)) ? 'J' + aa1[1..-1] : aa1
|
751
|
+
#aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2[0].chr == 'C') && ($cys != 2)) ? 'J' + aa2[1..-1] : aa2
|
752
|
+
#aa1 = (aa1[0].chr == 'C' && (!disulphide.has_key?(id1) || disulphide[id1][pos] == 'F') && $cys != 2) ? 'J' + aa1[1..-1] : aa1
|
753
|
+
#aa2 = (aa2[0].chr == 'C' && (!disulphide.has_key?(id2) || disulphide[id2][pos] == 'F') && $cys != 2) ? 'J' + aa2[1..-1] : aa2
|
754
|
+
cnt1 = 1.0 / cluster1.size
|
755
|
+
cnt2 = 1.0 / cluster2.size
|
721
756
|
jnt_cnt = cnt1 * cnt2
|
722
757
|
env_label1 = $environment == 1 ? aa1 + '-' + aa2[1..-1] : env_labels[id1][pos]
|
723
758
|
env_label2 = $environment == 1 ? aa2 + '-' + aa1[1..-1] : env_labels[id2][pos]
|
724
759
|
|
725
760
|
if $cst_features.empty?
|
726
|
-
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
727
|
-
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
728
|
-
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) ==
|
761
|
+
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
762
|
+
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
763
|
+
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) ==
|
764
|
+
env_labels[id2][pos].split('').values_at(*$cst_features))
|
729
765
|
$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
730
766
|
$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
731
767
|
else
|
@@ -735,8 +771,71 @@ Options:
|
|
735
771
|
|
736
772
|
$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
|
737
773
|
$aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
|
738
|
-
$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1 if aa1
|
739
|
-
$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2 if aa1
|
774
|
+
($aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1) if aa1 != aa2
|
775
|
+
($aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2) if aa1 != aa2
|
776
|
+
|
777
|
+
#if $cst_features.empty?
|
778
|
+
#if $env_classes.has_key?(env_label1)
|
779
|
+
#if $env_classes.has_key?(env_label2)
|
780
|
+
#$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
781
|
+
#else
|
782
|
+
#if (aa1 == 'C' && aa2 == 'J')
|
783
|
+
#$env_classes[env_label1].increase_residue_count('C', jnt_cnt)
|
784
|
+
#else
|
785
|
+
#$env_classes[env_label1].increase_residue_count(aa2, jnt_cnt)
|
786
|
+
#end
|
787
|
+
#end
|
788
|
+
#end
|
789
|
+
#if $env_classes.has_key?(env_label2)
|
790
|
+
#if $env_classes.has_key?(env_label1)
|
791
|
+
#$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
792
|
+
#else
|
793
|
+
#if (aa2 == 'C' && aa1 == 'J')
|
794
|
+
#$env_classes[env_label2].increase_residue_count('C', jnt_cnt)
|
795
|
+
#else
|
796
|
+
#$env_classes[env_label2].increase_residue_count(aa1, jnt_cnt)
|
797
|
+
#end
|
798
|
+
#end
|
799
|
+
#end
|
800
|
+
#elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
801
|
+
#$env_classes[env_label1].increase_residue_count(aa2[0].chr, jnt_cnt)
|
802
|
+
#$env_classes[env_label2].increase_residue_count(aa1[0].chr, jnt_cnt)
|
803
|
+
#else
|
804
|
+
#$logger.debug "Skipped #{id1}-#{pos}-#{aa1[0].chr} and #{id2}-#{pos}-#{aa2[0].chr} having different symbols for constrained environment features each other."
|
805
|
+
#next
|
806
|
+
#end
|
807
|
+
|
808
|
+
#if $env_classes.has_key?(env_label1)
|
809
|
+
#$aa_tot_cnt.has_key?(aa1) ? $aa_tot_cnt[aa1] += cnt1 : $aa_tot_cnt[aa1] = cnt1
|
810
|
+
|
811
|
+
#if $env_classes.has_key?(env_label2)
|
812
|
+
#if aa1[0].chr != aa2[0].chr
|
813
|
+
#$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1
|
814
|
+
#end
|
815
|
+
#else
|
816
|
+
#if (aa1[0].chr != aa2)
|
817
|
+
#unless (aa1[0].chr == 'C' && aa2 == 'J') || (aa1[0].chr == 'J' && aa2 == 'C')
|
818
|
+
#$aa_mut_cnt.has_key?(aa1) ? $aa_mut_cnt[aa1] += cnt1 : $aa_mut_cnt[aa1] = cnt1
|
819
|
+
#end
|
820
|
+
#end
|
821
|
+
#end
|
822
|
+
#end
|
823
|
+
|
824
|
+
#if $env_classes.has_key?(env_label2)
|
825
|
+
#$aa_tot_cnt.has_key?(aa2) ? $aa_tot_cnt[aa2] += cnt2 : $aa_tot_cnt[aa2] = cnt2
|
826
|
+
|
827
|
+
#if $env_classes.has_key?(env_label1)
|
828
|
+
#if aa1[0].chr != aa2[0].chr
|
829
|
+
#$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2
|
830
|
+
#end
|
831
|
+
#else
|
832
|
+
#if (aa1 != aa2[0].chr)
|
833
|
+
#unless (aa1 == 'J' && aa2[0].chr == 'C') || (aa1 == 'C' && aa2[0].chr == 'J')
|
834
|
+
#$aa_mut_cnt.has_key?(aa2) ? $aa_mut_cnt[aa2] += cnt2 : $aa_mut_cnt[aa2] = cnt2
|
835
|
+
#end
|
836
|
+
#end
|
837
|
+
#end
|
838
|
+
#end
|
740
839
|
|
741
840
|
$logger.debug "#{id1}-#{pos}-#{aa1[0].chr} -> #{id2}-#{pos}-#{aa2[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label1}."
|
742
841
|
$logger.debug "#{id2}-#{pos}-#{aa2[0].chr} -> #{id1}-#{pos}-#{aa1[0].chr} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_label2}."
|
@@ -748,6 +847,13 @@ Options:
|
|
748
847
|
$logger.info "Analysing #{tem_file} done."
|
749
848
|
end
|
750
849
|
|
850
|
+
$tot_aa = $aa_tot_cnt.values.sum
|
851
|
+
|
852
|
+
if $tot_aa < 1
|
853
|
+
$logger.warn "No amino acid substitution counted!"
|
854
|
+
exit 1
|
855
|
+
end
|
856
|
+
|
751
857
|
# print out default header
|
752
858
|
$outfh.puts <<HEADER
|
753
859
|
# Environment-specific amino acid substitution matrices
|
@@ -798,18 +904,15 @@ HEADER
|
|
798
904
|
|
799
905
|
# calculate amino acid frequencies and mutabilities, and
|
800
906
|
# print them as default statistics in the header part
|
907
|
+
|
908
|
+
# pre-calculate ALA's mutability
|
801
909
|
if $environment == 0
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
else
|
807
|
-
100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
|
808
|
-
end
|
910
|
+
ala_mutb = if $aa_tot_cnt['A'] == 0 then 0.0
|
911
|
+
elsif $aa_mut_cnt['A'] == 0 then 0.0
|
912
|
+
else $aa_mut_cnt['A'].to_f / $aa_tot_cnt['A']
|
913
|
+
end
|
809
914
|
end
|
810
915
|
|
811
|
-
$tot_aa = $aa_tot_cnt.values.sum
|
812
|
-
|
813
916
|
$outfh.puts '#'
|
814
917
|
$outfh.puts "# Total amino acid frequencies:\n"
|
815
918
|
|
@@ -843,8 +946,8 @@ HEADER
|
|
843
946
|
end
|
844
947
|
|
845
948
|
if $environment == 0
|
846
|
-
$aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ?
|
847
|
-
$aa_rel_mutb[aa] = $aa_mutb[aa]
|
949
|
+
$aa_mutb[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_mut_cnt[aa] / $aa_tot_cnt[aa].to_f)
|
950
|
+
$aa_rel_mutb[aa] = 100 * $aa_mutb[aa] / ala_mutb
|
848
951
|
end
|
849
952
|
|
850
953
|
$aa_tot_freq[aa] = ($aa_tot_cnt[aa] == 0) ? 0.0 : ($aa_tot_cnt[aa] / $tot_aa.to_f)
|
@@ -866,7 +969,7 @@ HEADER
|
|
866
969
|
|
867
970
|
if $noweight
|
868
971
|
if $environment == 0
|
869
|
-
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.
|
972
|
+
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.6f' % columns
|
870
973
|
else
|
871
974
|
$outfh.puts "# %-3s %-#{$env_features.size}s %9d %9d %8.4f" % columns
|
872
975
|
end
|