egor 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +14 -0
- data/Manifest.txt +2 -0
- data/README.rdoc +2 -2
- data/lib/egor/cli.rb +553 -401
- data/lib/egor.rb +1 -1
- data/lib/environment.rb +2 -2
- data/lib/environment_class_hash.rb +18 -0
- data/lib/environment_feature_array.rb +10 -0
- data/website/index.html +2 -2
- metadata +6 -25
- data.tar.gz.sig +0 -0
- metadata.gz.sig +0 -0
data/lib/egor/cli.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'getoptlong'
|
3
|
+
require 'logger'
|
4
|
+
require 'narray'
|
5
|
+
require 'bio'
|
6
|
+
require 'set'
|
7
|
+
require 'facets'
|
8
|
+
require 'simple_memoize'
|
9
|
+
|
10
|
+
require 'narray_extensions'
|
11
|
+
require 'nmatrix_extensions'
|
12
|
+
require 'enumerable_extensions'
|
13
|
+
require 'math_extensions'
|
14
|
+
require 'environment'
|
15
|
+
require 'environment_class_hash'
|
16
|
+
require 'environment_feature'
|
17
|
+
require 'environment_feature_array'
|
16
18
|
|
17
19
|
# This is a module for an actual command line interpreter for Egor
|
18
20
|
# ---
|
@@ -45,29 +47,32 @@ Options:
|
|
45
47
|
--tem-list (-l) FILE: a list for tem files
|
46
48
|
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
47
49
|
--outfile (-o) FILE: output filename (default 'allmat.dat')
|
48
|
-
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
|
-
--noweight: calculate substitution counts with no weights
|
50
|
+
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60)
|
51
|
+
--noweight: calculate substitution counts with no weights
|
50
52
|
--smooth (-s) INTEGER:
|
51
53
|
0 for partial smoothing (default)
|
52
54
|
1 for full smoothing
|
55
|
+
--p1smooth: perform smoothing for p1 probability calculation when partial smoothing
|
53
56
|
--nosmooth: perform no smoothing operation
|
54
57
|
--cys (-y) INTEGER:
|
55
58
|
0 for using C and J only for structure (default)
|
56
59
|
1 for both structure and sequence
|
57
|
-
2 for using only C for both (
|
60
|
+
2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates)
|
58
61
|
--output INTEGER:
|
59
|
-
0 for raw counts (no
|
62
|
+
0 for raw counts (no smoothing performed)
|
60
63
|
1 for probabilities
|
61
64
|
2 for log-odds (default)
|
65
|
+
--noround: do not round off log odds ratio
|
62
66
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
63
|
-
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
67
|
+
--sigma DOUBLE: change the sigma value for smoothing (default 5.0)
|
68
|
+
--autosigma: automatically adjust the sigma value for smoothing
|
64
69
|
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
65
70
|
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
66
71
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
67
72
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
68
73
|
--verbose (-v) INTEGER
|
69
|
-
0 for ERROR level
|
70
|
-
1 for WARN or above level
|
74
|
+
0 for ERROR level
|
75
|
+
1 for WARN or above level (default)
|
71
76
|
2 for INFO or above level
|
72
77
|
3 for DEBUG or above level
|
73
78
|
--version: print version
|
@@ -79,72 +84,85 @@ Options:
|
|
79
84
|
# Calculate PID between two sequences
|
80
85
|
#
|
81
86
|
# :call-seq:
|
82
|
-
# Egor::CLI::
|
87
|
+
# Egor::CLI::calculate_pid(seq1, seq2) -> Float
|
83
88
|
#
|
84
|
-
def
|
85
|
-
s1 = seq1.split(
|
86
|
-
s2 = seq2.split(
|
89
|
+
def calculate_pid(seq1, seq2)
|
90
|
+
s1 = seq1.split('')
|
91
|
+
s2 = seq2.split('')
|
87
92
|
cols = s1.zip(s2)
|
88
93
|
align = 0
|
89
94
|
ident = 0
|
90
95
|
intgp = 0
|
91
96
|
|
92
97
|
cols.each do |col|
|
93
|
-
if (col[0] !=
|
98
|
+
if (col[0] != '-') && (col[1] != '-')
|
94
99
|
align += 1
|
95
100
|
if col[0] == col[1]
|
96
101
|
ident += 1
|
97
102
|
end
|
98
|
-
elsif (((col[0] ==
|
99
|
-
((col[0] != "-") && (col[1] == "-")))
|
103
|
+
elsif (((col[0] == '-') && (col[1] != '-')) || ((col[0] != '-') && (col[1] == '-')))
|
100
104
|
intgp += 1
|
101
105
|
end
|
102
106
|
end
|
103
107
|
|
104
108
|
pid = 100.0 * ident.to_f / (align + intgp)
|
105
109
|
end
|
106
|
-
memoize :
|
110
|
+
memoize :calculate_pid
|
107
111
|
|
108
112
|
# :nodoc:
|
109
113
|
def execute(arguments=[])
|
110
114
|
#
|
111
|
-
# Abbreviations in the
|
112
|
-
#
|
113
|
-
# * env: environment
|
114
|
-
# * tem: (FUGUE) template
|
115
|
-
# * classdef: (envlironment) class definition
|
116
|
-
# * aa: amino acid
|
117
|
-
# * aa: weighted amino acid
|
118
|
-
# * tot: total
|
119
|
-
# * rel: relative
|
120
|
-
# * obs: observation (frequency)
|
121
|
-
# * mut: mutation
|
122
|
-
# * mutb: mutability
|
123
|
-
# * freq: frequency
|
124
|
-
# * prob: probability
|
125
|
-
# * opts: options
|
115
|
+
# * Abbreviations in the codes
|
126
116
|
#
|
117
|
+
# env: environment
|
118
|
+
# tem: (FUGUE) template
|
119
|
+
# classdef: (envlironment) class definition
|
120
|
+
# aa: amino acid
|
121
|
+
# aa: weighted amino acid
|
122
|
+
# tot: total
|
123
|
+
# rel: relative
|
124
|
+
# obs: observation
|
125
|
+
# cnt: count
|
126
|
+
# mut: mutation
|
127
|
+
# mutb: mutability
|
128
|
+
# freq: frequency
|
129
|
+
# prob: probability
|
130
|
+
# logo: log odds ratio
|
131
|
+
# opts: options
|
132
|
+
# fh: file handle
|
133
|
+
# ff: flat file
|
134
|
+
# ali: alignment
|
135
|
+
# mat: matrix
|
136
|
+
# arr: array
|
137
|
+
|
127
138
|
|
128
139
|
# Part 1.
|
129
140
|
#
|
130
141
|
# Global variables and their default values
|
131
142
|
#
|
143
|
+
|
132
144
|
$logger = Logger.new(STDOUT)
|
133
|
-
$logger.level = Logger::
|
134
|
-
|
145
|
+
$logger.level = Logger::WARN
|
146
|
+
|
147
|
+
# default set of 21 amino acids including J (Cysteine, the free thiol form)
|
148
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('')
|
149
|
+
|
135
150
|
$tem_list = nil
|
136
151
|
$tem_file = nil
|
137
|
-
$classdef =
|
138
|
-
$outfile =
|
152
|
+
$classdef = 'classdef.dat'
|
153
|
+
$outfile = 'allmat.dat'
|
139
154
|
$outfh = nil # file hanfle for outfile
|
140
|
-
$output = 2
|
155
|
+
$output = 2 # default: log odds matrix
|
141
156
|
$ali_size = 0
|
142
157
|
$tot_aa = 0
|
143
158
|
$sigma = 5.0
|
159
|
+
$autosigma = false
|
144
160
|
$weight = 60
|
145
161
|
$noweight = false
|
146
162
|
$smooth = :partial
|
147
163
|
$nosmooth = false
|
164
|
+
$noround = false
|
165
|
+
$p1smooth = false
|
148
166
|
$scale = 3
|
149
167
|
$pidmin = nil
|
150
168
|
$pidmax = nil
|
@@ -153,16 +171,21 @@ Options:
|
|
153
171
|
$cys = 0
|
154
172
|
$penv = false
|
155
173
|
|
156
|
-
$
|
157
|
-
$
|
174
|
+
$aa_tot_cnt = Hash.new(0)
|
175
|
+
$aa_mut_cnt = Hash.new(0)
|
158
176
|
$aa_mutb = {}
|
159
177
|
$aa_rel_mutb = {}
|
160
|
-
$
|
161
|
-
$
|
178
|
+
$aa_tot_freq = {}
|
179
|
+
$aa_env_cnt = Hash.new(0)
|
162
180
|
$smooth_prob = {}
|
163
|
-
$
|
181
|
+
$tot_cnt_mat = nil
|
164
182
|
$tot_prob_mat = nil
|
165
183
|
$tot_logo_mat = nil
|
184
|
+
$tot_smooth_prob = {}
|
185
|
+
|
186
|
+
# minimum ratio of amino acid count to sigma value
|
187
|
+
$min_obs_sigma_ratio = 500.0
|
188
|
+
|
166
189
|
#
|
167
190
|
# Part 1 END
|
168
191
|
#
|
@@ -171,6 +194,7 @@ Options:
|
|
171
194
|
#
|
172
195
|
# Parsing options
|
173
196
|
#
|
197
|
+
|
174
198
|
opts = GetoptLong.new(
|
175
199
|
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
176
200
|
[ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
|
@@ -178,9 +202,13 @@ Options:
|
|
178
202
|
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
179
203
|
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
180
204
|
[ '--nosmooth', GetoptLong::NO_ARGUMENT ],
|
205
|
+
[ '--p1smooth', GetoptLong::NO_ARGUMENT ],
|
181
206
|
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
182
207
|
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
183
|
-
[ '--
|
208
|
+
[ '--noround', GetoptLong::NO_ARGUMENT ],
|
209
|
+
[ '--sigma', GetoptLong::REQUIRED_ARGUMENT ],
|
210
|
+
[ '--autosigma', GetoptLong::NO_ARGUMENT ],
|
211
|
+
#[ '--heatmap', GetoptLong::NO_ARGUMENT ],
|
184
212
|
[ '--output', GetoptLong::REQUIRED_ARGUMENT ],
|
185
213
|
[ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ],
|
186
214
|
[ '--penv', GetoptLong::NO_ARGUMENT ],
|
@@ -189,70 +217,95 @@ Options:
|
|
189
217
|
[ '--version', GetoptLong::NO_ARGUMENT ]
|
190
218
|
)
|
191
219
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
220
|
+
begin
|
221
|
+
opts.each do |opt, arg|
|
222
|
+
case opt
|
223
|
+
when '--help'
|
224
|
+
print_usage
|
225
|
+
exit 0
|
226
|
+
when '--tem-list'
|
227
|
+
$tem_list = arg
|
228
|
+
when '--tem-file'
|
229
|
+
$tem_file = arg
|
230
|
+
when '--classdef'
|
231
|
+
$classdef = arg
|
232
|
+
when '--output'
|
233
|
+
$output = arg.to_i
|
234
|
+
when '--outfile'
|
235
|
+
$outfile = arg
|
236
|
+
when '--cys'
|
237
|
+
$cys = arg.to_i
|
238
|
+
when '--weight'
|
239
|
+
$weight = arg.to_i
|
240
|
+
when '--sigma'
|
241
|
+
$sigma = arg.to_f
|
242
|
+
when '--autosigma'
|
243
|
+
$autosigma = true
|
244
|
+
when '--pidmin'
|
245
|
+
$pidmin = arg.to_f
|
246
|
+
when '--pidmax'
|
247
|
+
$pidmax = arg.to_f
|
248
|
+
when '--noweight'
|
249
|
+
$noweight = true
|
250
|
+
when '--noround'
|
251
|
+
$noround = true
|
252
|
+
when '--smooth'
|
253
|
+
$smooth = (arg.to_i == 1) ? :full : :partial
|
254
|
+
when '--nosmooth'
|
255
|
+
$nosmooth = true
|
256
|
+
when '--p1smooth'
|
257
|
+
$p1smooth = true
|
258
|
+
when '--scale'
|
259
|
+
$scale = arg.to_f
|
260
|
+
when '--add'
|
261
|
+
$add = arg.to_f
|
262
|
+
when '--penv'
|
263
|
+
warn "--penv option is not supported yet."
|
264
|
+
exit 1
|
265
|
+
$penv = true
|
266
|
+
# when '--heatmap'
|
267
|
+
# $heatmap = true
|
268
|
+
when '--verbose'
|
269
|
+
$logger.level = case arg.to_i
|
270
|
+
when 0 then Logger::ERROR
|
271
|
+
when 1 then Logger::WARN
|
272
|
+
when 2 then Logger::INFO
|
273
|
+
when 3 then Logger::DEBUG
|
274
|
+
else Logger::WARN
|
275
|
+
end
|
276
|
+
when '--version'
|
277
|
+
print_version
|
278
|
+
exit 0
|
279
|
+
end
|
246
280
|
end
|
281
|
+
rescue
|
282
|
+
# invalid option
|
283
|
+
exit 1
|
247
284
|
end
|
248
285
|
|
249
286
|
# when arguments are nonsense, print usage
|
250
|
-
if ((ARGV.length != 0) ||
|
251
|
-
(!$tem_list && !$tem_file) ||
|
252
|
-
($tem_list && $tem_file))
|
287
|
+
if ((ARGV.length != 0) || (!$tem_list && !$tem_file) || ($tem_list && $tem_file))
|
253
288
|
print_usage
|
254
289
|
exit 1
|
255
290
|
end
|
291
|
+
|
292
|
+
# warn if any input file is missing
|
293
|
+
if $tem_list && !File.exist?($tem_list)
|
294
|
+
warn "Cannot find template list file, #{$tem_list}"
|
295
|
+
exit 1
|
296
|
+
end
|
297
|
+
|
298
|
+
if $tem_file && !File.exist?($tem_file)
|
299
|
+
warn "Cannot find template file, #{$tem_file}"
|
300
|
+
exit 1
|
301
|
+
end
|
302
|
+
|
303
|
+
if $classdef && !File.exist?($classdef)
|
304
|
+
warn "Cannot find environment class definition file, #{$classdef}"
|
305
|
+
exit 1
|
306
|
+
end
|
307
|
+
|
308
|
+
|
256
309
|
#
|
257
310
|
# Part 2 END
|
258
311
|
#
|
@@ -263,76 +316,68 @@ Options:
|
|
263
316
|
# Reading Environment Class Definition File
|
264
317
|
#
|
265
318
|
|
266
|
-
|
267
|
-
$amino_acids = "ACDEFGHIKLMNPQRSTVWY".split("") if $cys == 2
|
319
|
+
$logger.info "Egor START."
|
268
320
|
|
269
|
-
#
|
270
|
-
$
|
321
|
+
# check --cys option and modify amino_acids set if necessary
|
322
|
+
if $cys == 2
|
323
|
+
$amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('')
|
324
|
+
end
|
325
|
+
|
326
|
+
# create an EnvironmentFeatureList object for storing all environment features
|
327
|
+
$env_features = EnvironmentFeatureArray.new
|
271
328
|
|
272
329
|
# an array for storing indexes of constrained environment features
|
273
330
|
$cst_features = []
|
274
331
|
|
275
|
-
#
|
276
|
-
$env_features << EnvironmentFeature.new(
|
277
|
-
$amino_acids,
|
278
|
-
$amino_acids,
|
279
|
-
"F",
|
280
|
-
"F")
|
332
|
+
# add substituted amino acid (aa1) in a substitution to the environment feature list
|
333
|
+
$env_features << EnvironmentFeature.new('sequence', $amino_acids, $amino_acids, 'F', 'F')
|
281
334
|
|
282
|
-
# read environment class definiton file and
|
283
|
-
# store them into the hash prepared above
|
335
|
+
# read environment class definiton file and store them into the hash prepared above
|
284
336
|
env_index = 1
|
285
337
|
|
286
338
|
IO.foreach($classdef) do |line|
|
287
339
|
line.chomp!
|
288
|
-
if line.start_with?(
|
340
|
+
if line.start_with?('#')
|
289
341
|
next
|
290
342
|
elsif (env_ftr = line.chomp.split(/;/)).length == 5
|
291
|
-
$logger.info "
|
292
|
-
if env_ftr[-1] ==
|
343
|
+
$logger.info "An environment feature, #{line} detected."
|
344
|
+
if env_ftr[-1] == 'T'
|
293
345
|
# skip silenced environment feature
|
294
|
-
$logger.warn "
|
346
|
+
$logger.warn "The environment feature, #{line} silent."
|
295
347
|
next
|
296
348
|
end
|
297
|
-
if env_ftr[-2] ==
|
349
|
+
if env_ftr[-2] == 'T'
|
298
350
|
$cst_features << env_index
|
299
|
-
$logger.warn "
|
351
|
+
$logger.warn "The environment feature, #{line} constrained."
|
300
352
|
end
|
301
|
-
$env_features << EnvironmentFeature.new(env_ftr[0],
|
302
|
-
env_ftr[1].split(""),
|
303
|
-
env_ftr[2].split(""),
|
304
|
-
env_ftr[3],
|
305
|
-
env_ftr[4])
|
353
|
+
$env_features << EnvironmentFeature.new(env_ftr[0], env_ftr[1].split(''), env_ftr[2].split(''), env_ftr[3], env_ftr[4])
|
306
354
|
env_index += 1
|
307
355
|
else
|
308
|
-
$logger.error "
|
356
|
+
$logger.error "\"#{line}\" doesn't seem to be a proper format for a environment class definition."
|
309
357
|
exit 1
|
310
358
|
end
|
311
359
|
end
|
312
360
|
|
313
|
-
# a hash for storing all environment
|
314
|
-
$
|
315
|
-
|
316
|
-
# generate all possible combinations of environment labels, and
|
317
|
-
|
318
|
-
|
319
|
-
sum << ec.labels
|
320
|
-
}.inject { |pro, lb|
|
321
|
-
pro.product(lb)
|
322
|
-
}.each_with_index { |e, i|
|
323
|
-
$envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
|
361
|
+
# a hash for storing all environment classes
|
362
|
+
$env_classes = EnvironmentClassHash.new
|
363
|
+
|
364
|
+
# generate all possible combinations of environment labels, and store every environment class into the hash prepared above with the label as a key
|
365
|
+
$env_features.label_combinations.each_with_index { |e, i|
|
366
|
+
$env_classes[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
|
324
367
|
}
|
368
|
+
|
325
369
|
#
|
326
370
|
# Part 3 END
|
327
371
|
#
|
328
372
|
|
373
|
+
|
329
374
|
# Part 4.
|
330
375
|
#
|
331
376
|
# Reading TEM file or TEMLIST list file and couting substitutions
|
332
377
|
#
|
333
378
|
|
334
379
|
# a global file handle for output
|
335
|
-
$outfh = File.open($outfile,
|
380
|
+
$outfh = File.open($outfile, 'w')
|
336
381
|
|
337
382
|
if $tem_file
|
338
383
|
$tem_list_io = StringIO.new($tem_file)
|
@@ -345,18 +390,19 @@ Options:
|
|
345
390
|
$tem_list_io.each_line do |tem_file|
|
346
391
|
tem_file.chomp!
|
347
392
|
|
348
|
-
$logger.info "
|
393
|
+
$logger.info "Analysing #{tem_file} ..."
|
349
394
|
|
350
395
|
ali = Bio::Alignment::OriginalAlignment.new
|
351
396
|
ff = Bio::FlatFile.auto(tem_file)
|
397
|
+
|
352
398
|
ff.each_entry do |pir|
|
353
|
-
if pir.definition ==
|
354
|
-
ali.add_seq(pir.data.gsub("\n",
|
399
|
+
if (pir.definition == 'sequence') || (pir.definition == 'structure')
|
400
|
+
ali.add_seq(pir.data.gsub("\n", ''), pir.entry_id)
|
355
401
|
end
|
356
402
|
end
|
357
403
|
|
358
404
|
if ali.size < 2
|
359
|
-
$logger.warn "
|
405
|
+
$logger.warn "Skipped #{tem_file}, there is only one unique entry."
|
360
406
|
next
|
361
407
|
end
|
362
408
|
|
@@ -368,8 +414,8 @@ Options:
|
|
368
414
|
# check disulphide bond environment first!
|
369
415
|
ff.rewind
|
370
416
|
ff.each_entry do |pir|
|
371
|
-
if (pir.entry_id == key) && (pir.definition == "disulphide")
|
372
|
-
disulphide[key] = pir.data.gsub("\n",
|
417
|
+
if (pir.entry_id == key) && ((pir.definition == "disulphide") || (pir.definition == "disulfide"))
|
418
|
+
disulphide[key] = pir.data.gsub("\n", '').split('')
|
373
419
|
end
|
374
420
|
end
|
375
421
|
|
@@ -379,14 +425,14 @@ Options:
|
|
379
425
|
ff.rewind
|
380
426
|
ff.each_entry do |pir|
|
381
427
|
if (pir.entry_id == key) && (pir.definition == ec.name)
|
382
|
-
labels = pir.data.gsub("\n",
|
383
|
-
if sym ==
|
384
|
-
|
385
|
-
elsif sym ==
|
386
|
-
|
428
|
+
labels = pir.data.gsub("\n", '').split('').map_with_index do |sym, pos|
|
429
|
+
if sym == '-'
|
430
|
+
'-'
|
431
|
+
elsif sym == 'X' || sym == 'x'
|
432
|
+
'X'
|
387
433
|
else
|
388
434
|
if ei == 0 # Amino Acid Environment Feature
|
389
|
-
(
|
435
|
+
(disulphide.has_key?(key) && (disulphide[key][pos] == 'F') && (sym == 'C')) ? 'J' : sym
|
390
436
|
else
|
391
437
|
ec.labels[ec.symbols.index(sym)]
|
392
438
|
end
|
@@ -407,19 +453,19 @@ Options:
|
|
407
453
|
ali.each_pair do |id1, seq1|
|
408
454
|
ali.each_pair do |id2, seq2|
|
409
455
|
if id1 != id2
|
410
|
-
pid =
|
411
|
-
s1 = seq1.split(
|
412
|
-
s2 = seq2.split(
|
456
|
+
pid = calculate_pid(seq1, seq2)
|
457
|
+
s1 = seq1.split('')
|
458
|
+
s2 = seq2.split('')
|
413
459
|
|
414
460
|
# check PID_MIN
|
415
461
|
if $pidmin && (pid < $pidmin)
|
416
|
-
$logger.info "
|
462
|
+
$logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% less than PID_MIN, #{$pidmin}."
|
417
463
|
next
|
418
464
|
end
|
419
465
|
|
420
466
|
# check PID_MAX
|
421
467
|
if $pidmax && (pid > $pidmax)
|
422
|
-
$logger.info "
|
468
|
+
$logger.info "Skip alignment between #{id1} and #{id2} having PID, #{pid}% greater than PID_MAX, #{$pidmax}."
|
423
469
|
next
|
424
470
|
end
|
425
471
|
|
@@ -427,65 +473,65 @@ Options:
|
|
427
473
|
aa1.upcase!
|
428
474
|
aa2 = s2[pos].upcase
|
429
475
|
|
430
|
-
if env_labels[id1][pos].include?(
|
431
|
-
$logger.info "
|
476
|
+
if env_labels[id1][pos].include?('X')
|
477
|
+
$logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked."
|
432
478
|
next
|
433
479
|
end
|
434
480
|
|
435
|
-
if env_labels[id2][pos].include?(
|
436
|
-
$logger.info "
|
481
|
+
if env_labels[id2][pos].include?('X')
|
482
|
+
$logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked."
|
437
483
|
next
|
438
484
|
end
|
439
485
|
|
440
|
-
|
441
|
-
$logger.warn "
|
486
|
+
unless $amino_acids.include?(aa1)
|
487
|
+
$logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-"
|
442
488
|
next
|
443
489
|
end
|
444
490
|
|
445
|
-
|
446
|
-
$logger.warn "
|
491
|
+
unless $amino_acids.include?(aa2)
|
492
|
+
$logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-"
|
447
493
|
next
|
448
494
|
end
|
449
495
|
|
450
|
-
aa1 = (
|
451
|
-
aa2 = (
|
496
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
|
497
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
|
452
498
|
|
453
499
|
if $cst_features.empty?
|
454
|
-
$
|
455
|
-
elsif (env_labels[id1][pos].split(
|
456
|
-
$
|
500
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
|
501
|
+
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
502
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2)
|
457
503
|
else
|
458
|
-
$logger.debug "
|
504
|
+
$logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other."
|
459
505
|
next
|
460
506
|
end
|
461
507
|
|
462
508
|
grp_label = env_labels[id1][pos][1..-1]
|
463
509
|
|
464
|
-
if $
|
465
|
-
if $
|
466
|
-
$
|
510
|
+
if $aa_env_cnt.has_key? grp_label
|
511
|
+
if $aa_env_cnt[grp_label].has_key? aa1
|
512
|
+
$aa_env_cnt[grp_label][aa1] += 1
|
467
513
|
else
|
468
|
-
$
|
514
|
+
$aa_env_cnt[grp_label][aa1] = 1
|
469
515
|
end
|
470
516
|
else
|
471
|
-
$
|
472
|
-
$
|
517
|
+
$aa_env_cnt[grp_label] = Hash.new(0)
|
518
|
+
$aa_env_cnt[grp_label][aa1] = 1
|
473
519
|
end
|
474
520
|
|
475
|
-
if $
|
476
|
-
$
|
521
|
+
if $aa_tot_cnt.has_key? aa1
|
522
|
+
$aa_tot_cnt[aa1] += 1
|
477
523
|
else
|
478
|
-
$
|
524
|
+
$aa_tot_cnt[aa1] = 1
|
479
525
|
end
|
480
526
|
|
481
527
|
if aa1 != aa2
|
482
|
-
if $
|
483
|
-
$
|
528
|
+
if $aa_mut_cnt.has_key? aa1
|
529
|
+
$aa_mut_cnt[aa1] += 1
|
484
530
|
else
|
485
|
-
$
|
531
|
+
$aa_mut_cnt[aa1] = 1
|
486
532
|
end
|
487
533
|
end
|
488
|
-
$logger.debug "
|
534
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}."
|
489
535
|
end
|
490
536
|
end
|
491
537
|
end
|
@@ -504,7 +550,7 @@ Options:
|
|
504
550
|
found = false
|
505
551
|
clusters[i].each do |c1|
|
506
552
|
clusters[j].each do |c2|
|
507
|
-
if
|
553
|
+
if calculate_pid(ali[c1], ali[c2]) >= $weight
|
508
554
|
indexes << j
|
509
555
|
found = true
|
510
556
|
break
|
@@ -527,106 +573,110 @@ Options:
|
|
527
573
|
end
|
528
574
|
end while(continue)
|
529
575
|
|
576
|
+
if clusters.size < 2
|
577
|
+
$logger.debug "Skipped #{tem_file} because there is only one cluster at the #{$weight} PID level."
|
578
|
+
next
|
579
|
+
end
|
580
|
+
|
530
581
|
clusters.combination(2).each do |cluster1, cluster2|
|
531
582
|
cluster1.each do |id1|
|
532
583
|
cluster2.each do |id2|
|
533
|
-
seq1 = ali[id1].split(
|
534
|
-
seq2 = ali[id2].split(
|
584
|
+
seq1 = ali[id1].split('')
|
585
|
+
seq2 = ali[id2].split('')
|
535
586
|
|
536
587
|
seq1.each_with_index do |aa1, pos|
|
537
588
|
aa1.upcase!
|
538
|
-
aa2 = seq2[pos].upcase rescue next # should fix this in sane way!
|
589
|
+
aa2 = seq2[pos].upcase rescue next # should fix this in a sane way!
|
539
590
|
|
540
|
-
if env_labels[id1][pos].include?(
|
541
|
-
$logger.debug "
|
591
|
+
if env_labels[id1][pos].include?('X')
|
592
|
+
$logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked."
|
542
593
|
next
|
543
594
|
end
|
544
595
|
|
545
|
-
if env_labels[id2][pos].include?(
|
546
|
-
$logger.debug "
|
596
|
+
if env_labels[id2][pos].include?('X')
|
597
|
+
$logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked."
|
547
598
|
next
|
548
599
|
end
|
549
600
|
|
550
|
-
|
551
|
-
$logger.warn "
|
601
|
+
unless $amino_acids.include?(aa1)
|
602
|
+
$logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-"
|
552
603
|
next
|
553
604
|
end
|
554
605
|
|
555
|
-
|
556
|
-
$logger.warn "
|
606
|
+
unless $amino_acids.include?(aa2)
|
607
|
+
$logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-"
|
557
608
|
next
|
558
609
|
end
|
559
610
|
|
560
|
-
aa1 = (
|
561
|
-
aa2 = (
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
obs2 = 1.0 / size2
|
611
|
+
aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1
|
612
|
+
aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2
|
613
|
+
obs1 = 1.0 / cluster1.size
|
614
|
+
obs2 = 1.0 / cluster2.size
|
615
|
+
obs_cnt = obs1 * obs2
|
566
616
|
|
567
617
|
if $cst_features.empty?
|
568
|
-
$
|
569
|
-
$
|
570
|
-
elsif (env_labels[id1][pos].split(
|
571
|
-
$
|
572
|
-
$
|
618
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
|
619
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
|
620
|
+
elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features))
|
621
|
+
$env_classes[env_labels[id1][pos]].increase_residue_count(aa2, obs_cnt)
|
622
|
+
$env_classes[env_labels[id2][pos]].increase_residue_count(aa1, obs_cnt)
|
573
623
|
else
|
574
|
-
$logger.debug "
|
624
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."
|
575
625
|
next
|
576
626
|
end
|
577
627
|
|
578
628
|
grp_label1 = env_labels[id1][pos][1..-1]
|
579
629
|
grp_label2 = env_labels[id2][pos][1..-1]
|
580
630
|
|
581
|
-
if $
|
582
|
-
if $
|
583
|
-
$
|
631
|
+
if $aa_env_cnt.has_key? grp_label1
|
632
|
+
if $aa_env_cnt[grp_label1].has_key? aa1
|
633
|
+
$aa_env_cnt[grp_label1][aa1] += obs1
|
584
634
|
else
|
585
|
-
$
|
635
|
+
$aa_env_cnt[grp_label1][aa1] = obs1
|
586
636
|
end
|
587
637
|
else
|
588
|
-
$
|
589
|
-
$
|
638
|
+
$aa_env_cnt[grp_label1] = Hash.new(0.0)
|
639
|
+
$aa_env_cnt[grp_label1][aa1] = obs1
|
590
640
|
end
|
591
641
|
|
592
|
-
if $
|
593
|
-
if $
|
594
|
-
$
|
642
|
+
if $aa_env_cnt.has_key? grp_label2
|
643
|
+
if $aa_env_cnt[grp_label2].has_key? aa2
|
644
|
+
$aa_env_cnt[grp_label2][aa2] += obs2
|
595
645
|
else
|
596
|
-
$
|
646
|
+
$aa_env_cnt[grp_label2][aa2] = obs2
|
597
647
|
end
|
598
648
|
else
|
599
|
-
$
|
600
|
-
$
|
649
|
+
$aa_env_cnt[grp_label2] = Hash.new(0.0)
|
650
|
+
$aa_env_cnt[grp_label2][aa2] = obs2
|
601
651
|
end
|
602
652
|
|
603
|
-
if $
|
604
|
-
$
|
653
|
+
if $aa_tot_cnt.has_key? aa1
|
654
|
+
$aa_tot_cnt[aa1] += obs1
|
605
655
|
else
|
606
|
-
$
|
656
|
+
$aa_tot_cnt[aa1] = obs1
|
607
657
|
end
|
608
658
|
|
609
|
-
if $
|
610
|
-
$
|
659
|
+
if $aa_tot_cnt.has_key? aa2
|
660
|
+
$aa_tot_cnt[aa2] += obs2
|
611
661
|
else
|
612
|
-
$
|
662
|
+
$aa_tot_cnt[aa2] = obs2
|
613
663
|
end
|
614
664
|
|
615
665
|
if aa1 != aa2
|
616
|
-
if $
|
617
|
-
$
|
666
|
+
if $aa_mut_cnt.has_key? aa1
|
667
|
+
$aa_mut_cnt[aa1] += obs1
|
618
668
|
else
|
619
|
-
$
|
669
|
+
$aa_mut_cnt[aa1] = obs1
|
620
670
|
end
|
621
|
-
if $
|
622
|
-
$
|
671
|
+
if $aa_mut_cnt.has_key? aa2
|
672
|
+
$aa_mut_cnt[aa2] += obs2
|
623
673
|
else
|
624
|
-
$
|
674
|
+
$aa_mut_cnt[aa2] = obs2
|
625
675
|
end
|
626
676
|
end
|
627
677
|
|
628
|
-
$logger.debug "
|
629
|
-
$logger.debug "
|
678
|
+
$logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id1][pos]}."
|
679
|
+
$logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % obs_cnt}) was added to the environments class, #{env_labels[id2][pos]}."
|
630
680
|
end
|
631
681
|
end
|
632
682
|
end
|
@@ -636,7 +686,6 @@ Options:
|
|
636
686
|
|
637
687
|
# print out default header
|
638
688
|
$outfh.puts <<HEADER
|
639
|
-
#
|
640
689
|
# Environment-specific amino acid substitution matrices
|
641
690
|
# Creator: egor version #{Egor::VERSION}
|
642
691
|
# Creation Date: #{Time.now.strftime("%d/%m/%Y %H:%M")}
|
@@ -649,55 +698,94 @@ HEADER
|
|
649
698
|
$env_features[1..-1].each { |e| $outfh.puts "# #{e}" }
|
650
699
|
|
651
700
|
$outfh.puts <<HEADER
|
652
|
-
#
|
653
701
|
# (read in from #{$classdef})
|
654
702
|
#
|
655
703
|
# Number of alignments: #{$ali_size}
|
656
704
|
# (list of .tem files read in from #{$tem_list})
|
657
705
|
#
|
658
|
-
# Total number of environments: #{Integer($
|
706
|
+
# Total number of environments: #{Integer($env_classes.size / $amino_acids.size)}
|
659
707
|
#
|
660
708
|
# There are #{$amino_acids.size} amino acids considered.
|
661
709
|
# #{$amino_acids.join}
|
662
710
|
#
|
663
711
|
HEADER
|
664
712
|
|
713
|
+
if $amino_acids.include? 'J'
|
714
|
+
$outfh.puts <<HEADER
|
715
|
+
# C: Cystine (the disulfide-bonded form)
|
716
|
+
# J: Cysteine (the free thiol form)
|
717
|
+
#
|
718
|
+
HEADER
|
719
|
+
end
|
720
|
+
|
665
721
|
if $noweight
|
666
|
-
$outfh.puts
|
722
|
+
$outfh.puts '# Weighting scheme: none'
|
667
723
|
else
|
668
724
|
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
669
725
|
end
|
670
726
|
|
671
727
|
# calculate amino acid frequencies and mutabilities, and
|
672
728
|
# print them as default statistics in the header part
|
673
|
-
ala_factor = if $
|
729
|
+
ala_factor = if $aa_tot_cnt['A'] == 0
|
674
730
|
0.0
|
675
|
-
elsif $
|
731
|
+
elsif $aa_mut_cnt['A'] == 0
|
676
732
|
0.0
|
677
733
|
else
|
678
|
-
100.0 * $
|
734
|
+
100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f
|
679
735
|
end
|
680
|
-
$tot_aa = $
|
736
|
+
$tot_aa = $aa_tot_cnt.values.sum
|
681
737
|
|
682
|
-
$outfh.puts
|
738
|
+
$outfh.puts '#'
|
683
739
|
$outfh.puts "# Total amino acid frequencies:\n"
|
684
|
-
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB
|
740
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ]
|
741
|
+
|
742
|
+
min_obs = -1
|
743
|
+
min_sigma = nil
|
685
744
|
|
686
745
|
$amino_acids.each do |res|
|
687
|
-
|
746
|
+
if ($aa_tot_cnt[res] / $sigma) < $min_obs_sigma_ratio
|
747
|
+
if min_obs < 0
|
748
|
+
min_obs = $aa_tot_cnt[res]
|
749
|
+
min_sigma = min_obs / $min_obs_sigma_ratio
|
750
|
+
elsif (min_obs > 0) && (min_obs > $aa_tot_cnt[res])
|
751
|
+
min_obs = $aa_tot_cnt[res]
|
752
|
+
min_sigma = min_obs / $min_obs_sigma_ratio
|
753
|
+
end
|
754
|
+
|
755
|
+
$logger.warn "The current sigma value, #{$sigma} seems to be too big for the total observation (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}."
|
756
|
+
end
|
757
|
+
|
758
|
+
$aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f)
|
688
759
|
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
689
|
-
$
|
760
|
+
$aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f)
|
690
761
|
end
|
691
762
|
|
692
763
|
$amino_acids.each do |res|
|
693
764
|
if $noweight
|
694
|
-
$outfh.puts
|
695
|
-
[res, $
|
765
|
+
$outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' %
|
766
|
+
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
696
767
|
else
|
697
|
-
$outfh.puts
|
698
|
-
[res, $
|
768
|
+
$outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' %
|
769
|
+
[res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]]
|
770
|
+
end
|
771
|
+
end
|
772
|
+
|
773
|
+
if min_obs > -1
|
774
|
+
$logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}."
|
775
|
+
if $autosigma
|
776
|
+
$logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}."
|
777
|
+
$sigma = min_sigma
|
699
778
|
end
|
700
779
|
end
|
780
|
+
|
781
|
+
$outfh.puts '#'
|
782
|
+
$outfh.puts '# RES: Amino acid one letter code'
|
783
|
+
$outfh.puts '# TOT_OBS: Total observations of incidence'
|
784
|
+
$outfh.puts '# MUT_OBS: Total observations of mutation'
|
785
|
+
$outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)'
|
786
|
+
$outfh.puts '# REL_MUTB: Relative mutability (ALA=100)'
|
787
|
+
$outfh.puts '# REL_FREQ: Relative frequency'
|
788
|
+
$outfh.puts '#'
|
701
789
|
#
|
702
790
|
# Part 4. END
|
703
791
|
#
|
@@ -705,48 +793,45 @@ HEADER
|
|
705
793
|
|
706
794
|
# Part 5.
|
707
795
|
#
|
708
|
-
#
|
796
|
+
# Generating substitution frequency matrices
|
709
797
|
#
|
710
798
|
|
711
799
|
# calculating probabilities for each environment
|
712
|
-
$
|
800
|
+
$env_classes.values.each do |e|
|
713
801
|
if e.freq_array.sum != 0
|
714
802
|
e.prob_array = 100.0 * e.freq_array / e.freq_array.sum
|
715
803
|
end
|
716
804
|
end
|
717
805
|
|
718
806
|
# count raw frequencies
|
719
|
-
$
|
807
|
+
$tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
|
720
808
|
|
721
809
|
# for each combination of environment features
|
722
|
-
|
723
|
-
|
724
|
-
env_groups.to_a.sort_by { |env_group|
|
725
|
-
# a bit clumsy sorting here...
|
726
|
-
env_group[0].split("").map_with_index { |l, i|
|
727
|
-
$env_features[i + 1].labels.index(l)
|
728
|
-
}
|
729
|
-
}.each_with_index do |group, group_no|
|
730
|
-
grp_freq_mat = ($noweight ? NMatrix.int($amino_acids.size,$amino_acids.size) : NMatrix.float($amino_acids.size,$amino_acids.size))
|
810
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
811
|
+
grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size)
|
731
812
|
|
732
|
-
$amino_acids.each_with_index do |aa,
|
813
|
+
$amino_acids.each_with_index do |aa, aj|
|
733
814
|
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
734
|
-
0.upto($amino_acids.size - 1) { |
|
815
|
+
0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] }
|
735
816
|
end
|
736
817
|
|
737
|
-
$
|
818
|
+
$tot_cnt_mat += grp_cnt_mat
|
738
819
|
|
739
820
|
if $output == 0
|
740
821
|
$outfh.puts ">#{group[0]} #{group_no}"
|
741
|
-
$outfh.puts
|
822
|
+
$outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
742
823
|
end
|
743
824
|
end
|
744
825
|
|
745
826
|
if $output == 0
|
746
|
-
$outfh.puts
|
747
|
-
$outfh.puts $
|
827
|
+
$outfh.puts '>Total'
|
828
|
+
$outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
829
|
+
$logger.info 'Egor END.'
|
748
830
|
exit 0
|
749
831
|
end
|
832
|
+
|
833
|
+
$logger.info "Counting substitutions is done."
|
834
|
+
|
750
835
|
#
|
751
836
|
# Part 5. END
|
752
837
|
#
|
@@ -770,25 +855,29 @@ HEADER
|
|
770
855
|
|
771
856
|
# when nosmoothing !!!
|
772
857
|
if ($output > 0) && $nosmooth
|
773
|
-
#
|
774
|
-
$
|
858
|
+
# reinitialize $tot_cnt_mat for pseudocounts
|
859
|
+
$tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
775
860
|
|
776
861
|
# for each combination of environment features
|
777
|
-
|
778
|
-
env_groups.to_a.sort_by { |env_group|
|
779
|
-
# a bit clumsy sorting here...
|
780
|
-
env_group[0].split("").map_with_index { |l, i|
|
781
|
-
$env_features[i + 1].labels.index(l)
|
782
|
-
}
|
783
|
-
}.each_with_index do |group, group_no|
|
784
|
-
grp_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
862
|
+
pseudo_cnt = $add || (1.0 / $env_classes.group_size)
|
785
863
|
|
786
|
-
|
787
|
-
|
788
|
-
|
864
|
+
# add pseudo counts for each frequency vector
|
865
|
+
$env_classes.values.each { |e| e.freq_array += pseudo_cnt }
|
866
|
+
|
867
|
+
# re-calculate probability vector for each environment class
|
868
|
+
$env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum }
|
869
|
+
|
870
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
871
|
+
grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
872
|
+
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
873
|
+
|
874
|
+
$amino_acids.each_with_index do |aa, aj|
|
875
|
+
env_class = group[1].find { |e| e.label.start_with?(aa) }
|
876
|
+
0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] }
|
877
|
+
0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] }
|
789
878
|
end
|
790
879
|
|
791
|
-
$
|
880
|
+
$tot_cnt_mat += grp_cnt_mat
|
792
881
|
|
793
882
|
if ($output == 1)
|
794
883
|
$outfh.puts ">#{group[0]} #{group_no}"
|
@@ -796,10 +885,20 @@ HEADER
|
|
796
885
|
end
|
797
886
|
end
|
798
887
|
|
888
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
889
|
+
|
890
|
+
0.upto($amino_acids.size - 1) do |aj|
|
891
|
+
col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] }
|
892
|
+
0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum }
|
893
|
+
end
|
894
|
+
|
895
|
+
$logger.info 'Calculating substitution probabilities is done (no smoothing)'
|
896
|
+
|
799
897
|
if ($output == 1)
|
800
|
-
$outfh.puts
|
898
|
+
$outfh.puts '>Total'
|
801
899
|
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
802
900
|
$outfh.close
|
901
|
+
$logger.info 'Egor END.'
|
803
902
|
exit 0
|
804
903
|
end
|
805
904
|
end
|
@@ -807,7 +906,7 @@ HEADER
|
|
807
906
|
# when smoothing!!!
|
808
907
|
if ($output > 0) && !$nosmooth
|
809
908
|
#
|
810
|
-
# p1
|
909
|
+
# p1 probabilities
|
811
910
|
#
|
812
911
|
p1 = NArray.float($amino_acids.size)
|
813
912
|
a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size)
|
@@ -816,55 +915,73 @@ HEADER
|
|
816
915
|
omega1 = 1.0 / (1 + big_N / ($sigma * small_n))
|
817
916
|
omega2 = 1.0 - omega1
|
818
917
|
|
819
|
-
if $smooth == :
|
820
|
-
# for partial smoothing,
|
821
|
-
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $
|
918
|
+
if ($smooth == :full) || $p1smooth
|
919
|
+
# smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing
|
920
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) }
|
822
921
|
$smooth_prob[1] = p1
|
823
|
-
|
824
|
-
#
|
825
|
-
|
922
|
+
elsif ($smooth == :partial)
|
923
|
+
# no smoothing for p1 probabilities just as Kenji's subst
|
924
|
+
# in this case, p1 probabilities were taken from the amino acid frequencies of your data set
|
925
|
+
0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] }
|
826
926
|
$smooth_prob[1] = p1
|
827
927
|
end
|
828
928
|
|
829
929
|
#
|
830
930
|
# p2 and above
|
831
931
|
#
|
832
|
-
env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
932
|
+
env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }
|
833
933
|
|
834
934
|
if $smooth == :partial
|
835
935
|
$outfh.puts <<HEADER
|
836
936
|
#
|
837
937
|
# Partial Smoothing:
|
838
938
|
#
|
939
|
+
HEADER
|
940
|
+
if $p1smooth
|
941
|
+
$outfh.puts <<HEADER
|
839
942
|
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
840
|
-
# each row in all matrices (
|
841
|
-
#
|
943
|
+
# each row in all matrices and smoothing them with A0 (a uniform distribution)
|
944
|
+
# ^^^^^^^^^
|
945
|
+
HEADER
|
946
|
+
else
|
947
|
+
$outfh.puts <<HEADER
|
948
|
+
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
949
|
+
# each row in all matrices without smoothing
|
950
|
+
# ^^^^^^^^^^^^^^^^^
|
951
|
+
HEADER
|
952
|
+
end
|
953
|
+
|
954
|
+
$outfh.puts <<HEADER
|
842
955
|
# p2(ri|Rj) is estimated as:
|
843
956
|
# p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)
|
844
957
|
#
|
845
958
|
# p3(ri|Rj,fq) is estimated as:
|
846
959
|
# p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)
|
847
960
|
# where
|
848
|
-
# A2(ri|fq) = p2(ri|fq) (fixed fq; partial smoothing)
|
961
|
+
# A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)
|
849
962
|
#
|
850
963
|
# The smoothing procedure is curtailed here and finally
|
964
|
+
# ^^^^^^^^^
|
851
965
|
# p5(ri|Rj,...) is estimated as:
|
852
966
|
# p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)
|
853
967
|
# where
|
854
968
|
# A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)
|
855
969
|
#
|
856
|
-
# Weights (omegas) are calculated as in Topham et al. 1993)
|
970
|
+
# Weights (omegas) are calculated as in Topham et al. (1993)
|
857
971
|
#
|
858
|
-
# sigma value used is:
|
972
|
+
# sigma value used is: #{$sigma}
|
859
973
|
#
|
860
974
|
HEADER
|
861
975
|
1.upto($env_features.size) do |ci|
|
862
976
|
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
863
|
-
|
977
|
+
if (ci > 2) && (ci < $env_features.size)
|
978
|
+
$logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing."
|
979
|
+
next
|
980
|
+
end
|
864
981
|
|
865
982
|
env_labels.combination(ci) do |c1|
|
866
983
|
Enumerable.cart_prod(*c1).each do |labels|
|
867
|
-
pattern =
|
984
|
+
pattern = '.' * $env_features.size
|
868
985
|
|
869
986
|
labels.each do |label|
|
870
987
|
i = label[0].chr.to_i
|
@@ -873,30 +990,31 @@ HEADER
|
|
873
990
|
end
|
874
991
|
|
875
992
|
if pattern =~ /^\./
|
876
|
-
$logger.debug "
|
993
|
+
$logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing."
|
877
994
|
next
|
878
995
|
end
|
879
996
|
|
880
|
-
# get
|
881
|
-
|
997
|
+
# get environments matching the pattern created above
|
998
|
+
# and calculate amino acid frequencies and their probabilities for all the environments
|
999
|
+
envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
|
882
1000
|
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
883
1001
|
prob_arr = NArray.float($amino_acids.size)
|
884
|
-
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = (freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f) }
|
1002
|
+
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) }
|
885
1003
|
|
886
1004
|
# # assess whether a residue type j is compatible with a particular combination of structural features
|
887
1005
|
# # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below
|
888
1006
|
# if ci == $env_features.size
|
889
1007
|
# aa_label = labels.find { |l| l.match(/^0/) }[1].chr
|
890
|
-
# sub_pattern =
|
1008
|
+
# sub_pattern = '.' * $env_features.size
|
891
1009
|
# sub_pattern[0] = aa_label
|
892
1010
|
# sub_freq_sum = 0
|
893
1011
|
#
|
894
1012
|
# labels[1..-1].each do |label|
|
895
|
-
# next if label.start_with?(
|
1013
|
+
# next if label.start_with?('0')
|
896
1014
|
# i = label[0].chr.to_i
|
897
1015
|
# l = label[1].chr
|
898
1016
|
# sub_pattern[i] = l
|
899
|
-
# sub_envs = $
|
1017
|
+
# sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
|
900
1018
|
# sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
901
1019
|
# sub_freq_sum += sub_freq_arr.sum
|
902
1020
|
# end
|
@@ -908,25 +1026,27 @@ HEADER
|
|
908
1026
|
# $smooth_prob[ci + 1] = {}
|
909
1027
|
# $smooth_prob[ci + 1][labels.to_set] = prob_arr
|
910
1028
|
# end
|
911
|
-
# $logger.warn "
|
1029
|
+
# $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}"
|
912
1030
|
# next
|
913
1031
|
# end
|
914
1032
|
# end
|
915
1033
|
|
916
|
-
# collect priors
|
917
|
-
priors
|
1034
|
+
# collect priors
|
1035
|
+
priors = []
|
918
1036
|
|
919
|
-
if ci ==
|
920
|
-
|
1037
|
+
if ci == 1
|
1038
|
+
priors << $smooth_prob[1]
|
1039
|
+
elsif ci == 2
|
1040
|
+
labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3|
|
921
1041
|
priors << $smooth_prob[2][c3.to_set]
|
922
1042
|
}
|
923
1043
|
elsif ci == $env_features.size
|
924
|
-
labels.combination(2).select { |c2| c2[0].start_with?(
|
1044
|
+
labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3|
|
925
1045
|
priors << $smooth_prob[3][c3.to_set]
|
926
1046
|
}
|
927
1047
|
end
|
928
1048
|
|
929
|
-
# entropy based weighting
|
1049
|
+
# entropy based prior weighting step
|
930
1050
|
entropy_max = Math::log($amino_acids.size)
|
931
1051
|
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
932
1052
|
begin
|
@@ -952,15 +1072,16 @@ HEADER
|
|
952
1072
|
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
953
1073
|
|
954
1074
|
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
955
|
-
if
|
956
|
-
$smooth_prob[ci + 1] = {}
|
1075
|
+
if $smooth_prob.has_key?(ci + 1)
|
957
1076
|
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
958
1077
|
else
|
1078
|
+
$smooth_prob[ci + 1] = {}
|
959
1079
|
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
960
1080
|
end
|
961
1081
|
end
|
962
1082
|
end
|
963
1083
|
end
|
1084
|
+
$logger.info 'Calculating substitution probabilities is done (partial smoothing).'
|
964
1085
|
else
|
965
1086
|
$outfh.puts <<HEADER
|
966
1087
|
#
|
@@ -980,22 +1101,23 @@ HEADER
|
|
980
1101
|
# A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)
|
981
1102
|
#
|
982
1103
|
# The smoothing procedure is NOT curtailed here and it goes upto
|
1104
|
+
# ^^^^^^^^^^^^^
|
983
1105
|
#
|
984
1106
|
# pn(ri|f1q,f2q,...,fn-1q) is estimated as:
|
985
|
-
# pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 *
|
1107
|
+
# pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)
|
986
1108
|
# where
|
987
1109
|
# An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)
|
988
1110
|
#
|
989
|
-
# Weights (omegas) are calculated as in Topham et al. 1993)
|
1111
|
+
# Weights (omegas) are calculated as in Topham et al. (1993)
|
990
1112
|
#
|
991
|
-
# sigma value used is:
|
1113
|
+
# sigma value used is: #{$sigma}
|
992
1114
|
#
|
993
1115
|
HEADER
|
994
1116
|
# full smooting
|
995
1117
|
1.upto($env_features.size) do |ci|
|
996
1118
|
env_labels.combination(ci) do |c1|
|
997
1119
|
Enumerable.cart_prod(*c1).each do |labels|
|
998
|
-
pattern =
|
1120
|
+
pattern = '.' * $env_features.size
|
999
1121
|
labels.each do |label|
|
1000
1122
|
j = label[0].chr.to_i
|
1001
1123
|
l = label[1].chr
|
@@ -1003,7 +1125,7 @@ HEADER
|
|
1003
1125
|
end
|
1004
1126
|
|
1005
1127
|
# get environmetns, frequencies, and probabilities
|
1006
|
-
envs = $
|
1128
|
+
envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) }
|
1007
1129
|
freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array }
|
1008
1130
|
prob_arr = NArray.float($amino_acids.size)
|
1009
1131
|
0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }
|
@@ -1036,58 +1158,57 @@ HEADER
|
|
1036
1158
|
0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }
|
1037
1159
|
|
1038
1160
|
# store smoothed probabilties in a hash using a set of envrionment labels as a key
|
1039
|
-
if
|
1040
|
-
$smooth_prob[ci + 1] = {}
|
1161
|
+
if $smooth_prob.has_key?(ci + 1)
|
1041
1162
|
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1042
1163
|
else
|
1164
|
+
$smooth_prob[ci + 1] = {}
|
1043
1165
|
$smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
|
1044
1166
|
end
|
1045
1167
|
end
|
1046
1168
|
end
|
1047
1169
|
end
|
1170
|
+
$logger.info 'Calculating substitution probabilities is done (full smoothing).'
|
1048
1171
|
end
|
1049
1172
|
|
1050
1173
|
# updating smoothed probability array for each envrionment
|
1051
|
-
$
|
1052
|
-
|
1053
|
-
|
1054
|
-
$tot_prob_mat = NMatrix.float($amino_acids.size,$amino_acids.size)
|
1055
|
-
|
1056
|
-
# grouping environments by its environment labels but amino acid label
|
1057
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1174
|
+
$env_classes.values.each do |env|
|
1175
|
+
env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set]
|
1176
|
+
end
|
1058
1177
|
|
1059
1178
|
# sorting environments and build 21X21 substitution matrices
|
1060
|
-
|
1061
|
-
# a bit clumsy sorting here...
|
1062
|
-
env_group[0].split("").map_with_index { |l, i|
|
1063
|
-
$env_features[i + 1].labels.index(l)
|
1064
|
-
}
|
1065
|
-
}.each_with_index do |group, group_no|
|
1179
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1066
1180
|
# calculating 21X21 substitution probability matrix for each envrionment
|
1067
|
-
grp_prob_mat = NMatrix.float($amino_acids.size
|
1181
|
+
grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1068
1182
|
|
1069
1183
|
$amino_acids.each_with_index do |aa, ai|
|
1070
|
-
|
1071
|
-
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] =
|
1184
|
+
smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
1185
|
+
0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] }
|
1072
1186
|
end
|
1073
1187
|
|
1074
|
-
$tot_prob_mat += grp_prob_mat
|
1075
|
-
|
1076
1188
|
if $output == 1
|
1077
1189
|
$outfh.puts ">#{group[0]} #{group_no}"
|
1078
1190
|
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1079
1191
|
end
|
1080
1192
|
end
|
1081
1193
|
|
1082
|
-
|
1194
|
+
# for a total substitution probability matrix
|
1195
|
+
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
1196
|
+
|
1197
|
+
$amino_acids.each_with_index do |aa, aj|
|
1198
|
+
0.upto($amino_acids.size - 1) do |ai|
|
1199
|
+
$tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai]
|
1200
|
+
end
|
1201
|
+
end
|
1083
1202
|
|
1084
1203
|
if $output == 1
|
1085
|
-
$outfh.puts
|
1204
|
+
$outfh.puts '>Total'
|
1086
1205
|
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1087
1206
|
$outfh.close
|
1207
|
+
$logger.info 'Egor END.'
|
1088
1208
|
exit 0
|
1089
1209
|
end
|
1090
1210
|
end
|
1211
|
+
|
1091
1212
|
#
|
1092
1213
|
# Part 6. END
|
1093
1214
|
#
|
@@ -1104,79 +1225,88 @@ HEADER
|
|
1104
1225
|
HEADER
|
1105
1226
|
if $penv
|
1106
1227
|
$outfh.puts <<HEADER
|
1107
|
-
# which were derived from the environment-
|
1108
|
-
#
|
1228
|
+
# which were derived from the environment-dependent amino acid frequencies.
|
1229
|
+
# ^^^^^^^^^^^^^^^^^^^^^
|
1109
1230
|
HEADER
|
1110
1231
|
else
|
1111
1232
|
$outfh.puts <<HEADER
|
1112
|
-
# which were derived from the environment-
|
1113
|
-
#
|
1233
|
+
# which were derived from the environment-independent amino acid frequencies.
|
1234
|
+
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1114
1235
|
HEADER
|
1115
1236
|
end
|
1116
1237
|
|
1117
|
-
$tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1118
1238
|
grp_logo_mats = []
|
1119
1239
|
factor = $scale / Math::log(2)
|
1120
1240
|
|
1121
|
-
|
1122
|
-
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1123
|
-
|
1124
|
-
# sorting environments and build 21X21 substitution matrices
|
1125
|
-
env_groups.to_a.sort_by { |env_group|
|
1126
|
-
# a bit clumsy sorting here...
|
1127
|
-
env_group[0].split("").map_with_index { |l, i|
|
1128
|
-
$env_features[i + 1].labels.index(l)
|
1129
|
-
}
|
1130
|
-
}.each_with_index do |group, group_no|
|
1241
|
+
$env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no|
|
1131
1242
|
# calculating substitution probability matrix for each envrionment
|
1132
1243
|
grp_label = group[0]
|
1133
1244
|
grp_envs = group[1]
|
1134
1245
|
grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1135
1246
|
|
1136
|
-
$amino_acids.each_with_index do |aa,
|
1137
|
-
env
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1247
|
+
$amino_acids.each_with_index do |aa, aj|
|
1248
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1249
|
+
#paj = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').sum / $tot_cnt_mat.sum
|
1250
|
+
env.logo_array = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1251
|
+
|
1252
|
+
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array').to_a.each_with_index do |prob, ai|
|
1253
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1254
|
+
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1255
|
+
odds = prob / pai
|
1256
|
+
env.logo_array[ai] = factor * Math::log(odds)
|
1257
|
+
grp_logo_mat[aj, ai] = env.logo_array[ai]
|
1144
1258
|
end
|
1145
1259
|
|
1146
|
-
|
1147
|
-
|
1148
|
-
# adding log odds ratio for "U" (J or C) when --cyc is 0
|
1260
|
+
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1149
1261
|
if $cys == 0
|
1150
|
-
|
1151
|
-
prob
|
1152
|
-
|
1153
|
-
odds
|
1154
|
-
|
1155
|
-
|
1262
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1263
|
+
prob = env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('C')] +
|
1264
|
+
env.send($nosmooth ? 'prob_array' : 'smooth_prob_array')[$amino_acids.index('J')]
|
1265
|
+
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1266
|
+
odds = prob / pai
|
1267
|
+
env.logo_array[$amino_acids.size] = factor * Math::log(odds)
|
1268
|
+
grp_logo_mat[aj, $amino_acids.size] = env.logo_array[$amino_acids.size]
|
1156
1269
|
end
|
1157
1270
|
end
|
1158
1271
|
|
1159
|
-
$tot_logo_mat += grp_logo_mat
|
1160
1272
|
grp_logo_mats << [grp_label, grp_logo_mat]
|
1161
1273
|
end
|
1162
1274
|
|
1163
|
-
$tot_logo_mat
|
1275
|
+
$tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1276
|
+
|
1277
|
+
$amino_acids.each_with_index do |aa1, aj|
|
1278
|
+
$amino_acids.each_with_index do |aa2, ai|
|
1279
|
+
prob = $tot_prob_mat[aj, ai]
|
1280
|
+
pai = 100.0 * $aa_tot_freq[$amino_acids[ai]]
|
1281
|
+
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1282
|
+
odds = prob / pai
|
1283
|
+
$tot_logo_mat[aj, ai] = factor * Math::log(odds)
|
1284
|
+
end
|
1285
|
+
|
1286
|
+
# adding log odds ratio for 'U' (J or C) when --cyc is 0
|
1287
|
+
if $cys == 0
|
1288
|
+
pai = 100.0 * ($aa_tot_freq['C'] + $aa_tot_freq['J'])
|
1289
|
+
prob = $tot_prob_mat[aj, $amino_acids.index('C')] + $tot_prob_mat[aj, $amino_acids.index('J')]
|
1290
|
+
#odds = prob == 0.0 ? 0.000001 / pai : prob / pai
|
1291
|
+
odds = prob / pai
|
1292
|
+
$tot_logo_mat[aj, $amino_acids.size] = factor * Math::log(odds)
|
1293
|
+
end
|
1294
|
+
end
|
1295
|
+
|
1164
1296
|
|
1165
1297
|
# calculating relative entropy for each amino acid pair H and
|
1166
1298
|
# the expected score E in bit units
|
1167
|
-
#
|
1168
|
-
# I'm a bit suspicious about this part...
|
1169
1299
|
tot_E = 0.0
|
1170
1300
|
tot_H = 0.0
|
1171
1301
|
|
1172
|
-
0.upto($tot_logo_mat.shape[0] - 1) do |
|
1173
|
-
0.upto($tot_logo_mat.shape[0] - 1) do |
|
1174
|
-
if
|
1175
|
-
tot_E += $tot_logo_mat[
|
1176
|
-
tot_H += $tot_logo_mat[
|
1302
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1303
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i| # it's deliberately '0' not '1'
|
1304
|
+
if j != i
|
1305
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[j]] * $aa_tot_freq[$amino_acids[i]] / 2.0
|
1306
|
+
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 2.0 / 10000.0
|
1177
1307
|
else
|
1178
|
-
tot_E += $tot_logo_mat[
|
1179
|
-
tot_H += $tot_logo_mat[
|
1308
|
+
tot_E += $tot_logo_mat[j, i] * $aa_tot_freq[$amino_acids[i]] * $aa_tot_freq[$amino_acids[i]]
|
1309
|
+
tot_H += $tot_logo_mat[j, i] * $tot_prob_mat[j, i] / 10000.0
|
1180
1310
|
end
|
1181
1311
|
end
|
1182
1312
|
end
|
@@ -1184,8 +1314,14 @@ HEADER
|
|
1184
1314
|
$outfh.puts <<HEADER
|
1185
1315
|
#
|
1186
1316
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1187
|
-
|
1188
|
-
|
1317
|
+
HEADER
|
1318
|
+
unless $noround
|
1319
|
+
$outfh.puts <<HEADER
|
1320
|
+
# rounded to the nearest integer (log-odds scores in 1/#{$scale} bit units).
|
1321
|
+
HEADER
|
1322
|
+
end
|
1323
|
+
|
1324
|
+
$outfh.puts <<HEADER
|
1189
1325
|
# For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
|
1190
1326
|
#
|
1191
1327
|
HEADER
|
@@ -1194,24 +1330,40 @@ HEADER
|
|
1194
1330
|
grp_label = arr[0]
|
1195
1331
|
grp_logo_mat = arr[1]
|
1196
1332
|
|
1333
|
+
unless $noround
|
1334
|
+
grp_logo_mat = grp_logo_mat.round
|
1335
|
+
end
|
1336
|
+
|
1197
1337
|
$outfh.puts ">#{grp_label} #{grp_no}"
|
1198
1338
|
if $cys
|
1199
|
-
$outfh.puts grp_logo_mat.
|
1339
|
+
$outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1200
1340
|
else
|
1201
|
-
$outfh.puts grp_logo_mat.
|
1341
|
+
$outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1202
1342
|
end
|
1203
1343
|
end
|
1204
1344
|
|
1205
1345
|
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1206
1346
|
|
1347
|
+
unless $noround
|
1348
|
+
$tot_logo_mat = $tot_logo_mat.round
|
1349
|
+
end
|
1350
|
+
|
1207
1351
|
if $cys == 0
|
1208
|
-
$outfh.puts $tot_logo_mat.
|
1352
|
+
$outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1209
1353
|
else
|
1210
|
-
$outfh.puts $tot_logo_mat.
|
1354
|
+
$outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1211
1355
|
end
|
1212
|
-
|
1213
|
-
|
1356
|
+
|
1357
|
+
$logger.info "Calculating log odds ratio is done."
|
1358
|
+
|
1359
|
+
#
|
1360
|
+
# Part 7. END
|
1361
|
+
#
|
1214
1362
|
end
|
1363
|
+
|
1364
|
+
$outfh.close
|
1365
|
+
$logger.info "Egor END."
|
1366
|
+
exit 0
|
1215
1367
|
end
|
1216
1368
|
end
|
1217
1369
|
|