treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1471 @@
|
|
|
1
|
+
package alignment_pruner;
|
|
2
|
+
|
|
3
|
+
=head1 NAME
|
|
4
|
+
|
|
5
|
+
alingment_pruner.pl
|
|
6
|
+
|
|
7
|
+
=head1 SYNOPSIS
|
|
8
|
+
|
|
9
|
+
alignment_pruner.pl --file alignment.fasta --gap_threshold 10 > pruned.fasta
|
|
10
|
+
|
|
11
|
+
=head1 DESCRIPTION
|
|
12
|
+
|
|
13
|
+
alignment_pruner.pl removes unconserved or gappy columns from an alignment
|
|
14
|
+
according to criteria specified by the user.
|
|
15
|
+
|
|
16
|
+
The chi2 and overview functions currently only work for amino-acids.
|
|
17
|
+
|
|
18
|
+
=head1 OPTIONS
|
|
19
|
+
|
|
20
|
+
=cut
|
|
21
|
+
|
|
22
|
+
use Moose;
|
|
23
|
+
use MooseX::Types::Path::Class;
|
|
24
|
+
with 'MooseX::Getopt';
|
|
25
|
+
|
|
26
|
+
use feature ':5.10';
|
|
27
|
+
|
|
28
|
+
use List::MoreUtils qw( uniq minmax );
|
|
29
|
+
use List::Util qw( max sum );
|
|
30
|
+
use Bio::AlignIO;
|
|
31
|
+
use Bio::Matrix::IO;
|
|
32
|
+
use Bio::Tools::CodonTable;
|
|
33
|
+
use GD;
|
|
34
|
+
use Statistics::Descriptive;
|
|
35
|
+
no warnings 'experimental';
|
|
36
|
+
#use Benchmark qw/ :all :hireswallclock /;
|
|
37
|
+
#use Data::Dump qw( dd );
|
|
38
|
+
|
|
39
|
+
=head2 --file <file>
|
|
40
|
+
|
|
41
|
+
The file that contains the alignment.
|
|
42
|
+
|
|
43
|
+
=cut
|
|
44
|
+
|
|
45
|
+
has 'file' => (
|
|
46
|
+
is => 'ro',
|
|
47
|
+
isa => 'Path::Class::File',
|
|
48
|
+
required => 1,
|
|
49
|
+
coerce => 1,
|
|
50
|
+
documentation => 'File containing alignment',
|
|
51
|
+
);
|
|
52
|
+
|
|
53
|
+
=head2 --format <format>
|
|
54
|
+
|
|
55
|
+
The alignment format, default is fasta. Can use any format supported by
|
|
56
|
+
bioperl.
|
|
57
|
+
|
|
58
|
+
=cut
|
|
59
|
+
|
|
60
|
+
has 'format' => (
|
|
61
|
+
is => 'ro',
|
|
62
|
+
isa => 'Str',
|
|
63
|
+
required => '',
|
|
64
|
+
default => 'fasta',
|
|
65
|
+
documentation => 'Alignment format, default is fasta',
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
=head2 --subset <subset>
|
|
69
|
+
|
|
70
|
+
Specify a subset of the sequences that all calculations should be based on.
|
|
71
|
+
This is usefull for only taking the ingroup into account for example. The
|
|
72
|
+
argument has to be a regular expression that matches the names of the sequences
|
|
73
|
+
to include.
|
|
74
|
+
|
|
75
|
+
--subset '^INGROUP'
|
|
76
|
+
|
|
77
|
+
--subset '^(Snuffe|Snuffa|Peter)$'
|
|
78
|
+
|
|
79
|
+
=cut
|
|
80
|
+
|
|
81
|
+
has 'subset' => (
|
|
82
|
+
is => 'ro',
|
|
83
|
+
isa => 'Str',
|
|
84
|
+
required => '',
|
|
85
|
+
default => '',
|
|
86
|
+
documentation => 'The subset of sequences to be used for the analysis, argument is a regexp that should match the id of the sequences',
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
=head2 --chi2_test
|
|
90
|
+
|
|
91
|
+
This will run a chi2 test on you alignment and show the result as a table.
|
|
92
|
+
The table contains 4 columns:
|
|
93
|
+
|
|
94
|
+
1. The number of the sequence in the alignment
|
|
95
|
+
2. The name of the sequence
|
|
96
|
+
3. The chi2 statistic
|
|
97
|
+
4. A star if it is significantly deviating
|
|
98
|
+
|
|
99
|
+
Note that if you have specified a subset, the expected distribution of
|
|
100
|
+
aminoacids is based on this set, but the script will calculate the chi2
|
|
101
|
+
test on all sequences.
|
|
102
|
+
|
|
103
|
+
=cut
|
|
104
|
+
|
|
105
|
+
has 'chi2_test' => (
|
|
106
|
+
is => 'ro',
|
|
107
|
+
isa => 'Bool',
|
|
108
|
+
required => '',
|
|
109
|
+
default => '',
|
|
110
|
+
documentation => 'Show chi2 statistics for the taxa of the alignment',
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
=head2 --bowker_symmetry_test
|
|
114
|
+
|
|
115
|
+
This will run bowkers symmetry test on you alignment and show the result as a
|
|
116
|
+
table.
|
|
117
|
+
|
|
118
|
+
=cut
|
|
119
|
+
|
|
120
|
+
has 'bowker_symmetry_test' => (
|
|
121
|
+
is => 'ro',
|
|
122
|
+
isa => 'Bool',
|
|
123
|
+
required => '',
|
|
124
|
+
default => '',
|
|
125
|
+
documentation => 'Run the Bowker symmetry test',
|
|
126
|
+
);
|
|
127
|
+
|
|
128
|
+
=head2 --aminogc
|
|
129
|
+
|
|
130
|
+
Calculate the aminoGC content of the different taxa.
|
|
131
|
+
|
|
132
|
+
=cut
|
|
133
|
+
|
|
134
|
+
has 'aminogc' => (
|
|
135
|
+
is => 'ro',
|
|
136
|
+
isa => 'Bool',
|
|
137
|
+
required => '',
|
|
138
|
+
default => '',
|
|
139
|
+
documentation => 'Calculate aminogc',
|
|
140
|
+
);
|
|
141
|
+
|
|
142
|
+
=head2 --chi2_remove_taxa
|
|
143
|
+
|
|
144
|
+
This will run a chi2 test on you alignment and remove all taxa that are
|
|
145
|
+
significantly deviating.
|
|
146
|
+
|
|
147
|
+
=cut
|
|
148
|
+
|
|
149
|
+
has 'chi2_remove_taxa' => (
|
|
150
|
+
is => 'ro',
|
|
151
|
+
isa => 'Bool',
|
|
152
|
+
required => '',
|
|
153
|
+
default => '',
|
|
154
|
+
documentation => 'Remove taxa that fail the chi2 test',
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
=head2 --remove_columns <columns>
|
|
158
|
+
|
|
159
|
+
Specify a list of columns to remove from the alignment. The first column is
|
|
160
|
+
column 0. It should be a comma separated string of ranges. Example:
|
|
161
|
+
|
|
162
|
+
0-5,22-30
|
|
163
|
+
|
|
164
|
+
=cut
|
|
165
|
+
|
|
166
|
+
has 'remove_columns' => (
|
|
167
|
+
is => 'ro',
|
|
168
|
+
isa => 'Str',
|
|
169
|
+
required => '',
|
|
170
|
+
default => '',
|
|
171
|
+
documentation => 'Remove columns, specified by a commaseparated string (4-10,88,100-140)',
|
|
172
|
+
);
|
|
173
|
+
|
|
174
|
+
=head2 --gap_threshold <threshold>
|
|
175
|
+
|
|
176
|
+
The threshold used for removing gapped positions, either the maximal number of
|
|
177
|
+
sequences that are allowed to have gaps or the fraction of sequences allowed to
|
|
178
|
+
have gaps, either as percent or as a number between 0 and 1. Examples:
|
|
179
|
+
|
|
180
|
+
--gap_threshold 10 # remove columns with gaps in more than 10 sequences.
|
|
181
|
+
|
|
182
|
+
--gap_threshold 10% # remove columns with gaps in more than 10% of the sequences.
|
|
183
|
+
|
|
184
|
+
--gap_threshold .1 # remove columns with gaps in more than 10% of the sequences.
|
|
185
|
+
|
|
186
|
+
=cut
|
|
187
|
+
|
|
188
|
+
has 'gap_threshold' => (
|
|
189
|
+
is => 'ro',
|
|
190
|
+
isa => 'Str',
|
|
191
|
+
required => '',
|
|
192
|
+
default => '',
|
|
193
|
+
documentation => 'The threshold used for removing gapped positions, either a number specifying the minimal number of sequences or %'
|
|
194
|
+
);
|
|
195
|
+
|
|
196
|
+
=head2 --conserved_threshold <threshold>
|
|
197
|
+
|
|
198
|
+
The threshold used for removing unconserved positions. Conservation is
|
|
199
|
+
calculated as the number of times the most frequent aminoacid appears in an
|
|
200
|
+
alignment column. Specify in the same way as for the --gap_threshold. This
|
|
201
|
+
option is probably not that useful. Example:
|
|
202
|
+
|
|
203
|
+
--conserved_threshold 10
|
|
204
|
+
|
|
205
|
+
--conserved_threshold 10%
|
|
206
|
+
|
|
207
|
+
--conserved_threshold .1
|
|
208
|
+
|
|
209
|
+
=cut
|
|
210
|
+
|
|
211
|
+
has 'conserved_threshold' => (
|
|
212
|
+
is => 'ro',
|
|
213
|
+
isa => 'Str',
|
|
214
|
+
required => '',
|
|
215
|
+
default => '',
|
|
216
|
+
documentation => 'The threshold used for removing unconserved positions, same as gap_threshold',
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
=head2 --chi2_prune <half|n#|f#|min|plot>
|
|
220
|
+
|
|
221
|
+
Use the chi2 statistic to choose columns to prune. This will first order
|
|
222
|
+
all of the columns by the chi2 statistic by comparing the chi2
|
|
223
|
+
statistic for the alignment with and without each column. Then the option
|
|
224
|
+
specifies how to remove columns:
|
|
225
|
+
|
|
226
|
+
half Remove half of the sites (starting with the most biased).
|
|
227
|
+
f# Remove sites until # fraction of sites remains, half can be
|
|
228
|
+
specified as f0.5
|
|
229
|
+
n# Remove sites until only # number of sequences show significant bias.
|
|
230
|
+
min Remove sites until a minimum of sequences show significant bias.
|
|
231
|
+
plot Will print statistics to the screen suitable for plotting. It will
|
|
232
|
+
contain 4 columns: idx, number of biased sequences, chi2 delta for
|
|
233
|
+
this column and the names of the biased sequences.
|
|
234
|
+
|
|
235
|
+
=cut
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
has 'chi2_prune' => (
|
|
239
|
+
is => 'ro',
|
|
240
|
+
isa => 'Str',
|
|
241
|
+
required => '',
|
|
242
|
+
default => '',
|
|
243
|
+
documentation => 'Use the chi2 statistic to prune sites of the alignment, values are Half, n<number>, f<number>, min or Plot.',
|
|
244
|
+
);
|
|
245
|
+
|
|
246
|
+
=head2 --generate_overview
|
|
247
|
+
|
|
248
|
+
With this option you will get an overview image showing your alignment and the
|
|
249
|
+
effect of your pruning settings. The filename will be "infile.png". The colors
|
|
250
|
+
are based on BLOSUM62 score spanning from green for really good to red for
|
|
251
|
+
really bad, yellow means 0. Removed columns will be overlayed with gray and if
|
|
252
|
+
you have specified a subset that will be overlayed with white. Try it to see.
|
|
253
|
+
|
|
254
|
+
=cut
|
|
255
|
+
|
|
256
|
+
has 'generate_overview' => (
|
|
257
|
+
is => 'ro',
|
|
258
|
+
isa => 'Bool',
|
|
259
|
+
required => '',
|
|
260
|
+
default => '',
|
|
261
|
+
documentation => 'Generate an overview image of the alignment',
|
|
262
|
+
);
|
|
263
|
+
|
|
264
|
+
=head2 --inverse
|
|
265
|
+
|
|
266
|
+
Inverse the pruning, that is remove the columns that otherwise would be kept.
|
|
267
|
+
|
|
268
|
+
=cut
|
|
269
|
+
|
|
270
|
+
has 'inverse' => (
|
|
271
|
+
is => 'ro',
|
|
272
|
+
isa => 'Bool',
|
|
273
|
+
required => '',
|
|
274
|
+
default => '',
|
|
275
|
+
documentation => 'Inverse the pruning',
|
|
276
|
+
);
|
|
277
|
+
|
|
278
|
+
=head2 --gap_treatment <ignore|mean|additional>
|
|
279
|
+
|
|
280
|
+
How to treat gaps in the calculations:
|
|
281
|
+
|
|
282
|
+
ignore Just ignore them, default
|
|
283
|
+
mean Add 1/20 on all the other AAs for each gap in a sequence.
|
|
284
|
+
additional As an additional state (not implemented)
|
|
285
|
+
|
|
286
|
+
=cut
|
|
287
|
+
|
|
288
|
+
has 'gap_treatment' => (
|
|
289
|
+
is => 'ro',
|
|
290
|
+
isa => 'Str',
|
|
291
|
+
required => '',
|
|
292
|
+
default => 'ignore',
|
|
293
|
+
documentation => 'How to treat gaps when calculating conservation and chi2 statistics. Ignore (default), Mean: 1/20 of all other AAs or Additional: additional state',
|
|
294
|
+
);
|
|
295
|
+
|
|
296
|
+
=head2 --aa_significance_level <number>
|
|
297
|
+
|
|
298
|
+
The significance level used for the chi2 test. The only cases you'd want to
|
|
299
|
+
change this is probably when running the chi2 pruning. The default is
|
|
300
|
+
30.143527. Can be generated in R with qchisq(0.05, 19, lower.tail=FALSE).
|
|
301
|
+
|
|
302
|
+
=cut
|
|
303
|
+
|
|
304
|
+
has aa_significance_level => (
|
|
305
|
+
is => 'ro',
|
|
306
|
+
isa => 'Num',
|
|
307
|
+
default => 30.143527,
|
|
308
|
+
documentation => 'The chi2 significance level for amino-acids, default 30.143527',
|
|
309
|
+
);
|
|
310
|
+
|
|
311
|
+
=head2 --dna_significance_level <number>
|
|
312
|
+
|
|
313
|
+
Same as above but for dna instead. The default is 7.814728. Can be generated in
|
|
314
|
+
R with qchisq(0.05, 3, lower.tail=FALSE).
|
|
315
|
+
|
|
316
|
+
=cut
|
|
317
|
+
|
|
318
|
+
has dna_significance_level => (
|
|
319
|
+
is => 'ro',
|
|
320
|
+
isa => 'Num',
|
|
321
|
+
default => 7.814728,
|
|
322
|
+
documentation => 'The chi2 significance level for nucleotides, default 7.814728',
|
|
323
|
+
);
|
|
324
|
+
|
|
325
|
+
=head2 --recode <recoding scheme>
|
|
326
|
+
|
|
327
|
+
Recoding scheme for recoding if the alignment, currently only dayhoff4 recoding
|
|
328
|
+
is supported. Schemes:
|
|
329
|
+
|
|
330
|
+
dayhoff4: A=AGPST T=DENQ C=HKR G=FYWILMV -=C
|
|
331
|
+
dayhoff6: 1=AGPST 2=DENQ 3=HKR 4=FYW 5=ILMV 6=C
|
|
332
|
+
hp: 1=ACFGILMVW 2=DEHKNPQRSTY
|
|
333
|
+
|
|
334
|
+
=cut
|
|
335
|
+
|
|
336
|
+
has recode => (
|
|
337
|
+
is => 'ro',
|
|
338
|
+
isa => 'Str',
|
|
339
|
+
default => '',
|
|
340
|
+
documentation => 'Recoding scheme, dayhoff4',
|
|
341
|
+
);
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
#### END OF OPTIONS ####
|
|
345
|
+
|
|
346
|
+
has _matrix => (
|
|
347
|
+
is => 'ro',
|
|
348
|
+
isa => 'HashRef',
|
|
349
|
+
lazy_build => 1,
|
|
350
|
+
traits => ['Hash'],
|
|
351
|
+
handles => {
|
|
352
|
+
_amino_acids => 'keys',
|
|
353
|
+
}
|
|
354
|
+
);
|
|
355
|
+
|
|
356
|
+
sub _build__matrix {
|
|
357
|
+
my $self = shift;
|
|
358
|
+
|
|
359
|
+
# The matrix is stored in the DATA-block at the end of this file
|
|
360
|
+
my $matrix = Bio::Matrix::IO->new(
|
|
361
|
+
-fh => \*DATA,
|
|
362
|
+
-format => 'scoring'
|
|
363
|
+
)->next_matrix;
|
|
364
|
+
|
|
365
|
+
# Using the matrix directly takes too much time, so we make cache it.
|
|
366
|
+
my %qmatrix;
|
|
367
|
+
my @AAS = grep /[A-Z]/, $matrix->row_names;
|
|
368
|
+
for my $ni ( @AAS ) {
|
|
369
|
+
for my $nj ( @AAS ) {
|
|
370
|
+
$qmatrix{$ni}{$nj} = $matrix->entry($ni,$nj);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
return \%qmatrix;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
#### MAIN PROGRAM ####
|
|
378
|
+
|
|
379
|
+
sub run {
|
|
380
|
+
my $self = shift;
|
|
381
|
+
my $alnio = Bio::AlignIO->new(
|
|
382
|
+
-fh => $self->file->openr,
|
|
383
|
+
-format => $self->format,
|
|
384
|
+
);
|
|
385
|
+
$Bio::Root::Root::DEBUG = -1;
|
|
386
|
+
while (my $aln = $alnio->next_aln) {
|
|
387
|
+
$self->prune( $aln );
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
sub prune {
|
|
392
|
+
my $self = shift;
|
|
393
|
+
my ( $aln ) = @_;
|
|
394
|
+
|
|
395
|
+
my $stats = $self->calculate_statistics( $aln );
|
|
396
|
+
my $nseq = $stats->{nseq};
|
|
397
|
+
|
|
398
|
+
if ($self->recode) {
|
|
399
|
+
$self->run_recoding( $aln );
|
|
400
|
+
return;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
if ( $self->chi2_test ) {
|
|
404
|
+
$self->run_chi2_test( $stats );
|
|
405
|
+
$self->show_chi2_test( $stats );
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if ( $self->bowker_symmetry_test ) {
|
|
410
|
+
$self->run_bowker_symmetry_test( $stats );
|
|
411
|
+
return;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
if ( $self->aminogc ) {
|
|
415
|
+
$self->run_aminogc( $stats );
|
|
416
|
+
return;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
if ( $self->chi2_remove_taxa ) {
|
|
420
|
+
$stats = $self->run_chi2_remove_taxa( $aln, $stats );
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
my $MAX_COL = $aln->length - 1;
|
|
424
|
+
my @columns_to_remove;
|
|
425
|
+
|
|
426
|
+
## chi2 removal
|
|
427
|
+
if ( $self->chi2_prune ) {
|
|
428
|
+
if ( $self->gap_threshold || $self->conserved_threshold ) {
|
|
429
|
+
die "Can't specify both chi2_prune and any of gap_threshold or conserved_threshold\n";
|
|
430
|
+
}
|
|
431
|
+
# Note that $stats will change after this call, so it's not that usable
|
|
432
|
+
# after this call.
|
|
433
|
+
push @columns_to_remove, $self->run_chi2_prune( $stats );
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
## Gap removal
|
|
437
|
+
if ( $self->gap_threshold ) {
|
|
438
|
+
my $threshold = $self->_convert_threshold( $self->gap_threshold, $nseq );
|
|
439
|
+
for my $pos ( 0 .. $MAX_COL ) {
|
|
440
|
+
next if $stats->{counts}[$pos]{'-'} <= $threshold;
|
|
441
|
+
push @columns_to_remove, $pos;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
## Unconserved removal
|
|
446
|
+
if ( $self->conserved_threshold ) {
|
|
447
|
+
my $threshold = $self->_convert_threshold( $self->conserved_threshold, $nseq );
|
|
448
|
+
for my $pos ( 0 .. $MAX_COL ) {
|
|
449
|
+
my %counts = %{ $stats->{counts}[$pos] };
|
|
450
|
+
delete $counts{'-'};
|
|
451
|
+
my $best = max values %counts;
|
|
452
|
+
if ( $best < $threshold ) {
|
|
453
|
+
push @columns_to_remove, $pos;
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
## Specific columns removal
|
|
459
|
+
if ( $self->remove_columns ne '' ) {
|
|
460
|
+
my @columns = split /,/, $self->remove_columns;
|
|
461
|
+
for my $c ( @columns ) {
|
|
462
|
+
if ( $c =~ /(\d+)(?:-|\.\.?)(\d+)/ ) {
|
|
463
|
+
push @columns_to_remove, $1 .. $2;
|
|
464
|
+
}
|
|
465
|
+
else {
|
|
466
|
+
push @columns_to_remove, $c;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
if ( $self->inverse ) {
|
|
472
|
+
my %remove = map { ( $_, 1 ) } @columns_to_remove;
|
|
473
|
+
@columns_to_remove =
|
|
474
|
+
grep { ! exists $remove{$_} } 0 .. $aln->length - 1;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
if ( $self->generate_overview ) {
|
|
478
|
+
$self->run_overview_window( $stats, \@columns_to_remove );
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
# if ( @columns_to_remove || $self->chi2_remove_taxa ) {
|
|
482
|
+
$aln = $self->run_pruning( $aln, \@columns_to_remove ) if @columns_to_remove;
|
|
483
|
+
$self->write_aln( $aln );
|
|
484
|
+
# }
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
sub write_aln {
|
|
488
|
+
my $self = shift;
|
|
489
|
+
my ($aln) = @_;
|
|
490
|
+
$aln->set_displayname_flat;
|
|
491
|
+
|
|
492
|
+
my $IO = Bio::AlignIO->new(
|
|
493
|
+
-fh => \*STDOUT,
|
|
494
|
+
-format => $self->format,
|
|
495
|
+
);
|
|
496
|
+
|
|
497
|
+
$IO->write_aln( $aln );
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
sub run_recoding {
|
|
501
|
+
my $self = shift;
|
|
502
|
+
my ($aln) = @_;
|
|
503
|
+
|
|
504
|
+
my %schemes = (
|
|
505
|
+
'dayhoff4' => 'A=AGPST,T=DENQ,C=HKR,G=FYWILMV,-=C',
|
|
506
|
+
'dayhoff6' => '1=AGPST,2=DENQ,3=HKR,4=FYW,5=ILMV,6=C',
|
|
507
|
+
'hp' => '1=ACFGILMVW,2=DEHKNPQRSTY',
|
|
508
|
+
);
|
|
509
|
+
if (! exists $schemes{ $self->recode }) {
|
|
510
|
+
die "Unrecognized recoding scheme\n";
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
my %scheme = (
|
|
514
|
+
'-' => '-',
|
|
515
|
+
'X' => 'X',
|
|
516
|
+
);
|
|
517
|
+
for my $class ( split ',', $schemes{$self->recode}) {
|
|
518
|
+
my ($l,undef,@letters) = split //, $class;
|
|
519
|
+
$scheme{$_} = $l for @letters;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
my @new_seqs;
|
|
523
|
+
for my $seq ( $aln->each_seq ) {
|
|
524
|
+
my $s = $seq->seq;
|
|
525
|
+
$s =~ s{(.)}{$scheme{$1} // die "Invalid alignment character: $1\n"}ge;
|
|
526
|
+
push @new_seqs, Bio::LocatableSeq->new(
|
|
527
|
+
-id => $seq->id,
|
|
528
|
+
-seq => $s,
|
|
529
|
+
);
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
$self->write_aln(Bio::SimpleAlign->new(-seqs => \@new_seqs));
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
sub run_pruning {
|
|
536
|
+
my $self = shift;
|
|
537
|
+
my ( $aln, $columns ) = @_;
|
|
538
|
+
|
|
539
|
+
return $aln if $columns->[0] == -1;
|
|
540
|
+
|
|
541
|
+
my $groups = $self->_group_columns( $columns );
|
|
542
|
+
$aln = $aln->remove_columns( $_ ) for reverse @$groups;
|
|
543
|
+
|
|
544
|
+
return $aln;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
sub run_aminogc {
|
|
548
|
+
my $self = shift;
|
|
549
|
+
my ($stats) = @_;
|
|
550
|
+
|
|
551
|
+
my $codon_table = Bio::Tools::CodonTable->new( -id => 11 );
|
|
552
|
+
my @AAS = $self->amino_acids;
|
|
553
|
+
|
|
554
|
+
my %chosen;
|
|
555
|
+
for my $AA ( @AAS ) {
|
|
556
|
+
my $codons = join '', map { /^(..)/ } $codon_table->revtranslate( $AA );
|
|
557
|
+
if ( $codons =~ /t|a/i ) {
|
|
558
|
+
$chosen{$AA} = '';
|
|
559
|
+
}
|
|
560
|
+
else {
|
|
561
|
+
$chosen{$AA} = 1;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
my @counts = @{ $stats->{seqcounts} };
|
|
566
|
+
my @names = @{ $stats->{names} };
|
|
567
|
+
|
|
568
|
+
my @aminogc;
|
|
569
|
+
for my $seqi ( 0 .. $#names ) {
|
|
570
|
+
my $aminogc;
|
|
571
|
+
my $tot;
|
|
572
|
+
|
|
573
|
+
for my $AA ( keys %{ $counts[$seqi] } ) {
|
|
574
|
+
next if $AA eq '-';
|
|
575
|
+
if ( $chosen{$AA} ) {
|
|
576
|
+
$aminogc += $counts[$seqi]{$AA};
|
|
577
|
+
}
|
|
578
|
+
$tot += $counts[$seqi]{$AA}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
$aminogc /= $tot;
|
|
582
|
+
push @aminogc, $aminogc;
|
|
583
|
+
|
|
584
|
+
printf "%-30s %6.4f\n", $names[$seqi], $aminogc;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
say '---';
|
|
588
|
+
|
|
589
|
+
my $mean = sum( @aminogc ) / @aminogc;
|
|
590
|
+
my $var = sum( map { ($_ - $mean)**2 } @aminogc ) / @aminogc;
|
|
591
|
+
|
|
592
|
+
printf "%-30s %6.4f\n", 'MEAN', $mean;
|
|
593
|
+
printf "%-30s %6.4f\n", 'STD.DEV.', sqrt($var);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
sub run_bowker_symmetry_test {
|
|
597
|
+
my $self = shift;
|
|
598
|
+
my ($stats) = @_;
|
|
599
|
+
|
|
600
|
+
my $distrib = $stats->{distribution};
|
|
601
|
+
my $length = $stats->{length};
|
|
602
|
+
my $nseq = $stats->{totseq};
|
|
603
|
+
my @AAS = $self->amino_acids;
|
|
604
|
+
|
|
605
|
+
my @names = @{ $stats->{names} };
|
|
606
|
+
|
|
607
|
+
my $significance_cutoff = 223.1602;
|
|
608
|
+
|
|
609
|
+
my %sign_counts;
|
|
610
|
+
|
|
611
|
+
for my $seqi ( 0 .. $nseq - 2 ) {
|
|
612
|
+
for my $seqj ( $seqi+1 .. $nseq - 1 ) {
|
|
613
|
+
my $matrix;
|
|
614
|
+
for my $pos ( 0 .. $length - 1 ) {
|
|
615
|
+
my $aai = $distrib->[$pos][$seqi];
|
|
616
|
+
my $aaj = $distrib->[$pos][$seqj];
|
|
617
|
+
$matrix->{$aai}{$aaj}++
|
|
618
|
+
}
|
|
619
|
+
my $stat = 0;
|
|
620
|
+
for my $ii ( 0 .. $#AAS - 1 ) {
|
|
621
|
+
for my $jj ( $ii+1 .. $#AAS ) {
|
|
622
|
+
my $ai = $AAS[$ii];
|
|
623
|
+
my $aj = $AAS[$jj];
|
|
624
|
+
my $nij = $matrix->{$ai}{$aj} // 0;
|
|
625
|
+
my $nji = $matrix->{$aj}{$ai} // 0;
|
|
626
|
+
next unless $nij || $nji;
|
|
627
|
+
$stat += ( $nij - $nji )**2 / ($nij + $nji);
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
printf "%-8.8s %-8.8s %3d %3d %7.2f",
|
|
632
|
+
@names[$seqi,$seqj],
|
|
633
|
+
$seqi, $seqj,
|
|
634
|
+
$stat;
|
|
635
|
+
if ( $stat > $significance_cutoff ) {
|
|
636
|
+
$sign_counts{$names[$seqi]}++;
|
|
637
|
+
$sign_counts{$names[$seqj]}++;
|
|
638
|
+
print ' *';
|
|
639
|
+
}
|
|
640
|
+
print "\n";
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
#### OVERVIEW WINDOW GENERATION ####
|
|
646
|
+
|
|
647
|
+
sub run_overview_window {
|
|
648
|
+
my $self = shift;
|
|
649
|
+
my ($stats, $remove) = @_;
|
|
650
|
+
|
|
651
|
+
$self->_calculate_statistics_scores( $stats );
|
|
652
|
+
|
|
653
|
+
my $distrib = $stats->{distribution};
|
|
654
|
+
my $scores = $stats->{scores};
|
|
655
|
+
my $nseq = $stats->{totseq};
|
|
656
|
+
|
|
657
|
+
my $AA_WIDTH = 4;
|
|
658
|
+
my $AA_HEIGHT = 6;
|
|
659
|
+
|
|
660
|
+
my $width = $AA_WIDTH * $stats->{length};
|
|
661
|
+
my $height = $AA_HEIGHT * $nseq;
|
|
662
|
+
|
|
663
|
+
my $gd = GD::Image->new( $width, $height, 1, );
|
|
664
|
+
|
|
665
|
+
## Cache all colors
|
|
666
|
+
my @COLORS = map {
|
|
667
|
+
$gd->colorAllocate( @$_ )
|
|
668
|
+
} _rainbow_gradient();
|
|
669
|
+
my $white = $gd->colorAllocate( 255,255,255 );
|
|
670
|
+
|
|
671
|
+
## Build the image
|
|
672
|
+
for my $pos ( 0 .. $stats->{length} - 1 ) {
|
|
673
|
+
for my $seqi ( 0 .. $nseq - 1 ) {
|
|
674
|
+
my $aa = $distrib->[$pos][$seqi];
|
|
675
|
+
my $color = $white;
|
|
676
|
+
if ( $aa ne '-' ) {
|
|
677
|
+
my $score = $scores->[$pos][$seqi];
|
|
678
|
+
my $idx = int(100*$score);
|
|
679
|
+
die "Too large $pos,$seqi $idx,$score" if $idx > 100;
|
|
680
|
+
die "Too small $pos,$seqi $idx,$score" if $idx < 0;
|
|
681
|
+
$color = $COLORS[ $idx ];
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
$gd->filledRectangle(
|
|
685
|
+
$pos * $AA_WIDTH,
|
|
686
|
+
$seqi * $AA_HEIGHT,
|
|
687
|
+
($pos+1) * $AA_WIDTH,
|
|
688
|
+
($seqi+1) * $AA_HEIGHT - 1,
|
|
689
|
+
$color,
|
|
690
|
+
);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
## Add remove overlay
|
|
695
|
+
if ( @$remove ) {
|
|
696
|
+
my $groups = $self->_group_columns( $remove );
|
|
697
|
+
|
|
698
|
+
my $color = $gd->colorAllocateAlpha(0,0,0,90);
|
|
699
|
+
|
|
700
|
+
for my $group ( @$groups ) {
|
|
701
|
+
my ($start,$end) = @$group;
|
|
702
|
+
$end++;
|
|
703
|
+
|
|
704
|
+
$gd->filledRectangle(
|
|
705
|
+
$start * $AA_WIDTH,
|
|
706
|
+
0,
|
|
707
|
+
$end * $AA_WIDTH,
|
|
708
|
+
$height,
|
|
709
|
+
$color,
|
|
710
|
+
);
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
## Add subset overlay
|
|
715
|
+
if ( $self->subset ) {
|
|
716
|
+
my %chosen = map { ( $_ => 1 ) } @{ $stats->{chosen} };
|
|
717
|
+
my $color = $gd->colorAllocateAlpha(255,255,255,90);
|
|
718
|
+
for my $c ( 0 .. $nseq - 1 ) {
|
|
719
|
+
next if $chosen{$c};
|
|
720
|
+
|
|
721
|
+
$gd->filledRectangle(
|
|
722
|
+
0,
|
|
723
|
+
$c * $AA_HEIGHT,
|
|
724
|
+
$width,
|
|
725
|
+
($c+1) * $AA_HEIGHT - 1,
|
|
726
|
+
$color,
|
|
727
|
+
);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
my $outfile = $self->file . '.png';
|
|
732
|
+
say STDERR "Saving png to $outfile";
|
|
733
|
+
open my $OUT, '>', $outfile or die;
|
|
734
|
+
print $OUT $gd->png;
|
|
735
|
+
close $OUT;
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
#### CHISQUARE METHODS ####
|
|
740
|
+
|
|
741
|
+
sub run_chi2_prune {
|
|
742
|
+
my $self = shift;
|
|
743
|
+
my ( $stats ) = @_;
|
|
744
|
+
|
|
745
|
+
my $length = $stats->{length};
|
|
746
|
+
my @columns_to_remove;
|
|
747
|
+
given ( $self->chi2_prune ) {
|
|
748
|
+
when ( /^h/i ) {
|
|
749
|
+
my @order = $self->_chi2_prune_order( $stats );
|
|
750
|
+
push @columns_to_remove, @order[ 0 .. $#order * .5];
|
|
751
|
+
}
|
|
752
|
+
when ( /^f(\d+\.?\d*)/i ) {
|
|
753
|
+
my $num = $self->_convert_threshold( $1, $length );
|
|
754
|
+
my @order = $self->_chi2_prune_order( $stats );
|
|
755
|
+
push @columns_to_remove, @order[ 0 .. ($num-1) ];
|
|
756
|
+
}
|
|
757
|
+
when ( /^n(\d+\.?\d*)/i ) {
|
|
758
|
+
my $num = $self->_convert_threshold( $1, $length );
|
|
759
|
+
push @columns_to_remove,
|
|
760
|
+
$self->_run_chi2_prune_threshold( $stats, $num );
|
|
761
|
+
}
|
|
762
|
+
when ( /^min/i ) {
|
|
763
|
+
push @columns_to_remove,
|
|
764
|
+
$self->_run_chi2_prune_threshold( $stats, 0, 1 );
|
|
765
|
+
}
|
|
766
|
+
when ( /^plot/i ) {
|
|
767
|
+
$self->_plot_chi2_prune( $stats );
|
|
768
|
+
exit;
|
|
769
|
+
}
|
|
770
|
+
default {
|
|
771
|
+
die "chi2 pruning of type $_ not recongnized\n";
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
return @columns_to_remove;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
sub run_chi2_test {
|
|
778
|
+
my $self = shift;
|
|
779
|
+
my ($stats) = @_;
|
|
780
|
+
|
|
781
|
+
my @AAS = $self->amino_acids;
|
|
782
|
+
my @chosen = @{ $stats->{chosen} };
|
|
783
|
+
my @O = @{ $stats->{seqcounts} };
|
|
784
|
+
my @l = @{ $stats->{seqlength} };
|
|
785
|
+
my %chosen = map { ($_,1) } @chosen;
|
|
786
|
+
|
|
787
|
+
my ( %E );
|
|
788
|
+
|
|
789
|
+
for my $seqi ( @chosen ) {
|
|
790
|
+
for my $AA ( @AAS ) {
|
|
791
|
+
$E{$AA} += $O[$seqi]{$AA};
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
my $L = sum @l;
|
|
796
|
+
$_ /= $L for values %E;
|
|
797
|
+
|
|
798
|
+
my @test;
|
|
799
|
+
|
|
800
|
+
my $tot = 0;
|
|
801
|
+
for my $seqi ( 0 .. $#O ) {
|
|
802
|
+
my $chi2 = 0;
|
|
803
|
+
|
|
804
|
+
for my $AA ( @AAS ) {
|
|
805
|
+
next if $E{$AA} == 0;
|
|
806
|
+
my $E = $E{$AA} * $l[$seqi];
|
|
807
|
+
$chi2 += ( $O[$seqi]{$AA} - $E )**2 / $E;
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
push @test, $chi2;
|
|
811
|
+
$tot += $chi2 if $chosen{$seqi};
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
push @test, $tot;
|
|
815
|
+
$stats->{chi2} = \@test;
|
|
816
|
+
return $tot;
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
sub show_chi2_test {
|
|
820
|
+
my $self = shift;
|
|
821
|
+
my ($stats) = @_;
|
|
822
|
+
|
|
823
|
+
my $SIGN_LEVEL = $self->aa_significance_level;
|
|
824
|
+
|
|
825
|
+
my @names = @{ $stats->{names} };
|
|
826
|
+
my $chi2 = $stats->{chi2};
|
|
827
|
+
|
|
828
|
+
for my $seqi ( 0 .. $#names ) {
|
|
829
|
+
my $sign = $chi2->[$seqi] > $SIGN_LEVEL ? '*' : '';
|
|
830
|
+
|
|
831
|
+
printf "%4d %-30.30s %7.2f %1s\n",
|
|
832
|
+
$seqi, $names[$seqi], $chi2->[$seqi], $sign;
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
printf " %-30.30s %7.2f\n",
|
|
836
|
+
'TOTAL', $chi2->[-1];
|
|
837
|
+
|
|
838
|
+
my $stat = Statistics::Descriptive::Sparse->new;
|
|
839
|
+
$stat->add_data( @$chi2[ 0 .. $#$chi2-1 ] );
|
|
840
|
+
|
|
841
|
+
printf " %-30.30s %7.2f\n",
|
|
842
|
+
'Std.Dev.', $stat->standard_deviation;
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
sub run_chi2_remove_taxa {
|
|
846
|
+
my $self = shift;
|
|
847
|
+
my ($aln, $stats) = @_;
|
|
848
|
+
|
|
849
|
+
my $SIGN_LEVEL = $self->aa_significance_level;
|
|
850
|
+
|
|
851
|
+
if ( ! exists $stats->{chi2} ) {
|
|
852
|
+
$self->run_chi2_test( $stats );
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
my @names = @{ $stats->{names} };
|
|
856
|
+
my $chi2 = $stats->{chi2};
|
|
857
|
+
|
|
858
|
+
my %remove_seqs;
|
|
859
|
+
|
|
860
|
+
for my $seqi ( 0 .. $#names ) {
|
|
861
|
+
if ( $chi2->[$seqi] > $SIGN_LEVEL ) {
|
|
862
|
+
$remove_seqs{ $names[$seqi] }++;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
for my $seq ( $aln->each_seq ) {
|
|
867
|
+
if ( exists $remove_seqs{ $seq->id } ) {
|
|
868
|
+
$aln->remove_seq( $seq );
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
my $new_stats = $self->calculate_statistics( $aln );
|
|
873
|
+
|
|
874
|
+
return $new_stats;
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
sub _chi2_test_minus {
|
|
878
|
+
my $self = shift;
|
|
879
|
+
my ($stats, $columns, $inplace) = @_;
|
|
880
|
+
|
|
881
|
+
# The inplace flag signifies whether to modify the $stats hash inplace or
|
|
882
|
+
# create a new one.
|
|
883
|
+
|
|
884
|
+
my $new_stats = {};
|
|
885
|
+
|
|
886
|
+
if ( $inplace ) {
|
|
887
|
+
$new_stats = $stats;
|
|
888
|
+
}
|
|
889
|
+
else {
|
|
890
|
+
$new_stats->{$_} = $stats->{$_} for keys %$stats;
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
if ( ref($columns) ne 'ARRAY' ) {
|
|
894
|
+
$columns = [$columns];
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
my @distrib = @{ $stats->{distribution} };
|
|
898
|
+
my @AAS = $self->amino_acids;
|
|
899
|
+
|
|
900
|
+
my @O = @{ $stats->{seqcounts} };
|
|
901
|
+
my @l = @{ $stats->{seqlength} };
|
|
902
|
+
|
|
903
|
+
my @chosen = @{ $stats->{chosen} };
|
|
904
|
+
my %chosen = map { ($_, 1) } @chosen;
|
|
905
|
+
|
|
906
|
+
my (@Oc, @lc, %Ec);
|
|
907
|
+
|
|
908
|
+
for my $seqi ( 0 .. $#O ) {
|
|
909
|
+
$lc[$seqi] = $l[$seqi];
|
|
910
|
+
|
|
911
|
+
for my $AA ( @AAS ) {
|
|
912
|
+
$Oc[$seqi]{$AA} = $O[$seqi]{$AA};
|
|
913
|
+
$Ec{$AA} += $O[$seqi]{$AA} if $chosen{$seqi};
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
for my $c ( @$columns ) {
|
|
917
|
+
my $aa = $distrib[$c][$seqi];
|
|
918
|
+
if ( $aa ne '-' ) {
|
|
919
|
+
$Oc[$seqi]{$aa} -= 1;
|
|
920
|
+
$lc[$seqi] -= 1;
|
|
921
|
+
$Ec{$aa} -= 1 if $chosen{ $seqi };
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
$new_stats->{seqcounts} = \@Oc;
|
|
927
|
+
$new_stats->{seqlength} = \@lc;
|
|
928
|
+
|
|
929
|
+
my $L = sum values %Ec;
|
|
930
|
+
|
|
931
|
+
if ( $L == 0 ) {
|
|
932
|
+
warn "Nothing left of alignment\n";
|
|
933
|
+
return $new_stats;
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
$_ /= $L for values %Ec;
|
|
937
|
+
|
|
938
|
+
my @chi2_test;
|
|
939
|
+
|
|
940
|
+
my $tot;
|
|
941
|
+
for my $seqi ( @chosen ) {
|
|
942
|
+
my $chi2 = 0;
|
|
943
|
+
|
|
944
|
+
for my $AA ( @AAS ) {
|
|
945
|
+
my $E = $Ec{$AA}*$lc[$seqi];
|
|
946
|
+
next if $E == 0;
|
|
947
|
+
$chi2 += ( $Oc[$seqi]{$AA} - $E )**2 / $E;
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
$tot += $chi2;
|
|
951
|
+
push @chi2_test, $chi2;
|
|
952
|
+
}
|
|
953
|
+
push @chi2_test, $tot;
|
|
954
|
+
|
|
955
|
+
$new_stats->{chi2} = \@chi2_test;
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
sub _chi2_delta_exact {
|
|
959
|
+
my $self = shift;
|
|
960
|
+
my ( $stats, $c, $full_tot ) = @_;
|
|
961
|
+
|
|
962
|
+
my @AAS = $self->amino_acids;
|
|
963
|
+
my @distrib = @{ $stats->{distribution} };
|
|
964
|
+
|
|
965
|
+
my @O = @{ $stats->{seqcounts} };
|
|
966
|
+
my @l = @{ $stats->{seqlength} };
|
|
967
|
+
|
|
968
|
+
my @chosen = @{ $stats->{chosen} };
|
|
969
|
+
|
|
970
|
+
my (@Oc,%Ec,@lc);
|
|
971
|
+
|
|
972
|
+
for my $seqi ( @chosen ) {
|
|
973
|
+
$lc[$seqi] = $l[$seqi];
|
|
974
|
+
|
|
975
|
+
for my $AA ( @AAS ) {
|
|
976
|
+
$Oc[$seqi]{$AA} = $O[$seqi]{$AA};
|
|
977
|
+
$Ec{$AA} += $O[$seqi]{$AA};
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
my $aa = $distrib[$c][$seqi];
|
|
981
|
+
if ( $aa ne '-' ) {
|
|
982
|
+
$Oc[$seqi]{$aa} -= 1;
|
|
983
|
+
$lc[$seqi] -= 1;
|
|
984
|
+
$Ec{$aa} -= 1;
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
my $L = sum grep defined, @lc;
|
|
989
|
+
$_ /= $L for values %Ec;
|
|
990
|
+
|
|
991
|
+
my $tot;
|
|
992
|
+
for my $seqi ( @chosen ) {
|
|
993
|
+
my $chi2 = 0;
|
|
994
|
+
|
|
995
|
+
for my $AA ( @AAS ) {
|
|
996
|
+
next if $Ec{$AA} == 0;
|
|
997
|
+
my $E = $Ec{$AA}*$lc[$seqi];
|
|
998
|
+
$chi2 += ( $Oc[$seqi]{$AA} - $E )**2 / $E;
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
$tot += $chi2;
|
|
1002
|
+
}
|
|
1003
|
+
return $tot - $full_tot;
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
sub _chi2_prune_order {
|
|
1007
|
+
my $self = shift;
|
|
1008
|
+
my ( $stats ) = @_;
|
|
1009
|
+
|
|
1010
|
+
# TODO: Lazy build...
|
|
1011
|
+
if ( ! $stats->{chi2} ) {
|
|
1012
|
+
$self->run_chi2_test( $stats );
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
my $len = $stats->{length};
|
|
1016
|
+
my $chi2_sum = $stats->{chi2}[-1];
|
|
1017
|
+
my @pos_scores;
|
|
1018
|
+
for my $c ( 0 .. $len - 1 ) {
|
|
1019
|
+
push @pos_scores, $self->_chi2_delta_exact( $stats, $c, $chi2_sum );
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
$stats->{chi2_deltas} = \@pos_scores;
|
|
1023
|
+
|
|
1024
|
+
my @order = sort { $pos_scores[$a] <=> $pos_scores[$b] } 0 .. $len - 1;
|
|
1025
|
+
return @order;
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
sub _plot_chi2_prune {
|
|
1029
|
+
my $self = shift;
|
|
1030
|
+
my ( $stats ) = @_;
|
|
1031
|
+
|
|
1032
|
+
my @order = $self->_chi2_prune_order( $stats );
|
|
1033
|
+
|
|
1034
|
+
my $new_stats = {};
|
|
1035
|
+
$new_stats->{$_} = $stats->{$_} for keys %$stats;
|
|
1036
|
+
my $len = $stats->{length};
|
|
1037
|
+
|
|
1038
|
+
my $SIGN_LEVEL = $self->aa_significance_level;
|
|
1039
|
+
for my $idx_order ( 0 .. $len - 2 ) {
|
|
1040
|
+
$self->_chi2_test_minus( $new_stats, $order[$idx_order], 1 );
|
|
1041
|
+
|
|
1042
|
+
my @delta = @{ $new_stats->{chi2} };
|
|
1043
|
+
pop @delta; # Not interested in total
|
|
1044
|
+
|
|
1045
|
+
my @over = grep { $delta[$_] > $SIGN_LEVEL } 0 .. $#delta;
|
|
1046
|
+
my $num = grep { $_ > $SIGN_LEVEL } @delta;
|
|
1047
|
+
my $pos_delta = $stats->{chi2_deltas}[$order[$idx_order]];
|
|
1048
|
+
|
|
1049
|
+
printf "%5d %5d %5.2f (%s)\n", $idx_order, $num, $pos_delta,
|
|
1050
|
+
join(',', @{$stats->{names}}[@over]);
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
sub _run_chi2_prune_threshold {
|
|
1055
|
+
my $self = shift;
|
|
1056
|
+
my ( $stats, $stop_num, $MIN ) = @_;
|
|
1057
|
+
|
|
1058
|
+
$stop_num //= 0;
|
|
1059
|
+
|
|
1060
|
+
my @order = $self->_chi2_prune_order( $stats );
|
|
1061
|
+
|
|
1062
|
+
my $new_stats = {};
|
|
1063
|
+
$new_stats->{$_} = $stats->{$_} for keys %$stats;
|
|
1064
|
+
|
|
1065
|
+
my $len = $stats->{length};
|
|
1066
|
+
|
|
1067
|
+
my ($best,$best_idx);
|
|
1068
|
+
|
|
1069
|
+
my $SIGN_LEVEL = $self->aa_significance_level;
|
|
1070
|
+
for my $idx_order ( 0 .. $len - 2 ) {
|
|
1071
|
+
$self->_chi2_test_minus( $new_stats, $order[$idx_order], 1 );
|
|
1072
|
+
|
|
1073
|
+
my @delta = @{ $new_stats->{chi2} };
|
|
1074
|
+
pop @delta; # Not interested in total
|
|
1075
|
+
|
|
1076
|
+
my @over = grep { $delta[$_] > $SIGN_LEVEL } 0 .. $#delta;
|
|
1077
|
+
my $num = grep { $_ > $SIGN_LEVEL } @delta;
|
|
1078
|
+
|
|
1079
|
+
if ( $num <= $stop_num ) {
|
|
1080
|
+
return @order[ 0 .. $idx_order ];
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
if ( !$best || $num < $best ) {
|
|
1084
|
+
($best,$best_idx) = ($num, $idx_order);
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
if ( $MIN ) {
|
|
1089
|
+
print STDERR "Min is at $best_idx with $best\n";
|
|
1090
|
+
return @order[ 0 .. $best_idx ];
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
die "It is not possible to get $stop_num or fewer sequences above significance threshold\n";
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
## Not in current option list
|
|
1097
|
+
sub _run_reorder_chi2 {
|
|
1098
|
+
my $self = shift;
|
|
1099
|
+
my ( $stats ) = @_;
|
|
1100
|
+
|
|
1101
|
+
my @order = $self->_chi2_prune_order( $stats );
|
|
1102
|
+
|
|
1103
|
+
my $nseq = $stats->{nseq};
|
|
1104
|
+
my @names = @{ $stats->{names} };
|
|
1105
|
+
my @distrib = @{ $stats->{distribution} };
|
|
1106
|
+
|
|
1107
|
+
my @new_seq_str;
|
|
1108
|
+
for my $c ( @order ) {
|
|
1109
|
+
for my $seqi ( 0 .. $nseq - 1 ) {
|
|
1110
|
+
$new_seq_str[$seqi] .= $distrib[$c][$seqi];
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
my @new_seq;
|
|
1115
|
+
for my $seqi ( 0 .. $nseq - 1 ) {
|
|
1116
|
+
push @new_seq, Bio::LocatableSeq->new(
|
|
1117
|
+
-id => $names[$seqi],
|
|
1118
|
+
-seq => $new_seq_str[$seqi],
|
|
1119
|
+
);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
return Bio::SimpleAlign->new(
|
|
1123
|
+
-seqs => \@new_seq,
|
|
1124
|
+
);
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
# TODO: Should be replaced with a different sub to return the alphabet of the
|
|
1129
|
+
# alignment instead, that way we can support dna quite easily.
|
|
1130
|
+
|
|
1131
|
+
sub amino_acids {
|
|
1132
|
+
my $self = shift;
|
|
1133
|
+
if ( $self->gap_treatment =~ /^a/i ) {
|
|
1134
|
+
return ( $self->_amino_acids, '-');
|
|
1135
|
+
}
|
|
1136
|
+
return $self->_amino_acids;
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
#### STATISTICS ####
|
|
1140
|
+
|
|
1141
|
+
sub calculate_statistics {
|
|
1142
|
+
my $self = shift;
|
|
1143
|
+
my $aln = shift;
|
|
1144
|
+
|
|
1145
|
+
# TODO: This should be refactored into it's own class, but I don't have the
|
|
1146
|
+
# tuits...
|
|
1147
|
+
#
|
|
1148
|
+
# In the end we have the following datastructure.
|
|
1149
|
+
#$stats = {
|
|
1150
|
+
#### Indexed by column
|
|
1151
|
+
# scores Array of Arrays [pos][seqi] = Normalized Blosum62 score
|
|
1152
|
+
# distribution Array of Arrays [pos][seqi] = AA at pos in seqi
|
|
1153
|
+
# counts Array of Hashes [pos]{char} = n(AA) in column pos
|
|
1154
|
+
#
|
|
1155
|
+
#### Indexed by sequence
|
|
1156
|
+
# seqcounts Array of Hashes [seqi]{char} = n(AA) in seqi
|
|
1157
|
+
# frequency Array of Hashes [seqi]{char} = f(AA) in column sequence seqi, between 0 and 1
|
|
1158
|
+
# names Array [seqi] = name of seqi
|
|
1159
|
+
# seqlength Array [seqi] = l(seqi)
|
|
1160
|
+
#
|
|
1161
|
+
#### Other stuff
|
|
1162
|
+
# chosen Array [idx] = chosen seqi
|
|
1163
|
+
# nseq Integer = number of chosen seqs
|
|
1164
|
+
# totseq Integer = total number of seqs
|
|
1165
|
+
# length Integer = length of alignment
|
|
1166
|
+
# overall_freq Hash {char} = Overall frequency
|
|
1167
|
+
#};
|
|
1168
|
+
|
|
1169
|
+
|
|
1170
|
+
# Each of these methods builds upon the previous one.
|
|
1171
|
+
|
|
1172
|
+
my $stats = {};
|
|
1173
|
+
$self->_calculate_statistics_transpose( $stats, $aln );
|
|
1174
|
+
$self->_calculate_statistics_counts( $stats );
|
|
1175
|
+
# This one takes a lot of time, we defer it until it is needed
|
|
1176
|
+
#$self->_calculate_statistics_scores( $stats );
|
|
1177
|
+
|
|
1178
|
+
return $stats;
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
sub _calculate_statistics_transpose {
|
|
1182
|
+
my $self = shift;
|
|
1183
|
+
my ($stats, $aln) = @_;
|
|
1184
|
+
|
|
1185
|
+
my $subset = $self->subset;
|
|
1186
|
+
my $totseq = $aln->num_sequences;
|
|
1187
|
+
my @AAS = $self->amino_acids;
|
|
1188
|
+
|
|
1189
|
+
my (@distrib, @names, @chosen, @seqcounts);
|
|
1190
|
+
my (@frequency, %overall_frequency, @length);
|
|
1191
|
+
|
|
1192
|
+
my $seqi=0;
|
|
1193
|
+
for my $seq ( $aln->each_seq ) {
|
|
1194
|
+
my $chosen = '';
|
|
1195
|
+
if ( !$subset || $seq->id =~ /$subset/ ) {
|
|
1196
|
+
$chosen++;
|
|
1197
|
+
push @chosen, $seqi;
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
push @names, $seq->id;
|
|
1201
|
+
|
|
1202
|
+
my %seqcounts = map { ($_=>0) } ( @AAS, '-' );
|
|
1203
|
+
my @s = split //, $seq->seq;
|
|
1204
|
+
for my $pos ( 0 .. $#s ) {
|
|
1205
|
+
$seqcounts{$s[$pos]}++;
|
|
1206
|
+
push @{ $distrib[$pos] }, $s[$pos];
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
given ( $self->gap_treatment ) {
|
|
1210
|
+
when (/^i/i) { delete $seqcounts{'-'} }
|
|
1211
|
+
when (/^m/i) {
|
|
1212
|
+
$seqcounts{$_} += $seqcounts{'-'}/20 for @AAS;
|
|
1213
|
+
delete $seqcounts{'-'};
|
|
1214
|
+
}
|
|
1215
|
+
when (/^a/i) {
|
|
1216
|
+
die "Treatment of gaps as an additional character is not implemented yet\n";
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
push @seqcounts, \%seqcounts;
|
|
1221
|
+
|
|
1222
|
+
if ( $chosen ) {
|
|
1223
|
+
$overall_frequency{$_} += $seqcounts{$_} for keys %seqcounts;
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
my $sum = sum values %seqcounts;
|
|
1227
|
+
push @length, $sum;
|
|
1228
|
+
|
|
1229
|
+
my %seq_frequency = %seqcounts;
|
|
1230
|
+
$_ /= $sum for values %seq_frequency;
|
|
1231
|
+
push @frequency, \%seq_frequency;
|
|
1232
|
+
|
|
1233
|
+
$seqi++;
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
my $sum = sum values %overall_frequency;
|
|
1237
|
+
$_ /= $sum for values %overall_frequency;
|
|
1238
|
+
|
|
1239
|
+
$stats->{distribution} = \@distrib;
|
|
1240
|
+
$stats->{chosen} = \@chosen;
|
|
1241
|
+
$stats->{names} = \@names;
|
|
1242
|
+
$stats->{totseq} = $totseq;
|
|
1243
|
+
$stats->{nseq} = scalar(@chosen);
|
|
1244
|
+
$stats->{length} = scalar(@distrib);
|
|
1245
|
+
$stats->{seqcounts} = \@seqcounts;
|
|
1246
|
+
$stats->{seqlength} = \@length;
|
|
1247
|
+
$stats->{frequency} = \@frequency;
|
|
1248
|
+
$stats->{overall_freq} = \%overall_frequency;
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
sub _calculate_statistics_counts {
|
|
1252
|
+
my $self = shift;
|
|
1253
|
+
my ($stats) = @_;
|
|
1254
|
+
|
|
1255
|
+
my @AAS = ($self->amino_acids, '-');
|
|
1256
|
+
my @chosen = @{ $stats->{chosen} };
|
|
1257
|
+
|
|
1258
|
+
my (@counts);
|
|
1259
|
+
|
|
1260
|
+
for my $p ( @{ $stats->{distribution} } ) {
|
|
1261
|
+
my %counts = map { ($_ => 0) } @AAS;
|
|
1262
|
+
|
|
1263
|
+
$counts{$_}++ for @$p[@chosen];
|
|
1264
|
+
push @counts, \%counts;
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
$stats->{counts} = \@counts;
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
sub _calculate_statistics_scores {
|
|
1271
|
+
my $self = shift;
|
|
1272
|
+
my ( $stats ) = @_;
|
|
1273
|
+
|
|
1274
|
+
my $qmatrix = $self->_matrix;
|
|
1275
|
+
my @AAS = $self->amino_acids;
|
|
1276
|
+
|
|
1277
|
+
# Precalculate the scores as weights
|
|
1278
|
+
my @weights;
|
|
1279
|
+
for my $pos ( 0 .. $stats->{length} - 1 ) {
|
|
1280
|
+
my $p = $stats->{distribution}[$pos];
|
|
1281
|
+
|
|
1282
|
+
my %weights;
|
|
1283
|
+
|
|
1284
|
+
for my $aai ( @AAS ) {
|
|
1285
|
+
my $score = 0;
|
|
1286
|
+
|
|
1287
|
+
for my $aaj ( @AAS ) {
|
|
1288
|
+
if ( ! defined $stats->{counts}[$pos]{$aaj} ) {
|
|
1289
|
+
die "Can't find $aaj at $pos of counts";
|
|
1290
|
+
}
|
|
1291
|
+
$score += $qmatrix->{$aai}{$aaj} * $stats->{counts}[$pos]{$aaj};
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
# Remove self comparison
|
|
1295
|
+
$score -= $qmatrix->{$aai}{$aai};
|
|
1296
|
+
|
|
1297
|
+
$weights{$aai} = $score;
|
|
1298
|
+
}
|
|
1299
|
+
my ($smin, $smax) = minmax values %weights;
|
|
1300
|
+
|
|
1301
|
+
# Normalize the scores so they are mapped like this:
|
|
1302
|
+
# (-Inf,0,Inf) -> (0 , 0.5 , 1)
|
|
1303
|
+
for ( values %weights ) {
|
|
1304
|
+
when ( $_ < 0 ) { $_ = (1 - $_/$smin) / 2 }
|
|
1305
|
+
when ( $smax > 0 ) { $_ = 0.5 + ($_/$smax) / 2 }
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
push @weights, \%weights;
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
# Map the weights to the scores
|
|
1312
|
+
my @scores;
|
|
1313
|
+
for my $pos ( 0 .. $#weights ) {
|
|
1314
|
+
for my $seqi ( 0 .. $stats->{totseq}-1 ) {
|
|
1315
|
+
my $c = $stats->{distribution}[$pos][$seqi];
|
|
1316
|
+
next if $c eq '-';
|
|
1317
|
+
|
|
1318
|
+
$scores[$pos][$seqi] = $weights[$pos]{$c};
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
$stats->{scores} = \@scores;
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
#### UTILITY FUNCTIONS ####
|
|
1326
|
+
|
|
1327
|
+
sub _group_columns {
|
|
1328
|
+
my $self = shift;
|
|
1329
|
+
my $columns = shift;
|
|
1330
|
+
|
|
1331
|
+
my @columns = sort { $a <=> $b } uniq @$columns;
|
|
1332
|
+
my @groups;
|
|
1333
|
+
my $current_group = [];
|
|
1334
|
+
for ( @columns ) {
|
|
1335
|
+
if ( !@$current_group ) {
|
|
1336
|
+
$current_group = [$_,$_];
|
|
1337
|
+
}
|
|
1338
|
+
elsif ( $current_group->[1] + 1 == $_ ) {
|
|
1339
|
+
$current_group->[1] = $_;
|
|
1340
|
+
}
|
|
1341
|
+
else {
|
|
1342
|
+
push @groups, $current_group;
|
|
1343
|
+
$current_group = [$_,$_]
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
push @groups, $current_group;
|
|
1347
|
+
return \@groups;
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
sub _convert_threshold {
|
|
1351
|
+
my $self = shift;
|
|
1352
|
+
my ( $value, $nseq ) = @_;
|
|
1353
|
+
|
|
1354
|
+
my $ret;
|
|
1355
|
+
for ( $value ) {
|
|
1356
|
+
when (/^(\d+\.?\d*)\%$/) { $ret = $nseq * $1 / 100 }
|
|
1357
|
+
when (/^(\d+)$/) { $ret = $value }
|
|
1358
|
+
when (/^(\d*\.\d+)$/) { $ret = $nseq * $1 }
|
|
1359
|
+
default { die "Can't interpret threshold: $value\n" }
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
if ( $ret > $nseq ) {
|
|
1363
|
+
die "Threshold can't be larger than nseq $value ($ret) > $nseq\n";
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
return $ret;
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
sub _gradient {
|
|
1370
|
+
map { [ map hex, /(..)(..)(..)/ ] } @_;
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
sub _jalview_gradient {
|
|
1374
|
+
# Generated in R with hsv(2/3, seq(1,101)/101, 1)
|
|
1375
|
+
_gradient qw(
|
|
1376
|
+
FCFCFF FAFAFF F7F7FF F5F5FF F2F2FF F0F0FF EDEDFF EBEBFF E8E8FF E6E6FF
|
|
1377
|
+
E3E3FF E1E1FF DEDEFF DCDCFF D9D9FF D7D7FF D4D4FF D2D2FF CFCFFF CDCDFF
|
|
1378
|
+
CACAFF C7C7FF C5C5FF C2C2FF C0C0FF BDBDFF BBBBFF B8B8FF B6B6FF B3B3FF
|
|
1379
|
+
B1B1FF AEAEFF ACACFF A9A9FF A7A7FF A4A4FF A2A2FF 9F9FFF 9D9DFF 9A9AFF
|
|
1380
|
+
9797FF 9595FF 9292FF 9090FF 8D8DFF 8B8BFF 8888FF 8686FF 8383FF 8181FF
|
|
1381
|
+
7E7EFF 7C7CFF 7979FF 7777FF 7474FF 7272FF 6F6FFF 6D6DFF 6A6AFF 6868FF
|
|
1382
|
+
6565FF 6262FF 6060FF 5D5DFF 5B5BFF 5858FF 5656FF 5353FF 5151FF 4E4EFF
|
|
1383
|
+
4C4CFF 4949FF 4747FF 4444FF 4242FF 3F3FFF 3D3DFF 3A3AFF 3838FF 3535FF
|
|
1384
|
+
3232FF 3030FF 2D2DFF 2B2BFF 2828FF 2626FF 2323FF 2121FF 1E1EFF 1C1CFF
|
|
1385
|
+
1919FF 1717FF 1414FF 1212FF 0F0FFF 0D0DFF 0A0AFF 0808FF 0505FF 0303FF
|
|
1386
|
+
0000FF
|
|
1387
|
+
)
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
sub _rainbow_gradient {
|
|
1391
|
+
# Generated in R with rainbow(101, s=.6, v=.9, start=0, end=1/3)
|
|
1392
|
+
_gradient qw(
|
|
1393
|
+
E65C5C E65F5C E6615C E6645C E6675C E66A5C E66C5C E66F5C E6725C E6755C
|
|
1394
|
+
E6775C E67A5C E67D5C E6805C E6825C E6855C E6885C E68B5C E68D5C E6905C
|
|
1395
|
+
E6935C E6965C E6985C E69B5C E69E5C E6A15C E6A35C E6A65C E6A95C E6AC5C
|
|
1396
|
+
E6AE5C E6B15C E6B45C E6B75C E6B95C E6BC5C E6BF5C E6C25C E6C45C E6C75C
|
|
1397
|
+
E6CA5C E6CD5C E6CF5C E6D25C E6D55C E6D85C E6DA5C E6DD5C E6E05C E6E35C
|
|
1398
|
+
E6E65C E3E65C E0E65C DDE65C DAE65C D8E65C D5E65C D2E65C CFE65C CDE65C
|
|
1399
|
+
CAE65C C7E65C C4E65C C2E65C BFE65C BCE65C B9E65C B7E65C B4E65C B1E65C
|
|
1400
|
+
AEE65C ACE65C A9E65C A6E65C A3E65C A1E65C 9EE65C 9BE65C 98E65C 96E65C
|
|
1401
|
+
93E65C 90E65C 8DE65C 8BE65C 88E65C 85E65C 82E65C 80E65C 7DE65C 7AE65C
|
|
1402
|
+
77E65C 75E65C 72E65C 6FE65C 6CE65C 6AE65C 67E65C 64E65C 61E65C 5FE65C
|
|
1403
|
+
5CE65C
|
|
1404
|
+
)
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
__PACKAGE__->new_with_options->run unless caller;
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
=head1 AUTHOR
|
|
1412
|
+
|
|
1413
|
+
Johan Viklund <johan.viklund@gmail.com>, Github: viklund
|
|
1414
|
+
|
|
1415
|
+
=head1 LICENSE
|
|
1416
|
+
|
|
1417
|
+
Copyright 2018 Johan Viklund
|
|
1418
|
+
|
|
1419
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
1420
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
1421
|
+
the Software without restriction, including without limitation the rights to
|
|
1422
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
1423
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
1424
|
+
so, subject to the following conditions:
|
|
1425
|
+
|
|
1426
|
+
The above copyright notice and this permission notice shall be included in all
|
|
1427
|
+
copies or substantial portions of the Software.
|
|
1428
|
+
|
|
1429
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
1430
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
1431
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
1432
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
1433
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
1434
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
1435
|
+
SOFTWARE.
|
|
1436
|
+
|
|
1437
|
+
=cut
|
|
1438
|
+
|
|
1439
|
+
### Don't change anything below this line unless you know how it's used
|
|
1440
|
+
__DATA__
|
|
1441
|
+
# Matrix made by matblas from blosum62.iij
|
|
1442
|
+
# * column uses minimum score
|
|
1443
|
+
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
|
|
1444
|
+
# Blocks Database = /data/blocks_5.0/blocks.dat
|
|
1445
|
+
# Cluster Percentage: >= 62
|
|
1446
|
+
# Entropy = 0.6979, Expected = -0.5209
|
|
1447
|
+
A R N D C Q E G H I L K M F P S T W Y V B Z X *
|
|
1448
|
+
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -4
|
|
1449
|
+
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
|
|
1450
|
+
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
|
|
1451
|
+
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
|
|
1452
|
+
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -1 -4
|
|
1453
|
+
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
|
|
1454
|
+
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
|
|
1455
|
+
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
|
|
1456
|
+
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
|
|
1457
|
+
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
|
|
1458
|
+
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
|
|
1459
|
+
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
|
|
1460
|
+
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
|
|
1461
|
+
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
|
|
1462
|
+
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -1 -4
|
|
1463
|
+
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 -1 -4
|
|
1464
|
+
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -4
|
|
1465
|
+
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -1 -4
|
|
1466
|
+
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
|
|
1467
|
+
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
|
|
1468
|
+
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
|
|
1469
|
+
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
|
|
1470
|
+
X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4
|
|
1471
|
+
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
|