treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1471 @@
1
+ package alignment_pruner;
2
+
3
+ =head1 NAME
4
+
5
+ alingment_pruner.pl
6
+
7
+ =head1 SYNOPSIS
8
+
9
+ alignment_pruner.pl --file alignment.fasta --gap_threshold 10 > pruned.fasta
10
+
11
+ =head1 DESCRIPTION
12
+
13
+ alignment_pruner.pl removes unconserved or gappy columns from an alignment
14
+ according to criteria specified by the user.
15
+
16
+ The chi2 and overview functions currently only work for amino-acids.
17
+
18
+ =head1 OPTIONS
19
+
20
+ =cut
21
+
22
+ use Moose;
23
+ use MooseX::Types::Path::Class;
24
+ with 'MooseX::Getopt';
25
+
26
+ use feature ':5.10';
27
+
28
+ use List::MoreUtils qw( uniq minmax );
29
+ use List::Util qw( max sum );
30
+ use Bio::AlignIO;
31
+ use Bio::Matrix::IO;
32
+ use Bio::Tools::CodonTable;
33
+ use GD;
34
+ use Statistics::Descriptive;
35
+ no warnings 'experimental';
36
+ #use Benchmark qw/ :all :hireswallclock /;
37
+ #use Data::Dump qw( dd );
38
+
39
+ =head2 --file <file>
40
+
41
+ The file that contains the alignment.
42
+
43
+ =cut
44
+
45
+ has 'file' => (
46
+ is => 'ro',
47
+ isa => 'Path::Class::File',
48
+ required => 1,
49
+ coerce => 1,
50
+ documentation => 'File containing alignment',
51
+ );
52
+
53
+ =head2 --format <format>
54
+
55
+ The alignment format, default is fasta. Can use any format supported by
56
+ bioperl.
57
+
58
+ =cut
59
+
60
+ has 'format' => (
61
+ is => 'ro',
62
+ isa => 'Str',
63
+ required => '',
64
+ default => 'fasta',
65
+ documentation => 'Alignment format, default is fasta',
66
+ );
67
+
68
+ =head2 --subset <subset>
69
+
70
+ Specify a subset of the sequences that all calculations should be based on.
71
+ This is usefull for only taking the ingroup into account for example. The
72
+ argument has to be a regular expression that matches the names of the sequences
73
+ to include.
74
+
75
+ --subset '^INGROUP'
76
+
77
+ --subset '^(Snuffe|Snuffa|Peter)$'
78
+
79
+ =cut
80
+
81
+ has 'subset' => (
82
+ is => 'ro',
83
+ isa => 'Str',
84
+ required => '',
85
+ default => '',
86
+ documentation => 'The subset of sequences to be used for the analysis, argument is a regexp that should match the id of the sequences',
87
+ );
88
+
89
+ =head2 --chi2_test
90
+
91
+ This will run a chi2 test on you alignment and show the result as a table.
92
+ The table contains 4 columns:
93
+
94
+ 1. The number of the sequence in the alignment
95
+ 2. The name of the sequence
96
+ 3. The chi2 statistic
97
+ 4. A star if it is significantly deviating
98
+
99
+ Note that if you have specified a subset, the expected distribution of
100
+ aminoacids is based on this set, but the script will calculate the chi2
101
+ test on all sequences.
102
+
103
+ =cut
104
+
105
+ has 'chi2_test' => (
106
+ is => 'ro',
107
+ isa => 'Bool',
108
+ required => '',
109
+ default => '',
110
+ documentation => 'Show chi2 statistics for the taxa of the alignment',
111
+ );
112
+
113
+ =head2 --bowker_symmetry_test
114
+
115
+ This will run bowkers symmetry test on you alignment and show the result as a
116
+ table.
117
+
118
+ =cut
119
+
120
+ has 'bowker_symmetry_test' => (
121
+ is => 'ro',
122
+ isa => 'Bool',
123
+ required => '',
124
+ default => '',
125
+ documentation => 'Run the Bowker symmetry test',
126
+ );
127
+
128
+ =head2 --aminogc
129
+
130
+ Calculate the aminoGC content of the different taxa.
131
+
132
+ =cut
133
+
134
+ has 'aminogc' => (
135
+ is => 'ro',
136
+ isa => 'Bool',
137
+ required => '',
138
+ default => '',
139
+ documentation => 'Calculate aminogc',
140
+ );
141
+
142
+ =head2 --chi2_remove_taxa
143
+
144
+ This will run a chi2 test on you alignment and remove all taxa that are
145
+ significantly deviating.
146
+
147
+ =cut
148
+
149
+ has 'chi2_remove_taxa' => (
150
+ is => 'ro',
151
+ isa => 'Bool',
152
+ required => '',
153
+ default => '',
154
+ documentation => 'Remove taxa that fail the chi2 test',
155
+ );
156
+
157
+ =head2 --remove_columns <columns>
158
+
159
+ Specify a list of columns to remove from the alignment. The first column is
160
+ column 0. It should be a comma separated string of ranges. Example:
161
+
162
+ 0-5,22-30
163
+
164
+ =cut
165
+
166
+ has 'remove_columns' => (
167
+ is => 'ro',
168
+ isa => 'Str',
169
+ required => '',
170
+ default => '',
171
+ documentation => 'Remove columns, specified by a commaseparated string (4-10,88,100-140)',
172
+ );
173
+
174
+ =head2 --gap_threshold <threshold>
175
+
176
+ The threshold used for removing gapped positions, either the maximal number of
177
+ sequences that are allowed to have gaps or the fraction of sequences allowed to
178
+ have gaps, either as percent or as a number between 0 and 1. Examples:
179
+
180
+ --gap_threshold 10 # remove columns with gaps in more than 10 sequences.
181
+
182
+ --gap_threshold 10% # remove columns with gaps in more than 10% of the sequences.
183
+
184
+ --gap_threshold .1 # remove columns with gaps in more than 10% of the sequences.
185
+
186
+ =cut
187
+
188
+ has 'gap_threshold' => (
189
+ is => 'ro',
190
+ isa => 'Str',
191
+ required => '',
192
+ default => '',
193
+ documentation => 'The threshold used for removing gapped positions, either a number specifying the minimal number of sequences or %'
194
+ );
195
+
196
+ =head2 --conserved_threshold <threshold>
197
+
198
+ The threshold used for removing unconserved positions. Conservation is
199
+ calculated as the number of times the most frequent aminoacid appears in an
200
+ alignment column. Specify in the same way as for the --gap_threshold. This
201
+ option is probably not that useful. Example:
202
+
203
+ --conserved_threshold 10
204
+
205
+ --conserved_threshold 10%
206
+
207
+ --conserved_threshold .1
208
+
209
+ =cut
210
+
211
+ has 'conserved_threshold' => (
212
+ is => 'ro',
213
+ isa => 'Str',
214
+ required => '',
215
+ default => '',
216
+ documentation => 'The threshold used for removing unconserved positions, same as gap_threshold',
217
+ );
218
+
219
+ =head2 --chi2_prune <half|n#|f#|min|plot>
220
+
221
+ Use the chi2 statistic to choose columns to prune. This will first order
222
+ all of the columns by the chi2 statistic by comparing the chi2
223
+ statistic for the alignment with and without each column. Then the option
224
+ specifies how to remove columns:
225
+
226
+ half Remove half of the sites (starting with the most biased).
227
+ f# Remove sites until # fraction of sites remains, half can be
228
+ specified as f0.5
229
+ n# Remove sites until only # number of sequences show significant bias.
230
+ min Remove sites until a minimum of sequences show significant bias.
231
+ plot Will print statistics to the screen suitable for plotting. It will
232
+ contain 4 columns: idx, number of biased sequences, chi2 delta for
233
+ this column and the names of the biased sequences.
234
+
235
+ =cut
236
+
237
+
238
+ has 'chi2_prune' => (
239
+ is => 'ro',
240
+ isa => 'Str',
241
+ required => '',
242
+ default => '',
243
+ documentation => 'Use the chi2 statistic to prune sites of the alignment, values are Half, n<number>, f<number>, min or Plot.',
244
+ );
245
+
246
+ =head2 --generate_overview
247
+
248
+ With this option you will get an overview image showing your alignment and the
249
+ effect of your pruning settings. The filename will be "infile.png". The colors
250
+ are based on BLOSUM62 score spanning from green for really good to red for
251
+ really bad, yellow means 0. Removed columns will be overlayed with gray and if
252
+ you have specified a subset that will be overlayed with white. Try it to see.
253
+
254
+ =cut
255
+
256
+ has 'generate_overview' => (
257
+ is => 'ro',
258
+ isa => 'Bool',
259
+ required => '',
260
+ default => '',
261
+ documentation => 'Generate an overview image of the alignment',
262
+ );
263
+
264
+ =head2 --inverse
265
+
266
+ Inverse the pruning, that is remove the columns that otherwise would be kept.
267
+
268
+ =cut
269
+
270
+ has 'inverse' => (
271
+ is => 'ro',
272
+ isa => 'Bool',
273
+ required => '',
274
+ default => '',
275
+ documentation => 'Inverse the pruning',
276
+ );
277
+
278
+ =head2 --gap_treatment <ignore|mean|additional>
279
+
280
+ How to treat gaps in the calculations:
281
+
282
+ ignore Just ignore them, default
283
+ mean Add 1/20 on all the other AAs for each gap in a sequence.
284
+ additional As an additional state (not implemented)
285
+
286
+ =cut
287
+
288
+ has 'gap_treatment' => (
289
+ is => 'ro',
290
+ isa => 'Str',
291
+ required => '',
292
+ default => 'ignore',
293
+ documentation => 'How to treat gaps when calculating conservation and chi2 statistics. Ignore (default), Mean: 1/20 of all other AAs or Additional: additional state',
294
+ );
295
+
296
+ =head2 --aa_significance_level <number>
297
+
298
+ The significance level used for the chi2 test. The only cases you'd want to
299
+ change this is probably when running the chi2 pruning. The default is
300
+ 30.143527. Can be generated in R with qchisq(0.05, 19, lower.tail=FALSE).
301
+
302
+ =cut
303
+
304
+ has aa_significance_level => (
305
+ is => 'ro',
306
+ isa => 'Num',
307
+ default => 30.143527,
308
+ documentation => 'The chi2 significance level for amino-acids, default 30.143527',
309
+ );
310
+
311
+ =head2 --dna_significance_level <number>
312
+
313
+ Same as above but for dna instead. The default is 7.814728. Can be generated in
314
+ R with qchisq(0.05, 3, lower.tail=FALSE).
315
+
316
+ =cut
317
+
318
+ has dna_significance_level => (
319
+ is => 'ro',
320
+ isa => 'Num',
321
+ default => 7.814728,
322
+ documentation => 'The chi2 significance level for nucleotides, default 7.814728',
323
+ );
324
+
325
+ =head2 --recode <recoding scheme>
326
+
327
+ Recoding scheme for recoding if the alignment, currently only dayhoff4 recoding
328
+ is supported. Schemes:
329
+
330
+ dayhoff4: A=AGPST T=DENQ C=HKR G=FYWILMV -=C
331
+ dayhoff6: 1=AGPST 2=DENQ 3=HKR 4=FYW 5=ILMV 6=C
332
+ hp: 1=ACFGILMVW 2=DEHKNPQRSTY
333
+
334
+ =cut
335
+
336
+ has recode => (
337
+ is => 'ro',
338
+ isa => 'Str',
339
+ default => '',
340
+ documentation => 'Recoding scheme, dayhoff4',
341
+ );
342
+
343
+
344
+ #### END OF OPTIONS ####
345
+
346
+ has _matrix => (
347
+ is => 'ro',
348
+ isa => 'HashRef',
349
+ lazy_build => 1,
350
+ traits => ['Hash'],
351
+ handles => {
352
+ _amino_acids => 'keys',
353
+ }
354
+ );
355
+
356
+ sub _build__matrix {
357
+ my $self = shift;
358
+
359
+ # The matrix is stored in the DATA-block at the end of this file
360
+ my $matrix = Bio::Matrix::IO->new(
361
+ -fh => \*DATA,
362
+ -format => 'scoring'
363
+ )->next_matrix;
364
+
365
+ # Using the matrix directly takes too much time, so we make cache it.
366
+ my %qmatrix;
367
+ my @AAS = grep /[A-Z]/, $matrix->row_names;
368
+ for my $ni ( @AAS ) {
369
+ for my $nj ( @AAS ) {
370
+ $qmatrix{$ni}{$nj} = $matrix->entry($ni,$nj);
371
+ }
372
+ }
373
+ return \%qmatrix;
374
+ }
375
+
376
+
377
+ #### MAIN PROGRAM ####
378
+
379
+ sub run {
380
+ my $self = shift;
381
+ my $alnio = Bio::AlignIO->new(
382
+ -fh => $self->file->openr,
383
+ -format => $self->format,
384
+ );
385
+ $Bio::Root::Root::DEBUG = -1;
386
+ while (my $aln = $alnio->next_aln) {
387
+ $self->prune( $aln );
388
+ }
389
+ }
390
+
391
+ sub prune {
392
+ my $self = shift;
393
+ my ( $aln ) = @_;
394
+
395
+ my $stats = $self->calculate_statistics( $aln );
396
+ my $nseq = $stats->{nseq};
397
+
398
+ if ($self->recode) {
399
+ $self->run_recoding( $aln );
400
+ return;
401
+ }
402
+
403
+ if ( $self->chi2_test ) {
404
+ $self->run_chi2_test( $stats );
405
+ $self->show_chi2_test( $stats );
406
+ return;
407
+ }
408
+
409
+ if ( $self->bowker_symmetry_test ) {
410
+ $self->run_bowker_symmetry_test( $stats );
411
+ return;
412
+ }
413
+
414
+ if ( $self->aminogc ) {
415
+ $self->run_aminogc( $stats );
416
+ return;
417
+ }
418
+
419
+ if ( $self->chi2_remove_taxa ) {
420
+ $stats = $self->run_chi2_remove_taxa( $aln, $stats );
421
+ }
422
+
423
+ my $MAX_COL = $aln->length - 1;
424
+ my @columns_to_remove;
425
+
426
+ ## chi2 removal
427
+ if ( $self->chi2_prune ) {
428
+ if ( $self->gap_threshold || $self->conserved_threshold ) {
429
+ die "Can't specify both chi2_prune and any of gap_threshold or conserved_threshold\n";
430
+ }
431
+ # Note that $stats will change after this call, so it's not that usable
432
+ # after this call.
433
+ push @columns_to_remove, $self->run_chi2_prune( $stats );
434
+ }
435
+
436
+ ## Gap removal
437
+ if ( $self->gap_threshold ) {
438
+ my $threshold = $self->_convert_threshold( $self->gap_threshold, $nseq );
439
+ for my $pos ( 0 .. $MAX_COL ) {
440
+ next if $stats->{counts}[$pos]{'-'} <= $threshold;
441
+ push @columns_to_remove, $pos;
442
+ }
443
+ }
444
+
445
+ ## Unconserved removal
446
+ if ( $self->conserved_threshold ) {
447
+ my $threshold = $self->_convert_threshold( $self->conserved_threshold, $nseq );
448
+ for my $pos ( 0 .. $MAX_COL ) {
449
+ my %counts = %{ $stats->{counts}[$pos] };
450
+ delete $counts{'-'};
451
+ my $best = max values %counts;
452
+ if ( $best < $threshold ) {
453
+ push @columns_to_remove, $pos;
454
+ }
455
+ }
456
+ }
457
+
458
+ ## Specific columns removal
459
+ if ( $self->remove_columns ne '' ) {
460
+ my @columns = split /,/, $self->remove_columns;
461
+ for my $c ( @columns ) {
462
+ if ( $c =~ /(\d+)(?:-|\.\.?)(\d+)/ ) {
463
+ push @columns_to_remove, $1 .. $2;
464
+ }
465
+ else {
466
+ push @columns_to_remove, $c;
467
+ }
468
+ }
469
+ }
470
+
471
+ if ( $self->inverse ) {
472
+ my %remove = map { ( $_, 1 ) } @columns_to_remove;
473
+ @columns_to_remove =
474
+ grep { ! exists $remove{$_} } 0 .. $aln->length - 1;
475
+ }
476
+
477
+ if ( $self->generate_overview ) {
478
+ $self->run_overview_window( $stats, \@columns_to_remove );
479
+ }
480
+
481
+ # if ( @columns_to_remove || $self->chi2_remove_taxa ) {
482
+ $aln = $self->run_pruning( $aln, \@columns_to_remove ) if @columns_to_remove;
483
+ $self->write_aln( $aln );
484
+ # }
485
+ }
486
+
487
+ sub write_aln {
488
+ my $self = shift;
489
+ my ($aln) = @_;
490
+ $aln->set_displayname_flat;
491
+
492
+ my $IO = Bio::AlignIO->new(
493
+ -fh => \*STDOUT,
494
+ -format => $self->format,
495
+ );
496
+
497
+ $IO->write_aln( $aln );
498
+ }
499
+
500
+ sub run_recoding {
501
+ my $self = shift;
502
+ my ($aln) = @_;
503
+
504
+ my %schemes = (
505
+ 'dayhoff4' => 'A=AGPST,T=DENQ,C=HKR,G=FYWILMV,-=C',
506
+ 'dayhoff6' => '1=AGPST,2=DENQ,3=HKR,4=FYW,5=ILMV,6=C',
507
+ 'hp' => '1=ACFGILMVW,2=DEHKNPQRSTY',
508
+ );
509
+ if (! exists $schemes{ $self->recode }) {
510
+ die "Unrecognized recoding scheme\n";
511
+ }
512
+
513
+ my %scheme = (
514
+ '-' => '-',
515
+ 'X' => 'X',
516
+ );
517
+ for my $class ( split ',', $schemes{$self->recode}) {
518
+ my ($l,undef,@letters) = split //, $class;
519
+ $scheme{$_} = $l for @letters;
520
+ }
521
+
522
+ my @new_seqs;
523
+ for my $seq ( $aln->each_seq ) {
524
+ my $s = $seq->seq;
525
+ $s =~ s{(.)}{$scheme{$1} // die "Invalid alignment character: $1\n"}ge;
526
+ push @new_seqs, Bio::LocatableSeq->new(
527
+ -id => $seq->id,
528
+ -seq => $s,
529
+ );
530
+ }
531
+
532
+ $self->write_aln(Bio::SimpleAlign->new(-seqs => \@new_seqs));
533
+ }
534
+
535
+ sub run_pruning {
536
+ my $self = shift;
537
+ my ( $aln, $columns ) = @_;
538
+
539
+ return $aln if $columns->[0] == -1;
540
+
541
+ my $groups = $self->_group_columns( $columns );
542
+ $aln = $aln->remove_columns( $_ ) for reverse @$groups;
543
+
544
+ return $aln;
545
+ }
546
+
547
+ sub run_aminogc {
548
+ my $self = shift;
549
+ my ($stats) = @_;
550
+
551
+ my $codon_table = Bio::Tools::CodonTable->new( -id => 11 );
552
+ my @AAS = $self->amino_acids;
553
+
554
+ my %chosen;
555
+ for my $AA ( @AAS ) {
556
+ my $codons = join '', map { /^(..)/ } $codon_table->revtranslate( $AA );
557
+ if ( $codons =~ /t|a/i ) {
558
+ $chosen{$AA} = '';
559
+ }
560
+ else {
561
+ $chosen{$AA} = 1;
562
+ }
563
+ }
564
+
565
+ my @counts = @{ $stats->{seqcounts} };
566
+ my @names = @{ $stats->{names} };
567
+
568
+ my @aminogc;
569
+ for my $seqi ( 0 .. $#names ) {
570
+ my $aminogc;
571
+ my $tot;
572
+
573
+ for my $AA ( keys %{ $counts[$seqi] } ) {
574
+ next if $AA eq '-';
575
+ if ( $chosen{$AA} ) {
576
+ $aminogc += $counts[$seqi]{$AA};
577
+ }
578
+ $tot += $counts[$seqi]{$AA}
579
+ }
580
+
581
+ $aminogc /= $tot;
582
+ push @aminogc, $aminogc;
583
+
584
+ printf "%-30s %6.4f\n", $names[$seqi], $aminogc;
585
+ }
586
+
587
+ say '---';
588
+
589
+ my $mean = sum( @aminogc ) / @aminogc;
590
+ my $var = sum( map { ($_ - $mean)**2 } @aminogc ) / @aminogc;
591
+
592
+ printf "%-30s %6.4f\n", 'MEAN', $mean;
593
+ printf "%-30s %6.4f\n", 'STD.DEV.', sqrt($var);
594
+ }
595
+
596
+ sub run_bowker_symmetry_test {
597
+ my $self = shift;
598
+ my ($stats) = @_;
599
+
600
+ my $distrib = $stats->{distribution};
601
+ my $length = $stats->{length};
602
+ my $nseq = $stats->{totseq};
603
+ my @AAS = $self->amino_acids;
604
+
605
+ my @names = @{ $stats->{names} };
606
+
607
+ my $significance_cutoff = 223.1602;
608
+
609
+ my %sign_counts;
610
+
611
+ for my $seqi ( 0 .. $nseq - 2 ) {
612
+ for my $seqj ( $seqi+1 .. $nseq - 1 ) {
613
+ my $matrix;
614
+ for my $pos ( 0 .. $length - 1 ) {
615
+ my $aai = $distrib->[$pos][$seqi];
616
+ my $aaj = $distrib->[$pos][$seqj];
617
+ $matrix->{$aai}{$aaj}++
618
+ }
619
+ my $stat = 0;
620
+ for my $ii ( 0 .. $#AAS - 1 ) {
621
+ for my $jj ( $ii+1 .. $#AAS ) {
622
+ my $ai = $AAS[$ii];
623
+ my $aj = $AAS[$jj];
624
+ my $nij = $matrix->{$ai}{$aj} // 0;
625
+ my $nji = $matrix->{$aj}{$ai} // 0;
626
+ next unless $nij || $nji;
627
+ $stat += ( $nij - $nji )**2 / ($nij + $nji);
628
+ }
629
+ }
630
+
631
+ printf "%-8.8s %-8.8s %3d %3d %7.2f",
632
+ @names[$seqi,$seqj],
633
+ $seqi, $seqj,
634
+ $stat;
635
+ if ( $stat > $significance_cutoff ) {
636
+ $sign_counts{$names[$seqi]}++;
637
+ $sign_counts{$names[$seqj]}++;
638
+ print ' *';
639
+ }
640
+ print "\n";
641
+ }
642
+ }
643
+ }
644
+
645
+ #### OVERVIEW WINDOW GENERATION ####
646
+
647
+ sub run_overview_window {
648
+ my $self = shift;
649
+ my ($stats, $remove) = @_;
650
+
651
+ $self->_calculate_statistics_scores( $stats );
652
+
653
+ my $distrib = $stats->{distribution};
654
+ my $scores = $stats->{scores};
655
+ my $nseq = $stats->{totseq};
656
+
657
+ my $AA_WIDTH = 4;
658
+ my $AA_HEIGHT = 6;
659
+
660
+ my $width = $AA_WIDTH * $stats->{length};
661
+ my $height = $AA_HEIGHT * $nseq;
662
+
663
+ my $gd = GD::Image->new( $width, $height, 1, );
664
+
665
+ ## Cache all colors
666
+ my @COLORS = map {
667
+ $gd->colorAllocate( @$_ )
668
+ } _rainbow_gradient();
669
+ my $white = $gd->colorAllocate( 255,255,255 );
670
+
671
+ ## Build the image
672
+ for my $pos ( 0 .. $stats->{length} - 1 ) {
673
+ for my $seqi ( 0 .. $nseq - 1 ) {
674
+ my $aa = $distrib->[$pos][$seqi];
675
+ my $color = $white;
676
+ if ( $aa ne '-' ) {
677
+ my $score = $scores->[$pos][$seqi];
678
+ my $idx = int(100*$score);
679
+ die "Too large $pos,$seqi $idx,$score" if $idx > 100;
680
+ die "Too small $pos,$seqi $idx,$score" if $idx < 0;
681
+ $color = $COLORS[ $idx ];
682
+ }
683
+
684
+ $gd->filledRectangle(
685
+ $pos * $AA_WIDTH,
686
+ $seqi * $AA_HEIGHT,
687
+ ($pos+1) * $AA_WIDTH,
688
+ ($seqi+1) * $AA_HEIGHT - 1,
689
+ $color,
690
+ );
691
+ }
692
+ }
693
+
694
+ ## Add remove overlay
695
+ if ( @$remove ) {
696
+ my $groups = $self->_group_columns( $remove );
697
+
698
+ my $color = $gd->colorAllocateAlpha(0,0,0,90);
699
+
700
+ for my $group ( @$groups ) {
701
+ my ($start,$end) = @$group;
702
+ $end++;
703
+
704
+ $gd->filledRectangle(
705
+ $start * $AA_WIDTH,
706
+ 0,
707
+ $end * $AA_WIDTH,
708
+ $height,
709
+ $color,
710
+ );
711
+ }
712
+ }
713
+
714
+ ## Add subset overlay
715
+ if ( $self->subset ) {
716
+ my %chosen = map { ( $_ => 1 ) } @{ $stats->{chosen} };
717
+ my $color = $gd->colorAllocateAlpha(255,255,255,90);
718
+ for my $c ( 0 .. $nseq - 1 ) {
719
+ next if $chosen{$c};
720
+
721
+ $gd->filledRectangle(
722
+ 0,
723
+ $c * $AA_HEIGHT,
724
+ $width,
725
+ ($c+1) * $AA_HEIGHT - 1,
726
+ $color,
727
+ );
728
+ }
729
+ }
730
+
731
+ my $outfile = $self->file . '.png';
732
+ say STDERR "Saving png to $outfile";
733
+ open my $OUT, '>', $outfile or die;
734
+ print $OUT $gd->png;
735
+ close $OUT;
736
+ }
737
+
738
+
739
+ #### CHISQUARE METHODS ####
740
+
741
+ sub run_chi2_prune {
742
+ my $self = shift;
743
+ my ( $stats ) = @_;
744
+
745
+ my $length = $stats->{length};
746
+ my @columns_to_remove;
747
+ given ( $self->chi2_prune ) {
748
+ when ( /^h/i ) {
749
+ my @order = $self->_chi2_prune_order( $stats );
750
+ push @columns_to_remove, @order[ 0 .. $#order * .5];
751
+ }
752
+ when ( /^f(\d+\.?\d*)/i ) {
753
+ my $num = $self->_convert_threshold( $1, $length );
754
+ my @order = $self->_chi2_prune_order( $stats );
755
+ push @columns_to_remove, @order[ 0 .. ($num-1) ];
756
+ }
757
+ when ( /^n(\d+\.?\d*)/i ) {
758
+ my $num = $self->_convert_threshold( $1, $length );
759
+ push @columns_to_remove,
760
+ $self->_run_chi2_prune_threshold( $stats, $num );
761
+ }
762
+ when ( /^min/i ) {
763
+ push @columns_to_remove,
764
+ $self->_run_chi2_prune_threshold( $stats, 0, 1 );
765
+ }
766
+ when ( /^plot/i ) {
767
+ $self->_plot_chi2_prune( $stats );
768
+ exit;
769
+ }
770
+ default {
771
+ die "chi2 pruning of type $_ not recongnized\n";
772
+ }
773
+ }
774
+ return @columns_to_remove;
775
+ }
776
+
777
+ sub run_chi2_test {
778
+ my $self = shift;
779
+ my ($stats) = @_;
780
+
781
+ my @AAS = $self->amino_acids;
782
+ my @chosen = @{ $stats->{chosen} };
783
+ my @O = @{ $stats->{seqcounts} };
784
+ my @l = @{ $stats->{seqlength} };
785
+ my %chosen = map { ($_,1) } @chosen;
786
+
787
+ my ( %E );
788
+
789
+ for my $seqi ( @chosen ) {
790
+ for my $AA ( @AAS ) {
791
+ $E{$AA} += $O[$seqi]{$AA};
792
+ }
793
+ }
794
+
795
+ my $L = sum @l;
796
+ $_ /= $L for values %E;
797
+
798
+ my @test;
799
+
800
+ my $tot = 0;
801
+ for my $seqi ( 0 .. $#O ) {
802
+ my $chi2 = 0;
803
+
804
+ for my $AA ( @AAS ) {
805
+ next if $E{$AA} == 0;
806
+ my $E = $E{$AA} * $l[$seqi];
807
+ $chi2 += ( $O[$seqi]{$AA} - $E )**2 / $E;
808
+ }
809
+
810
+ push @test, $chi2;
811
+ $tot += $chi2 if $chosen{$seqi};
812
+ }
813
+
814
+ push @test, $tot;
815
+ $stats->{chi2} = \@test;
816
+ return $tot;
817
+ }
818
+
819
+ sub show_chi2_test {
820
+ my $self = shift;
821
+ my ($stats) = @_;
822
+
823
+ my $SIGN_LEVEL = $self->aa_significance_level;
824
+
825
+ my @names = @{ $stats->{names} };
826
+ my $chi2 = $stats->{chi2};
827
+
828
+ for my $seqi ( 0 .. $#names ) {
829
+ my $sign = $chi2->[$seqi] > $SIGN_LEVEL ? '*' : '';
830
+
831
+ printf "%4d %-30.30s %7.2f %1s\n",
832
+ $seqi, $names[$seqi], $chi2->[$seqi], $sign;
833
+ }
834
+
835
+ printf " %-30.30s %7.2f\n",
836
+ 'TOTAL', $chi2->[-1];
837
+
838
+ my $stat = Statistics::Descriptive::Sparse->new;
839
+ $stat->add_data( @$chi2[ 0 .. $#$chi2-1 ] );
840
+
841
+ printf " %-30.30s %7.2f\n",
842
+ 'Std.Dev.', $stat->standard_deviation;
843
+ }
844
+
845
+ sub run_chi2_remove_taxa {
846
+ my $self = shift;
847
+ my ($aln, $stats) = @_;
848
+
849
+ my $SIGN_LEVEL = $self->aa_significance_level;
850
+
851
+ if ( ! exists $stats->{chi2} ) {
852
+ $self->run_chi2_test( $stats );
853
+ }
854
+
855
+ my @names = @{ $stats->{names} };
856
+ my $chi2 = $stats->{chi2};
857
+
858
+ my %remove_seqs;
859
+
860
+ for my $seqi ( 0 .. $#names ) {
861
+ if ( $chi2->[$seqi] > $SIGN_LEVEL ) {
862
+ $remove_seqs{ $names[$seqi] }++;
863
+ }
864
+ }
865
+
866
+ for my $seq ( $aln->each_seq ) {
867
+ if ( exists $remove_seqs{ $seq->id } ) {
868
+ $aln->remove_seq( $seq );
869
+ }
870
+ }
871
+
872
+ my $new_stats = $self->calculate_statistics( $aln );
873
+
874
+ return $new_stats;
875
+ }
876
+
877
+ sub _chi2_test_minus {
878
+ my $self = shift;
879
+ my ($stats, $columns, $inplace) = @_;
880
+
881
+ # The inplace flag signifies whether to modify the $stats hash inplace or
882
+ # create a new one.
883
+
884
+ my $new_stats = {};
885
+
886
+ if ( $inplace ) {
887
+ $new_stats = $stats;
888
+ }
889
+ else {
890
+ $new_stats->{$_} = $stats->{$_} for keys %$stats;
891
+ }
892
+
893
+ if ( ref($columns) ne 'ARRAY' ) {
894
+ $columns = [$columns];
895
+ }
896
+
897
+ my @distrib = @{ $stats->{distribution} };
898
+ my @AAS = $self->amino_acids;
899
+
900
+ my @O = @{ $stats->{seqcounts} };
901
+ my @l = @{ $stats->{seqlength} };
902
+
903
+ my @chosen = @{ $stats->{chosen} };
904
+ my %chosen = map { ($_, 1) } @chosen;
905
+
906
+ my (@Oc, @lc, %Ec);
907
+
908
+ for my $seqi ( 0 .. $#O ) {
909
+ $lc[$seqi] = $l[$seqi];
910
+
911
+ for my $AA ( @AAS ) {
912
+ $Oc[$seqi]{$AA} = $O[$seqi]{$AA};
913
+ $Ec{$AA} += $O[$seqi]{$AA} if $chosen{$seqi};
914
+ }
915
+
916
+ for my $c ( @$columns ) {
917
+ my $aa = $distrib[$c][$seqi];
918
+ if ( $aa ne '-' ) {
919
+ $Oc[$seqi]{$aa} -= 1;
920
+ $lc[$seqi] -= 1;
921
+ $Ec{$aa} -= 1 if $chosen{ $seqi };
922
+ }
923
+ }
924
+ }
925
+
926
+ $new_stats->{seqcounts} = \@Oc;
927
+ $new_stats->{seqlength} = \@lc;
928
+
929
+ my $L = sum values %Ec;
930
+
931
+ if ( $L == 0 ) {
932
+ warn "Nothing left of alignment\n";
933
+ return $new_stats;
934
+ }
935
+
936
+ $_ /= $L for values %Ec;
937
+
938
+ my @chi2_test;
939
+
940
+ my $tot;
941
+ for my $seqi ( @chosen ) {
942
+ my $chi2 = 0;
943
+
944
+ for my $AA ( @AAS ) {
945
+ my $E = $Ec{$AA}*$lc[$seqi];
946
+ next if $E == 0;
947
+ $chi2 += ( $Oc[$seqi]{$AA} - $E )**2 / $E;
948
+ }
949
+
950
+ $tot += $chi2;
951
+ push @chi2_test, $chi2;
952
+ }
953
+ push @chi2_test, $tot;
954
+
955
+ $new_stats->{chi2} = \@chi2_test;
956
+ }
957
+
958
+ sub _chi2_delta_exact {
959
+ my $self = shift;
960
+ my ( $stats, $c, $full_tot ) = @_;
961
+
962
+ my @AAS = $self->amino_acids;
963
+ my @distrib = @{ $stats->{distribution} };
964
+
965
+ my @O = @{ $stats->{seqcounts} };
966
+ my @l = @{ $stats->{seqlength} };
967
+
968
+ my @chosen = @{ $stats->{chosen} };
969
+
970
+ my (@Oc,%Ec,@lc);
971
+
972
+ for my $seqi ( @chosen ) {
973
+ $lc[$seqi] = $l[$seqi];
974
+
975
+ for my $AA ( @AAS ) {
976
+ $Oc[$seqi]{$AA} = $O[$seqi]{$AA};
977
+ $Ec{$AA} += $O[$seqi]{$AA};
978
+ }
979
+
980
+ my $aa = $distrib[$c][$seqi];
981
+ if ( $aa ne '-' ) {
982
+ $Oc[$seqi]{$aa} -= 1;
983
+ $lc[$seqi] -= 1;
984
+ $Ec{$aa} -= 1;
985
+ }
986
+ }
987
+
988
+ my $L = sum grep defined, @lc;
989
+ $_ /= $L for values %Ec;
990
+
991
+ my $tot;
992
+ for my $seqi ( @chosen ) {
993
+ my $chi2 = 0;
994
+
995
+ for my $AA ( @AAS ) {
996
+ next if $Ec{$AA} == 0;
997
+ my $E = $Ec{$AA}*$lc[$seqi];
998
+ $chi2 += ( $Oc[$seqi]{$AA} - $E )**2 / $E;
999
+ }
1000
+
1001
+ $tot += $chi2;
1002
+ }
1003
+ return $tot - $full_tot;
1004
+ }
1005
+
1006
+ sub _chi2_prune_order {
1007
+ my $self = shift;
1008
+ my ( $stats ) = @_;
1009
+
1010
+ # TODO: Lazy build...
1011
+ if ( ! $stats->{chi2} ) {
1012
+ $self->run_chi2_test( $stats );
1013
+ }
1014
+
1015
+ my $len = $stats->{length};
1016
+ my $chi2_sum = $stats->{chi2}[-1];
1017
+ my @pos_scores;
1018
+ for my $c ( 0 .. $len - 1 ) {
1019
+ push @pos_scores, $self->_chi2_delta_exact( $stats, $c, $chi2_sum );
1020
+ }
1021
+
1022
+ $stats->{chi2_deltas} = \@pos_scores;
1023
+
1024
+ my @order = sort { $pos_scores[$a] <=> $pos_scores[$b] } 0 .. $len - 1;
1025
+ return @order;
1026
+ }
1027
+
1028
+ sub _plot_chi2_prune {
1029
+ my $self = shift;
1030
+ my ( $stats ) = @_;
1031
+
1032
+ my @order = $self->_chi2_prune_order( $stats );
1033
+
1034
+ my $new_stats = {};
1035
+ $new_stats->{$_} = $stats->{$_} for keys %$stats;
1036
+ my $len = $stats->{length};
1037
+
1038
+ my $SIGN_LEVEL = $self->aa_significance_level;
1039
+ for my $idx_order ( 0 .. $len - 2 ) {
1040
+ $self->_chi2_test_minus( $new_stats, $order[$idx_order], 1 );
1041
+
1042
+ my @delta = @{ $new_stats->{chi2} };
1043
+ pop @delta; # Not interested in total
1044
+
1045
+ my @over = grep { $delta[$_] > $SIGN_LEVEL } 0 .. $#delta;
1046
+ my $num = grep { $_ > $SIGN_LEVEL } @delta;
1047
+ my $pos_delta = $stats->{chi2_deltas}[$order[$idx_order]];
1048
+
1049
+ printf "%5d %5d %5.2f (%s)\n", $idx_order, $num, $pos_delta,
1050
+ join(',', @{$stats->{names}}[@over]);
1051
+ }
1052
+ }
1053
+
1054
+ sub _run_chi2_prune_threshold {
1055
+ my $self = shift;
1056
+ my ( $stats, $stop_num, $MIN ) = @_;
1057
+
1058
+ $stop_num //= 0;
1059
+
1060
+ my @order = $self->_chi2_prune_order( $stats );
1061
+
1062
+ my $new_stats = {};
1063
+ $new_stats->{$_} = $stats->{$_} for keys %$stats;
1064
+
1065
+ my $len = $stats->{length};
1066
+
1067
+ my ($best,$best_idx);
1068
+
1069
+ my $SIGN_LEVEL = $self->aa_significance_level;
1070
+ for my $idx_order ( 0 .. $len - 2 ) {
1071
+ $self->_chi2_test_minus( $new_stats, $order[$idx_order], 1 );
1072
+
1073
+ my @delta = @{ $new_stats->{chi2} };
1074
+ pop @delta; # Not interested in total
1075
+
1076
+ my @over = grep { $delta[$_] > $SIGN_LEVEL } 0 .. $#delta;
1077
+ my $num = grep { $_ > $SIGN_LEVEL } @delta;
1078
+
1079
+ if ( $num <= $stop_num ) {
1080
+ return @order[ 0 .. $idx_order ];
1081
+ }
1082
+
1083
+ if ( !$best || $num < $best ) {
1084
+ ($best,$best_idx) = ($num, $idx_order);
1085
+ }
1086
+ }
1087
+
1088
+ if ( $MIN ) {
1089
+ print STDERR "Min is at $best_idx with $best\n";
1090
+ return @order[ 0 .. $best_idx ];
1091
+ }
1092
+
1093
+ die "It is not possible to get $stop_num or fewer sequences above significance threshold\n";
1094
+ }
1095
+
1096
+ ## Not in current option list
1097
+ sub _run_reorder_chi2 {
1098
+ my $self = shift;
1099
+ my ( $stats ) = @_;
1100
+
1101
+ my @order = $self->_chi2_prune_order( $stats );
1102
+
1103
+ my $nseq = $stats->{nseq};
1104
+ my @names = @{ $stats->{names} };
1105
+ my @distrib = @{ $stats->{distribution} };
1106
+
1107
+ my @new_seq_str;
1108
+ for my $c ( @order ) {
1109
+ for my $seqi ( 0 .. $nseq - 1 ) {
1110
+ $new_seq_str[$seqi] .= $distrib[$c][$seqi];
1111
+ }
1112
+ }
1113
+
1114
+ my @new_seq;
1115
+ for my $seqi ( 0 .. $nseq - 1 ) {
1116
+ push @new_seq, Bio::LocatableSeq->new(
1117
+ -id => $names[$seqi],
1118
+ -seq => $new_seq_str[$seqi],
1119
+ );
1120
+ }
1121
+
1122
+ return Bio::SimpleAlign->new(
1123
+ -seqs => \@new_seq,
1124
+ );
1125
+ }
1126
+
1127
+
1128
+ # TODO: Should be replaced with a different sub to return the alphabet of the
1129
+ # alignment instead, that way we can support dna quite easily.
1130
+
1131
+ sub amino_acids {
1132
+ my $self = shift;
1133
+ if ( $self->gap_treatment =~ /^a/i ) {
1134
+ return ( $self->_amino_acids, '-');
1135
+ }
1136
+ return $self->_amino_acids;
1137
+ }
1138
+
1139
+ #### STATISTICS ####
1140
+
1141
+ sub calculate_statistics {
1142
+ my $self = shift;
1143
+ my $aln = shift;
1144
+
1145
+ # TODO: This should be refactored into it's own class, but I don't have the
1146
+ # tuits...
1147
+ #
1148
+ # In the end we have the following datastructure.
1149
+ #$stats = {
1150
+ #### Indexed by column
1151
+ # scores Array of Arrays [pos][seqi] = Normalized Blosum62 score
1152
+ # distribution Array of Arrays [pos][seqi] = AA at pos in seqi
1153
+ # counts Array of Hashes [pos]{char} = n(AA) in column pos
1154
+ #
1155
+ #### Indexed by sequence
1156
+ # seqcounts Array of Hashes [seqi]{char} = n(AA) in seqi
1157
+ # frequency Array of Hashes [seqi]{char} = f(AA) in column sequence seqi, between 0 and 1
1158
+ # names Array [seqi] = name of seqi
1159
+ # seqlength Array [seqi] = l(seqi)
1160
+ #
1161
+ #### Other stuff
1162
+ # chosen Array [idx] = chosen seqi
1163
+ # nseq Integer = number of chosen seqs
1164
+ # totseq Integer = total number of seqs
1165
+ # length Integer = length of alignment
1166
+ # overall_freq Hash {char} = Overall frequency
1167
+ #};
1168
+
1169
+
1170
+ # Each of these methods builds upon the previous one.
1171
+
1172
+ my $stats = {};
1173
+ $self->_calculate_statistics_transpose( $stats, $aln );
1174
+ $self->_calculate_statistics_counts( $stats );
1175
+ # This one takes a lot of time, we defer it until it is needed
1176
+ #$self->_calculate_statistics_scores( $stats );
1177
+
1178
+ return $stats;
1179
+ }
1180
+
1181
+ sub _calculate_statistics_transpose {
1182
+ my $self = shift;
1183
+ my ($stats, $aln) = @_;
1184
+
1185
+ my $subset = $self->subset;
1186
+ my $totseq = $aln->num_sequences;
1187
+ my @AAS = $self->amino_acids;
1188
+
1189
+ my (@distrib, @names, @chosen, @seqcounts);
1190
+ my (@frequency, %overall_frequency, @length);
1191
+
1192
+ my $seqi=0;
1193
+ for my $seq ( $aln->each_seq ) {
1194
+ my $chosen = '';
1195
+ if ( !$subset || $seq->id =~ /$subset/ ) {
1196
+ $chosen++;
1197
+ push @chosen, $seqi;
1198
+ }
1199
+
1200
+ push @names, $seq->id;
1201
+
1202
+ my %seqcounts = map { ($_=>0) } ( @AAS, '-' );
1203
+ my @s = split //, $seq->seq;
1204
+ for my $pos ( 0 .. $#s ) {
1205
+ $seqcounts{$s[$pos]}++;
1206
+ push @{ $distrib[$pos] }, $s[$pos];
1207
+ }
1208
+
1209
+ given ( $self->gap_treatment ) {
1210
+ when (/^i/i) { delete $seqcounts{'-'} }
1211
+ when (/^m/i) {
1212
+ $seqcounts{$_} += $seqcounts{'-'}/20 for @AAS;
1213
+ delete $seqcounts{'-'};
1214
+ }
1215
+ when (/^a/i) {
1216
+ die "Treatment of gaps as an additional character is not implemented yet\n";
1217
+ }
1218
+ }
1219
+
1220
+ push @seqcounts, \%seqcounts;
1221
+
1222
+ if ( $chosen ) {
1223
+ $overall_frequency{$_} += $seqcounts{$_} for keys %seqcounts;
1224
+ }
1225
+
1226
+ my $sum = sum values %seqcounts;
1227
+ push @length, $sum;
1228
+
1229
+ my %seq_frequency = %seqcounts;
1230
+ $_ /= $sum for values %seq_frequency;
1231
+ push @frequency, \%seq_frequency;
1232
+
1233
+ $seqi++;
1234
+ }
1235
+
1236
+ my $sum = sum values %overall_frequency;
1237
+ $_ /= $sum for values %overall_frequency;
1238
+
1239
+ $stats->{distribution} = \@distrib;
1240
+ $stats->{chosen} = \@chosen;
1241
+ $stats->{names} = \@names;
1242
+ $stats->{totseq} = $totseq;
1243
+ $stats->{nseq} = scalar(@chosen);
1244
+ $stats->{length} = scalar(@distrib);
1245
+ $stats->{seqcounts} = \@seqcounts;
1246
+ $stats->{seqlength} = \@length;
1247
+ $stats->{frequency} = \@frequency;
1248
+ $stats->{overall_freq} = \%overall_frequency;
1249
+ }
1250
+
1251
+ sub _calculate_statistics_counts {
1252
+ my $self = shift;
1253
+ my ($stats) = @_;
1254
+
1255
+ my @AAS = ($self->amino_acids, '-');
1256
+ my @chosen = @{ $stats->{chosen} };
1257
+
1258
+ my (@counts);
1259
+
1260
+ for my $p ( @{ $stats->{distribution} } ) {
1261
+ my %counts = map { ($_ => 0) } @AAS;
1262
+
1263
+ $counts{$_}++ for @$p[@chosen];
1264
+ push @counts, \%counts;
1265
+ }
1266
+
1267
+ $stats->{counts} = \@counts;
1268
+ }
1269
+
1270
+ sub _calculate_statistics_scores {
1271
+ my $self = shift;
1272
+ my ( $stats ) = @_;
1273
+
1274
+ my $qmatrix = $self->_matrix;
1275
+ my @AAS = $self->amino_acids;
1276
+
1277
+ # Precalculate the scores as weights
1278
+ my @weights;
1279
+ for my $pos ( 0 .. $stats->{length} - 1 ) {
1280
+ my $p = $stats->{distribution}[$pos];
1281
+
1282
+ my %weights;
1283
+
1284
+ for my $aai ( @AAS ) {
1285
+ my $score = 0;
1286
+
1287
+ for my $aaj ( @AAS ) {
1288
+ if ( ! defined $stats->{counts}[$pos]{$aaj} ) {
1289
+ die "Can't find $aaj at $pos of counts";
1290
+ }
1291
+ $score += $qmatrix->{$aai}{$aaj} * $stats->{counts}[$pos]{$aaj};
1292
+ }
1293
+
1294
+ # Remove self comparison
1295
+ $score -= $qmatrix->{$aai}{$aai};
1296
+
1297
+ $weights{$aai} = $score;
1298
+ }
1299
+ my ($smin, $smax) = minmax values %weights;
1300
+
1301
+ # Normalize the scores so they are mapped like this:
1302
+ # (-Inf,0,Inf) -> (0 , 0.5 , 1)
1303
+ for ( values %weights ) {
1304
+ when ( $_ < 0 ) { $_ = (1 - $_/$smin) / 2 }
1305
+ when ( $smax > 0 ) { $_ = 0.5 + ($_/$smax) / 2 }
1306
+ }
1307
+
1308
+ push @weights, \%weights;
1309
+ }
1310
+
1311
+ # Map the weights to the scores
1312
+ my @scores;
1313
+ for my $pos ( 0 .. $#weights ) {
1314
+ for my $seqi ( 0 .. $stats->{totseq}-1 ) {
1315
+ my $c = $stats->{distribution}[$pos][$seqi];
1316
+ next if $c eq '-';
1317
+
1318
+ $scores[$pos][$seqi] = $weights[$pos]{$c};
1319
+ }
1320
+ }
1321
+
1322
+ $stats->{scores} = \@scores;
1323
+ }
1324
+
1325
+ #### UTILITY FUNCTIONS ####
1326
+
1327
+ sub _group_columns {
1328
+ my $self = shift;
1329
+ my $columns = shift;
1330
+
1331
+ my @columns = sort { $a <=> $b } uniq @$columns;
1332
+ my @groups;
1333
+ my $current_group = [];
1334
+ for ( @columns ) {
1335
+ if ( !@$current_group ) {
1336
+ $current_group = [$_,$_];
1337
+ }
1338
+ elsif ( $current_group->[1] + 1 == $_ ) {
1339
+ $current_group->[1] = $_;
1340
+ }
1341
+ else {
1342
+ push @groups, $current_group;
1343
+ $current_group = [$_,$_]
1344
+ }
1345
+ }
1346
+ push @groups, $current_group;
1347
+ return \@groups;
1348
+ }
1349
+
1350
+ sub _convert_threshold {
1351
+ my $self = shift;
1352
+ my ( $value, $nseq ) = @_;
1353
+
1354
+ my $ret;
1355
+ for ( $value ) {
1356
+ when (/^(\d+\.?\d*)\%$/) { $ret = $nseq * $1 / 100 }
1357
+ when (/^(\d+)$/) { $ret = $value }
1358
+ when (/^(\d*\.\d+)$/) { $ret = $nseq * $1 }
1359
+ default { die "Can't interpret threshold: $value\n" }
1360
+ }
1361
+
1362
+ if ( $ret > $nseq ) {
1363
+ die "Threshold can't be larger than nseq $value ($ret) > $nseq\n";
1364
+ }
1365
+
1366
+ return $ret;
1367
+ }
1368
+
1369
+ sub _gradient {
1370
+ map { [ map hex, /(..)(..)(..)/ ] } @_;
1371
+ }
1372
+
1373
+ sub _jalview_gradient {
1374
+ # Generated in R with hsv(2/3, seq(1,101)/101, 1)
1375
+ _gradient qw(
1376
+ FCFCFF FAFAFF F7F7FF F5F5FF F2F2FF F0F0FF EDEDFF EBEBFF E8E8FF E6E6FF
1377
+ E3E3FF E1E1FF DEDEFF DCDCFF D9D9FF D7D7FF D4D4FF D2D2FF CFCFFF CDCDFF
1378
+ CACAFF C7C7FF C5C5FF C2C2FF C0C0FF BDBDFF BBBBFF B8B8FF B6B6FF B3B3FF
1379
+ B1B1FF AEAEFF ACACFF A9A9FF A7A7FF A4A4FF A2A2FF 9F9FFF 9D9DFF 9A9AFF
1380
+ 9797FF 9595FF 9292FF 9090FF 8D8DFF 8B8BFF 8888FF 8686FF 8383FF 8181FF
1381
+ 7E7EFF 7C7CFF 7979FF 7777FF 7474FF 7272FF 6F6FFF 6D6DFF 6A6AFF 6868FF
1382
+ 6565FF 6262FF 6060FF 5D5DFF 5B5BFF 5858FF 5656FF 5353FF 5151FF 4E4EFF
1383
+ 4C4CFF 4949FF 4747FF 4444FF 4242FF 3F3FFF 3D3DFF 3A3AFF 3838FF 3535FF
1384
+ 3232FF 3030FF 2D2DFF 2B2BFF 2828FF 2626FF 2323FF 2121FF 1E1EFF 1C1CFF
1385
+ 1919FF 1717FF 1414FF 1212FF 0F0FFF 0D0DFF 0A0AFF 0808FF 0505FF 0303FF
1386
+ 0000FF
1387
+ )
1388
+ }
1389
+
1390
+ sub _rainbow_gradient {
1391
+ # Generated in R with rainbow(101, s=.6, v=.9, start=0, end=1/3)
1392
+ _gradient qw(
1393
+ E65C5C E65F5C E6615C E6645C E6675C E66A5C E66C5C E66F5C E6725C E6755C
1394
+ E6775C E67A5C E67D5C E6805C E6825C E6855C E6885C E68B5C E68D5C E6905C
1395
+ E6935C E6965C E6985C E69B5C E69E5C E6A15C E6A35C E6A65C E6A95C E6AC5C
1396
+ E6AE5C E6B15C E6B45C E6B75C E6B95C E6BC5C E6BF5C E6C25C E6C45C E6C75C
1397
+ E6CA5C E6CD5C E6CF5C E6D25C E6D55C E6D85C E6DA5C E6DD5C E6E05C E6E35C
1398
+ E6E65C E3E65C E0E65C DDE65C DAE65C D8E65C D5E65C D2E65C CFE65C CDE65C
1399
+ CAE65C C7E65C C4E65C C2E65C BFE65C BCE65C B9E65C B7E65C B4E65C B1E65C
1400
+ AEE65C ACE65C A9E65C A6E65C A3E65C A1E65C 9EE65C 9BE65C 98E65C 96E65C
1401
+ 93E65C 90E65C 8DE65C 8BE65C 88E65C 85E65C 82E65C 80E65C 7DE65C 7AE65C
1402
+ 77E65C 75E65C 72E65C 6FE65C 6CE65C 6AE65C 67E65C 64E65C 61E65C 5FE65C
1403
+ 5CE65C
1404
+ )
1405
+ }
1406
+
1407
+
1408
+ __PACKAGE__->new_with_options->run unless caller;
1409
+
1410
+
1411
+ =head1 AUTHOR
1412
+
1413
+ Johan Viklund <johan.viklund@gmail.com>, Github: viklund
1414
+
1415
+ =head1 LICENSE
1416
+
1417
+ Copyright 2018 Johan Viklund
1418
+
1419
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
1420
+ this software and associated documentation files (the "Software"), to deal in
1421
+ the Software without restriction, including without limitation the rights to
1422
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
1423
+ of the Software, and to permit persons to whom the Software is furnished to do
1424
+ so, subject to the following conditions:
1425
+
1426
+ The above copyright notice and this permission notice shall be included in all
1427
+ copies or substantial portions of the Software.
1428
+
1429
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1430
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1431
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1432
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1433
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1434
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1435
+ SOFTWARE.
1436
+
1437
+ =cut
1438
+
1439
+ ### Don't change anything below this line unless you know how it's used
1440
+ __DATA__
1441
+ # Matrix made by matblas from blosum62.iij
1442
+ # * column uses minimum score
1443
+ # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
1444
+ # Blocks Database = /data/blocks_5.0/blocks.dat
1445
+ # Cluster Percentage: >= 62
1446
+ # Entropy = 0.6979, Expected = -0.5209
1447
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
1448
+ A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -4
1449
+ R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
1450
+ N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
1451
+ D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
1452
+ C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -1 -4
1453
+ Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
1454
+ E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
1455
+ G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
1456
+ H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
1457
+ I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
1458
+ L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
1459
+ K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
1460
+ M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
1461
+ F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
1462
+ P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -1 -4
1463
+ S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 -1 -4
1464
+ T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -4
1465
+ W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -1 -4
1466
+ Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
1467
+ V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
1468
+ B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
1469
+ Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
1470
+ X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4
1471
+ * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1