phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
phykit/phykit.py ADDED
@@ -0,0 +1,2862 @@
1
+ #!/usr/bin/env python
2
+
3
+ import logging
4
+ import sys
5
+ import textwrap
6
+
7
+ from .version import __version__
8
+
9
+ from argparse import (
10
+ ArgumentParser,
11
+ SUPPRESS,
12
+ RawDescriptionHelpFormatter,
13
+ )
14
+
15
+ from .services.alignment import (
16
+ AlignmentLength,
17
+ AlignmentLengthNoGaps,
18
+ AlignmentRecoding,
19
+ ColumnScore,
20
+ CompositionalBiasPerSite,
21
+ CreateConcatenationMatrix,
22
+ DNAThreader,
23
+ EvolutionaryRatePerSite,
24
+ Faidx,
25
+ GCContent,
26
+ PairwiseIdentity,
27
+ ParsimonyInformative,
28
+ RelativeCompositionVariability,
29
+ RelativeCompositionVariabilityTaxon,
30
+ RenameFastaEntries,
31
+ SumOfPairsScore,
32
+ VariableSites,
33
+ )
34
+
35
+ from .services.tree import (
36
+ BipartitionSupportStats,
37
+ BranchLengthMultiplier,
38
+ CollapseBranches,
39
+ CovaryingEvolutionaryRates,
40
+ DVMC,
41
+ EvolutionaryRate,
42
+ HiddenParalogyCheck,
43
+ InternalBranchStats,
44
+ InternodeLabeler,
45
+ LastCommonAncestorSubtree,
46
+ LBScore,
47
+ MonophylyCheck,
48
+ NearestNeighborInterchange,
49
+ PatristicDistances,
50
+ PolytomyTest,
51
+ PrintTree,
52
+ PruneTree,
53
+ RenameTreeTips,
54
+ RobinsonFouldsDistance,
55
+ RootTree,
56
+ Saturation,
57
+ SpuriousSequence,
58
+ TerminalBranchStats,
59
+ TipLabels,
60
+ TipToTipDistance,
61
+ TipToTipNodeDistance,
62
+ TotalTreeLength,
63
+ Treeness,
64
+ TreenessOverRCV,
65
+ )
66
+
67
+ from .helpers.boolean_argument_parsing import str2bool
68
+
69
+
70
+ logger = logging.getLogger(__name__)
71
+ ch = logging.StreamHandler()
72
+ ch.setLevel(logging.INFO)
73
+ logger.addHandler(ch)
74
+
75
+ help_header = f"""
76
+ _____ _ _ _______ _______
77
+ | __ \| | | |/ /_ _|__ __|
78
+ | |__) | |__ _ _| ' / | | | |
79
+ | ___/| '_ \| | | | < | | | |
80
+ | | | | | | |_| | . \ _| |_ | |
81
+ |_| |_| |_|\__, |_|\_\_____| |_|
82
+ __/ |
83
+ |___/
84
+
85
+ Version: {__version__}
86
+ Citation: Steenwyk et al. 2021, Bioinformatics. doi: 10.1093/bioinformatics/btab096
87
+ Documentation link: https://jlsteenwyk.com/PhyKIT
88
+ Publication link: https://academic.oup.com/bioinformatics/article-abstract/37/16/2325/6131675
89
+
90
+ """
91
+
92
+
93
+ class Phykit(object):
94
+ help_header = f"""
95
+ _____ _ _ _______ _______
96
+ | __ \| | | |/ /_ _|__ __|
97
+ | |__) | |__ _ _| ' / | | | |
98
+ | ___/| '_ \| | | | < | | | |
99
+ | | | | | | |_| | . \ _| |_ | |
100
+ |_| |_| |_|\__, |_|\_\_____| |_|
101
+ __/ |
102
+ |___/
103
+
104
+ Version: {__version__}
105
+ Citation: Steenwyk et al. 2021, Bioinformatics. doi: 10.1093/bioinformatics/btab096
106
+ Documentation link: https://jlsteenwyk.com/PhyKIT
107
+ Publication link: https://academic.oup.com/bioinformatics/article-abstract/37/16/2325/6131675
108
+
109
+ """
110
+
111
+ def __init__(self):
112
+ parser = ArgumentParser(
113
+ add_help=True,
114
+ usage=SUPPRESS,
115
+ formatter_class=RawDescriptionHelpFormatter,
116
+ description=textwrap.dedent(
117
+ f"""\
118
+ {self.help_header}
119
+
120
+ PhyKIT helps process and analyze multiple sequence alignments and phylogenies.
121
+
122
+ Generally, all functions are designed to help understand the contents of alignments
123
+ (e.g., gc content or the number of parsimony informative sites) and the shape
124
+ of trees (e.g., treeness, degree of violation of a molecular clock).
125
+
126
+ Some help messages indicate that summary statistics are reported (e.g.,
127
+ bipartition_support_stats). Summary statistics include mean, median, 25th percentile,
128
+ 75th percentile, minimum, maximum, standard deviation, and variance. These functions
129
+ typically have a verbose option that allows users to get the underlying data
130
+ used to calculate summary statistics.
131
+
132
+ Usage: phykit <command> [optional command arguments]
133
+
134
+ Command specific help messages can be viewed by adding a
135
+ -h/--help argument after the command. For example, to see the
136
+ to see the help message for the command 'treeness', execute
137
+ "phykit treeness -h" or "phykit treeness --help".
138
+
139
+ Lastly, each function comes with aliases to save the user some
140
+ key strokes. For example, to get the help message for the 'treeness'
141
+ function, you can type "phykit tness -h". All aliases are specified
142
+ in parentheses after the long form of the function name.
143
+
144
+ Alignment-based commands
145
+ ========================
146
+ alignment_length (alias: aln_len; al)
147
+ - calculates alignment length
148
+ alignment_length_no_gaps (alias: aln_len_no_gaps; alng)
149
+ - calculates alignment length after removing sites with gaps
150
+ alignment_recoding (alias: aln_recoding, recode)
151
+ - recode alignments using reduced character schemes
152
+ column_score (alias: cs)
153
+ - calculate column score between a reference and query alignment
154
+ compositional_bias_per_site (alias: comp_bias_per_site; cbps)
155
+ - detects site-wise compositional biases in an alignment
156
+ create_concatenation_matrix (alias: create_concat; cc)
157
+ - create concatenation matrix from a set of alignments
158
+ evolutionary_rate_per_site (alias: evo_rate_per_site; erps)
159
+ - estimate evolutionary per site in an alignment
160
+ faidx (alias: get_entry; ge)
161
+ - extract query fasta entry from multi-fasta file
162
+ gc_content (alias: gc)
163
+ - calculate GC content of a fasta entries or entries thereof
164
+ pairwise_identity (alias: pairwise_id, pi)
165
+ - calculates average pairwise identify among sequences in
166
+ an alignment file. This is a proxy for evolutionary rate
167
+ parsimony_informative_sites (alias: pis)
168
+ - calculates the number and percentage of parsimony
169
+ informative sites in an alignment
170
+ relative_composition_variability (alias: rel_comp_var, rcv)
171
+ - calculates relative composition variability in an alignment
172
+ relative_composition_variability_taxon (alias: rel_comp_var_taxon, rcvt)
173
+ - calculates relative composition variability of each taxa in an alignment
174
+ rename_fasta_entries (alias: rename_fasta)
175
+ - rename entries in a fasta file
176
+ sum_of_pairs_score (alias: sops; sop)
177
+ - calculate sum-of-pairs score between a reference and query alignment
178
+ thread_dna (alias: pal2nal; p2n)
179
+ - thread dna sequences over a protein alignment
180
+ variable_sites (alias: vs)
181
+ - calculates the number and percentage of variable sites
182
+ in an alignment
183
+
184
+ Tree-based commands
185
+ ===================
186
+ bipartition_support_stats (alias: bss)
187
+ - calculates summary statistics for bipartition support
188
+ branch_length_multiplier (alias: blm)
189
+ - multiply all branch lengths by a specified factor
190
+ collapse_branches (alias: collapse; cb)
191
+ - collapses branches according to bipartition support
192
+ covarying_evolutionary_rates (alias: cover)
193
+ - calculates correlation in the evolutionary rate of two trees
194
+ degree_of_violation_of_a_molecular_clock (alias: dvmc)
195
+ - reports the degree of violation of the molecular clock
196
+ evolutionary_rate (alias: evo_rate)
197
+ - reports a tree-based estimation of evolutionary rate for a gene
198
+ hidden_paralogy_check (alias: clan_check)
199
+ - check for monophyly of specific clades of taxa
200
+ internal_branch_stats (alias: ibs)
201
+ - calculates summary statistics for internal branch lengths
202
+ internode_labeler (alias: il)
203
+ - create labels at internodes in a phylogeny
204
+ last_common_ancestor_subtree (alias: lca_subtree)
205
+ - get last common ancestor of a set of taxa
206
+ long_branch_score (alias: lb_score; lbs)
207
+ - calculates lb (long branch) score for taxa in a phylogeny
208
+ monophyly_check (alias: is_monophyletic)
209
+ - determines if a set of tip names are monophyletic
210
+ nearest_neighbor_interchange (alias: nni)
211
+ - make nearest neighbor interchange moves on a tree
212
+ patristic_distances (alias: pd)
213
+ - calculate all pairwise distances between tips in a tree
214
+ polytomy_test (alias: polyt_test; polyt; ptt)
215
+ - conducts a polytomy test using gene
216
+ support frequencies
217
+ print_tree (alias: print; pt)
218
+ - prints ascii tree
219
+ prune_tree (alias: prune)
220
+ - prune taxa from a phylogeny
221
+ rename_tree_tips (alias: rename_tree; rename_tips)
222
+ - renames tips in a phylogeny according to a file with
223
+ the desired new tip names
224
+ robinson_foulds_distance (alias: rf_distance; rf_dist; rf)
225
+ - calculates Robinson-Foulds distance between two trees
226
+ root_tree (alias: root; rt)
227
+ - roots tree on user-specified taxa or taxon
228
+ spurious_sequence (alias: spurious_seq; ss)
229
+ - identifies putatively spurious sequences by identifying
230
+ branch lengths that are atypically long
231
+ tip_labels (alias: tree_labels; labels; tl)
232
+ - print leaf names in a phylogeny
233
+ tip_to_tip_distance (alias: t2t_dist; t2t)
234
+ - calculate tip-to-tip distance in a phylogeny
235
+ tip_to_tip_node_distance (alias: t2t_node_dist; t2t_nd)
236
+ - calculate tip-to-tip node distance in a phylogeny
237
+ total_tree_length (alias: tree_len)
238
+ - calculates total tree length
239
+ treeness (alias: tness)
240
+ - reports treeness or stemminess, a measure of signal-to-
241
+ noise ratio in a phylogeny
242
+
243
+ Alignment- and tree-based commands
244
+ ==================================
245
+ saturation (alias: sat)
246
+ - calculates saturation by examining the slope of
247
+ patristic distance and uncorrected distances
248
+ treeness_over_rcv (alias: toverr)
249
+ - calculates treeness/rcv, treeness, and rcv
250
+ """
251
+ ),
252
+ )
253
+ parser.add_argument("command", help=SUPPRESS)
254
+ args = parser.parse_args(sys.argv[1:2])
255
+
256
+ # if command is part of the possible commands (i.e., the long form
257
+ # commands, run). Otherwise, assume it is an alias and look to the
258
+ # run_alias function
259
+ try:
260
+ if hasattr(self, args.command):
261
+ getattr(self, args.command)(sys.argv[2:])
262
+ else:
263
+ self.run_alias(args.command, sys.argv[2:])
264
+ except SystemExit:
265
+ # Re-raise SystemExit as-is to preserve exit code
266
+ raise
267
+ except NameError:
268
+ sys.exit(2)
269
+
270
+ ## Aliases
271
+ def run_alias(self, command, argv):
272
+ # version
273
+ if command in ["version", "v"]:
274
+ return self.version()
275
+ # Alignment aliases
276
+ if command in ["aln_len", "al"]:
277
+ return self.alignment_length(argv)
278
+ elif command in ["aln_len_no_gaps", "alng"]:
279
+ return self.alignment_length_no_gaps(argv)
280
+ elif command in ["aln_recoding", "recode"]:
281
+ return self.alignment_recoding(argv)
282
+ elif command == "cs":
283
+ return self.column_score(argv)
284
+ elif command in ["comp_bias_per_site", "cbps"]:
285
+ return self.compositional_bias_per_site(argv)
286
+ elif command in ["evo_rate_per_site", "erps"]:
287
+ return self.evolutionary_rate_per_site(argv)
288
+ elif command in ["get_entry", "ge"]:
289
+ return self.faidx(argv)
290
+ elif command == "gc":
291
+ return self.gc_content(argv)
292
+ elif command in ["pairwise_id", "pi"]:
293
+ return self.pairwise_identity(argv)
294
+ elif command == "pis":
295
+ return self.parsimony_informative_sites(argv)
296
+ elif command in ["rel_comp_var", "relative_composition_variability"]:
297
+ return self.rcv(argv)
298
+ elif command in ["relative_composition_variability_taxon", "rel_comp_var_taxon"]:
299
+ return self.rcvt(argv)
300
+ elif command == "rename_fasta":
301
+ return self.rename_fasta_entries(argv)
302
+ elif command in ["sum_of_pairs_score", "sops", "sop"]:
303
+ return self.sum_of_pairs_score(argv)
304
+ elif command == "vs":
305
+ return self.variable_sites(argv)
306
+ # Tree aliases
307
+ elif command == "bss":
308
+ return self.bipartition_support_stats(argv)
309
+ elif command == "blm":
310
+ return self.branch_length_multiplier(argv)
311
+ elif command in ["collapse", "cb"]:
312
+ return self.collapse_branches(argv)
313
+ elif command == "cover":
314
+ return self.covarying_evolutionary_rates(argv)
315
+ elif command == "degree_of_violation_of_a_molecular_clock":
316
+ return self.dvmc(argv)
317
+ elif command == "evo_rate":
318
+ return self.evolutionary_rate(argv)
319
+ elif command == "clan_check":
320
+ return self.hidden_paralogy_check(argv)
321
+ elif command == "ibs":
322
+ return self.internal_branch_stats(argv)
323
+ elif command == "il":
324
+ return self.internode_labeler(argv)
325
+ elif command in ["lca_subtree"]:
326
+ return self.last_common_ancestor_subtree(argv)
327
+ elif command in ["long_branch_score", "lbs"]:
328
+ return self.lb_score(argv)
329
+ elif command == "is_monophyletic":
330
+ return self.monophyly_check(argv)
331
+ elif command == "nni":
332
+ return self.nearest_neighbor_interchange(argv)
333
+ elif command == "pd":
334
+ return self.patristic_distances(argv)
335
+ elif command in ["polyt_test", "ptt", "polyt"]:
336
+ return self.polytomy_test(argv)
337
+ elif command in ["print", "pt"]:
338
+ return self.print_tree(argv)
339
+ elif command == "prune":
340
+ return self.prune_tree(argv)
341
+ elif command in ["rename_tree", "rename_tips"]:
342
+ return self.rename_tree_tips(argv)
343
+ elif command in ["robinson_foulds_distance", "rf_dist", "rf"]:
344
+ return self.rf_distance(argv)
345
+ elif command in ["root", "rt"]:
346
+ return self.root_tree(argv)
347
+ elif command in ["spurious_seq", "ss"]:
348
+ return self.spurious_sequence(argv)
349
+ elif command == "tbs":
350
+ return self.terminal_branch_stats(argv)
351
+ elif command in ["labels", "tree_labels", "tl"]:
352
+ return self.tip_labels(argv)
353
+ elif command in ["t2t_dist", "t2t"]:
354
+ return self.tip_to_tip_distance(argv)
355
+ elif command in ["t2t_node_dist", "t2t_nd"]:
356
+ return self.tip_to_tip_node_distance(argv)
357
+ elif command == "tree_len":
358
+ return self.total_tree_length(argv)
359
+ elif command == "tness":
360
+ return self.treeness(argv)
361
+ # Alignment- and tree-based aliases
362
+ elif command == "sat":
363
+ return self.saturation(argv)
364
+ elif command in ["toverr", "tor"]:
365
+ return self.treeness_over_rcv(argv)
366
+ # Helper aliases
367
+ elif command in ["create_concat", "cc"]:
368
+ return self.create_concatenation_matrix(argv)
369
+ elif command in ["pal2nal", "p2n"]:
370
+ return self.thread_dna(argv)
371
+ else:
372
+ print(textwrap.dedent(help_header))
373
+ print(
374
+ "Invalid command option. See help for a complete list of commands and aliases."
375
+ )
376
+ sys.exit(1)
377
+
378
+ ## print version
379
+ def version(self):
380
+ print(
381
+ textwrap.dedent(
382
+ f"""\
383
+ {self.help_header}
384
+ """
385
+ )
386
+ )
387
+
388
+ ## Alignment functions
389
+ @staticmethod
390
+ def alignment_length(argv):
391
+ parser = ArgumentParser(
392
+ add_help=True,
393
+ usage=SUPPRESS,
394
+ formatter_class=RawDescriptionHelpFormatter,
395
+ description=textwrap.dedent(
396
+ f"""\
397
+ {help_header}
398
+
399
+ Length of an input alignment is calculated using this function.
400
+
401
+ Longer alignments are associated with strong phylogenetic signal.
402
+
403
+ Association between alignment length and phylogenetic signal
404
+ was determined by Shen et al., Genome Biology and Evolution (2016),
405
+ doi: 10.1093/gbe/evw179.
406
+
407
+ Aliases:
408
+ alignment_length, aln_len, al
409
+ Command line interfaces:
410
+ pk_alignment_length, pk_aln_len, pk_al
411
+
412
+ Usage:
413
+ phykit alignment_length <alignment>
414
+
415
+ Options
416
+ =====================================================
417
+ <alignment> first argument after
418
+ function name should be
419
+ an alignment file
420
+ """
421
+ ),
422
+ )
423
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
424
+ args = parser.parse_args(argv)
425
+ AlignmentLength(args).run()
426
+
427
+ @staticmethod
428
+ def alignment_length_no_gaps(argv):
429
+ parser = ArgumentParser(
430
+ add_help=True,
431
+ usage=SUPPRESS,
432
+ formatter_class=RawDescriptionHelpFormatter,
433
+ description=textwrap.dedent(
434
+ f"""\
435
+ {help_header}
436
+
437
+ Calculate alignment length excluding sites with gaps.
438
+
439
+ Longer alignments when excluding sites with gaps is
440
+ associated with strong phylogenetic signal.
441
+
442
+ PhyKIT reports three tab delimited values:
443
+ col1: number of sites without gaps
444
+ col2: total number of sites
445
+ col3: percentage of sites without gaps
446
+
447
+ Association between alignment length when excluding sites
448
+ with gaps and phylogenetic signal was determined by Shen
449
+ et al., Genome Biology and Evolution (2016),
450
+ doi: 10.1093/gbe/evw179.
451
+
452
+ Aliases:
453
+ alignment_length_no_gaps, aln_len_no_gaps, alng
454
+ Command line interfaces:
455
+ pk_alignment_length_no_gaps, pk_aln_len_no_gaps, pk_alng
456
+
457
+ Usage:
458
+ phykit alignment_length_no_gaps <alignment>
459
+
460
+ Options
461
+ =====================================================
462
+ <alignment> first argument after
463
+ function name should be
464
+ an alignment file
465
+ """
466
+ ),
467
+ )
468
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
469
+ args = parser.parse_args(argv)
470
+ AlignmentLengthNoGaps(args).run()
471
+
472
+ @staticmethod
473
+ def alignment_recoding(argv):
474
+ parser = ArgumentParser(
475
+ add_help=True,
476
+ usage=SUPPRESS,
477
+ formatter_class=RawDescriptionHelpFormatter,
478
+ description=textwrap.dedent(
479
+ f"""\
480
+ {help_header}
481
+
482
+ Recode alignments using reduced character states.
483
+
484
+ Alignments can be recoded using established or
485
+ custom recoding schemes. Recoding schemes are
486
+ specified using the -c/--code argument. Custom
487
+ recoding schemes can be used and should be formatted
488
+ as a two column file wherein the first column is the
489
+ recoded character and the second column is the character
490
+ in the alignment.
491
+
492
+ Aliases:
493
+ alignment_recoding, aln_recoding, recode
494
+ Command line interfaces:
495
+ bk_alignment_recoding, bk_aln_recoding, bk_recode
496
+
497
+ Usage:
498
+ phykit alignment_recoding <fasta> -c/--code <code>
499
+
500
+ Options
501
+ =====================================================
502
+ <fasta> first argument after
503
+ function name should be
504
+ a fasta file
505
+
506
+ -c/--code recoding scheme to use
507
+
508
+ Codes for which recoding scheme to use
509
+ =====================================================
510
+ RY-nucleotide
511
+ R = purines (i.e., A and G)
512
+ Y = pyrimidines (i.e., T and C)
513
+
514
+ SandR-6
515
+ 0 = A, P, S, and T
516
+ 1 = D, E, N, and G
517
+ 2 = Q, K, and R
518
+ 3 = M, I, V, and L
519
+ 4 = W and C
520
+ 5 = F, Y, and H
521
+
522
+ KGB-6
523
+ 0 = A, G, P, and S
524
+ 1 = D, E, N, Q, H, K, R, and T
525
+ 2 = M, I, and L
526
+ 3 = W
527
+ 4 = F and Y
528
+ 5 = C and V
529
+
530
+ Dayhoff-6
531
+ 0 = A, G, P, S, and T
532
+ 1 = D, E, N, and Q
533
+ 2 = H, K, and R
534
+ 3 = I, L, M, and V
535
+ 4 = F, W, and Y
536
+ 5 = C
537
+
538
+ Dayhoff-9
539
+ 0 = D, E, H, N, and Q
540
+ 1 = I, L, M, and V
541
+ 2 = F and Y
542
+ 3 = A, S, and T
543
+ 4 = K and R
544
+ 5 = G
545
+ 6 = P
546
+ 7 = C
547
+ 8 = W
548
+
549
+ Dayhoff-12
550
+ 0 = D, E, and Q
551
+ 1 = M, L, I, and V
552
+ 2 = F and Y
553
+ 3 = K, H, and R
554
+ 4 = G
555
+ 5 = A
556
+ 6 = P
557
+ 7 = S
558
+ 8 = T
559
+ 9 = N
560
+ A = W
561
+ B = C
562
+
563
+ Dayhoff-15
564
+ 0 = D, E, and Q
565
+ 1 = M and L
566
+ 2 = I and V
567
+ 3 = F and Y
568
+ 4 = G
569
+ 5 = A
570
+ 6 = P
571
+ 7 = S
572
+ 8 = T
573
+ 9 = N
574
+ A = K
575
+ B = H
576
+ C = R
577
+ D = W
578
+ E = C
579
+
580
+ Dayhoff-18
581
+ 0 = F and Y
582
+ 1 = M and L
583
+ 2 = I
584
+ 3 = V
585
+ 4 = G
586
+ 5 = A
587
+ 6 = P
588
+ 7 = S
589
+ 8 = T
590
+ 9 = D
591
+ A = E
592
+ B = Q
593
+ C = N
594
+ D = K
595
+ E = H
596
+ F = R
597
+ G = W
598
+ H = C
599
+ """ # noqa
600
+ ),
601
+ )
602
+
603
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
604
+ parser.add_argument("-c", "--code", type=str, help=SUPPRESS)
605
+ args = parser.parse_args(argv)
606
+ AlignmentRecoding(args).run()
607
+
608
+ @staticmethod
609
+ def column_score(argv):
610
+ parser = ArgumentParser(
611
+ add_help=True,
612
+ usage=SUPPRESS,
613
+ formatter_class=RawDescriptionHelpFormatter,
614
+ description=textwrap.dedent(
615
+ f"""\
616
+ {help_header}
617
+
618
+ Calculates column score.
619
+
620
+ Column is an accuracy metric for a multiple alignment relative
621
+ to a reference alignment. It is calculated by summing the correctly
622
+ aligned columns over all columns in an alignment. Thus, values range
623
+ from 0 to 1 and higher values indicate more accurate alignments.
624
+
625
+ Aliases:
626
+ column_score, cs
627
+ Command line interfaces:
628
+ pk_column_score, pk_cs
629
+
630
+ Usage:
631
+ phykit column_score <fasta> -r/--reference <ref.aln>
632
+
633
+ Options
634
+ =====================================================
635
+ <fasta> first argument after
636
+ function name should be a
637
+ query fasta alignment file
638
+ to be scored for accuracy
639
+
640
+ -r/--reference reference fasta alignment to
641
+ compare query alignment to
642
+ """
643
+ ),
644
+ )
645
+ parser.add_argument("fasta", type=str, help=SUPPRESS)
646
+ parser.add_argument("-r", "--reference", type=str, help=SUPPRESS)
647
+ args = parser.parse_args(argv)
648
+ ColumnScore(args).run()
649
+
650
+ @staticmethod
651
+ def compositional_bias_per_site(argv):
652
+ parser = ArgumentParser(
653
+ add_help=True,
654
+ usage=SUPPRESS,
655
+ formatter_class=RawDescriptionHelpFormatter,
656
+ description=textwrap.dedent(
657
+ f"""\
658
+ {help_header}
659
+
660
+ Calculates compositional bias per site in an alignment.
661
+
662
+ Site-wise chi-squared tests are conducted in an alignment to
663
+ detect compositional biases. PhyKIT outputs four columns:
664
+ col 1: index in alignment
665
+ col 2: chi-squared statistic (higher values indicate greater bias)
666
+ col 3: multi-test corrected p-value (Benjamini-Hochberg false discovery rate procedure)
667
+ col 4: uncorrected p-value
668
+
669
+ Aliases:
670
+ compositional_bias_per_site; comp_bias_per_site; cbps
671
+ Command line interfaces:
672
+ pk_compositional_bias_per_site; pk_compositional_bias_per_site; pk_cbps
673
+
674
+ Usage:
675
+ phykit compositional_bias_per_site <alignment>
676
+
677
+ Options
678
+ =====================================================
679
+ <alignment> first argument after the
680
+ function name should be a
681
+ fasta alignment file
682
+ """
683
+ ),
684
+ )
685
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
686
+ args = parser.parse_args(argv)
687
+ CompositionalBiasPerSite(args).run()
688
+
689
+ @staticmethod
690
+ def evolutionary_rate_per_site(argv):
691
+ parser = ArgumentParser(
692
+ add_help=True,
693
+ usage=SUPPRESS,
694
+ formatter_class=RawDescriptionHelpFormatter,
695
+ description=textwrap.dedent(
696
+ f"""\
697
+ {help_header}
698
+
699
+ Estimate evolutionary rate per site.
700
+
701
+ Evolutionary rate per site is one minus the sum of squared
702
+ frequency of different characters at a given site. Values
703
+ may range from 0 (slow evolving; no diversity at the given
704
+ site) to 1 (fast evolving; all characters appear only once).
705
+
706
+ PhyKIT prints out two columns of information.
707
+ col 1: site in alignment
708
+ col 2: estimated evolutionary rate
709
+
710
+ Aliases:
711
+ evolutionary_rate_per_site; evo_rate_per_site; erps
712
+ Command line interfaces:
713
+ pk_evolutionary_rate_per_site; pk_evo_rate_per_site; pk_erps
714
+
715
+
716
+ Usage:
717
+ phykit evo_rate_per_site <fasta>
718
+
719
+ Options
720
+ =====================================================
721
+ <fasta> first argument after
722
+ function name should be a
723
+ query fasta file
724
+ """
725
+ ),
726
+ )
727
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
728
+ args = parser.parse_args(argv)
729
+ EvolutionaryRatePerSite(args).run()
730
+
731
+ @staticmethod
732
+ def faidx(argv):
733
+ parser = ArgumentParser(
734
+ add_help=True,
735
+ usage=SUPPRESS,
736
+ formatter_class=RawDescriptionHelpFormatter,
737
+ description=textwrap.dedent(
738
+ f"""\
739
+ {help_header}
740
+
741
+ Extracts sequence entry from fasta file.
742
+
743
+ This function works similarly to the faidx function
744
+ in samtools, but does not requiring an indexing step.
745
+
746
+ To obtain multiple entries, input multiple entries separated
747
+ by a comma (,). For example, if you want entries
748
+ named "seq_0" and "seq_1", the string "seq_0,seq_1"
749
+ should be associated with the -e argument.
750
+
751
+ Aliases:
752
+ faidx, get_entry; ge
753
+ Command line interfaces:
754
+ pk_faidx, pk_get_entry, pk_ge
755
+
756
+
757
+ Usage:
758
+ phykit faidx <fasta> -e/--entry <fasta entry>
759
+
760
+ Options
761
+ =====================================================
762
+ <fasta> first argument after
763
+ function name should be a
764
+ query fasta file
765
+
766
+ -e/--entry entry name to be extracted
767
+ from the inputted fasta file
768
+ """
769
+ ),
770
+ )
771
+ parser.add_argument("fasta", type=str, help=SUPPRESS)
772
+ parser.add_argument("-e", "--entry", type=str, help=SUPPRESS)
773
+ args = parser.parse_args(argv)
774
+ Faidx(args).run()
775
+
776
+ @staticmethod
777
+ def gc_content(argv):
778
+ parser = ArgumentParser(
779
+ add_help=True,
780
+ usage=SUPPRESS,
781
+ formatter_class=RawDescriptionHelpFormatter,
782
+ description=textwrap.dedent(
783
+ f"""\
784
+ {help_header}
785
+
786
+ Calculate GC content of a fasta file.
787
+
788
+ GC content is negatively correlated with phylogenetic signal.
789
+
790
+ If there are multiple entries, use the -v/--verbose option
791
+ to determine the GC content of each fasta entry separately.
792
+
793
+ Association between GC content and phylogenetic signal was
794
+ determined by Shen et al., Genome Biology and Evolution (2016),
795
+ doi: 10.1093/gbe/evw179.
796
+
797
+ Aliases:
798
+ gc_content, gc
799
+ Command line interfaces:
800
+ pk_gc_content, pk_gc
801
+
802
+ Usage:
803
+ phykit gc_content <fasta> [-v/--verbose]
804
+
805
+ Options
806
+ =====================================================
807
+ <fasta> first argument after
808
+ function name should be
809
+ a fasta file
810
+
811
+ -v/--verbose optional argument to print
812
+ the GC content of each fasta
813
+ entry
814
+ """
815
+ ),
816
+ )
817
+ parser.add_argument("fasta", type=str, help=SUPPRESS)
818
+ parser.add_argument(
819
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
820
+ )
821
+ args = parser.parse_args(argv)
822
+ GCContent(args).run()
823
+
824
+ @staticmethod
825
+ def pairwise_identity(argv):
826
+ parser = ArgumentParser(
827
+ add_help=True,
828
+ usage=SUPPRESS,
829
+ formatter_class=RawDescriptionHelpFormatter,
830
+ description=textwrap.dedent(
831
+ f"""\
832
+ {help_header}
833
+
834
+ Calculate the average pairwise identity among sequences.
835
+
836
+ Pairwise identities can be used as proxies for the
837
+ evolutionary rate of sequences.
838
+
839
+ Pairwise identity is defined as the number of identical
840
+ columns (including gaps) between two aligned sequences divided
841
+ by the number of columns in the alignment. Summary statistics
842
+ are reported unless used with the verbose option in which
843
+ all pairwise identities will be reported.
844
+
845
+ An example of pairwise identities being used as a proxy
846
+ for evolutionary rate can be found here: Chen et al.
847
+ Genome Biology and Evolution (2017), doi: 10.1093/gbe/evx147.
848
+
849
+ Aliases:
850
+ pairwise_identity, pairwise_id, pi
851
+ Command line interfaces:
852
+ pk_pairwise_identity, pk_pairwise_id, pk_pi
853
+
854
+ Usage:
855
+ phykit pairwise_identity <alignment> [-v/--verbose]
856
+
857
+ Options
858
+ =====================================================
859
+ <alignment> first argument after
860
+ function name should be
861
+ an alignment file
862
+
863
+ -v/--verbose optional argument to print
864
+ identity per pair
865
+
866
+ -e/--exclude_gaps if a site has a gap, ignore it
867
+ """
868
+ ),
869
+ )
870
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
871
+ parser.add_argument(
872
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
873
+ )
874
+ parser.add_argument(
875
+ "-e", "--exclude_gaps", action="store_true", required=False, help=SUPPRESS
876
+ )
877
+ args = parser.parse_args(argv)
878
+ PairwiseIdentity(args).run()
879
+
880
+ @staticmethod
881
+ def parsimony_informative_sites(argv):
882
+ parser = ArgumentParser(
883
+ add_help=True,
884
+ usage=SUPPRESS,
885
+ formatter_class=RawDescriptionHelpFormatter,
886
+ description=textwrap.dedent(
887
+ f"""\
888
+ {help_header}
889
+
890
+ Calculate the number and percentage of parismony
891
+ informative sites in an alignment.
892
+
893
+ The number of parsimony informative sites in an alignment
894
+ is associated with strong phylogenetic signal.
895
+
896
+ PhyKIT reports three tab delimited values:
897
+ col1: number of parsimony informative sites
898
+ col2: total number of sites
899
+ col3: percentage of parsimony informative sites
900
+
901
+ Association between the number of parsimony informative
902
+ sites and phylogenetic signal was determined by Shen
903
+ et al., Genome Biology and Evolution (2016),
904
+ doi: 10.1093/gbe/evw179 and Steenwyk et al., bioRxiv
905
+ (2020), doi: 10.1101/2020.06.08.140384.
906
+
907
+ Aliases:
908
+ parsimony_informative_sites, pis
909
+ Command line interfaces:
910
+ pk_parsimony_informative_sites, pk_pis
911
+
912
+ Usage:
913
+ phykit parsimony_informative_sites <alignment>
914
+
915
+ Options
916
+ =====================================================
917
+ <alignment> first argument after
918
+ function name should be
919
+ an alignment file
920
+ """
921
+ ),
922
+ )
923
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
924
+ args = parser.parse_args(argv)
925
+ ParsimonyInformative(args).run()
926
+
927
+ @staticmethod
928
+ def rcv(argv):
929
+ parser = ArgumentParser(
930
+ add_help=True,
931
+ usage=SUPPRESS,
932
+ formatter_class=RawDescriptionHelpFormatter,
933
+ description=textwrap.dedent(
934
+ f"""\
935
+ {help_header}
936
+
937
+ Calculate RCV (relative composition variability) for an alignment.
938
+
939
+ Lower RCV values are thought to be desirable because they represent
940
+ a lower composition bias in an alignment. Statistically, RCV describes
941
+ the average variability in sequence composition among taxa.
942
+
943
+ RCV is calculated following Phillips and Penny, Molecular Phylogenetics
944
+ and Evolution (2003), doi: 10.1016/S1055-7903(03)00057-5.
945
+
946
+ Aliases:
947
+ relative_composition_variability, rel_comp_var, rcv
948
+ Command line interfaces:
949
+ pk_relative_composition_variability, pk_rel_comp_var, pk_rcv
950
+
951
+ Usage:
952
+ phykit relative_composition_variability <alignment>
953
+
954
+ Options
955
+ =====================================================
956
+ <alignment> first argument after
957
+ function name should be
958
+ an alignment file
959
+ """
960
+ ),
961
+ )
962
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
963
+ args = parser.parse_args(argv)
964
+ RelativeCompositionVariability(args).run()
965
+
966
+ @staticmethod
967
+ def rcvt(argv):
968
+ parser = ArgumentParser(
969
+ add_help=True,
970
+ usage=SUPPRESS,
971
+ formatter_class=RawDescriptionHelpFormatter,
972
+ description=textwrap.dedent(
973
+ f"""\
974
+ {help_header}
975
+
976
+ Calculate RCVT (relative composition variability, taxon) for an alignment.
977
+
978
+ RCVT is the relative composition variability metric for individual taxa.
979
+ This facilitates identifying specific taxa that may have compositional
980
+ biases. Lower RCVT values are more desirable because they indicate
981
+ a lower composition bias for a given taxon in an alignment.
982
+
983
+ Aliases:
984
+ relative_composition_variability_taxon, rel_comp_var_taxon, rcvt
985
+ Command line interfaces:
986
+ pk_relative_composition_variability_taxon, pk_rel_comp_var_taxon, pk_rcvt
987
+
988
+ Usage:
989
+ phykit relative_composition_variability_taxon <alignment>
990
+
991
+ Options
992
+ =====================================================
993
+ <alignment> first argument after
994
+ function name should be
995
+ an alignment file
996
+ """
997
+ ),
998
+ )
999
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
1000
+ args = parser.parse_args(argv)
1001
+ RelativeCompositionVariabilityTaxon(args).run()
1002
+
1003
+ @staticmethod
1004
+ def rename_fasta_entries(argv):
1005
+ parser = ArgumentParser(
1006
+ add_help=True,
1007
+ usage=SUPPRESS,
1008
+ formatter_class=RawDescriptionHelpFormatter,
1009
+ description=textwrap.dedent(
1010
+ f"""\
1011
+ {help_header}
1012
+
1013
+ Renames fasta entries.
1014
+
1015
+ Renaming fasta entries will follow the scheme of a tab-delimited
1016
+ file wherein the first column is the current fasta entry name and
1017
+ the second column is the new fasta entry name in the resulting
1018
+ output alignment.
1019
+
1020
+ Aliases:
1021
+ rename_fasta_entries, rename_fasta
1022
+ Command line interfaces:
1023
+ pk_rename_fasta_entries, pk_rename_fasta
1024
+
1025
+ Usage:
1026
+ phykit rename_fasta_entries <fasta> -i/--idmap <idmap>
1027
+ [-o/--output <output_file>]
1028
+
1029
+ Options
1030
+ =====================================================
1031
+ <fasta> first argument after
1032
+ function name should be
1033
+ a fasta file
1034
+
1035
+ -i/--idmap identifier map of current FASTA
1036
+ names (col1) and desired FASTA
1037
+ names (col2)
1038
+
1039
+ -o/--output optional argument to write
1040
+ the renamed fasta file to.
1041
+ Default output has the same
1042
+ name as the input file with
1043
+ the suffix ".renamed.fa" added
1044
+ to it.
1045
+ """
1046
+ ),
1047
+ )
1048
+ parser.add_argument("fasta", type=str, help=SUPPRESS)
1049
+ parser.add_argument("-i", "--idmap", type=str, help=SUPPRESS)
1050
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1051
+ args = parser.parse_args(argv)
1052
+ RenameFastaEntries(args).run()
1053
+
1054
+ @staticmethod
1055
+ def sum_of_pairs_score(argv):
1056
+ parser = ArgumentParser(
1057
+ add_help=True,
1058
+ usage=SUPPRESS,
1059
+ formatter_class=RawDescriptionHelpFormatter,
1060
+ description=textwrap.dedent(
1061
+ f"""\
1062
+ {help_header}
1063
+
1064
+ Calculates sum-of-pairs score.
1065
+
1066
+ Sum-of-pairs is an accuracy metric for a multiple alignment relative
1067
+ to a reference alignment. It is calculated by summing the correctly
1068
+ aligned residue pairs over all pairs of sequences. Thus, values range
1069
+ from 0 to 1 and higher values indicate more accurate alignments.
1070
+
1071
+ Aliases:
1072
+ sum_of_pairs_score, sops, sop
1073
+ Command line interfaces:
1074
+ pk_sum_of_pairs_score, pk_sops, pk_sop
1075
+
1076
+ Usage:
1077
+ phykit sum_of_pairs_score <fasta> -r/--reference <ref.aln>
1078
+
1079
+ Options
1080
+ =====================================================
1081
+ <fasta> first argument after
1082
+ function name should be a
1083
+ query fasta alignment file
1084
+ to be scored for accuracy
1085
+
1086
+ -r/--reference reference fasta alignment to
1087
+ compare query alignment to
1088
+ """
1089
+ ),
1090
+ )
1091
+ parser.add_argument("fasta", type=str, help=SUPPRESS)
1092
+ parser.add_argument("-r", "--reference", type=str, help=SUPPRESS)
1093
+ args = parser.parse_args(argv)
1094
+ SumOfPairsScore(args).run()
1095
+
1096
+ @staticmethod
1097
+ def variable_sites(argv):
1098
+ parser = ArgumentParser(
1099
+ add_help=True,
1100
+ usage=SUPPRESS,
1101
+ formatter_class=RawDescriptionHelpFormatter,
1102
+ description=textwrap.dedent(
1103
+ f"""\
1104
+ {help_header}
1105
+
1106
+ Calculate the number of variable sites in an alignment.
1107
+
1108
+ The number of variable sites in an alignment is
1109
+ associated with strong phylogenetic signal.
1110
+
1111
+ PhyKIT reports three tab delimited values:
1112
+ col1: number of variable sites
1113
+ col2: total number of sites
1114
+ col3: percentage of variable sites
1115
+
1116
+ Association between the number of variable sites and
1117
+ phylogenetic signal was determined by Shen et al.,
1118
+ Genome Biology and Evolution (2016),
1119
+ doi: 10.1093/gbe/evw179.
1120
+
1121
+ Aliases:
1122
+ variable_sites, vs
1123
+ Command line interfaces:
1124
+ pk_variable_sites, pk_vs
1125
+
1126
+ Usage:
1127
+ phykit variable_sites <alignment>
1128
+
1129
+ Options
1130
+ =====================================================
1131
+ <alignment> first argument after
1132
+ function name should be
1133
+ an alignment file
1134
+ """
1135
+ ),
1136
+ )
1137
+ parser.add_argument("alignment", type=str, help=SUPPRESS)
1138
+ args = parser.parse_args(argv)
1139
+ VariableSites(args).run()
1140
+
1141
+ ## Tree functions
1142
+ @staticmethod
1143
+ def bipartition_support_stats(argv):
1144
+ parser = ArgumentParser(
1145
+ add_help=True,
1146
+ usage=SUPPRESS,
1147
+ formatter_class=RawDescriptionHelpFormatter,
1148
+ description=textwrap.dedent(
1149
+ f"""\
1150
+ {help_header}
1151
+ Calculate summary statistics for bipartition support.
1152
+
1153
+ High bipartition support values are thought to be desirable because
1154
+ they are indicative of greater certainty in tree topology.
1155
+
1156
+ To obtain all bipartition support values, use the -v/--verbose option.
1157
+ In addition to support values for each node, the names of all terminal
1158
+ branches tips are also included. Each terminal branch name is separated
1159
+ with a semi-colon (;).
1160
+
1161
+ Aliases:
1162
+ bipartition_support_stats, bss
1163
+ Command line interfaces:
1164
+ pk_bipartition_support_stats, pk_bss
1165
+
1166
+ Usage:
1167
+ phykit bipartition_support_stats <tree> [-v/--verbose]
1168
+
1169
+ Options
1170
+ =====================================================
1171
+ <tree> first argument after
1172
+ function name should be
1173
+ a tree file
1174
+
1175
+ -v/--verbose optional argument to print
1176
+ all bipartition support
1177
+ values
1178
+ """
1179
+ ),
1180
+ )
1181
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1182
+ parser.add_argument(
1183
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
1184
+ )
1185
+ args = parser.parse_args(argv)
1186
+ BipartitionSupportStats(args).run()
1187
+
1188
+ @staticmethod
1189
+ def branch_length_multiplier(argv):
1190
+ parser = ArgumentParser(
1191
+ add_help=True,
1192
+ usage=SUPPRESS,
1193
+ formatter_class=RawDescriptionHelpFormatter,
1194
+ description=textwrap.dedent(
1195
+ f"""\
1196
+ {help_header}
1197
+
1198
+ Multiply branch lengths in a phylogeny by a given factor.
1199
+
1200
+ This can help modify reference trees when conducting simulations
1201
+ or other analyses.
1202
+
1203
+ Alias:
1204
+ branch_length_multiplier, blm
1205
+ Command line interfaces:
1206
+ pk_branch_length_multiplier, pk_blm
1207
+
1208
+ Usage:
1209
+ phykit branch_length_multiplier <tree> -f n [-o/--output <output_file>]
1210
+
1211
+ Options
1212
+ =====================================================
1213
+ <tree> first argument after
1214
+ function name should be
1215
+ an tree file
1216
+
1217
+ -f/--factor factor to multiply branch
1218
+ lengths by
1219
+
1220
+ -o/--output optional argument to name
1221
+ the outputted tree file.
1222
+ Default output will have
1223
+ the same name as the input
1224
+ file but with the suffix
1225
+ ".factor_(n).tre"
1226
+ """
1227
+ ),
1228
+ )
1229
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1230
+ parser.add_argument("-f", "--factor", type=float, required=True, help=SUPPRESS)
1231
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1232
+ args = parser.parse_args(argv)
1233
+ BranchLengthMultiplier(args).run()
1234
+
1235
+ @staticmethod
1236
+ def collapse_branches(argv):
1237
+ parser = ArgumentParser(
1238
+ add_help=True,
1239
+ usage=SUPPRESS,
1240
+ formatter_class=RawDescriptionHelpFormatter,
1241
+ description=textwrap.dedent(
1242
+ f"""\
1243
+ {help_header}
1244
+
1245
+ Collapse branches on a phylogeny according to bipartition support.
1246
+ Bipartitions will be collapsed if they are less than the user specified
1247
+ value.
1248
+
1249
+ Aliases:
1250
+ collapse_branches, collapse, cb
1251
+ Command line interfaces:
1252
+ pk_collapse_branches, pk_collapse, pk_cb
1253
+
1254
+ Usage:
1255
+ phykit collapse_branches <tree> -s/--support n [-o/--output <output_file>]
1256
+
1257
+ Options
1258
+ =====================================================
1259
+ <tree> first argument after
1260
+ function name should be
1261
+ an tree file
1262
+
1263
+ -s/--support bipartitions with support less
1264
+ than this value will be collapsed
1265
+
1266
+ -o/--output optional argument to name
1267
+ the outputted tree file.
1268
+ Default output will have
1269
+ the same name as the input
1270
+ file but with the suffix
1271
+ ".collapsed_(support).tre"
1272
+
1273
+ """
1274
+ ),
1275
+ )
1276
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1277
+ parser.add_argument("-s", "--support", type=float, required=True, help=SUPPRESS)
1278
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1279
+ args = parser.parse_args(argv)
1280
+ CollapseBranches(args).run()
1281
+
1282
+ @staticmethod
1283
+ def covarying_evolutionary_rates(argv):
1284
+ parser = ArgumentParser(
1285
+ add_help=True,
1286
+ usage=SUPPRESS,
1287
+ formatter_class=RawDescriptionHelpFormatter,
1288
+ description=textwrap.dedent(
1289
+ f"""\
1290
+ {help_header}
1291
+
1292
+ Determine if two genes have a signature of covariation with one another.
1293
+
1294
+ Genes that have covarying evolutionary histories tend to have
1295
+ similar functions and expression levels.
1296
+
1297
+ Input two phylogenies and calculate the correlation among relative
1298
+ evolutionary rates between the two phylogenies. The two input trees
1299
+ do not have to have the same taxa. This function will first prune both
1300
+ trees to have the same tips. To transform branch lengths into relative
1301
+ rates, PhyKIT uses the putative species tree's branch lengths, which is
1302
+ inputted by the user. As recommended by the original method developers,
1303
+ outlier branche lengths are removed. Outlier branches have a relative
1304
+ evolutionary rate greater than five.
1305
+
1306
+ PhyKIT reports two tab delimited values:
1307
+ col1: correlation coefficient
1308
+ col2: p-value
1309
+
1310
+ Method is empirically evaluated by Clark et al., Genome Research
1311
+ (2012), doi: 10.1101/gr.132647.111. Normalization method using a
1312
+ species tree follows Sato et al., Bioinformatics (2005), doi:
1313
+ 10.1093/bioinformatics/bti564.
1314
+
1315
+
1316
+ Aliases:
1317
+ covarying_evolutionary_rates, cover
1318
+ Command line interfaces:
1319
+ pk_covarying_evolutionary_rates, pk_cover
1320
+
1321
+ Usage:
1322
+ phykit covarying_evolutionary_rates <tree_file_zero> <tree_file_one>
1323
+ -r/--reference <reference_tree_file> [-v/--verbose]
1324
+
1325
+ Options
1326
+ =====================================================
1327
+ <tree_file_zero> first argument after
1328
+ function name should be
1329
+ an alignment file
1330
+
1331
+ <tree_file_one> first argument after
1332
+ function name should be
1333
+ an alignment file
1334
+
1335
+ -r/--reference a tree to correct branch
1336
+ lengths by in the two input
1337
+ trees. Typically, this is a
1338
+ putative species tree.
1339
+
1340
+ -v/--verbose print out corrected branch
1341
+ lengths shared between
1342
+ tree 0 and tree 1
1343
+ """
1344
+ ),
1345
+ )
1346
+ parser.add_argument("tree_zero", type=str, help=SUPPRESS)
1347
+ parser.add_argument("tree_one", type=str, help=SUPPRESS)
1348
+ parser.add_argument(
1349
+ "-r", "--reference", type=str, required=True, help=SUPPRESS, metavar=""
1350
+ )
1351
+ parser.add_argument(
1352
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
1353
+ )
1354
+ args = parser.parse_args(argv)
1355
+ CovaryingEvolutionaryRates(args).run()
1356
+
1357
+ @staticmethod
1358
+ def dvmc(argv):
1359
+ parser = ArgumentParser(
1360
+ add_help=True,
1361
+ usage=SUPPRESS,
1362
+ formatter_class=RawDescriptionHelpFormatter,
1363
+ description=textwrap.dedent(
1364
+ f"""\
1365
+ {help_header}
1366
+
1367
+ Calculate degree of violation of a molecular clock (or DVMC) in a phylogeny.
1368
+
1369
+ Lower DVMC values are thought to be desirable because they are indicative
1370
+ of a lower degree of violation in the molecular clock assumption.
1371
+
1372
+ Typically, outgroup taxa are not included in molecular clock analysis. Thus,
1373
+ prior to calculating DVMC from a single gene tree, users may want to prune
1374
+ outgroup taxa from the phylogeny. To prune tips from a phylogeny, see the
1375
+ prune_tree function.
1376
+
1377
+ Calculate DVMC in a tree following Liu et al., PNAS (2017), doi: 10.1073/pnas.1616744114.
1378
+
1379
+ Aliases:
1380
+ degree_of_violation_of_a_molecular_clock, dvmc
1381
+ Command line interfaces:
1382
+ pk_degree_of_violation_of_a_molecular_clock, pk_dvmc
1383
+
1384
+ Usage:
1385
+ phykit degree_of_violation_of_a_molecular_clock <tree>
1386
+
1387
+ Options
1388
+ =====================================================
1389
+ <tree> first argument after
1390
+ function name should be
1391
+ a tree file
1392
+ """
1393
+ ),
1394
+ )
1395
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1396
+ args = parser.parse_args(argv)
1397
+ DVMC(args).run()
1398
+
1399
+ @staticmethod
1400
+ def evolutionary_rate(argv):
1401
+ parser = ArgumentParser(
1402
+ add_help=True,
1403
+ usage=SUPPRESS,
1404
+ formatter_class=RawDescriptionHelpFormatter,
1405
+ description=textwrap.dedent(
1406
+ f"""\
1407
+ {help_header}
1408
+ Calculate a tree-based estimation of the evolutionary rate of a gene.
1409
+
1410
+ Evolutionary rate is the total tree length divided by the number
1411
+ of terminals.
1412
+
1413
+ Calculate evolutionary rate following Telford et al., Proceedings
1414
+ of the Royal Society B (2014).
1415
+
1416
+ Aliases:
1417
+ evolutionary_rate, evo_rate
1418
+ Command line interfaces:
1419
+ pk_evolutionary_rate, pk_evo_rate
1420
+
1421
+ Usage:
1422
+ phykit evolutionary_rate <tree>
1423
+
1424
+ Options
1425
+ =====================================================
1426
+ <tree> first argument after
1427
+ function name should be
1428
+ a tree file
1429
+ """
1430
+ ),
1431
+ )
1432
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1433
+ args = parser.parse_args(argv)
1434
+ EvolutionaryRate(args).run()
1435
+
1436
+ @staticmethod
1437
+ def hidden_paralogy_check(argv):
1438
+ parser = ArgumentParser(
1439
+ add_help=True,
1440
+ usage=SUPPRESS,
1441
+ formatter_class=RawDescriptionHelpFormatter,
1442
+ description=textwrap.dedent(
1443
+ f"""\
1444
+ {help_header}
1445
+ Scan tree for evidence of hidden paralogy.
1446
+
1447
+ This analysis can be used to identify hidden paralogy.
1448
+ Specifically, this method will examine if a set of
1449
+ well known monophyletic taxa are, in fact, monophyletic.
1450
+ If they are not, the evolutionary history of the gene may
1451
+ be subject to hidden paralogy. This analysis is typically
1452
+ done with single-copy orthologous genes.
1453
+
1454
+ Requires a clade file, which species which monophyletic
1455
+ lineages to check for. Multiple monophyletic
1456
+ lineages can be specified. Each lineage should
1457
+ be specified on a single line and each tip name
1458
+ (or taxon name) should be separated by a space.
1459
+ For example, if it is anticipated that tips
1460
+ "A", "B", and "C" are monophyletic and "D",
1461
+ "E", and "F" are expected to be monophyletic, the
1462
+ clade file should be formatted as follows:
1463
+ "
1464
+ A B C
1465
+ D E F
1466
+ "
1467
+ Tip names not present in the tree will not be considered
1468
+ when assessing hidden paralogy.
1469
+
1470
+ The output will report if the specified taxa were monophyletic
1471
+ or not. The number of rows will reflect how many groups of taxa
1472
+ were checked for monophyly. For example, if there were three
1473
+ rows of clades in the -c file, there will be three rows in the
1474
+ output where the first row in the output corresponds to the
1475
+ results of the first row in the clade file.
1476
+
1477
+ The concept behind this analysis follows
1478
+ Siu-Ting et al., Molecular Biology and Evolution (2019).
1479
+
1480
+ Aliases:
1481
+ hidden_paralogy_check, clan_check
1482
+ Command line interfaces:
1483
+ pk_hidden_paralogy_check, pk_clan_check
1484
+
1485
+ Usage:
1486
+ phykit hidden_paralogy_check <tree> -c/--clade <clade_file>
1487
+
1488
+ Options
1489
+ =====================================================
1490
+ <tree> first argument after
1491
+ function name should be
1492
+ a tree file
1493
+
1494
+ <clade_file> clade file that specifies
1495
+ what monophyletic clades
1496
+ to expect
1497
+ """
1498
+ ),
1499
+ )
1500
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1501
+ parser.add_argument("-c", "--clade", type=str, required=False, help=SUPPRESS)
1502
+ args = parser.parse_args(argv)
1503
+ HiddenParalogyCheck(args).run()
1504
+
1505
+ @staticmethod
1506
+ def internal_branch_stats(argv):
1507
+ parser = ArgumentParser(
1508
+ add_help=True,
1509
+ usage=SUPPRESS,
1510
+ formatter_class=RawDescriptionHelpFormatter,
1511
+ description=textwrap.dedent(
1512
+ f"""\
1513
+ {help_header}
1514
+
1515
+ Calculate summary statistics for internal branch lengths in a phylogeny.
1516
+
1517
+ Internal branch lengths can be useful for phylogeny diagnostics.
1518
+
1519
+ To obtain all internal branch lengths, use the -v/--verbose option.
1520
+
1521
+ Aliases:
1522
+ internal_branch_stats, ibs
1523
+ Command line interfaces:
1524
+ pk_internal_branch_stats, pk_ibs
1525
+
1526
+ Usage:
1527
+ phykit internal_branch_stats <tree> [-v/--verbose]
1528
+
1529
+ Options
1530
+ =====================================================
1531
+ <tree> first argument after
1532
+ function name should be
1533
+ a tree file
1534
+
1535
+ -v/--verbose optional argument to print
1536
+ all internal branch lengths
1537
+ """
1538
+ ),
1539
+ )
1540
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1541
+ parser.add_argument(
1542
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
1543
+ )
1544
+ args = parser.parse_args(argv)
1545
+ InternalBranchStats(args).run()
1546
+
1547
+ @staticmethod
1548
+ def internode_labeler(argv):
1549
+ parser = ArgumentParser(
1550
+ add_help=True,
1551
+ usage=SUPPRESS,
1552
+ formatter_class=RawDescriptionHelpFormatter,
1553
+ description=textwrap.dedent(
1554
+ f"""\
1555
+ {help_header}
1556
+
1557
+ Appends numerical identifiers to bipartitions in place
1558
+ of support values. This is helpful for pointing to
1559
+ specific internodes in supplementary files or otherwise.
1560
+
1561
+ Alias:
1562
+ internode_labeler, il
1563
+ Command line interfaces:
1564
+ pk_internode_labeler, pk_il
1565
+
1566
+ Usage:
1567
+ phykit internode_labeler <tree> [-o/--output <file>]
1568
+
1569
+ Options
1570
+ =====================================================
1571
+ <tree> first argument after
1572
+ function name should be
1573
+ a tree file
1574
+
1575
+ -o/--output optional argument to name
1576
+ the outputted tree file
1577
+ """
1578
+ ),
1579
+ )
1580
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1581
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1582
+ args = parser.parse_args(argv)
1583
+ InternodeLabeler(args).run()
1584
+
1585
+ @staticmethod
1586
+ def last_common_ancestor_subtree(argv):
1587
+ parser = ArgumentParser(
1588
+ add_help=True,
1589
+ usage=SUPPRESS,
1590
+ formatter_class=RawDescriptionHelpFormatter,
1591
+ description=textwrap.dedent(
1592
+ f"""\
1593
+ {help_header}
1594
+
1595
+ Obtains subtree from a phylogeny by getting
1596
+ the last common ancestor from a list of taxa.
1597
+
1598
+ Alias:
1599
+ last_common_ancestor_subtree, lca_subtree
1600
+ Command line interfaces:
1601
+ pk_last_common_ancestor_subtree, pk_lca_subtree
1602
+
1603
+ Usage:
1604
+ phykit last_common_ancestor_subtree <file> <list_of_taxa>
1605
+ [-o/--output <file>]
1606
+
1607
+ Options
1608
+ =====================================================
1609
+ <tree> first argument after
1610
+ function name should be
1611
+ a tree file
1612
+
1613
+ <list_of_taxa> list of taxa to get the last
1614
+ common ancestor subtree for
1615
+
1616
+ -o/--output optional argument to name
1617
+ the outputted tree file
1618
+ """
1619
+ ),
1620
+ )
1621
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1622
+ parser.add_argument("list_of_taxa", type=str, help=SUPPRESS)
1623
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1624
+ args = parser.parse_args(argv)
1625
+ LastCommonAncestorSubtree(args).run()
1626
+
1627
+ @staticmethod
1628
+ def lb_score(argv):
1629
+ parser = ArgumentParser(
1630
+ add_help=True,
1631
+ usage=SUPPRESS,
1632
+ formatter_class=RawDescriptionHelpFormatter,
1633
+ description=textwrap.dedent(
1634
+ f"""\
1635
+ {help_header}
1636
+
1637
+ Calculate long branch (LB) scores in a phylogeny.
1638
+
1639
+ Lower LB scores are thought to be desirable because
1640
+ they are indicative of taxa or trees that likely do
1641
+ not have issues with long branch attraction.
1642
+
1643
+ LB score is the mean pairwise patristic distance of
1644
+ taxon i compared to all other taxa over the average
1645
+ pairwise patristic distance.
1646
+
1647
+ PhyKIT reports summary statistics. To obtain LB scores
1648
+ for each taxa, use the -v/--verbose option.
1649
+
1650
+ LB scores are calculated following Struck, Evolutionary
1651
+ Bioinformatics (2014), doi: 10.4137/EBO.S14239.
1652
+
1653
+ Aliases:
1654
+ long_branch_score, lb_score, lbs
1655
+ Command line interfaces:
1656
+ pk_long_branch_score, pk_lb_score, pk_lbs
1657
+
1658
+ Usage:
1659
+ phykit long_branch_score <tree> [-v/--verbose]
1660
+
1661
+ Options
1662
+ =====================================================
1663
+ <tree> first argument after
1664
+ function name should be
1665
+ a tree file
1666
+
1667
+ -v/--verbose optional argument to print
1668
+ all LB score values
1669
+ """
1670
+ ),
1671
+ )
1672
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1673
+ parser.add_argument(
1674
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
1675
+ )
1676
+ args = parser.parse_args(argv)
1677
+ LBScore(args).run()
1678
+
1679
+ @staticmethod
1680
+ def monophyly_check(argv):
1681
+ parser = ArgumentParser(
1682
+ add_help=True,
1683
+ usage=SUPPRESS,
1684
+ formatter_class=RawDescriptionHelpFormatter,
1685
+ description=textwrap.dedent(
1686
+ f"""\
1687
+ {help_header}
1688
+ Check for monophyly of a lineage.
1689
+
1690
+ This analysis can be used to determine if a set of
1691
+ taxa are monophyletic.
1692
+
1693
+ Requires a taxa file, which species which tip names
1694
+ are expected to be monophyletic. File format is a
1695
+ single column file with tip names. Tip names not
1696
+ present in the tree will not be considered when
1697
+ examining monophyly.
1698
+
1699
+ The output will have six columns.
1700
+ col 1: if the clade was or wasn't monophyletic
1701
+ col 2: average bipartition support value in the clade of interest
1702
+ col 3: maximum bipartition support value in the clade of interest
1703
+ col 4: minimum bipartition support value in the clade of interest
1704
+ col 5: standard deviation of bipartition support values in the clade of interest
1705
+ col 6: tip names of taxa monophyletic with the lineage of interest
1706
+ excluding those that are listed in the taxa_of_interest file
1707
+
1708
+ Aliases:
1709
+ monophyly_check, is_monophyletic
1710
+ Command line interfaces:
1711
+ pk_monophyly_check, pk_is_monophyletic
1712
+
1713
+ Usage:
1714
+ phykit monophyly_check <tree> <list_of_taxa>
1715
+
1716
+ Options
1717
+ =====================================================
1718
+ <tree> first argument after
1719
+ function name should be
1720
+ a tree file
1721
+
1722
+ <list_of_taxa> single column file with
1723
+ list of tip names to
1724
+ examine the monophyly of
1725
+ """
1726
+ ),
1727
+ )
1728
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1729
+ parser.add_argument("list_of_taxa", type=str, help=SUPPRESS)
1730
+ args = parser.parse_args(argv)
1731
+ MonophylyCheck(args).run()
1732
+
1733
+ @staticmethod
1734
+ def nearest_neighbor_interchange(argv):
1735
+ parser = ArgumentParser(
1736
+ add_help=True,
1737
+ usage=SUPPRESS,
1738
+ formatter_class=RawDescriptionHelpFormatter,
1739
+ description=textwrap.dedent(
1740
+ f"""\
1741
+ {help_header}
1742
+
1743
+ Generate all nearest neighbor interchange moves for a binary
1744
+ rooted tree.
1745
+
1746
+ The output file will also include the original phylogeny.
1747
+
1748
+ Aliases:
1749
+ nearest_neighbor_interchange, nni
1750
+ Command line interfaces:
1751
+ pk_nearest_neighbor_interchange, pk_nni
1752
+
1753
+ Usage:
1754
+ phykit nearest_neighbor_interchange <tree> [-o/--output <output_file>]
1755
+
1756
+ Options
1757
+ =====================================================
1758
+ <tree> first argument after
1759
+ function name should be
1760
+ a tree file
1761
+
1762
+ -o/--output name of output file that will
1763
+ contain all trees with the
1764
+ nearest neighbor interchange
1765
+ moves.
1766
+ Default output will have
1767
+ the same name as the input
1768
+ file but with the suffix
1769
+ ".NNIs"
1770
+ """
1771
+ ),
1772
+ )
1773
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1774
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1775
+ args = parser.parse_args(argv)
1776
+ NearestNeighborInterchange(args).run()
1777
+
1778
+ @staticmethod
1779
+ def patristic_distances(argv):
1780
+ parser = ArgumentParser(
1781
+ add_help=True,
1782
+ usage=SUPPRESS,
1783
+ formatter_class=RawDescriptionHelpFormatter,
1784
+ description=textwrap.dedent(
1785
+ f"""\
1786
+ {help_header}
1787
+
1788
+ Calculate summary statistics among patristic distances in a phylogeny.
1789
+
1790
+ Patristic distances are all tip-to-tip distances in a phylogeny.
1791
+
1792
+ To obtain all patristic distances, use the -v/--verbose option.
1793
+ With the -v option, the first column will have two taxon names
1794
+ separated by a '-' followed by the patristic distance. Features
1795
+ will be tab separated.
1796
+
1797
+ Aliases:
1798
+ patristic_distances, pd
1799
+ Command line interfaces:
1800
+ pk_patristic_distances, pk_pd
1801
+
1802
+ Usage:
1803
+ phykit patristic_distances <tree> [-v/--verbose]
1804
+
1805
+ Options
1806
+ =====================================================
1807
+ <tree> first argument after
1808
+ function name should be
1809
+ a tree file
1810
+
1811
+ -v/--verbose optional argument to print
1812
+ all patristic distances between
1813
+ taxa
1814
+ """
1815
+ ),
1816
+ )
1817
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1818
+ parser.add_argument(
1819
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
1820
+ )
1821
+ args = parser.parse_args(argv)
1822
+ PatristicDistances(args).run()
1823
+
1824
+ @staticmethod
1825
+ def polytomy_test(argv):
1826
+ parser = ArgumentParser(
1827
+ add_help=True,
1828
+ usage=SUPPRESS,
1829
+ formatter_class=RawDescriptionHelpFormatter,
1830
+ description=textwrap.dedent(
1831
+ f"""\
1832
+ {help_header}
1833
+
1834
+ Conduct a polytomy test for three clades in a phylogeny.
1835
+
1836
+ Polytomy tests can be used to identify putative radiations
1837
+ as well as identify well supported alternative topologies.
1838
+
1839
+ The polytomy testing function takes as input a file with
1840
+ the three groups of taxa to test the relationships for and
1841
+ a single column file with the names of the desired tree files
1842
+ to use for polytomy testing. Next, the script to examine
1843
+ support for the grouping of the three taxa using triplets
1844
+ and gene support frequencies.
1845
+
1846
+ This function can account for uncertainty in gene trees -
1847
+ that is, the input phylogenies can have collapsed bipartitions.
1848
+
1849
+ Thereafter, a chi-squared test is conducted to determine if there
1850
+ is evidence to reject the null hypothesis wherein the null
1851
+ hypothesis is that the three possible topologies among the three
1852
+ groups are equally supported. This test is done using gene support
1853
+ frequencies.
1854
+
1855
+ Aliases:
1856
+ polytomy_test, polyt_test, polyt, ptt
1857
+ Command line interfaces:
1858
+ pk_polytomy_test, pk_polyt_test, pk_polyt, pk_ptt
1859
+
1860
+ Usage:
1861
+ phykit polytomy_test -t/--trees <trees> -g/--groups <groups>
1862
+
1863
+ Options
1864
+ =====================================================
1865
+ -t/--trees single column file with names
1866
+ of phylogenies to use for
1867
+ polytomy testing
1868
+
1869
+ -g/--groups a tab-delimited file with the
1870
+ grouping designations to test.
1871
+ Lines starting with comments
1872
+ are not considered. Names
1873
+ of individual taxa should be
1874
+ separated by a semi-colon ';'
1875
+
1876
+ For example, the groups file could look like the following:
1877
+ #labels group0 group1 group2
1878
+ name_of_test tip_name_A;tip_name_B tip_name_C tip_name_D;tip_name_E
1879
+ """
1880
+ ),
1881
+ )
1882
+ parser.add_argument("-t", "--trees", type=str, help=SUPPRESS)
1883
+ parser.add_argument("-g", "--groups", type=str, help=SUPPRESS)
1884
+ args = parser.parse_args(argv)
1885
+ PolytomyTest(args).run()
1886
+
1887
+ @staticmethod
1888
+ def print_tree(argv):
1889
+ parser = ArgumentParser(
1890
+ add_help=True,
1891
+ usage=SUPPRESS,
1892
+ formatter_class=RawDescriptionHelpFormatter,
1893
+ description=textwrap.dedent(
1894
+ f"""\
1895
+ {help_header}
1896
+
1897
+ Print ascii tree of input phylogeny.
1898
+
1899
+ Phylogeny can be printed with or without branch lengths.
1900
+ By default, the phylogeny will be printed with branch lengths
1901
+ but branch lengths can be removed using the -r/--remove argument.
1902
+
1903
+ Aliases:
1904
+ print_tree, print, pt
1905
+ Command line interfaces:
1906
+ pk_print_tree, pk_print, pk_pt
1907
+
1908
+ Usage:
1909
+ phykit print_tree <tree> [-r/--remove]
1910
+
1911
+ Options
1912
+ =====================================================
1913
+ <tree> first argument after
1914
+ function name should be
1915
+ a tree file
1916
+
1917
+ -r/--remove optional argument to print
1918
+ the phylogeny without branch
1919
+ lengths
1920
+ """
1921
+ ),
1922
+ )
1923
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1924
+ parser.add_argument(
1925
+ "-r", "--remove", action="store_true", required=False, help=SUPPRESS
1926
+ )
1927
+ args = parser.parse_args(argv)
1928
+ PrintTree(args).run()
1929
+
1930
+ @staticmethod
1931
+ def prune_tree(argv):
1932
+ parser = ArgumentParser(
1933
+ add_help=True,
1934
+ usage=SUPPRESS,
1935
+ formatter_class=RawDescriptionHelpFormatter,
1936
+ description=textwrap.dedent(
1937
+ f"""\
1938
+ {help_header}
1939
+
1940
+ Prune tips from a phylogeny.
1941
+
1942
+ Provide a single column file with the names of the tips
1943
+ in the input phylogeny you would like to prune from the
1944
+ tree.
1945
+
1946
+ Aliases:
1947
+ prune_tree, prune
1948
+ Command line interfaces:
1949
+ pk_prune_tree, pk_prune
1950
+
1951
+ Usage:
1952
+ phykit prune_tree <tree> <list_of_taxa> [-o/--output <output_file>
1953
+ -k/--keep]
1954
+
1955
+ Options
1956
+ =====================================================
1957
+ <tree> first argument after
1958
+ function name should be
1959
+ a tree file
1960
+
1961
+ <list_of_taxa> single column file with the
1962
+ names of the tips to remove
1963
+ from the phylogeny
1964
+
1965
+ -o/--output name of output file for the
1966
+ pruned phylogeny.
1967
+ Default output will have
1968
+ the same name as the input
1969
+ file but with the suffix
1970
+ ".pruned"
1971
+
1972
+ -k/--keep optional argument. If used
1973
+ instead of pruning taxa in
1974
+ <list_of_taxa>, keep them
1975
+ """
1976
+ ),
1977
+ )
1978
+ parser.add_argument("tree", type=str, help=SUPPRESS)
1979
+ parser.add_argument("list_of_taxa", type=str, help=SUPPRESS)
1980
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
1981
+ parser.add_argument(
1982
+ "-k", "--keep", type=str2bool, nargs="?", default=False, help=SUPPRESS
1983
+ )
1984
+ args = parser.parse_args(argv)
1985
+ PruneTree(args).run()
1986
+
1987
+ @staticmethod
1988
+ def rename_tree_tips(argv):
1989
+ parser = ArgumentParser(
1990
+ add_help=True,
1991
+ usage=SUPPRESS,
1992
+ formatter_class=RawDescriptionHelpFormatter,
1993
+ description=textwrap.dedent(
1994
+ f"""\
1995
+ {help_header}
1996
+
1997
+ Renames tips in a phylogeny.
1998
+
1999
+ Renaming tip files will follow the scheme of a tab-delimited
2000
+ file wherein the first column is the current tip name and the
2001
+ second column is the desired tip name in the resulting
2002
+ phylogeny.
2003
+
2004
+ Aliases:
2005
+ rename_tree_tips, rename_tree, rename_tips
2006
+ Command line interfaces:
2007
+ pk_rename_tree_tips, pk_rename_tree, pk_rename_tips
2008
+
2009
+ Usage:
2010
+ phykit rename_tree_tips <tree> -i/--idmap <idmap.txt>
2011
+ [-o/--output <output_file>]
2012
+
2013
+ Options
2014
+ =====================================================
2015
+ <tree> first argument after
2016
+ function name should be
2017
+ a tree file
2018
+
2019
+ -i/--idmap identifier map of current tip
2020
+ names (col1) and desired tip
2021
+ names (col2)
2022
+
2023
+ -o/--output optional argument to write
2024
+ the renamed tree files to.
2025
+ Default output will have
2026
+ the same name as the input
2027
+ file but with the suffix
2028
+ ".renamed"
2029
+ """
2030
+ ),
2031
+ )
2032
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2033
+ parser.add_argument("-i", "--idmap", type=str, help=SUPPRESS)
2034
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
2035
+ args = parser.parse_args(argv)
2036
+ RenameTreeTips(args).run()
2037
+
2038
+ @staticmethod
2039
+ def rf_distance(argv):
2040
+ parser = ArgumentParser(
2041
+ add_help=True,
2042
+ usage=SUPPRESS,
2043
+ formatter_class=RawDescriptionHelpFormatter,
2044
+ description=textwrap.dedent(
2045
+ f"""\
2046
+ {help_header}
2047
+
2048
+ Calculate Robinson-Foulds (RF) distance between two trees.
2049
+
2050
+ Low RF distances reflect greater similarity between two phylogenies.
2051
+ This function prints out two values, the plain RF value and the
2052
+ normalized RF value, which are separated by a tab. Normalized RF values
2053
+ are calculated by taking the plain RF value and dividing it by 2(n-3)
2054
+ where n is the number of tips in the phylogeny. Prior to calculating
2055
+ an RF value, PhyKIT will first determine the number of shared tips
2056
+ between the two input phylogenies and prune them to a common set of
2057
+ tips. Thus, users can input trees with different topologies and
2058
+ infer an RF value among subtrees with shared tips.
2059
+
2060
+ PhyKIT will print out
2061
+ col 1; the plain RF distance and
2062
+ col 2: the normalized RF distance.
2063
+
2064
+ RF distances are calculated following Robinson & Foulds, Mathematical
2065
+ Biosciences (1981), doi: 10.1016/0025-5564(81)90043-2.
2066
+
2067
+ Aliases:
2068
+ robinson_foulds_distance, rf_distance, rf_dist, rf
2069
+ Command line interfaces:
2070
+ pk_robinson_foulds_distance, pk_rf_distance, pk_rf_dist, pk_rf
2071
+
2072
+ Usage:
2073
+ phykit robinson_foulds_distance <tree_file_zero> <tree_file_one>
2074
+
2075
+ Options
2076
+ =====================================================
2077
+ <tree_file_zero> first argument after
2078
+ function name should be
2079
+ a tree file
2080
+
2081
+ <tree_file_one> second argument after
2082
+ function name should be
2083
+ a tree file
2084
+ """
2085
+ ),
2086
+ )
2087
+ parser.add_argument("tree_zero", type=str, help=SUPPRESS)
2088
+ parser.add_argument("tree_one", type=str, help=SUPPRESS)
2089
+ args = parser.parse_args(argv)
2090
+ RobinsonFouldsDistance(args).run()
2091
+
2092
+ @staticmethod
2093
+ def root_tree(argv):
2094
+ parser = ArgumentParser(
2095
+ add_help=True,
2096
+ usage=SUPPRESS,
2097
+ formatter_class=RawDescriptionHelpFormatter,
2098
+ description=textwrap.dedent(
2099
+ f"""\
2100
+ {help_header}
2101
+
2102
+ Roots phylogeny using user-specified taxa.
2103
+
2104
+ A list of taxa to root the phylogeny on should be
2105
+ specified using the -r argument. The root_taxa file
2106
+ should be a single-column file with taxa names. The
2107
+ outputted file will have the same name as the inputted
2108
+ tree file but with the suffix ".rooted".
2109
+
2110
+ Aliases:
2111
+ root_tree, root, rt
2112
+ Command line interfaces:
2113
+ pk_root_tree, pk_root, pk_rt
2114
+
2115
+ Usage:
2116
+ phykit root_tree <tree> -r/--root <root_taxa>
2117
+ [-o/--output <output_file>]
2118
+
2119
+ Options
2120
+ =====================================================
2121
+ <tree> first argument after
2122
+ function name should be
2123
+ a tree file
2124
+
2125
+ -r/--root single column file with
2126
+ tip names of root taxa
2127
+
2128
+ -o/--output optional argument to write
2129
+ the rooted tree file to.
2130
+ Default output will have
2131
+ the same name as the input
2132
+ file but with the suffix
2133
+ ".rooted"
2134
+ """
2135
+ ),
2136
+ )
2137
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2138
+ parser.add_argument("-r", "--root", type=str, required=True, help=SUPPRESS)
2139
+ parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
2140
+ args = parser.parse_args(argv)
2141
+ RootTree(args).run()
2142
+
2143
+ @staticmethod
2144
+ def spurious_sequence(argv):
2145
+ parser = ArgumentParser(
2146
+ add_help=True,
2147
+ usage=SUPPRESS,
2148
+ formatter_class=RawDescriptionHelpFormatter,
2149
+ description=textwrap.dedent(
2150
+ f"""\
2151
+ {help_header}
2152
+
2153
+ Determines potentially spurious homologs using branch lengths.
2154
+
2155
+ Identifies potentially spurious sequences and reports
2156
+ tips in the phylogeny that could possibly be removed
2157
+ from the associated multiple sequence alignment. PhyKIT
2158
+ does so by identifying and reporting long terminal branches
2159
+ defined as branches that are equal to or 20 times the median
2160
+ length of all branches.
2161
+
2162
+ PhyKIT reports the following information
2163
+ col1: name of tip that is a putatively spurious sequence
2164
+ col2: length of branch leading to putatively spurious sequence
2165
+ col3: threshold used to identify putatively spurious sequences
2166
+ col4: median branch length in the phylogeny
2167
+
2168
+ If there are no putatively spurious sequences, "None" is reported.
2169
+
2170
+ Using this method to identify potentially spurious sequences
2171
+ was, to my knowledge, first introduced by Shen et al., (2018)
2172
+ Cell doi: 10.1016/j.cell.2018.10.023.
2173
+
2174
+ Aliases:
2175
+ spurious_sequence, spurious_seq, ss
2176
+ Command line interfaces:
2177
+ pk_spurious_sequence, pk_spurious_seq, pk_ss
2178
+
2179
+ Usage:
2180
+ phykit spurious_sequence <file> [-f 20]
2181
+
2182
+ Options
2183
+ =====================================================
2184
+ <file> first argument after
2185
+ function name should be
2186
+ an tree file
2187
+
2188
+ -f/--factor factor to multiply median
2189
+ branch length by to calculate
2190
+ the threshold of long branches.
2191
+ (Default: 20)
2192
+ """
2193
+ ),
2194
+ )
2195
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2196
+ parser.add_argument("-f", "--factor", type=float, required=False, help=SUPPRESS)
2197
+ args = parser.parse_args(argv)
2198
+ SpuriousSequence(args).run()
2199
+
2200
+ @staticmethod
2201
+ def terminal_branch_stats(argv):
2202
+ parser = ArgumentParser(
2203
+ add_help=True,
2204
+ usage=SUPPRESS,
2205
+ formatter_class=RawDescriptionHelpFormatter,
2206
+ description=textwrap.dedent(
2207
+ f"""\
2208
+ {help_header}
2209
+
2210
+ Calculate summary statistics for terminal branch lengths in a phylogeny.
2211
+
2212
+ Terminal branch lengths can be useful for phylogeny diagnostics.
2213
+
2214
+ To obtain all terminal branch lengths, use the -v/--verbose option.
2215
+
2216
+ Aliases:
2217
+ terminal_branch_stats, tbs
2218
+ Command line interfaces:
2219
+ pk_terminal_branch_stats, pk_tbs
2220
+
2221
+ Usage:
2222
+ phykit terminal_branch_stats <tree> [-v/--verbose]
2223
+
2224
+ Options
2225
+ =====================================================
2226
+ <tree> first argument after
2227
+ function name should be
2228
+ a tree file
2229
+
2230
+ -v/--verbose optional argument to print
2231
+ all internal branch lengths
2232
+ """
2233
+ ),
2234
+ )
2235
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2236
+ parser.add_argument(
2237
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
2238
+ )
2239
+ args = parser.parse_args(argv)
2240
+ TerminalBranchStats(args).run()
2241
+
2242
+ @staticmethod
2243
+ def tip_labels(argv):
2244
+ parser = ArgumentParser(
2245
+ add_help=True,
2246
+ usage=SUPPRESS,
2247
+ formatter_class=RawDescriptionHelpFormatter,
2248
+ description=textwrap.dedent(
2249
+ f"""\
2250
+ {help_header}
2251
+
2252
+ Prints the tip labels (or names) a phylogeny.
2253
+
2254
+ Aliases:
2255
+ tip_labels, tree_labels; labels; tl
2256
+ Command line interfaces:
2257
+ pk_tip_labels, pk_tree_labels; pk_labels; pk_tl
2258
+
2259
+ Usage:
2260
+ phykit tip_labels <tree>
2261
+
2262
+ Options
2263
+ =====================================================
2264
+ <tree> first argument after
2265
+ function name should be
2266
+ a tree file
2267
+ """
2268
+ ),
2269
+ )
2270
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2271
+ args = parser.parse_args(argv)
2272
+ TipLabels(args).run()
2273
+
2274
+ @staticmethod
2275
+ def tip_to_tip_distance(argv):
2276
+ parser = ArgumentParser(
2277
+ add_help=True,
2278
+ usage=SUPPRESS,
2279
+ formatter_class=RawDescriptionHelpFormatter,
2280
+ description=textwrap.dedent(
2281
+ f"""\
2282
+ {help_header}
2283
+
2284
+ Calculate distance between two tips (or leaves) in a phylogeny.
2285
+
2286
+ Distances are in substitutions per site.
2287
+
2288
+ Aliases:
2289
+ tip_to_tip_distance, t2t_dist, t2t
2290
+ Command line interfaces:
2291
+ pk_tip_to_tip_distance, pk_t2t_dist, pk_t2t
2292
+
2293
+ Usage:
2294
+ phykit tip_to_tip_distance <tree_file> <tip_1> <tip_2>
2295
+
2296
+ Options
2297
+ =====================================================
2298
+ <tree_file> first argument after
2299
+ function name should be
2300
+ a tree file
2301
+
2302
+ <tip_1> second argument after
2303
+ function name should be
2304
+ one of the tip names
2305
+
2306
+ <tip_2> third argument after
2307
+ function name should be
2308
+ the second tip name
2309
+ """
2310
+ ),
2311
+ )
2312
+ parser.add_argument("tree_zero", type=str, help=SUPPRESS)
2313
+ parser.add_argument("tip_1", type=str, help=SUPPRESS)
2314
+ parser.add_argument("tip_2", type=str, help=SUPPRESS)
2315
+ args = parser.parse_args(argv)
2316
+ TipToTipDistance(args).run()
2317
+
2318
+ @staticmethod
2319
+ def tip_to_tip_node_distance(argv):
2320
+ parser = ArgumentParser(
2321
+ add_help=True,
2322
+ usage=SUPPRESS,
2323
+ formatter_class=RawDescriptionHelpFormatter,
2324
+ description=textwrap.dedent(
2325
+ f"""\
2326
+ {help_header}
2327
+
2328
+ Calculate distance between two tips (or leaves) in a phylogeny.
2329
+
2330
+ Distance is measured by the number of nodes between one tip
2331
+ and another.
2332
+
2333
+ Aliases:
2334
+ tip_to_tip_node_distance, t2t_node_dist, t2t_nd
2335
+ Command line interfaces:
2336
+ pk_tip_to_tip_node_distance, pk_t2t_node_dist, pk_t2t_nd
2337
+
2338
+ Usage:
2339
+ phykit tip_to_tip_node_distance <tree_file> <tip_1> <tip_2>
2340
+
2341
+ Options
2342
+ =====================================================
2343
+ <tree_file> first argument after
2344
+ function name should be
2345
+ a tree file
2346
+
2347
+ <tip_1> second argument after
2348
+ function name should be
2349
+ one of the tip names
2350
+
2351
+ <tip_2> third argument after
2352
+ function name should be
2353
+ the second tip name
2354
+ """
2355
+ ),
2356
+ )
2357
+ parser.add_argument("tree_zero", type=str, help=SUPPRESS)
2358
+ parser.add_argument("tip_1", type=str, help=SUPPRESS)
2359
+ parser.add_argument("tip_2", type=str, help=SUPPRESS)
2360
+ args = parser.parse_args(argv)
2361
+ TipToTipNodeDistance(args).run()
2362
+
2363
+ @staticmethod
2364
+ def total_tree_length(argv):
2365
+ parser = ArgumentParser(
2366
+ add_help=True,
2367
+ usage=SUPPRESS,
2368
+ formatter_class=RawDescriptionHelpFormatter,
2369
+ description=textwrap.dedent(
2370
+ f"""\
2371
+ {help_header}
2372
+
2373
+ Calculate total tree length, which is a sum of all branches.
2374
+
2375
+ Aliases:
2376
+ total_tree_length, tree_len
2377
+ Command line interfaces:
2378
+ pk_total_tree_length, pk_tree_len
2379
+
2380
+ Usage:
2381
+ phykit total_tree_length <tree>
2382
+
2383
+ Options
2384
+ =====================================================
2385
+ <tree> first argument after
2386
+ function name should be
2387
+ a tree file
2388
+ """
2389
+ ),
2390
+ )
2391
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2392
+ args = parser.parse_args(argv)
2393
+ TotalTreeLength(args).run()
2394
+
2395
+ @staticmethod
2396
+ def treeness(argv):
2397
+ parser = ArgumentParser(
2398
+ add_help=True,
2399
+ usage=SUPPRESS,
2400
+ formatter_class=RawDescriptionHelpFormatter,
2401
+ description=textwrap.dedent(
2402
+ f"""\
2403
+ {help_header}
2404
+
2405
+ Calculate treeness statistic for a phylogeny.
2406
+
2407
+ Higher treeness values are thought to be desirable because they
2408
+ represent a higher signal-to-noise ratio.
2409
+
2410
+ Treeness describes the proportion of the tree distance found on
2411
+ internal branches. Treeness can be used as a measure of the
2412
+ signal-to-noise ratio in a phylogeny.
2413
+
2414
+ Calculate treeness (also referred to as stemminess) following
2415
+ Lanyon, The Auk (1988), doi: 10.1093/auk/105.3.565 and
2416
+ Phillips and Penny, Molecular Phylogenetics and Evolution
2417
+ (2003), doi: 10.1016/S1055-7903(03)00057-5.
2418
+
2419
+ Aliases:
2420
+ treeness, tness
2421
+ Command line interfaces:
2422
+ pk_treeness, pk_tness
2423
+
2424
+ Usage:
2425
+ phykit treeness <tree>
2426
+
2427
+ Options
2428
+ =====================================================
2429
+ <tree> first argument after
2430
+ function name should be
2431
+ a tree file
2432
+ """
2433
+ ),
2434
+ )
2435
+ parser.add_argument("tree", type=str, help=SUPPRESS)
2436
+ args = parser.parse_args(argv)
2437
+ Treeness(args).run()
2438
+
2439
+ ## Alignment and tree functions
2440
+ @staticmethod
2441
+ def saturation(argv):
2442
+ parser = ArgumentParser(
2443
+ add_help=True,
2444
+ usage=SUPPRESS,
2445
+ formatter_class=RawDescriptionHelpFormatter,
2446
+ description=textwrap.dedent(
2447
+ f"""\
2448
+ {help_header}
2449
+
2450
+ Calculate saturation for a given tree and alignment.
2451
+
2452
+ Saturation is defined as sequences in multiple sequence
2453
+ alignments that have undergone numerous substitutions such
2454
+ that the distances between taxa are underestimated.
2455
+
2456
+ Data with no saturation will have a value of 1. The closer
2457
+ the value is to 1, the less saturated the data.
2458
+
2459
+ This function outputs two values (as of v1.19.9). The first
2460
+ value is the saturation value and the second column is the absolute
2461
+ value of saturation minus 1. Thus, lower values in the second column
2462
+ are indicative of values closer to one and, thus, less saturation.
2463
+
2464
+ Saturation is calculated following Philippe et al., PLoS
2465
+ Biology (2011), doi: 10.1371/journal.pbio.1000602.
2466
+
2467
+ Aliases:
2468
+ saturation, sat
2469
+ Command line interfaces:
2470
+ pk_saturation, pk_sat
2471
+
2472
+ Usage:
2473
+ phykit saturation -a <alignment> -t <tree> [-v/--verbose]
2474
+
2475
+ Options
2476
+ =====================================================
2477
+ -a/--alignment an alignment file
2478
+
2479
+ -t/--tree a tree file
2480
+
2481
+ -e/--exclude_gaps if a site has a gap, ignore it
2482
+
2483
+ -v/--verbose print out patristic distances
2484
+ and uncorrected distances used
2485
+ to determine saturation
2486
+ """
2487
+ ),
2488
+ )
2489
+ parser.add_argument(
2490
+ "-a", "--alignment", type=str, required=True, help=SUPPRESS, metavar=""
2491
+ )
2492
+ parser.add_argument(
2493
+ "-t", "--tree", type=str, required=True, help=SUPPRESS, metavar=""
2494
+ )
2495
+ parser.add_argument(
2496
+ "-v", "--verbose", action="store_true", required=False, help=SUPPRESS
2497
+ )
2498
+ parser.add_argument(
2499
+ "-e", "--exclude_gaps", action="store_true", required=False, help=SUPPRESS
2500
+ )
2501
+ args = parser.parse_args(argv)
2502
+ Saturation(args).run()
2503
+
2504
+ @staticmethod
2505
+ def treeness_over_rcv(argv):
2506
+ parser = ArgumentParser(
2507
+ add_help=True,
2508
+ usage=SUPPRESS,
2509
+ formatter_class=RawDescriptionHelpFormatter,
2510
+ description=textwrap.dedent(
2511
+ f"""\
2512
+ {help_header}
2513
+
2514
+ Calculate treeness/RCV for a given alignment and tree.
2515
+
2516
+ Higher treeness/RCV values are thought to be desirable because
2517
+ they harbor a high signal-to-noise ratio are least susceptible
2518
+ to composition bias.
2519
+
2520
+ PhyKIT reports three tab delimited values:
2521
+ col1: treeness/RCV
2522
+ col2: treeness
2523
+ col3: RCV
2524
+
2525
+ Calculate treeness/RCV following Phillips and Penny, Molecular
2526
+ Phylogenetics and Evolution (2003), doi: 10.1016/S1055-7903(03)00057-5.
2527
+
2528
+ Aliases:
2529
+ treeness_over_rcv, toverr, tor
2530
+ Command line interfaces:
2531
+ pk_treeness_over_rcv, pk_toverr, pk_tor
2532
+
2533
+ Usage:
2534
+ phykit treeness_over_rcv -a/--alignment <alignment> -t/--tree <tree>
2535
+
2536
+ Options
2537
+ =====================================================
2538
+ -a/--alignment an alignment file
2539
+
2540
+ -t/--tree a tree file
2541
+ """
2542
+ ),
2543
+ )
2544
+ parser.add_argument(
2545
+ "-a", "--alignment", type=str, required=True, help=SUPPRESS, metavar=""
2546
+ )
2547
+ parser.add_argument(
2548
+ "-t", "--tree", type=str, required=True, help=SUPPRESS, metavar=""
2549
+ )
2550
+ args = parser.parse_args(argv)
2551
+ TreenessOverRCV(args).run()
2552
+
2553
+ ### Helper commands
2554
+ @staticmethod
2555
+ def create_concatenation_matrix(argv):
2556
+ parser = ArgumentParser(
2557
+ add_help=True,
2558
+ usage=SUPPRESS,
2559
+ formatter_class=RawDescriptionHelpFormatter,
2560
+ description=textwrap.dedent(
2561
+ f"""\
2562
+ {help_header}
2563
+
2564
+ Create a concatenated alignment file. This function is
2565
+ used to help in the construction of multi-locus data
2566
+ matrices.
2567
+
2568
+ PhyKIT will output three files:
2569
+ 1) A fasta file with '.fa' appended to the prefix specified
2570
+ with the -p/--prefix parameter.
2571
+ 2) A partition file ready for input into RAxML or IQ-tree.
2572
+ 3) An occupancy file that summarizes the taxon occupancy
2573
+ per sequence.
2574
+ - column 1: alignment name
2575
+ - column 2: # of taxa present
2576
+ - column 3: # of taxa missing
2577
+ - column 4: fraction of occupancy
2578
+ - column 5: names of missing taxa (; separated)
2579
+
2580
+ Aliases:
2581
+ create_concatenation_matrix, create_concat, cc
2582
+ Command line interfaces:
2583
+ pk_create_concatenation_matrix, pk_create_concat, pk_cc
2584
+
2585
+ Usage:
2586
+ phykit create_concatenation_matrix -a <file> -p <string>
2587
+
2588
+ Options
2589
+ =====================================================
2590
+ -a/--alignment alignment list file. File
2591
+ should contain a single
2592
+ column list of alignment
2593
+ sequence files to concatenate
2594
+ into a single matrix. Provide
2595
+ path to files relative to
2596
+ working directory or provide
2597
+ absolute path.
2598
+
2599
+ -p/--prefix prefix of output files
2600
+ """
2601
+ ),
2602
+ )
2603
+ parser.add_argument("-a", "--alignment_list", type=str, help=SUPPRESS)
2604
+ parser.add_argument("-p", "--prefix", type=str, help=SUPPRESS)
2605
+ args = parser.parse_args(argv)
2606
+ CreateConcatenationMatrix(args).run()
2607
+
2608
+ @staticmethod
2609
+ def thread_dna(argv):
2610
+ parser = ArgumentParser(
2611
+ add_help=True,
2612
+ usage=SUPPRESS,
2613
+ formatter_class=RawDescriptionHelpFormatter,
2614
+ description=textwrap.dedent(
2615
+ f"""\
2616
+ {help_header}
2617
+
2618
+ Thread DNA sequence onto a protein alignment to create a
2619
+ codon-based alignment.
2620
+
2621
+ This function requires input alignments are in fasta format.
2622
+ Codon alignments are then printed to stdout. Note, paired
2623
+ sequences are assumed to have the same name between the
2624
+ protein and nucleotide file. The order does not matter.
2625
+
2626
+ To thread nucleotide sequences over a trimmed amino acid
2627
+ alignment, provide PhyKIT with a log file specifying which
2628
+ sites have been trimmed and which have been kept. The log
2629
+ file must be formatted the same as the log files outputted
2630
+ by the alignment trimming toolkit ClipKIT (see -l in ClipKIT
2631
+ documentation.) Details about ClipKIT can be seen here:
2632
+ https://github.com/JLSteenwyk/ClipKIT.
2633
+
2634
+ If using a ClipKIT log file, the untrimmed protein alignment
2635
+ should be provided in the -p/--protein argument.
2636
+
2637
+ Aliases:
2638
+ thread_dna, pal2nal, p2n
2639
+ Command line interfaces:
2640
+ pk_thread_dna, pk_pal2nal, pk_p2n
2641
+
2642
+ Usage:
2643
+ phykit thread_dna -p <file> -n <file> [-c/--clipkit_log_file
2644
+ <clipkit outputted log file> -s]
2645
+
2646
+ Options
2647
+ =====================================================
2648
+ -p/--protein protein alignment file
2649
+
2650
+ -n/--nucleotide nucleotide sequence file
2651
+
2652
+ -c/--clipkit_log clipkit outputted log file
2653
+
2654
+ -s/--stop boolean for whether or not
2655
+ stop codons should be kept.
2656
+ If used, stop codons will
2657
+ be removed.
2658
+ """
2659
+ ),
2660
+ )
2661
+ parser.add_argument("-p", "--protein", type=str, help=SUPPRESS)
2662
+ parser.add_argument("-n", "--nucleotide", type=str, help=SUPPRESS)
2663
+ parser.add_argument(
2664
+ "-c",
2665
+ "--clipkit_log_file",
2666
+ type=str,
2667
+ required=False,
2668
+ help=SUPPRESS,
2669
+ )
2670
+ parser.add_argument(
2671
+ "-s", "--stop", type=str2bool, nargs="?", default=True, help=SUPPRESS
2672
+ )
2673
+ args = parser.parse_args(argv)
2674
+ DNAThreader(args).run()
2675
+
2676
+
2677
+ def main(argv=None):
2678
+ Phykit()
2679
+
2680
+
2681
+ # Alignment-based functions
2682
+ def alignment_length(argv=None):
2683
+ Phykit.alignment_length(sys.argv[1:])
2684
+
2685
+
2686
+ def alignment_length_no_gaps(argv=None):
2687
+ Phykit.alignment_length_no_gaps(sys.argv[1:])
2688
+
2689
+
2690
+ def column_score(argv=None):
2691
+ Phykit.column_score(sys.argv[1:])
2692
+
2693
+
2694
+ def compositional_bias_per_site(argv=None):
2695
+ Phykit.compositional_bias_per_site(sys.argv[1:])
2696
+
2697
+
2698
+ def evolutionary_rate_per_site(argv=None):
2699
+ Phykit.evolutionary_rate_per_site(sys.argv[1:])
2700
+
2701
+
2702
+ def faidx(argv=None):
2703
+ Phykit.faidx(sys.argv[1:])
2704
+
2705
+
2706
+ def gc_content(argv=None):
2707
+ Phykit.gc_content(sys.argv[1:])
2708
+
2709
+
2710
+ def pairwise_identity(argv=None):
2711
+ Phykit.pairwise_identity(sys.argv[1:])
2712
+
2713
+
2714
+ def parsimony_informative_sites(argv=None):
2715
+ Phykit.parsimony_informative_sites(sys.argv[1:])
2716
+
2717
+
2718
+ def rcv(argv=None):
2719
+ Phykit.rcv(sys.argv[1:])
2720
+
2721
+
2722
+ def rcvt(argv=None):
2723
+ Phykit.rcvt(sys.argv[1:])
2724
+
2725
+
2726
+ def rename_fasta_entries(argv=None):
2727
+ Phykit.rename_fasta_entries(sys.argv[1:])
2728
+
2729
+
2730
+ def sum_of_pairs_score(argv=None):
2731
+ Phykit.sum_of_pairs_score(sys.argv[1:])
2732
+
2733
+
2734
+ def variable_sites(argv=None):
2735
+ Phykit.variable_sites(sys.argv[1:])
2736
+
2737
+
2738
+ # Tree-based functions
2739
+ def bipartition_support_stats(argv=None):
2740
+ Phykit.bipartition_support_stats(sys.argv[1:])
2741
+
2742
+
2743
+ def branch_length_multiplier(argv=None):
2744
+ Phykit.branch_length_multiplier(sys.argv[1:])
2745
+
2746
+
2747
+ def collapse_branches(argv=None):
2748
+ Phykit.collapse_branches(sys.argv[1:])
2749
+
2750
+
2751
+ def covarying_evolutionary_rates(argv=None):
2752
+ Phykit.covarying_evolutionary_rates(sys.argv[1:])
2753
+
2754
+
2755
+ def dvmc(argv=None):
2756
+ Phykit.dvmc(sys.argv[1:])
2757
+
2758
+
2759
+ def evolutionary_rate(argv=None):
2760
+ Phykit.evolutionary_rate(sys.argv[1:])
2761
+
2762
+
2763
+ def hidden_paralogy_check(argv=None):
2764
+ Phykit.hidden_paralogy_check(sys.argv[1:])
2765
+
2766
+
2767
+ def internal_branch_stats(argv=None):
2768
+ Phykit.internal_branch_stats(sys.argv[1:])
2769
+
2770
+
2771
+ def internode_labeler(argv=None):
2772
+ Phykit.internode_labeler(sys.argv[1:])
2773
+
2774
+
2775
+ def last_common_ancestor_subtree(argv=None):
2776
+ Phykit.last_common_ancestor_subtree(sys.argv[1:])
2777
+
2778
+
2779
+ def lb_score(argv=None):
2780
+ Phykit.lb_score(sys.argv[1:])
2781
+
2782
+
2783
+ def monophyly_check(argv=None):
2784
+ Phykit.monophyly_check(sys.argv[1:])
2785
+
2786
+
2787
+ def nearest_neighbor_interchange(argv=None):
2788
+ Phykit.nearest_neighbor_interchange(sys.argv[1:])
2789
+
2790
+
2791
+ def patristic_distances(argv=None):
2792
+ Phykit.patristic_distances(sys.argv[1:])
2793
+
2794
+
2795
+ def polytomy_test(argv=None):
2796
+ Phykit.polytomy_test(sys.argv[1:])
2797
+
2798
+
2799
+ def print_tree(argv=None):
2800
+ Phykit.print_tree(sys.argv[1:])
2801
+
2802
+
2803
+ def prune_tree(argv=None):
2804
+ Phykit.prune_tree(sys.argv[1:])
2805
+
2806
+
2807
+ def rename_tree_tips(argv=None):
2808
+ Phykit.rename_tree_tips(sys.argv[1:])
2809
+
2810
+
2811
+ def rf_distance(argv=None):
2812
+ Phykit.rf_distance(sys.argv[1:])
2813
+
2814
+
2815
+ def root_tree(argv=None):
2816
+ Phykit.root_tree(sys.argv[1:])
2817
+
2818
+
2819
+ def spurious_sequence(argv=None):
2820
+ Phykit.spurious_sequence(sys.argv[1:])
2821
+
2822
+
2823
+ def terminal_branch_stats(argv=None):
2824
+ Phykit.terminal_branch_stats(sys.argv[1:])
2825
+
2826
+
2827
+ def tip_labels(argv=None):
2828
+ Phykit.tip_labels(sys.argv[1:])
2829
+
2830
+
2831
+ def tip_to_tip_distance(argv=None):
2832
+ Phykit.tip_to_tip_distance(sys.argv[1:])
2833
+
2834
+
2835
+ def tip_to_tip_node_distance(argv=None):
2836
+ Phykit.tip_to_tip_node_distance(sys.argv[1:])
2837
+
2838
+
2839
+ def total_tree_length(argv=None):
2840
+ Phykit.total_tree_length(sys.argv[1:])
2841
+
2842
+
2843
+ def treeness(argv=None):
2844
+ Phykit.treeness(sys.argv[1:])
2845
+
2846
+
2847
+ # Alignment- and tree-based functions
2848
+ def saturation(argv=None):
2849
+ Phykit.saturation(sys.argv[1:])
2850
+
2851
+
2852
+ def treeness_over_rcv(argv=None):
2853
+ Phykit.treeness_over_rcv(sys.argv[1:])
2854
+
2855
+
2856
+ # Helper functions
2857
+ def create_concatenation_matrix(argv=None):
2858
+ Phykit.create_concatenation_matrix(sys.argv[1:])
2859
+
2860
+
2861
+ def thread_dna(argv=None):
2862
+ Phykit.thread_dna(sys.argv[1:])