treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/AssessPB.py ADDED
@@ -0,0 +1,113 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ AssessPB_usage = '''
6
+ ====================== AssessPB example commands ======================
7
+
8
+ # Dependency: bpcomp and tracecomp (from PhyloBayes-MPI)
9
+ export OMPI_MCA_btl=^openib
10
+ TreeSAK AssessPB -c all_chains.txt
11
+
12
+ # This is a wrapper for (take 4 chains as an example):
13
+ bpcomp -x 1000 10 c1 c2 c3 c4
14
+ tracecomp -x 1000 c1 c2 c3 c4
15
+
16
+ # format of the file provided to -c: directory_path/output_prefix
17
+ GTDB_SCG_best50p0_pb_chain1/GTDB_SCG_best50p0_pb_chain1
18
+ GTDB_SCG_best50p0_pb_chain2/GTDB_SCG_best50p0_pb_chain2
19
+ GTDB_SCG_best50p0_pb_chain3/GTDB_SCG_best50p0_pb_chain3
20
+ GTDB_SCG_best50p0_pb_chain4/GTDB_SCG_best50p0_pb_chain4
21
+
22
+ =======================================================================
23
+ '''
24
+
25
+
26
+ def compare2chains(chain_1, chain_2, chain_3, chain_4, burn_in, sample_interval, op_dir, cmd_txt):
27
+
28
+ # bpcomp: -x <burnin> [<every> <until>]. default burnin = 10 percent of the chain
29
+ # tracecomp: -x <burnin> [<every> <until>]. default burnin = 20 percent of the chain
30
+
31
+ bpcomp_cmd = 'bpcomp -o %s/bpcomp -x %s %s %s %s' % (op_dir, burn_in, sample_interval, chain_1, chain_2)
32
+ tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s' % (op_dir, burn_in, chain_1, chain_2)
33
+
34
+ if (chain_3 is not None) and (chain_4 is None):
35
+ bpcomp_cmd = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s' % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3)
36
+ tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s' % (op_dir, burn_in, chain_1, chain_2, chain_3)
37
+
38
+ if (chain_3 is not None) and (chain_4 is not None):
39
+ bpcomp_cmd = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s %s' % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3, chain_4)
40
+ tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s %s' % (op_dir, burn_in, chain_1, chain_2, chain_3, chain_4)
41
+
42
+ # write out commands
43
+ cmd_txt_handle = open(cmd_txt, 'a')
44
+ cmd_txt_handle.write(bpcomp_cmd + '\n')
45
+ cmd_txt_handle.write(tracecomp_cmd + '\n')
46
+ cmd_txt_handle.close()
47
+
48
+ # execute commands
49
+ print('\n================================ bpcomp ================================')
50
+ os.system(bpcomp_cmd)
51
+ print('\nGuideline')
52
+ print('maxdiff < 0.1: good')
53
+ print('maxdiff < 0.3: acceptable, gives a good qualitative picture of the posterior consensus.')
54
+ print('0.3 < maxdiff < 1: the sample is not yet sufficiently large and have not converged, but on right track.')
55
+ print('If maxdiff = 1 even after 10,000 points: at least one run stuck in a local maximum.')
56
+ print('\n============================== tracecomp ==============================\n')
57
+ os.system(tracecomp_cmd)
58
+ print('\nGuideline')
59
+ print('good: rel diff < 0.1 and minimum effective size > 300')
60
+ print('acceptable: rel diff < 0.3 and minimum effective size > 50')
61
+ print('\n========================================================================\n')
62
+
63
+
64
+ def AssessPB(args):
65
+
66
+ chain_file = args['c']
67
+ burn_in = args['bi']
68
+ sample_interval = args['si']
69
+ op_dir = args['o']
70
+ force_overwrite = args['f']
71
+
72
+ cmd_txt = '%s/cmds.txt' % op_dir
73
+
74
+ # check is chain_file exist
75
+ if os.path.isfile(chain_file) is False:
76
+ print('%s not found, program exited!' % chain_file)
77
+ exit()
78
+
79
+ # check if chains were provided in the file
80
+ chain_list = []
81
+ for each_chain in open(chain_file):
82
+ chain_list.append(each_chain.strip())
83
+ if len(chain_list) < 2:
84
+ print('Provided %s chains, need at least two chains, program exited!' % len(chain_list))
85
+ exit()
86
+
87
+ # create output dir
88
+ if os.path.isdir(op_dir) is True:
89
+ if force_overwrite is True:
90
+ os.system('rm -r %s' % op_dir)
91
+ else:
92
+ print('output folder already exist, program exited!')
93
+ exit()
94
+ os.system('mkdir %s' % op_dir)
95
+
96
+ if len(chain_list) == 2:
97
+ compare2chains(chain_list[0], chain_list[1], burn_in, sample_interval, op_dir, cmd_txt)
98
+ elif len(chain_list) == 3:
99
+ compare2chains(chain_list[0], chain_list[1], chain_list[2], burn_in, sample_interval, op_dir, cmd_txt)
100
+ elif len(chain_list) == 4:
101
+ compare2chains(chain_list[0], chain_list[1], chain_list[2], chain_list[3], burn_in, sample_interval, op_dir, cmd_txt)
102
+
103
+
104
+ if __name__ == '__main__':
105
+
106
+ AssessPB_parser = argparse.ArgumentParser()
107
+ AssessPB_parser.add_argument('-c', required=False, default=None, help='a txt file contain all the chains')
108
+ AssessPB_parser.add_argument('-bi', required=False, default=1000, help='burn-in, default: 1000')
109
+ AssessPB_parser.add_argument('-si', required=False, default=10, help='sample interval, default: 10')
110
+ AssessPB_parser.add_argument('-o', required=True, default=None, help='output directory')
111
+ AssessPB_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
112
+ args = vars(AssessPB_parser.parse_args())
113
+ AssessPB(args)
TreeSAK/BMGE.jar ADDED
Binary file
TreeSAK/BMGE.py ADDED
@@ -0,0 +1,49 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ BMGE_usage = '''
6
+ ======================= BMGE example commands =======================
7
+
8
+ # require: java
9
+
10
+ TreeSAK BMGE -p demo -i input.aln -m BLOSUM30 -esc 0.55
11
+
12
+ # Settings for calculating split score (Nina Dombrowski):
13
+ # -t AA -m BLOSUM30 -h 0.55
14
+
15
+ =====================================================================
16
+ '''
17
+
18
+
19
+ def BMGE(args):
20
+
21
+ op_prefix = args['p']
22
+ msa_in = args['i']
23
+ trim_model = args['m']
24
+ entropy_score_cutoff = args['esc']
25
+
26
+ # define file name
27
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
28
+
29
+ # specify path to BMGE.jar
30
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
31
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
32
+
33
+ # run BMGE
34
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_fasta)
35
+ print(bmge_cmd)
36
+ os.system(bmge_cmd)
37
+
38
+ print('Done!')
39
+
40
+
41
+ if __name__ == '__main__':
42
+
43
+ BMGE_parser = argparse.ArgumentParser()
44
+ BMGE_parser.add_argument('-p', required=True, help='output prefix')
45
+ BMGE_parser.add_argument('-i', required=True, help='input MSA')
46
+ BMGE_parser.add_argument('-m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
47
+ BMGE_parser.add_argument('-esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
48
+ args = vars(BMGE_parser.parse_args())
49
+ BMGE(args)
TreeSAK/C60SR4.nex ADDED
@@ -0,0 +1,127 @@
1
+ #nexus
2
+
3
+ begin models;
4
+
5
+ frequency C60NT1= 0.6671684132 0.0296031604 0.263873245 0.0393551815;
6
+
7
+ frequency C60NT2= 0.1276338907 0.0209812196 0.7927062299 0.0586786597;
8
+
9
+ frequency C60NT3= 0.0163962994 0.0051413956 0.0030074531 0.975454852;
10
+
11
+ frequency C60NT4= 0.43894983 0.1822394649 0.0174009227 0.3614097825;
12
+
13
+ frequency C60NT5= 0.0979385646 0.2096596198 0.0222763602 0.6701254554;
14
+
15
+ frequency C60NT6= 0.245811863 0.1332129821 0.5371646791 0.083810476;
16
+
17
+ frequency C60NT7= 0.0226930145 0.3959457214 0.0091927563 0.5721685078;
18
+
19
+ frequency C60NT8= 0.5015838424 0.4308125086 0.0079650295 0.0596386196;
20
+
21
+ frequency C60NT9= 0.2221963854 0.3881879363 0.1020538807 0.2875617976;
22
+
23
+ frequency C60NT10= 0.2637737779 0.1509966633 0.2145338888 0.3706956701;
24
+
25
+ frequency C60NT11= 0.192147964 0.0110024977 0.7823009601 0.0145485781;
26
+
27
+ frequency C60NT12= 0.1484234234 0.0338349152 0.4455828719 0.3721587895;
28
+
29
+ frequency C60NT13= 0.7701156248 0.0361294714 0.0511613978 0.142593506;
30
+
31
+ frequency C60NT14= 0.4210820539 0.0363523971 0.0088100421 0.5337555069;
32
+
33
+ frequency C60NT15= 0.4443921233 0.0693293801 0.4521540079 0.0341244887;
34
+
35
+ frequency C60NT16= 0.3775042704 0.0269821801 0.580257087 0.0152564625;
36
+
37
+ frequency C60NT17= 0.4767453479 0.034379142 0.1231147917 0.3657607185;
38
+
39
+ frequency C60NT18= 0.0268679326 0.0169276748 0.0098917288 0.9463126639;
40
+
41
+ frequency C60NT19= 0.3018006878 0.0294470475 0.6420354145 0.0267168502;
42
+
43
+ frequency C60NT20= 0.3419571734 0.0262071154 0.6184824034 0.0133533078;
44
+
45
+ frequency C60NT21= 0.0096247416 0.9088039403 0.0103815544 0.0711897638;
46
+
47
+ frequency C60NT22= 0.1999965426 0.0297517307 0.2954127651 0.4748389616;
48
+
49
+ frequency C60NT23= 0.1127907248 0.5495098576 0.0691492727 0.268550145;
50
+
51
+ frequency C60NT24= 0.0776920852 0.0329607437 0.0190469923 0.8703001788;
52
+
53
+ frequency C60NT25= 0.8938602529 0.0175688123 0.0516233175 0.0369476173;
54
+
55
+ frequency C60NT26= 0.27898143 0.2724136798 0.2567784573 0.1918264331;
56
+
57
+ frequency C60NT27= 0.498929145 0.0363065742 0.3503605267 0.1144037542;
58
+
59
+ frequency C60NT28= 0.2681948281 0.06920056 0.5207582638 0.1418463481;
60
+
61
+ frequency C60NT29= 0.2045159736 0.0339698809 0.0436529982 0.7178611476;
62
+
63
+ frequency C60NT30= 0.2691226215 0.0356845278 0.162767839 0.5324250117;
64
+
65
+ frequency C60NT31= 0.7694198604 0.0996406504 0.090926784 0.040012705;
66
+
67
+ frequency C60NT32= 0.015577726 0.0724312301 0.0067063212 0.9052847226;
68
+
69
+ frequency C60NT33= 0.0832030401 0.1132575475 0.2862435644 0.5172958479;
70
+
71
+ frequency C60NT34= 0.0509990348 0.0082496135 0.0061309345 0.9346204172;
72
+
73
+ frequency C60NT35= 0.7537265064 0.0213859704 0.1711138164 0.0537737068;
74
+
75
+ frequency C60NT36= 0.909000965 0.0289286367 0.0266394188 0.0354309794;
76
+
77
+ frequency C60NT37= 0.9546241163 0.008441069 0.0316827792 0.0052520355;
78
+
79
+ frequency C60NT38= 0.5192884454 0.0572862542 0.3797587173 0.0436665831;
80
+
81
+ frequency C60NT39= 0.1341646585 0.5925517098 0.1874442682 0.0858393636;
82
+
83
+ frequency C60NT40= 0.0448326475 0.0307205308 0.0250541698 0.8993926521;
84
+
85
+ frequency C60NT41= 0.5664706531 0.0393032078 0.3812930448 0.0129330943;
86
+
87
+ frequency C60NT42= 0.8370819783 0.0190256158 0.1290970633 0.0147953427;
88
+
89
+ frequency C60NT43= 0.1120464953 0.0574737723 0.799428803 0.0310509292;
90
+
91
+ frequency C60NT44= 0.5567531248 0.030047458 0.113077382 0.3001220351;
92
+
93
+ frequency C60NT45= 0.7530681463 0.0296715581 0.1919452504 0.0253150453;
94
+
95
+ frequency C60NT46= 0.0813111668 0.4663342365 0.3315560742 0.1207985224;
96
+
97
+ frequency C60NT47= 0.3493942031 0.0181602391 0.5911205418 0.0413250161;
98
+
99
+ frequency C60NT48= 0.7273938304 0.0224960247 0.2440348688 0.0060752763;
100
+
101
+ frequency C60NT49= 0.144800238 0.0781260939 0.6995327663 0.0775409016;
102
+
103
+ frequency C60NT50= 0.5558261942 0.0217135959 0.355400139 0.0670600707;
104
+
105
+ frequency C60NT51= 0.0142277933 0.577157862 0.0046886186 0.4039257261;
106
+
107
+ frequency C60NT52= 0.566952106 0.1662596481 0.0829557817 0.1838324641;
108
+
109
+ frequency C60NT53= 0.3710307018 0.2568582793 0.3304267825 0.0416842365;
110
+
111
+ frequency C60NT54= 0.4673591892 0.0443703034 0.0644933219 0.4237771855;
112
+
113
+ frequency C60NT55= 0.2935718655 0.0371033744 0.5025606284 0.1667641317;
114
+
115
+ frequency C60NT56= 0.1445054403 0.0175105032 0.8019947085 0.035989348;
116
+
117
+ frequency C60NT57= 0.5953413269 0.0543418469 0.3379976485 0.0123191777;
118
+
119
+ frequency C60NT58= 0.5011346064 0.0186312309 0.4456054968 0.034628666;
120
+
121
+ frequency C60NT59= 0.8862685333 0.0262544484 0.0131639188 0.0743130995;
122
+
123
+ frequency C60NT60= 0.0386456469 0.0058035261 0.0121187396 0.9434320874;
124
+
125
+ model C60SR4=GTR+G+FMIX{C60NT1,C60NT2,C60NT3,C60NT4,C60NT5,C60NT6,C60NT7,C60NT8,C60NT9,C60NT10,C60NT11,C60NT12,C60NT13,C60NT14,C60NT15,C60NT16,C60NT17,C60NT18,C60NT19,C60NT20,C60NT21,C60NT22,C60NT23,C60NT24,C60NT25,C60NT26,C60NT27,C60NT28,C60NT29,C60NT30,C60NT31,C60NT32,C60NT33,C60NT34,C60NT35,C60NT36,C60NT37,C60NT38,C60NT39,C60NT40,C60NT41,C60NT42,C60NT43,C60NT44,C60NT45,C60NT46,C60NT47,C60NT48,C60NT49,C60NT50,C60NT51,C60NT52,C60NT53,C60NT54,C60NT55,C60NT56,C60NT57,C60NT58,C60NT59,C60NT60}+F;
126
+
127
+ end;
TreeSAK/CompareMCMC.py ADDED
@@ -0,0 +1,138 @@
1
+ import os
2
+ import argparse
3
+ import arviz as az
4
+ import pandas as pd
5
+ import matplotlib as mpl
6
+ mpl.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+ from matplotlib.pyplot import figure
9
+
10
+
11
+ CompareMCMC_usage = '''
12
+ ====================================== CompareMCMC example commands ======================================
13
+
14
+ TreeSAK CompareMCMC -mx IR_mcmc.txt -my AR_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
15
+
16
+ cd /Users/songweizhi/Desktop
17
+ TreeSAK CompareMCMC -mx /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run1_mcmc.txt -my /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run2_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
18
+
19
+ ==========================================================================================================
20
+ '''
21
+
22
+
23
+ def sep_path_basename_ext(file_in):
24
+ file_path, file_name = os.path.split(file_in)
25
+ if file_path == '':
26
+ file_path = '.'
27
+ file_basename, file_extension = os.path.splitext(file_name)
28
+ return file_path, file_basename, file_extension
29
+
30
+
31
+ def CompareMCMC(args):
32
+
33
+ mcmc_txt_x = args['mx']
34
+ mcmc_txt_y = args['my']
35
+ label_x = args['lx']
36
+ label_y = args['ly']
37
+ pwd_figure = args['o']
38
+ max_axis_value = args['max']
39
+ label_fs = args['fs']
40
+
41
+ x_path, x_basename, x_ext = sep_path_basename_ext(mcmc_txt_x)
42
+ y_path, y_basename, y_ext = sep_path_basename_ext(mcmc_txt_y)
43
+
44
+ if label_x is None:
45
+ label_x = x_basename
46
+ if label_y is None:
47
+ label_y = y_basename
48
+
49
+ # read in dataframe
50
+ df_x = pd.read_table(mcmc_txt_x, index_col=0)
51
+ df_y = pd.read_table(mcmc_txt_y, index_col=0)
52
+
53
+ # get Mean value for each column
54
+ df_x_col_to_mean_dict = {col_name: mean for col_name, mean in df_x.mean().iteritems()}
55
+ df_y_col_to_mean_dict = {col_name: mean for col_name, mean in df_y.mean().iteritems()}
56
+
57
+ # get CI95 for each column
58
+ df_x_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_x.iteritems()}
59
+ df_y_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_y.iteritems()}
60
+
61
+ num_list_x = []
62
+ num_list_y = []
63
+ err_range_x = []
64
+ err_range_y = []
65
+ for col_name, col in df_x.iteritems():
66
+ if col_name not in ['mu', 'sigma2', 'lnL']:
67
+ num_list_x.append(df_x_col_to_mean_dict[col_name])
68
+ num_list_y.append(df_y_col_to_mean_dict[col_name])
69
+ err_range_x.append(df_x_col_to_ci_dict[col_name])
70
+ err_range_y.append(df_y_col_to_ci_dict[col_name])
71
+
72
+ x_err_l = []
73
+ x_err_r = []
74
+ y_err_l = []
75
+ y_err_u = []
76
+ max_value = 0
77
+ min_value = 100000000000000
78
+ n = 0
79
+ while n < len(num_list_x):
80
+ x_value = num_list_x[n]
81
+ y_value = num_list_y[n]
82
+ x_range = err_range_x[n]
83
+ y_range = err_range_y[n]
84
+ x_l_dist = abs(x_value - x_range[0])
85
+ x_r_dist = abs(x_range[1] - x_value)
86
+ y_l_dist = abs(y_value - y_range[0])
87
+ y_u_dist = abs(y_range[1] - y_value)
88
+ x_err_l.append(x_l_dist)
89
+ x_err_r.append(x_r_dist)
90
+ y_err_l.append(y_l_dist)
91
+ y_err_u.append(y_u_dist)
92
+
93
+ current_max = max(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
94
+ current_min = min(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
95
+
96
+ if current_max > max_value:
97
+ max_value = current_max
98
+ if current_min < min_value:
99
+ min_value = current_min
100
+ n += 1
101
+
102
+ figure(figsize=(6, 6), dpi=300)
103
+ plt.plot([min_value, max_value], [min_value, max_value], color='black', linestyle='dashed', linewidth=1, alpha=0.5)
104
+ plt.scatter(num_list_x, num_list_y, s=0)
105
+ plt.errorbar(num_list_x, num_list_y, xerr=[x_err_l, x_err_r], yerr=[y_err_l, y_err_u],
106
+ ls='none', ecolor='skyblue', elinewidth=1, alpha=0.5)
107
+
108
+ if max_axis_value is not None:
109
+ plt.xlim([0, max_axis_value])
110
+ plt.ylim([0, max_axis_value])
111
+
112
+ # Set the font size of xticks and yticks
113
+ plt.xticks(fontsize=label_fs)
114
+ plt.yticks(fontsize=label_fs)
115
+ plt.xlabel(label_x, fontsize=label_fs)
116
+ plt.ylabel(label_y, fontsize=label_fs)
117
+
118
+ # write out
119
+ plt.tight_layout()
120
+ plt.savefig(pwd_figure)
121
+ plt.close()
122
+
123
+ print('Plot exported to %s, done!' % pwd_figure)
124
+
125
+
126
+ if __name__ == '__main__':
127
+
128
+ # initialize the options parser
129
+ parser = argparse.ArgumentParser()
130
+ parser.add_argument('-mx', required=True, help='mcmc.txt for x axis')
131
+ parser.add_argument('-my', required=True, help='mcmc.txt for y axis')
132
+ parser.add_argument('-lx', required=False, default=None, help='label for x axis')
133
+ parser.add_argument('-ly', required=False, default=None, help='label for y axis')
134
+ parser.add_argument('-max', required=False, default=None, type=int, help='maximum axis value')
135
+ parser.add_argument('-fs', required=False, default=16, type=int, help='label font size, default: 16')
136
+ parser.add_argument('-o', required=True, help='output plot')
137
+ args = vars(parser.parse_args())
138
+ CompareMCMC(args)
TreeSAK/ConcateMSA.py ADDED
@@ -0,0 +1,111 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+
7
+
8
+ ConcateMSA_usage = '''
9
+ ================= ConcateMSA example commands =================
10
+
11
+ TreeSAK ConcateMSA -i aln -x aln -p concatenated -gene2gnm
12
+
13
+ # output file include:
14
+ concatenated.fasta
15
+ concatenated.phylip
16
+ concatenated.partition.txt
17
+
18
+ ===============================================================
19
+ '''
20
+
21
+
22
+ def ConcateMSA(args):
23
+
24
+ msa_dir = args['i']
25
+ msa_ext = args['x']
26
+ op_prefix = args['p']
27
+ gene2gnm = args['gene2gnm']
28
+
29
+ concatenated_msa_phy = '%s.phylip' % op_prefix
30
+ concatenated_msa_fasta = '%s.fasta' % op_prefix
31
+ partition_file = '%s.partition.txt' % op_prefix
32
+
33
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
34
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
35
+ msa_file_list_sorted = sorted(msa_file_list)
36
+
37
+ complete_gnm_set = set()
38
+ for each_msa_file in msa_file_list:
39
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
40
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
41
+ seq_id = each_seq.id
42
+ if gene2gnm is True:
43
+ seq_id = '_'.join(seq_id.split('_')[:-1])
44
+ complete_gnm_set.add(seq_id)
45
+
46
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
47
+
48
+ # initialize concatenated msa dict
49
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
50
+ msa_len_dict = dict()
51
+ for each_msa_file in msa_file_list_sorted:
52
+ msa_id = each_msa_file.split('.' + msa_ext)[0]
53
+
54
+ # read in msa
55
+ current_msa_len = 0
56
+ current_msa_len_set = set()
57
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
58
+ current_msa_seq_dict = dict()
59
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
60
+ seq_id = each_seq.id
61
+ if gene2gnm is True:
62
+ seq_id = '_'.join(seq_id.split('_')[:-1])
63
+ complete_gnm_set.add(seq_id)
64
+ current_msa_seq_dict[seq_id] = str(each_seq.seq)
65
+ current_msa_len_set.add(len(each_seq.seq))
66
+ current_msa_len = len(each_seq.seq)
67
+
68
+ if len(current_msa_len_set) != 1:
69
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
70
+ exit()
71
+
72
+ msa_len_dict[msa_id] = current_msa_len
73
+
74
+ # add sequence to concatenated msa dict
75
+ for each_gnm in complete_gnm_list_sorted:
76
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
77
+ gnm_to_seq_dict[each_gnm] += msa_seq
78
+
79
+ # write out concatenated msa
80
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
81
+ for each_gnm in complete_gnm_list_sorted:
82
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
83
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
84
+ concatenated_msa_handle.close()
85
+
86
+ # write out partition file
87
+ end_pos = 0
88
+ partition_file_handle = open(partition_file, 'w')
89
+ for each_m in msa_file_list_sorted:
90
+ gene_id = each_m.split('.' + msa_ext)[0]
91
+ current_m_len = msa_len_dict[gene_id]
92
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
93
+ end_pos += current_m_len
94
+ partition_file_handle.close()
95
+
96
+ # convert msa in fasta to phy
97
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
98
+
99
+
100
+
101
+ if __name__ == '__main__':
102
+
103
+ # initialize the options parser
104
+ parser = argparse.ArgumentParser()
105
+ parser.add_argument('-i', required=True, help='input MSA folder')
106
+ parser.add_argument('-x', required=False, default='aln', help='input file extension')
107
+ parser.add_argument('-p', required=True, help='output prefix')
108
+ parser.add_argument('-gene2gnm', required=False, action="store_true", help='gene id to gnm id, split sequence id before the last _')
109
+
110
+ args = vars(parser.parse_args())
111
+ ConcateMSA(args)
TreeSAK/ConvertMSA.py ADDED
@@ -0,0 +1,135 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+
7
+
8
+ ConvertMSA_usage = '''
9
+ ================================= ConvertMSA example commands =================================
10
+
11
+ # phylip to fasta
12
+ TreeSAK ConvertMSA -i concatenated.phy -fi phylip-relaxed -o concatenated.fasta -fo fasta
13
+ TreeSAK ConvertMSA -i phy_files -fi phylip-relaxed -xi phy -o MSA_in_fasta -fo fasta -xo fa
14
+
15
+ # examples of alignment format (https://biopython.org/wiki/AlignIO):
16
+ fasta, phylip, phylip-relaxed, phylip-sequential, clustal
17
+
18
+ ===============================================================================================
19
+ '''
20
+
21
+
22
+ def sep_path_basename_ext(file_in):
23
+
24
+ # separate path and file name
25
+ file_path, file_name = os.path.split(file_in)
26
+ if file_path == '':
27
+ file_path = '.'
28
+
29
+ # separate file basename and extension
30
+ file_basename, file_extension = os.path.splitext(file_name)
31
+
32
+ return file_path, file_basename, file_extension
33
+
34
+
35
+ def ConvertMSA(args):
36
+
37
+ aln_in = args['i']
38
+ aln_in_ext = args['xi']
39
+ aln_in_format = args['fi']
40
+ aln_out = args['o']
41
+ aln_out_ext = args['xo']
42
+ aln_out_format = args['fo']
43
+ one_line = args['oneline']
44
+ no_gap = args['nogap']
45
+ force_overwriting = args['f']
46
+
47
+ if ((one_line is True) and (aln_out_format != 'fasta')) or ((no_gap is True) and (aln_out_format != 'fasta')):
48
+ print('Please provide "-oneline" and/or "-nogap" only if "-fo" is fasta')
49
+ exit()
50
+
51
+ if os.path.isfile(aln_in) is True:
52
+ if (one_line is False) and (no_gap is False):
53
+ AlignIO.convert(aln_in, aln_in_format, aln_out, aln_out_format)
54
+ else:
55
+ aln_out_tmp = aln_out + '.tmp'
56
+ AlignIO.convert(aln_in, aln_in_format, aln_out_tmp, aln_out_format)
57
+ pwd_aln_out_handle = open(aln_out, 'w')
58
+ for each_seq in SeqIO.parse(aln_out_tmp, 'fasta'):
59
+ seq_id = each_seq.id
60
+ seq_sequence = str(each_seq.seq)
61
+ if no_gap is False:
62
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
63
+ pwd_aln_out_handle.write('%s\n' % seq_sequence)
64
+ else:
65
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
66
+ pwd_aln_out_handle.write('%s\n' % seq_sequence.replace('-', ''))
67
+ pwd_aln_out_handle.close()
68
+ os.system('rm %s' % aln_out_tmp)
69
+
70
+ print('Done!')
71
+
72
+ elif os.path.isdir(aln_in) is True:
73
+ aln_in_re = '%s/*.%s' % (aln_in, aln_in_ext)
74
+ aln_in_list = [os.path.basename(file_name) for file_name in glob.glob(aln_in_re)]
75
+
76
+ # check input
77
+ if len(aln_in_list) == 0:
78
+ print('Input file not detected, program exited!')
79
+ exit()
80
+
81
+ # check output folder
82
+ if os.path.isdir(aln_out) is True:
83
+ if force_overwriting is True:
84
+ os.system('rm -r %s' % aln_out)
85
+ else:
86
+ print('Output folder already exist, program exited!')
87
+ exit()
88
+ os.system('mkdir %s' % aln_out)
89
+
90
+ # convert
91
+ for each_aln_in in aln_in_list:
92
+
93
+ aln_in_path, aln_in_basename, aln_in_ext = sep_path_basename_ext(each_aln_in)
94
+ pwd_aln_in = '%s/%s' % (aln_in, each_aln_in)
95
+ pwd_aln_out = '%s/%s.%s' % (aln_out, aln_in_basename, aln_out_ext)
96
+ pwd_aln_out_tmp = '%s/%s_tmp.%s' % (aln_out, aln_in_basename, aln_out_ext)
97
+
98
+ if (one_line is False) and (no_gap is False):
99
+ AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out, aln_out_format)
100
+ else:
101
+ AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out_tmp, aln_out_format)
102
+ pwd_aln_out_handle = open(pwd_aln_out, 'w')
103
+ for each_seq in SeqIO.parse(pwd_aln_out_tmp, 'fasta'):
104
+ seq_id = each_seq.id
105
+ seq_sequence = str(each_seq.seq)
106
+ if no_gap is False:
107
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
108
+ pwd_aln_out_handle.write('%s\n' % seq_sequence)
109
+ else:
110
+ sequence_no_gap = seq_sequence.replace('-', '')
111
+ if len(sequence_no_gap) > 0:
112
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
113
+ pwd_aln_out_handle.write('%s\n' % sequence_no_gap)
114
+ pwd_aln_out_handle.close()
115
+ os.system('rm %s' % pwd_aln_out_tmp)
116
+ print('Done!')
117
+ else:
118
+ print('Input file not found, program exited!')
119
+
120
+
121
+ if __name__ == '__main__':
122
+
123
+ # initialize the options parser
124
+ parser = argparse.ArgumentParser()
125
+ parser.add_argument('-i', required=True, help='input alignment')
126
+ parser.add_argument('-xi', required=False, default='aln', help='input alignment extension')
127
+ parser.add_argument('-fi', required=True, help='input alignment format, e.g., fasta, phylip')
128
+ parser.add_argument('-o', required=True, help='output alignment')
129
+ parser.add_argument('-xo', required=False, default='aln', help='output alignment extension')
130
+ parser.add_argument('-fo', required=True, help='output alignment format, e.g., fasta, phylip')
131
+ parser.add_argument('-oneline', required=False, action="store_true", help='put sequence in single line, available if -fo is fasta')
132
+ parser.add_argument('-nogap', required=False, action="store_true", help='remove gaps from alignment, available if -fo is fasta')
133
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite existing output folder')
134
+ args = vars(parser.parse_args())
135
+ ConvertMSA(args)