treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,193 @@
1
+ import os
2
+ import copy
3
+ import argparse
4
+ from Bio import Phylo
5
+ from datetime import datetime
6
+
7
+
8
+ subset_tree_usage = '''
9
+ ========================== subset_tree example command ==========================
10
+
11
+ BioSAK subset_tree -tree tree_in.newick -taxon leaves.txt -out tree_out.newick
12
+
13
+ =================================================================================
14
+ '''
15
+
16
+
17
+ def check_to_keep(clade_name_str, identified_taxon_list):
18
+
19
+ # remove colon form clade name
20
+ clade_name_str_no_colon = clade_name_str
21
+ if ':' in clade_name_str_no_colon:
22
+ clade_name_str_no_colon = clade_name_str_no_colon.split(':')[1]
23
+
24
+ # split clade name if there are ';'
25
+ for_check = []
26
+ if ';' in clade_name_str_no_colon:
27
+ clade_name_str_no_colon_split = clade_name_str_no_colon.split(';')
28
+
29
+ # remove space at the begining or end for each split
30
+ clade_name_str_no_colon_split_no_space = []
31
+ for clade_name in clade_name_str_no_colon_split:
32
+ if clade_name[0] == ' ':
33
+ clade_name = clade_name[1:]
34
+ if clade_name[-1] == ' ':
35
+ clade_name = clade_name[:-1]
36
+ clade_name_str_no_colon_split_no_space.append(clade_name)
37
+ for_check = clade_name_str_no_colon_split_no_space
38
+ else:
39
+ for_check = [clade_name_str_no_colon]
40
+
41
+ # check to keep or not
42
+ clade_to_keep = 0
43
+ for identified_taxon in identified_taxon_list:
44
+ if identified_taxon in for_check:
45
+ clade_to_keep = 1
46
+
47
+ return clade_to_keep
48
+
49
+
50
+ def remove_unwanted_leaf_nodes(tree, identified_taxon_list):
51
+
52
+ # copy tree
53
+ tree_copy = copy.deepcopy(tree)
54
+
55
+ removed_leaf_num = 0
56
+ all_leaf_nodes = tree_copy.get_terminals()
57
+ for leaf_node in all_leaf_nodes:
58
+ leaf_node_name_str = str(leaf_node.name)
59
+ leaf_node_to_keep = check_to_keep(leaf_node_name_str, identified_taxon_list)
60
+
61
+ if leaf_node_to_keep == 0:
62
+ tree_copy.collapse(leaf_node)
63
+ removed_leaf_num += 1
64
+
65
+ return tree_copy, removed_leaf_num
66
+
67
+
68
+ def subset_tree(args):
69
+
70
+ ################################################# input #################################################
71
+
72
+ tree_file_in = args['tree']
73
+ group_to_taxon_file = args['taxon']
74
+ tree_file_out = args['out']
75
+ keep_quiet = args['q']
76
+
77
+ # define tmp file name
78
+ tree_file_tmp_1 = '%s.tmp_1.tree' % tree_file_out
79
+ tree_file_tmp_2 = '%s.tmp_2.tree' % tree_file_out
80
+ time_format = '[%Y-%m-%d %H:%M:%S] '
81
+
82
+ ################################################ store input information ###########################################
83
+
84
+ # read in tree
85
+ tree_in = Phylo.read(tree_file_in, 'newick')
86
+
87
+ # read in all identified taxons
88
+ identified_taxon_list = set()
89
+ for each_group in open(group_to_taxon_file):
90
+ identified_taxon_list.add(each_group.strip())
91
+
92
+ if keep_quiet is False:
93
+ print(datetime.now().strftime(time_format) + 'The number of provided taxon: %s' % len(identified_taxon_list))
94
+
95
+ ########################################## remove unwanted nodes recursively #######################################
96
+
97
+ # remove unwanted nodes recursively
98
+ if keep_quiet is False:
99
+ print(datetime.now().strftime(time_format) + 'Recursively removing unwanted nodes')
100
+ deleted_leaf_num = 1
101
+ n = 0
102
+ tree_in_copy = copy.deepcopy(tree_in)
103
+ while deleted_leaf_num > 0:
104
+ tree_in_copy, deleted_leaf_num = remove_unwanted_leaf_nodes(tree_in_copy, identified_taxon_list)
105
+ n += 1
106
+ if keep_quiet is False:
107
+ print(datetime.now().strftime(time_format) + 'Removed %s nodes in %sth round' % (deleted_leaf_num, n))
108
+
109
+ # write out tree
110
+ Phylo.write(tree_in_copy, tree_file_tmp_1, 'newick')
111
+
112
+ ############################################# remove "100:" in clade name ##########################################
113
+
114
+ # read in tree
115
+ tree_tmp_1 = Phylo.read(tree_file_tmp_1, 'newick')
116
+ tree_tmp_1_copy = copy.deepcopy(tree_tmp_1)
117
+
118
+ for clade in tree_tmp_1_copy.find_clades():
119
+ clade_name = str(clade.name)
120
+ if ':' in clade_name:
121
+ clade.name = clade_name.split(':')[1]
122
+
123
+ Phylo.write(tree_tmp_1_copy, tree_file_tmp_2, 'newick')
124
+
125
+ ################################################ rename leaf nodes name ############################################
126
+
127
+ # read in tree
128
+ tree_tmp_2 = Phylo.read(tree_file_tmp_2, 'newick')
129
+ tree_tmp_2_copy = copy.deepcopy(tree_tmp_2)
130
+
131
+ # get all leaf nodes
132
+ all_leaf_nodes = tree_tmp_2_copy.get_terminals()
133
+ for leaf_node in all_leaf_nodes:
134
+ leaf_node_name_str = str(leaf_node.name)
135
+
136
+ if ';' in leaf_node_name_str:
137
+ leaf_node_name_split = leaf_node_name_str.split(';')
138
+
139
+ # remove space at the begining or end
140
+ leaf_node_name_split_no_space = []
141
+ for each_name in leaf_node_name_split:
142
+ if each_name[0] == ' ':
143
+ each_name = each_name[1:]
144
+ if each_name[-1] == ' ':
145
+ each_name = each_name[:-1]
146
+ leaf_node_name_split_no_space.append(each_name)
147
+
148
+ leaf_node_name_new = ''
149
+ for identified_taxon in identified_taxon_list:
150
+ if identified_taxon in leaf_node_name_split_no_space:
151
+ leaf_node_name_new = identified_taxon
152
+
153
+ leaf_node.name = leaf_node_name_new
154
+
155
+ # write out tree
156
+ Phylo.write(tree_tmp_2_copy, tree_file_out, 'newick')
157
+
158
+ # report
159
+ if keep_quiet is False:
160
+ print(datetime.now().strftime(time_format) + 'Tree subset exported to: %s' % tree_file_out)
161
+
162
+ # print warning message if some provided node(s) were not found
163
+ extracted_leaf_nodes = tree_tmp_2_copy.get_terminals()
164
+ if len(extracted_leaf_nodes) < len(identified_taxon_list):
165
+
166
+ extracted_leaf_node_list = []
167
+ for extracted_leaf_node in extracted_leaf_nodes:
168
+ extracted_leaf_node_list.append(str(extracted_leaf_node.name))
169
+
170
+ un_extracted_nodes = []
171
+ for provided_node in identified_taxon_list:
172
+ if provided_node not in extracted_leaf_node_list:
173
+ un_extracted_nodes.append(provided_node)
174
+
175
+ if keep_quiet is False:
176
+ print(datetime.now().strftime(time_format) + 'Warning!!! Found %s of %s provided nodes, missed: %s' % (len(extracted_leaf_nodes), len(identified_taxon_list), ', '.join(un_extracted_nodes)))
177
+
178
+ ################################################### remove tmp files ###############################################
179
+
180
+ # remove tmp files
181
+ os.remove(tree_file_tmp_1)
182
+ os.remove(tree_file_tmp_2)
183
+
184
+
185
+ if __name__ == '__main__':
186
+ parser = argparse.ArgumentParser(description='', add_help=False)
187
+ parser.add_argument('-h', action='help', help='Show this help message and exit')
188
+ parser.add_argument('-tree', required=True, type=str, help='input tree file')
189
+ parser.add_argument('-taxon', required=True, type=str, help='A file containing list of leaves to keep, one leaf per line')
190
+ parser.add_argument('-out', required=True, type=str, help='Output tree file')
191
+ parser.add_argument('-q', required=False, action="store_true", help='do not report progress')
192
+ args = vars(parser.parse_args())
193
+ z
TreeSAK/supertree.py ADDED
@@ -0,0 +1,330 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from ete3 import Tree
5
+ from Bio import AlignIO
6
+ import multiprocessing as mp
7
+
8
+
9
+ supertree_usage = '''
10
+ ====================== supertree example commands ======================
11
+
12
+ Dependencies: mafft, trimal, bmge and iqtree2
13
+
14
+ TreeSAK supertree -i best10 -x fa -o best10_astral_tree -bmge -t 12 -f
15
+
16
+ ========================================================================
17
+ '''
18
+
19
+
20
+ def sep_path_basename_ext(file_in):
21
+
22
+ f_path, f_name = os.path.split(file_in)
23
+ if f_path == '':
24
+ f_path = '.'
25
+ f_base, f_ext = os.path.splitext(f_name)
26
+
27
+ return f_name, f_path, f_base, f_ext[1:]
28
+
29
+
30
+ def fa2phy(fasta_in, phy_out):
31
+
32
+ alignment = AlignIO.read(fasta_in, 'fasta')
33
+
34
+ max_seq_id_len = 0
35
+ for each_seq in alignment:
36
+ seq_id_len = len(each_seq.id)
37
+ if seq_id_len > max_seq_id_len:
38
+ max_seq_id_len = seq_id_len
39
+
40
+ with open(phy_out, 'w') as msa_out_handle:
41
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
42
+ for each_seq in alignment:
43
+ seq_id = each_seq.id
44
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
45
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
46
+
47
+
48
+ def PB(msa_in, op_dir, op_prefix, fa_to_plp, num_of_threads, num_of_chains, force_overwrite):
49
+
50
+ ####################################################################################################################
51
+
52
+ msa_in_name, msa_in_path, msa_in_base, msa_in_ext = sep_path_basename_ext(msa_in)
53
+
54
+ settings_dombrowski = '-cat -gtr -x 10 -1 -dgam 4'
55
+ setting_to_use = settings_dombrowski
56
+ msa_in_plp = '%s/%s.phylip' % (op_dir, msa_in_base)
57
+ cmd_txt = '%s/%s_cmds.txt' % (op_dir, msa_in_base)
58
+
59
+ ####################################################################################################################
60
+
61
+ # create output dir
62
+ if os.path.isdir(op_dir) is True:
63
+ if force_overwrite is True:
64
+ os.system('rm -r %s' % op_dir)
65
+ else:
66
+ print('output folder already exist, program exited!')
67
+ exit()
68
+ os.system('mkdir %s' % op_dir)
69
+
70
+ # fa_to_phylip
71
+ msa_to_use = msa_in
72
+ if fa_to_plp is True:
73
+ fa2phy(msa_in, msa_in_plp)
74
+ msa_to_use = msa_in_plp
75
+
76
+ cores_per_chain = 0
77
+ chain_name_list = []
78
+ pb_mpi_cmd_list = []
79
+ jobs_to_run_in_parallel = 0
80
+ if num_of_chains == 1:
81
+ jobs_to_run_in_parallel = 1
82
+ pb_mpi_cmd = 'mpirun -np %s pb_mpi -d %s %s -s %s/%s' % (num_of_threads, msa_to_use, setting_to_use, op_dir, op_prefix)
83
+ chain_name_list.append('%s/%s' % (op_dir, op_prefix))
84
+ pb_mpi_cmd_list.append(pb_mpi_cmd)
85
+ cores_per_chain = num_of_threads
86
+
87
+ elif num_of_threads <= num_of_chains:
88
+ jobs_to_run_in_parallel = num_of_threads
89
+ for chain_index in range(1, (num_of_chains + 1)):
90
+ current_wd = '%s/%s_chain%s' % (op_dir, op_prefix, chain_index)
91
+ os.mkdir(current_wd)
92
+ pb_mpi_cmd = 'mpirun -np %s pb_mpi -d %s %s -s %s/%s_chain%s' % (1, msa_to_use, setting_to_use, current_wd, op_prefix, chain_index)
93
+ chain_name_list.append('%s/%s_chain%s' % (current_wd, op_prefix, chain_index))
94
+ pb_mpi_cmd_list.append(pb_mpi_cmd)
95
+ cores_per_chain = 1
96
+ else:
97
+ jobs_to_run_in_parallel = num_of_chains
98
+ cores_per_run = num_of_threads // num_of_chains
99
+ for chain_index in range(1, (num_of_chains + 1)):
100
+ current_wd = '%s/%s_chain%s' % (op_dir, op_prefix, chain_index)
101
+ os.mkdir(current_wd)
102
+ pb_mpi_cmd = 'mpirun -np %s pb_mpi -d %s %s -s %s/%s_chain%s' % (cores_per_run, msa_to_use, setting_to_use, current_wd, op_prefix, chain_index)
103
+ chain_name_list.append('%s/%s_chain%s' % (current_wd, op_prefix, chain_index))
104
+ pb_mpi_cmd_list.append(pb_mpi_cmd)
105
+ cores_per_chain = cores_per_run
106
+
107
+ # write out commands
108
+ cmd_txt_handle = open(cmd_txt, 'w')
109
+ for cmd in pb_mpi_cmd_list:
110
+ cmd_txt_handle.write(cmd + '\n')
111
+
112
+ cmd_txt_handle.write('\n# To restart a terminated run (e.g., due to walltime limitation)\n')
113
+ for each_chain in chain_name_list:
114
+ cmd_txt_handle.write('mpirun -np %s pb_mpi %s\n' % (cores_per_chain, each_chain))
115
+ cmd_txt_handle.close()
116
+
117
+ # run chains with mp
118
+ print('Running pb_mpi with multiprocessing')
119
+ pool = mp.Pool(processes=jobs_to_run_in_parallel)
120
+ pool.map(os.system, pb_mpi_cmd_list)
121
+ pool.close()
122
+ pool.join()
123
+
124
+ # assess the results
125
+ if num_of_chains > 1:
126
+
127
+ readpb_cmd = 'bpcomp -x 1000 10 %s' % (' '.join(chain_name_list))
128
+ bpcomp_cmd = 'tracecomp -x 1000 %s' % (' '.join(chain_name_list))
129
+
130
+ # write out commands
131
+ cmd_txt_handle = open(cmd_txt, 'a')
132
+ cmd_txt_handle.write(readpb_cmd + '\n')
133
+ cmd_txt_handle.write(bpcomp_cmd + '\n')
134
+ cmd_txt_handle.close()
135
+
136
+ # report
137
+ print('You may want to use the following commands to assess the results:')
138
+ print(readpb_cmd)
139
+ print(bpcomp_cmd)
140
+
141
+ print('Done!')
142
+
143
+
144
+ def supertree(args):
145
+
146
+ oma_op_fasta = args['i']
147
+ fasta_file_ext = args['x']
148
+ op_dir = args['o']
149
+ trim_with_bmge = args['bmge']
150
+ trim_model = args['bmge_m']
151
+ entropy_score_cutoff = args['bmge_esc']
152
+ iqtree_model = args['iqtree_m']
153
+ force_overwrite = args['f']
154
+ num_of_threads = args['t']
155
+ infer_pb_tree = args['pb']
156
+
157
+ # specify path to BMGE.jar
158
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
159
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
160
+
161
+ fa_file_re = '%s/*.%s' % (oma_op_fasta, fasta_file_ext)
162
+ fa_file_list = ['.'.join(os.path.basename(i).split('.')[:-1]) for i in glob.glob(fa_file_re)]
163
+
164
+ if len(fa_file_list) == 0:
165
+ print('No file found in %s, program exited!' % oma_op_fasta)
166
+ exit()
167
+
168
+ ################################################################################
169
+
170
+ # define file name
171
+ cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
172
+ cmd_2_trim_txt = '%s/cmd_2_trim.txt' % op_dir
173
+ cmd_3_tree_txt = '%s/cmd_3_tree.txt' % op_dir
174
+ cmd_4_astral_txt = '%s/cmd_4_astral.txt' % op_dir
175
+ aln_dir = '%s/dir_1_msa' % op_dir
176
+ trimmed_aln_dir = '%s/dir_2_trimmed_msa' % op_dir
177
+ tree_dir = '%s/dir_3_tree' % op_dir
178
+ combined_gene_tree_file = '%s/combined_trees.txt' % op_dir
179
+ astral_mapping_txt = '%s/name_mapping.txt' % op_dir
180
+ consensus_tree_txt = '%s/consensus_tree.treefile' % op_dir
181
+
182
+ ################################################################################
183
+
184
+ # create output folder
185
+ if os.path.isdir(op_dir) is True:
186
+ if force_overwrite is True:
187
+ os.system('rm -r %s' % op_dir)
188
+ else:
189
+ print('%s exist, program exited!' % op_dir)
190
+ exit()
191
+
192
+ os.mkdir(op_dir)
193
+ os.mkdir(aln_dir)
194
+ os.mkdir(trimmed_aln_dir)
195
+ os.mkdir(tree_dir)
196
+
197
+ ################################################################################
198
+
199
+ cmd_list_mafft = []
200
+ cmd_list_trim = []
201
+ cmd_list_tree = []
202
+ cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
203
+ cmd_2_trim_txt_handle = open(cmd_2_trim_txt, 'w')
204
+ cmd_3_tree_txt_handle = open(cmd_3_tree_txt, 'w')
205
+ for each_og in sorted(fa_file_list):
206
+
207
+ # define file name
208
+ current_gene_tree_dir = '%s/dir_3_tree/%s' % (op_dir, each_og)
209
+ og_fa = '%s/%s.%s' % (oma_op_fasta, each_og, fasta_file_ext)
210
+ og_aln = '%s/%s.aln' % (aln_dir, each_og)
211
+ og_aln_trimmed = '%s/%s_trimal.aln' % (trimmed_aln_dir, each_og)
212
+ if trim_with_bmge is True:
213
+ og_aln_trimmed = '%s/%s_bmge.aln' % (trimmed_aln_dir, each_og)
214
+
215
+ os.system('mkdir %s' % current_gene_tree_dir)
216
+
217
+ # prepare commands
218
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (1, og_fa, og_aln)
219
+
220
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (og_aln, og_aln_trimmed)
221
+ if trim_with_bmge is True:
222
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, og_aln, trim_model, entropy_score_cutoff, og_aln_trimmed)
223
+
224
+ infer_tree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s/%s -T %s --quiet' % (og_aln_trimmed, iqtree_model, current_gene_tree_dir, each_og, num_of_threads)
225
+ if infer_pb_tree is True:
226
+ infer_tree_cmd = 'TreeSAK PB -i %s -o %s -p %s -t %s -n %s -fa2plp' % (og_aln_trimmed, current_gene_tree_dir, each_og, num_of_threads, 4)
227
+
228
+ # add commands to list
229
+ cmd_list_mafft.append(mafft_cmd)
230
+ cmd_list_trim.append(trim_cmd)
231
+ cmd_list_tree.append(infer_tree_cmd)
232
+
233
+ # write out commands
234
+ cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
235
+ cmd_2_trim_txt_handle.write(trim_cmd + '\n')
236
+ cmd_3_tree_txt_handle.write(infer_tree_cmd + '\n')
237
+
238
+ cmd_1_mafft_txt_handle.close()
239
+ cmd_2_trim_txt_handle.close()
240
+ cmd_3_tree_txt_handle.close()
241
+
242
+ # run mafft commands
243
+ print('Running mafft with %s cores for %s OGs' % (num_of_threads, len(fa_file_list)))
244
+ pool = mp.Pool(processes=num_of_threads)
245
+ pool.map(os.system, cmd_list_mafft)
246
+ pool.close()
247
+ pool.join()
248
+
249
+ # run trim commands
250
+ print('Trimming with %s cores for %s OGs' % (num_of_threads, len(fa_file_list)))
251
+ pool = mp.Pool(processes=num_of_threads)
252
+ pool.map(os.system, cmd_list_trim)
253
+ pool.close()
254
+ pool.join()
255
+
256
+ # run iqtree commands
257
+ if infer_pb_tree is False:
258
+ print('Running iqtree with %s cores for %s OGs' % (num_of_threads, len(fa_file_list)))
259
+ for each_iqtree_cmd in sorted(cmd_list_tree):
260
+ print(each_iqtree_cmd)
261
+ os.system(each_iqtree_cmd)
262
+ else:
263
+ print('Commands for inferring PhyloBayes tree exported to %s' % cmd_3_tree_txt)
264
+
265
+ #################################################### run astral ####################################################
266
+
267
+ if infer_pb_tree is False:
268
+
269
+ # cat gene trees
270
+ os.system('cat %s/*.treefile > %s' % (tree_dir, combined_gene_tree_file))
271
+
272
+ gnm_to_gene_dict = dict()
273
+ for each_tree in open(combined_gene_tree_file):
274
+ tree_str = each_tree.strip()
275
+ current_tree = Tree(tree_str, quoted_node_names=True, format=1)
276
+ for node in current_tree.traverse():
277
+ if node.is_leaf():
278
+ leaf_name = node.name
279
+ leaf_gnm = '_'.join(leaf_name.split('_')[:-1])
280
+ if leaf_gnm not in gnm_to_gene_dict:
281
+ gnm_to_gene_dict[leaf_gnm] = {leaf_name}
282
+ else:
283
+ gnm_to_gene_dict[leaf_gnm].add(leaf_name)
284
+
285
+ # get the mapping file
286
+ astral_mapping_txt_handle = open(astral_mapping_txt, 'w')
287
+ for each_gnm in gnm_to_gene_dict:
288
+ current_gene_set = gnm_to_gene_dict[each_gnm]
289
+ for each_gene in current_gene_set:
290
+ astral_mapping_txt_handle.write('%s\t%s\n' % (each_gene, each_gnm))
291
+ astral_mapping_txt_handle.close()
292
+
293
+ # may need to add more parameters
294
+ astral_cmd = 'astral -i %s -o %s -t %s -a %s' % (combined_gene_tree_file, consensus_tree_txt, num_of_threads, astral_mapping_txt)
295
+ # -r --round Integer 4 Number of initial rounds of placements
296
+ # -s --subsample Integer 4 Number of rounds of subsampling per exploration step
297
+
298
+ # write out command
299
+ cmd_4_astral_txt_handle = open(cmd_4_astral_txt, 'w')
300
+ cmd_4_astral_txt_handle.write(astral_cmd + '\n')
301
+ cmd_4_astral_txt_handle.close()
302
+
303
+ # run astral
304
+ os.system(astral_cmd)
305
+
306
+ else:
307
+ print('Things to do:')
308
+ print('Run PhyloBayes with commands exported to %s' % cmd_3_tree_txt)
309
+ print('Wait until your chains, for each of your protein family, reached convergence')
310
+ print('1. get consensus gene tree for each protein family usingn')
311
+ print('2. get species tree based on the consensus gene trees with trimal')
312
+
313
+ ####################################################################################################################
314
+
315
+
316
+ if __name__ == '__main__':
317
+
318
+ supertree_parser = argparse.ArgumentParser()
319
+ supertree_parser.add_argument('-i', required=True, help='orthologous gene sequence')
320
+ supertree_parser.add_argument('-x', required=True, help='faa file extension')
321
+ supertree_parser.add_argument('-o', required=True, help='output directory')
322
+ supertree_parser.add_argument('-bmge', required=False, action="store_true", help='trim with BMGE, default is trimal')
323
+ supertree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
324
+ supertree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
325
+ supertree_parser.add_argument('-iqtree_m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
326
+ supertree_parser.add_argument('-pb', required=False, action="store_true", help='infer tree with PhyloBayes-MPI, default is iqtree')
327
+ supertree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
328
+ supertree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
329
+ args = vars(supertree_parser.parse_args())
330
+ supertree(args)
TreeSAK/tmp_1.py ADDED
@@ -0,0 +1,19 @@
1
+
2
+ def gene_gain_and_loss(TableEvents_tsv):
3
+
4
+ for each_line in open(TableEvents_tsv):
5
+ if not each_line.startswith('Family\tBranchType\tBranch'):
6
+ each_line_split = each_line.strip().split('\t')
7
+ protein_family_id = each_line_split[0]
8
+ node_id = each_line_split[2]
9
+ Losses = float(each_line_split[5])
10
+ Originations = float(each_line_split[6])
11
+ Extinctinonprob = float(each_line_split[9])
12
+
13
+ print('%s\t%s\t%s\t%s\t%s' % (protein_family_id, node_id, Losses, '', Extinctinonprob))
14
+
15
+
16
+ TableEvents_tsv = '/Users/songweizhi/Desktop/Sponge_r220/10_dRep95_255_ALE_wd/dRep95_255_ALE3_op_dir_80/TableEvents.tsv'
17
+
18
+ gene_gain_and_loss(TableEvents_tsv)
19
+
TreeSAK/tmp_2.py ADDED
@@ -0,0 +1,19 @@
1
+ from Bio import SeqIO
2
+
3
+
4
+ def filter_by_gap(file_in, max_gap_pct, file_out):
5
+ file_out_handle = open(file_out, 'w')
6
+ for each_seq in SeqIO.parse(file_in, 'fasta'):
7
+ seq_str = str(each_seq.seq)
8
+ gap_num = seq_str.count('-')
9
+ gap_pct = gap_num*100 / len(seq_str)
10
+ if gap_pct <= float(max_gap_pct):
11
+ file_out_handle.write('>%s\n%s\n' % (each_seq.id, seq_str))
12
+ file_out_handle.close()
13
+
14
+
15
+ file_in = '/Users/songweizhi/Desktop/Jianwei_Maggie/mmseqs_cov0.85_iden0.35/All_PeptidaseS9-RiPPs_dna100_realRiPPS9100_addref.mmseqs.iden0.35.cov0.85.representatives.trimal.aln'
16
+ max_gap_pct = 40
17
+ file_out = '/Users/songweizhi/Desktop/Jianwei_Maggie/mmseqs_cov0.85_iden0.35/All_PeptidaseS9-RiPPs_dna100_realRiPPS9100_addref.mmseqs.iden0.35.cov0.85.representatives.trimal.maxgap40.aln'
18
+
19
+ filter_by_gap(file_in, max_gap_pct, file_out)
TreeSAK/tmp_3.py ADDED
@@ -0,0 +1,120 @@
1
+ import os
2
+ import argparse
3
+ import arviz as az
4
+ import pandas as pd
5
+ import matplotlib as mpl
6
+ mpl.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+ from matplotlib.pyplot import figure
9
+
10
+
11
+ CompareMCMC_usage = '''
12
+ ====================================== CompareMCMC example commands ======================================
13
+
14
+ TreeSAK CompareMCMC -mx IR_mcmc.txt -my AR_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
15
+
16
+ cd /Users/songweizhi/Desktop
17
+ TreeSAK CompareMCMC -mx /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run1_mcmc.txt -my /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run2_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
18
+
19
+ ==========================================================================================================
20
+ '''
21
+
22
+
23
+ def sep_path_basename_ext(file_in):
24
+ file_path, file_name = os.path.split(file_in)
25
+ if file_path == '':
26
+ file_path = '.'
27
+ file_basename, file_extension = os.path.splitext(file_name)
28
+ return file_path, file_basename, file_extension
29
+
30
+
31
+ def CompareMCMC():
32
+
33
+ # mcmc_txt_x = args['mx']
34
+ # mcmc_txt_y = args['my']
35
+ # label_x = args['lx']
36
+ # label_y = args['ly']
37
+ # pwd_figure = args['o']
38
+ # max_axis_value = args['max']
39
+ # label_fs = args['fs']
40
+
41
+ label_fs = 16
42
+ file_x = '/Users/songweizhi/Desktop/x.txt'
43
+ file_y = '/Users/songweizhi/Desktop/y.txt'
44
+ pwd_figure = '/Users/songweizhi/Desktop/Figures.pdf'
45
+ min_value = 0
46
+ max_value = 1
47
+ max_axis_value = 1
48
+ label_x = []
49
+ num_list_x = []
50
+ x_err_l = []
51
+ x_err_r = []
52
+ line_num_index = 0
53
+ for line in open(file_x):
54
+ line_split = line.strip().split('\t')
55
+ if line_num_index > 0:
56
+ label_x.append(line_split[0])
57
+ num_list_x.append(float(line_split[1]))
58
+ x_err_l.append(float(line_split[1]) - float(line_split[2]))
59
+ x_err_r.append(float(line_split[3]) - float(line_split[1]))
60
+ line_num_index += 1
61
+
62
+
63
+ label_y = []
64
+ num_list_y = []
65
+ y_err_l = []
66
+ y_err_u = []
67
+ line_num_index = 0
68
+ for line in open(file_y):
69
+ line_split = line.strip().split('\t')
70
+ if line_num_index > 0:
71
+ label_y.append(line_split[0])
72
+ num_list_y.append(float(line_split[1]))
73
+ y_err_l.append(float(line_split[1]) - float(line_split[2]))
74
+ y_err_u.append(float(line_split[3]) - float(line_split[1]))
75
+ line_num_index += 1
76
+
77
+
78
+
79
+
80
+
81
+
82
+ figure(figsize=(6, 6), dpi=300)
83
+ plt.plot([min_value, max_value], [min_value, max_value], color='black', linestyle='dashed', linewidth=1, alpha=0.5)
84
+ plt.scatter(num_list_x, num_list_y, s=0)
85
+ plt.errorbar(num_list_x, num_list_y, xerr=[x_err_l, x_err_r], yerr=[y_err_l, y_err_u],
86
+ ls='none', ecolor='skyblue', elinewidth=1, alpha=0.5)
87
+
88
+ if max_axis_value is not None:
89
+ plt.xlim([0, max_axis_value])
90
+ plt.ylim([0, max_axis_value])
91
+
92
+ # Set the font size of xticks and yticks
93
+ plt.xticks(fontsize=label_fs)
94
+ plt.yticks(fontsize=label_fs)
95
+ plt.xlabel(label_x, fontsize=label_fs)
96
+ plt.ylabel(label_y, fontsize=label_fs)
97
+
98
+ # write out
99
+ plt.tight_layout()
100
+ plt.savefig(pwd_figure)
101
+ plt.close()
102
+
103
+ print('Plot exported to %s, done!' % pwd_figure)
104
+
105
+
106
+ CompareMCMC()
107
+
108
+ # if __name__ == '__main__':
109
+ #
110
+ # # initialize the options parser
111
+ # parser = argparse.ArgumentParser()
112
+ # parser.add_argument('-mx', required=True, help='mcmc.txt for x axis')
113
+ # parser.add_argument('-my', required=True, help='mcmc.txt for y axis')
114
+ # parser.add_argument('-lx', required=False, default=None, help='label for x axis')
115
+ # parser.add_argument('-ly', required=False, default=None, help='label for y axis')
116
+ # parser.add_argument('-max', required=False, default=None, type=int, help='maximum axis value')
117
+ # parser.add_argument('-fs', required=False, default=16, type=int, help='label font size, default: 16')
118
+ # parser.add_argument('-o', required=True, help='output plot')
119
+ # args = vars(parser.parse_args())
120
+ # CompareMCMC(args)