treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/assessOG.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
from Bio import SeqIO
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def sep_path_basename_ext(file_in):
|
|
7
|
+
|
|
8
|
+
f_path, f_name = os.path.split(file_in)
|
|
9
|
+
if f_path == '':
|
|
10
|
+
f_path = '.'
|
|
11
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
12
|
+
|
|
13
|
+
return f_name, f_path, f_base, f_ext[1:]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_gnm_og_cov(og_dir, og_ext, og_cov_txt):
|
|
17
|
+
|
|
18
|
+
og_file_re = '%s/*.%s' % (og_dir, og_ext)
|
|
19
|
+
og_file_list = glob.glob(og_file_re)
|
|
20
|
+
|
|
21
|
+
gnm_to_og_dict = dict()
|
|
22
|
+
for og_file in og_file_list:
|
|
23
|
+
_, _, og_id, _ = sep_path_basename_ext(og_file)
|
|
24
|
+
for each_seq in SeqIO.parse(og_file, 'fasta'):
|
|
25
|
+
seq_id = each_seq.id
|
|
26
|
+
gnm_id = '_'.join(seq_id.split('_')[:-1])
|
|
27
|
+
if gnm_id not in gnm_to_og_dict:
|
|
28
|
+
gnm_to_og_dict[gnm_id] = set()
|
|
29
|
+
gnm_to_og_dict[gnm_id].add(og_id)
|
|
30
|
+
|
|
31
|
+
og_cov_txt_handle = open(og_cov_txt, 'w')
|
|
32
|
+
for each_gnm in sorted(list(gnm_to_og_dict.keys())):
|
|
33
|
+
gnm_og_set = gnm_to_og_dict[each_gnm]
|
|
34
|
+
og_cov = len(gnm_og_set)*100/len(og_file_list)
|
|
35
|
+
og_cov = float("{0:.2f}".format(og_cov))
|
|
36
|
+
og_cov_txt_handle.write('%s\t%s\n' % (each_gnm, og_cov))
|
|
37
|
+
og_cov_txt_handle.close()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
og_dir = '/Users/songweizhi/Desktop/OrthologousGroupsFasta_cov95'
|
|
41
|
+
og_ext = 'fa'
|
|
42
|
+
og_cov_txt = '/Users/songweizhi/Desktop/gnm_og_cov.txt'
|
|
43
|
+
|
|
44
|
+
get_gnm_og_cov(og_dir, og_ext, og_cov_txt)
|
|
45
|
+
|
TreeSAK/batch_itol.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from itolapi import Itol
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
batch_itol_usage = '''
|
|
9
|
+
======================================= batch_itol example commands =======================================
|
|
10
|
+
|
|
11
|
+
TreeSAK batch_itol -f -api API_key -ip batch_access_tmp -a annotation_files.txt -i input.tree -o out.pdf
|
|
12
|
+
TreeSAK batch_itol -f -api API_key -ip batch_access_tmp -a annotation_files.txt -i tree_dir -x tree -o out_pdf
|
|
13
|
+
|
|
14
|
+
Manual
|
|
15
|
+
https://github.com/albertyw/itolapi
|
|
16
|
+
http://itol.embl.de/help.cgi#batch
|
|
17
|
+
|
|
18
|
+
# An example of the parameter file is available here
|
|
19
|
+
# to be added
|
|
20
|
+
|
|
21
|
+
===========================================================================================================
|
|
22
|
+
'''
|
|
23
|
+
|
|
24
|
+
def sep_path_basename_ext(file_in):
|
|
25
|
+
|
|
26
|
+
f_path, f_name = os.path.split(file_in)
|
|
27
|
+
if f_path == '':
|
|
28
|
+
f_path = '.'
|
|
29
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
30
|
+
f_ext = f_ext[1:]
|
|
31
|
+
|
|
32
|
+
return f_name, f_path, f_base, f_ext
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def itol_single_tree(tree_file, annotation_files_txt, project_name, APIkey, parameter_dict, op_plot):
|
|
36
|
+
|
|
37
|
+
annotation_file_list = []
|
|
38
|
+
for each_file in open(annotation_files_txt):
|
|
39
|
+
annotation_file_list.append(each_file.strip())
|
|
40
|
+
|
|
41
|
+
op_plot_ext = op_plot.split('.')[-1]
|
|
42
|
+
|
|
43
|
+
# upload tree to iTOL
|
|
44
|
+
itol_uploader = Itol()
|
|
45
|
+
itol_uploader.params['projectName'] = project_name # better to create a project with a unique name.
|
|
46
|
+
itol_uploader.params['APIkey'] = APIkey # sine we are the same account, we can use the same APIkey
|
|
47
|
+
itol_uploader.params['treeName'] = tree_file
|
|
48
|
+
itol_uploader.add_file(Path(tree_file))
|
|
49
|
+
|
|
50
|
+
# upload annotation files to iTOL
|
|
51
|
+
for annotation_file in annotation_file_list:
|
|
52
|
+
itol_uploader.add_file(Path(annotation_file))
|
|
53
|
+
|
|
54
|
+
status = itol_uploader.upload()
|
|
55
|
+
# import pdb;pdb.set_trace()
|
|
56
|
+
assert status != False
|
|
57
|
+
|
|
58
|
+
# the following parameters are optional, refer to https://itol.embl.de/help.cgi#batchExp
|
|
59
|
+
if len(annotation_file_list) == 1:
|
|
60
|
+
datasets_visible_str = '0'
|
|
61
|
+
elif len(annotation_file_list) == 2:
|
|
62
|
+
datasets_visible_str = '0,1'
|
|
63
|
+
elif len(annotation_file_list) == 3:
|
|
64
|
+
datasets_visible_str = '0,1,2'
|
|
65
|
+
else:
|
|
66
|
+
datasets_visible_str = ','.join([str(i) for i in list(range(0, len(annotation_file_list)))])
|
|
67
|
+
|
|
68
|
+
parameter_dict.get('', 'to be added')
|
|
69
|
+
parameter_dict.get('', '')
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# for a full list of options, go to https://itol.embl.de/help.cgi#batchExp
|
|
73
|
+
itol_exporter = itol_uploader.get_itol_export()
|
|
74
|
+
itol_exporter.set_export_param_value('internal_scale', parameter_dict.get('internal_scale', '0'))
|
|
75
|
+
itol_exporter.set_export_param_value('datasets_visible', datasets_visible_str)
|
|
76
|
+
itol_exporter.set_export_param_value('display_mode', parameter_dict.get('display_mode', '1'))
|
|
77
|
+
itol_exporter.set_export_param_value('vertical_shift_factor', parameter_dict.get('vertical_shift_factor', '1'))
|
|
78
|
+
itol_exporter.set_export_param_value('horizontal_scale_factor', parameter_dict.get('horizontal_scale_factor', '0.9'))
|
|
79
|
+
|
|
80
|
+
# range
|
|
81
|
+
itol_exporter.set_export_param_value('range_mode', parameter_dict.get('range_mode', '2')) # Possible values: 0,1 or 2 (0=off, 1=cover labels only, 2=cover full clades)
|
|
82
|
+
itol_exporter.set_export_param_value('include_ranges_legend', parameter_dict.get('include_ranges_legend', '0'))
|
|
83
|
+
|
|
84
|
+
# label
|
|
85
|
+
# itol_exporter.set_export_param_value('current_font_size', '12') # the default looks good
|
|
86
|
+
itol_exporter.set_export_param_value('current_font_name', parameter_dict.get('current_font_name', 'Courier'))
|
|
87
|
+
itol_exporter.set_export_param_value('default_label_color', parameter_dict.get('default_label_color', '#000000'))
|
|
88
|
+
|
|
89
|
+
# branch
|
|
90
|
+
itol_exporter.set_export_param_value('line_width', parameter_dict.get('line_width', '2'))
|
|
91
|
+
itol_exporter.set_export_param_value('dashed_lines', parameter_dict.get('dashed_lines', '1'))
|
|
92
|
+
itol_exporter.set_export_param_value('default_branch_color', parameter_dict.get('default_branch_color', '#000000'))
|
|
93
|
+
|
|
94
|
+
# bootstrap
|
|
95
|
+
itol_exporter.set_export_param_value('metadata_source', parameter_dict.get('metadata_source', 'bootstrap')) # Which metadata source to use for bootstrap display options
|
|
96
|
+
itol_exporter.set_export_param_value('bootstrap_display', parameter_dict.get('bootstrap_display', '1')) # possible values: 0 or 1
|
|
97
|
+
itol_exporter.set_export_param_value('bootstrap_type', parameter_dict.get('bootstrap_type', '2')) # Possible values: 1, 2, 3 or 4 (1=Symbol, 2=Text label, 3=Branch color and 4=Branch width)
|
|
98
|
+
itol_exporter.set_export_param_value('bootstrap_label_size', parameter_dict.get('bootstrap_label_size', '15')) # in pixels, integer >= 9
|
|
99
|
+
itol_exporter.set_export_param_value('bootstrap_label_percent_factor', parameter_dict.get('bootstrap_label_percent_factor', '10')) # in pixels, integer >= 9
|
|
100
|
+
|
|
101
|
+
# write out
|
|
102
|
+
itol_exporter.set_export_param_value('format', op_plot_ext)
|
|
103
|
+
itol_exporter.export(op_plot)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def batch_itol(args):
|
|
107
|
+
|
|
108
|
+
tree_file_dir = args['i']
|
|
109
|
+
tree_file_ext = args['x']
|
|
110
|
+
annotation_files_txt = args['a']
|
|
111
|
+
op_file_dir = args['o']
|
|
112
|
+
force_overwrite = args['f']
|
|
113
|
+
API_key = args['api']
|
|
114
|
+
project_name = args['ip']
|
|
115
|
+
para_txt = args['para']
|
|
116
|
+
|
|
117
|
+
para_dict = dict()
|
|
118
|
+
if para_txt is not None:
|
|
119
|
+
if os.path.isfile(tree_file_dir) is False:
|
|
120
|
+
print('The specified parameter file does not exist, program exited!')
|
|
121
|
+
exit()
|
|
122
|
+
else:
|
|
123
|
+
for each_line in open(para_txt):
|
|
124
|
+
if not each_line.startswith('#'):
|
|
125
|
+
if len(each_line.strip()) > 0:
|
|
126
|
+
para_without_comment = each_line.strip().split('#')[0].strip()
|
|
127
|
+
para_without_comment_split = para_without_comment.split('\t')
|
|
128
|
+
para_dict[para_without_comment_split[0]] = para_without_comment_split[1]
|
|
129
|
+
|
|
130
|
+
if os.path.isfile(tree_file_dir) is True:
|
|
131
|
+
itol_single_tree(tree_file_dir, annotation_files_txt, project_name, API_key, para_dict, op_file_dir)
|
|
132
|
+
elif os.path.isdir(tree_file_dir) is True:
|
|
133
|
+
file_re = '%s/*.%s' % (tree_file_dir, tree_file_ext)
|
|
134
|
+
file_list = glob.glob(file_re)
|
|
135
|
+
|
|
136
|
+
if len(file_list) == 0:
|
|
137
|
+
print('no file found in %s, please check file extension, program exited!' % tree_file_dir)
|
|
138
|
+
exit()
|
|
139
|
+
|
|
140
|
+
# create output folder
|
|
141
|
+
if os.path.isdir(op_file_dir) is True:
|
|
142
|
+
if force_overwrite is True:
|
|
143
|
+
os.system('rm -r %s' % op_file_dir)
|
|
144
|
+
else:
|
|
145
|
+
print('Output folder detected, program exited!')
|
|
146
|
+
exit()
|
|
147
|
+
os.system('mkdir %s' % op_file_dir)
|
|
148
|
+
|
|
149
|
+
for each_file in file_list:
|
|
150
|
+
f_name, f_path, f_base, f_ext = sep_path_basename_ext(each_file)
|
|
151
|
+
op_pdf = '%s/%s.pdf' % (op_file_dir, f_base)
|
|
152
|
+
|
|
153
|
+
itol_single_tree(each_file, annotation_files_txt, project_name, API_key, para_dict, op_pdf)
|
|
154
|
+
else:
|
|
155
|
+
print('please provide input file with -i, program exited!')
|
|
156
|
+
exit()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if __name__ == '__main__':
|
|
160
|
+
|
|
161
|
+
batch_itol_parser = argparse.ArgumentParser(usage=batch_itol_usage)
|
|
162
|
+
batch_itol_parser.add_argument('-i', required=True, help='input tree file or folder')
|
|
163
|
+
batch_itol_parser.add_argument('-x', required=False, default=None, help='file extension')
|
|
164
|
+
batch_itol_parser.add_argument('-o', required=True, help='output file or folder')
|
|
165
|
+
batch_itol_parser.add_argument('-a', required=False, default=None, help='a txt file contain absolute to all annotation files')
|
|
166
|
+
batch_itol_parser.add_argument('-para', required=False, default=None, help='parameter file')
|
|
167
|
+
batch_itol_parser.add_argument('-api', required=True, help='iTOL API key')
|
|
168
|
+
batch_itol_parser.add_argument('-ip', required=False, default='batch_access_tmp', help='iTOL project name, default: batch_access_tmp')
|
|
169
|
+
batch_itol_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
170
|
+
args = vars(batch_itol_parser.parse_args())
|
|
171
|
+
batch_itol(args)
|
TreeSAK/catfasta2phy.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
from Bio import SeqIO
|
|
4
|
+
from Bio import AlignIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
|
|
8
|
+
|
|
9
|
+
concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
|
|
10
|
+
msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
|
|
11
|
+
msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
|
|
12
|
+
msa_file_list_sorted = sorted(msa_file_list)
|
|
13
|
+
|
|
14
|
+
complete_gnm_set = set()
|
|
15
|
+
for each_msa_file in msa_file_list:
|
|
16
|
+
pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
|
|
17
|
+
for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
|
|
18
|
+
complete_gnm_set.add(each_seq.id)
|
|
19
|
+
|
|
20
|
+
complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
|
|
21
|
+
|
|
22
|
+
# initialize concatenated msa dict
|
|
23
|
+
gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
|
|
24
|
+
msa_len_dict = dict()
|
|
25
|
+
for each_msa_file in msa_file_list_sorted:
|
|
26
|
+
gene_id = each_msa_file.split('.' + msa_ext)[0]
|
|
27
|
+
|
|
28
|
+
# read in msa
|
|
29
|
+
current_msa_len = 0
|
|
30
|
+
current_msa_len_set = set()
|
|
31
|
+
pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
|
|
32
|
+
current_msa_seq_dict = dict()
|
|
33
|
+
for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
|
|
34
|
+
complete_gnm_set.add(each_seq.id)
|
|
35
|
+
current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
|
|
36
|
+
current_msa_len_set.add(len(each_seq.seq))
|
|
37
|
+
current_msa_len = len(each_seq.seq)
|
|
38
|
+
|
|
39
|
+
if len(current_msa_len_set) != 1:
|
|
40
|
+
print('Sequences with different length were found in %s, program exited!' % each_msa_file)
|
|
41
|
+
exit()
|
|
42
|
+
|
|
43
|
+
msa_len_dict[gene_id] = current_msa_len
|
|
44
|
+
|
|
45
|
+
# add sequence to concatenated msa dict
|
|
46
|
+
for each_gnm in complete_gnm_list_sorted:
|
|
47
|
+
msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
|
|
48
|
+
gnm_to_seq_dict[each_gnm] += msa_seq
|
|
49
|
+
|
|
50
|
+
# write out concatenated msa
|
|
51
|
+
concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
|
|
52
|
+
for each_gnm in complete_gnm_list_sorted:
|
|
53
|
+
concatenated_msa_handle.write('>%s\n' % each_gnm)
|
|
54
|
+
concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
|
|
55
|
+
concatenated_msa_handle.close()
|
|
56
|
+
|
|
57
|
+
# write out partition file
|
|
58
|
+
end_pos = 0
|
|
59
|
+
partition_file_handle = open(partition_file, 'w')
|
|
60
|
+
for each_m in msa_file_list_sorted:
|
|
61
|
+
gene_id = each_m.split('.' + msa_ext)[0]
|
|
62
|
+
current_m_len = msa_len_dict[gene_id]
|
|
63
|
+
partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
|
|
64
|
+
end_pos += current_m_len
|
|
65
|
+
partition_file_handle.close()
|
|
66
|
+
|
|
67
|
+
# convert msa in fasta to phy
|
|
68
|
+
AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
msa_dir = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed'
|
|
72
|
+
msa_ext = 'aln'
|
|
73
|
+
concatenated_msa_phy = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed_concatenated.phy'
|
|
74
|
+
partition_file = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed_concatenated_partition.txt'
|
|
75
|
+
# catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
msa_file = '/Users/songweizhi/Desktop/PA_75_DeltaLL_75_concatenated.phy'
|
|
80
|
+
msa_file_subset = '/Users/songweizhi/Desktop/PA_75_DeltaLL_75_concatenated_subset.phy'
|
|
81
|
+
|
|
82
|
+
from Bio import AlignIO
|
|
83
|
+
|
|
84
|
+
def slice_msa_by_col(msa_in, range_str, msa_out):
|
|
85
|
+
alignment = AlignIO.read(msa_in, 'phylip-relaxed')
|
|
86
|
+
|
|
87
|
+
range_l = int(range_str.split('-')[0]) - 1
|
|
88
|
+
range_r = int(range_str.split('-')[1])
|
|
89
|
+
|
|
90
|
+
aln_subset = alignment[:, range_l:range_r]
|
|
91
|
+
AlignIO.write(aln_subset, msa_out, 'phylip-relaxed')
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def slice_msa_by_col_manual(msa_in, range_str, msa_out):
|
|
95
|
+
alignment = AlignIO.read(msa_in, 'phylip-relaxed')
|
|
96
|
+
|
|
97
|
+
range_l = int(range_str.split('-')[0]) - 1
|
|
98
|
+
range_r = int(range_str.split('-')[1])
|
|
99
|
+
aln_subset = alignment[:, range_l:range_r]
|
|
100
|
+
|
|
101
|
+
max_seq_id_len = 0
|
|
102
|
+
for each_seq in aln_subset:
|
|
103
|
+
seq_id_len = len(each_seq.id)
|
|
104
|
+
if seq_id_len > max_seq_id_len:
|
|
105
|
+
max_seq_id_len = seq_id_len
|
|
106
|
+
print(max_seq_id_len)
|
|
107
|
+
|
|
108
|
+
with open(msa_out, 'w') as msa_out_handle:
|
|
109
|
+
msa_out_handle.write('%s %s\n' % (len(aln_subset), aln_subset.get_alignment_length()))
|
|
110
|
+
for each_seq in aln_subset:
|
|
111
|
+
seq_id = each_seq.id
|
|
112
|
+
seq_id_with_space = '%s%s' % (seq_id, ' '*(max_seq_id_len + 2 - len(seq_id)))
|
|
113
|
+
print(seq_id_with_space)
|
|
114
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# AlignIO.write(aln_subset, msa_out, 'phylip-relaxed')
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
slice_range = ['1-500', '501-1000', '1001-1500', '1501-2000', '2001-2500', '2501-3000', '3001-3500', '3501-4000', '4001-4500', '4501-4879']
|
|
121
|
+
|
|
122
|
+
for each_range in slice_range:
|
|
123
|
+
pwd_msa_op = '/Users/songweizhi/Desktop/%s.phy' % each_range
|
|
124
|
+
slice_msa_by_col_manual(msa_file, each_range, pwd_msa_op)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def fa2phy(fasta_in, phy_out):
|
|
128
|
+
alignment = AlignIO.read(fasta_in, 'fasta')
|
|
129
|
+
max_seq_id_len = 0
|
|
130
|
+
for each_seq in alignment:
|
|
131
|
+
seq_id_len = len(each_seq.id)
|
|
132
|
+
if seq_id_len > max_seq_id_len:
|
|
133
|
+
max_seq_id_len = seq_id_len
|
|
134
|
+
|
|
135
|
+
with open(phy_out, 'w') as msa_out_handle:
|
|
136
|
+
msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
|
|
137
|
+
for each_seq in alignment:
|
|
138
|
+
seq_id = each_seq.id
|
|
139
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
140
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
TreeSAK/cogTree.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
cogTree_usage = '''
|
|
9
|
+
================================ cogTree example commands ================================
|
|
10
|
+
|
|
11
|
+
TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG_id.txt
|
|
12
|
+
TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG00724
|
|
13
|
+
TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG00724,arCOG02271
|
|
14
|
+
|
|
15
|
+
==========================================================================================
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def select_seq(seq_file, seq_id_set, output_file):
|
|
20
|
+
output_file_handle = open(output_file, 'w')
|
|
21
|
+
for seq_record in SeqIO.parse(seq_file, 'fasta'):
|
|
22
|
+
seq_id = seq_record.id
|
|
23
|
+
if seq_id in seq_id_set:
|
|
24
|
+
SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
|
|
25
|
+
output_file_handle.close()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def cogTree(args):
|
|
29
|
+
|
|
30
|
+
combined_faa = args['i']
|
|
31
|
+
cog_annotation_wd = args['cog']
|
|
32
|
+
interested_fun_txt = args['fun']
|
|
33
|
+
op_dir = args['o']
|
|
34
|
+
trim_with_bmge = args['bmge']
|
|
35
|
+
trim_model = args['bmge_m']
|
|
36
|
+
entropy_score_cutoff = args['bmge_esc']
|
|
37
|
+
iqtree_model = args['iqtree_m']
|
|
38
|
+
force_overwrite = args['f']
|
|
39
|
+
num_of_threads = args['t']
|
|
40
|
+
|
|
41
|
+
# specify path to BMGE.jar
|
|
42
|
+
current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
|
|
43
|
+
pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
|
|
44
|
+
|
|
45
|
+
interested_fun_set = set()
|
|
46
|
+
if os.path.isfile(interested_fun_txt) is False:
|
|
47
|
+
if ',' in interested_fun_txt:
|
|
48
|
+
interested_fun_set = interested_fun_txt.split(',')
|
|
49
|
+
else:
|
|
50
|
+
interested_fun_set.add(interested_fun_txt)
|
|
51
|
+
else:
|
|
52
|
+
for each_fun in open(interested_fun_txt):
|
|
53
|
+
interested_fun_set.add(each_fun.strip().split()[0])
|
|
54
|
+
|
|
55
|
+
################################################################################
|
|
56
|
+
|
|
57
|
+
faa_dir = '%s/dir_1_faa' % op_dir
|
|
58
|
+
aln_dir = '%s/dir_2_msa' % op_dir
|
|
59
|
+
trimmed_aln_dir = '%s/dir_3_trimmed_msa' % op_dir
|
|
60
|
+
tree_dir = '%s/dir_4_tree' % op_dir
|
|
61
|
+
cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
|
|
62
|
+
cmd_2_trim_txt = '%s/cmd_2_trim.txt' % op_dir
|
|
63
|
+
cmd_3_tree_txt = '%s/cmd_3_tree.txt' % op_dir
|
|
64
|
+
|
|
65
|
+
################################################################################
|
|
66
|
+
|
|
67
|
+
# create output folder
|
|
68
|
+
if os.path.isdir(op_dir) is True:
|
|
69
|
+
if force_overwrite is True:
|
|
70
|
+
os.system('rm -r %s' % op_dir)
|
|
71
|
+
else:
|
|
72
|
+
print('%s exist, program exited!' % op_dir)
|
|
73
|
+
exit()
|
|
74
|
+
|
|
75
|
+
os.mkdir(op_dir)
|
|
76
|
+
os.mkdir(faa_dir)
|
|
77
|
+
os.mkdir(aln_dir)
|
|
78
|
+
os.mkdir(trimmed_aln_dir)
|
|
79
|
+
os.mkdir(tree_dir)
|
|
80
|
+
|
|
81
|
+
################################################################################
|
|
82
|
+
|
|
83
|
+
fun_to_gene_dict = dict()
|
|
84
|
+
if cog_annotation_wd is not None:
|
|
85
|
+
|
|
86
|
+
print('Reading in COG annotation results')
|
|
87
|
+
file_re = '%s/*COG_wd/*_query_to_cog.txt' % (cog_annotation_wd)
|
|
88
|
+
file_list = glob.glob(file_re)
|
|
89
|
+
if len(file_list) == 0:
|
|
90
|
+
print('COG annotation file not detected, program exited!')
|
|
91
|
+
exit()
|
|
92
|
+
|
|
93
|
+
for each_file in file_list:
|
|
94
|
+
line_index = 0
|
|
95
|
+
for each_line in open(each_file):
|
|
96
|
+
if line_index > 0:
|
|
97
|
+
each_line_split = each_line.strip().split('\t')
|
|
98
|
+
if len(each_line_split) == 4:
|
|
99
|
+
gene_id = each_line_split[0]
|
|
100
|
+
cog_id = each_line_split[1]
|
|
101
|
+
if cog_id in interested_fun_set:
|
|
102
|
+
if cog_id not in fun_to_gene_dict:
|
|
103
|
+
fun_to_gene_dict[cog_id] = set()
|
|
104
|
+
fun_to_gene_dict[cog_id].add(gene_id)
|
|
105
|
+
line_index += 1
|
|
106
|
+
|
|
107
|
+
cmd_list_mafft = []
|
|
108
|
+
cmd_list_trim = []
|
|
109
|
+
cmd_list_tree = []
|
|
110
|
+
cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
|
|
111
|
+
cmd_2_trim_txt_handle = open(cmd_2_trim_txt, 'w')
|
|
112
|
+
cmd_3_tree_txt_handle = open(cmd_3_tree_txt, 'w')
|
|
113
|
+
for each_fun in sorted(fun_to_gene_dict):
|
|
114
|
+
|
|
115
|
+
# define file name
|
|
116
|
+
fun_faa = '%s/%s.faa' % (faa_dir, each_fun)
|
|
117
|
+
current_gene_tree_dir = '%s/%s' % (tree_dir, each_fun)
|
|
118
|
+
fun_aln = '%s/%s.aln' % (aln_dir, each_fun)
|
|
119
|
+
fun_aln_trimmed = '%s/%s_trimal.aln' % (trimmed_aln_dir, each_fun)
|
|
120
|
+
if trim_with_bmge is True:
|
|
121
|
+
fun_aln_trimmed = '%s/%s_bmge.aln' % (trimmed_aln_dir, each_fun)
|
|
122
|
+
|
|
123
|
+
# extract sequences
|
|
124
|
+
current_fun_gene_set = fun_to_gene_dict[each_fun]
|
|
125
|
+
select_seq(combined_faa, current_fun_gene_set, fun_faa)
|
|
126
|
+
|
|
127
|
+
os.system('mkdir %s' % current_gene_tree_dir)
|
|
128
|
+
|
|
129
|
+
# prepare commands
|
|
130
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (1, fun_faa, fun_aln)
|
|
131
|
+
trim_cmd = 'trimal -in %s -out %s -automated1' % (fun_aln, fun_aln_trimmed)
|
|
132
|
+
if trim_with_bmge is True:
|
|
133
|
+
trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, fun_aln, trim_model, entropy_score_cutoff, fun_aln_trimmed)
|
|
134
|
+
infer_tree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s/%s -T %s --quiet' % (fun_aln_trimmed, iqtree_model, current_gene_tree_dir, each_fun, num_of_threads)
|
|
135
|
+
|
|
136
|
+
# add commands to list
|
|
137
|
+
cmd_list_mafft.append(mafft_cmd)
|
|
138
|
+
cmd_list_trim.append(trim_cmd)
|
|
139
|
+
cmd_list_tree.append(infer_tree_cmd)
|
|
140
|
+
|
|
141
|
+
# write out commands
|
|
142
|
+
cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
|
|
143
|
+
cmd_2_trim_txt_handle.write(trim_cmd + '\n')
|
|
144
|
+
cmd_3_tree_txt_handle.write(infer_tree_cmd + '\n')
|
|
145
|
+
|
|
146
|
+
cmd_1_mafft_txt_handle.close()
|
|
147
|
+
cmd_2_trim_txt_handle.close()
|
|
148
|
+
cmd_3_tree_txt_handle.close()
|
|
149
|
+
|
|
150
|
+
# run mafft commands
|
|
151
|
+
print('Running mafft with %s cores for %s commands' % (num_of_threads, len(cmd_list_mafft)))
|
|
152
|
+
pool = mp.Pool(processes=num_of_threads)
|
|
153
|
+
pool.map(os.system, cmd_list_mafft)
|
|
154
|
+
pool.close()
|
|
155
|
+
pool.join()
|
|
156
|
+
|
|
157
|
+
# run trim commands
|
|
158
|
+
print('Trimming with %s cores for %s commands' % (num_of_threads, len(cmd_list_trim)))
|
|
159
|
+
pool = mp.Pool(processes=num_of_threads)
|
|
160
|
+
pool.map(os.system, cmd_list_trim)
|
|
161
|
+
pool.close()
|
|
162
|
+
pool.join()
|
|
163
|
+
|
|
164
|
+
# run iqtree commands
|
|
165
|
+
print('Running iqtree with %s cores' % num_of_threads)
|
|
166
|
+
for each_iqtree_cmd in sorted(cmd_list_tree):
|
|
167
|
+
print(each_iqtree_cmd)
|
|
168
|
+
os.system(each_iqtree_cmd)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == '__main__':
|
|
172
|
+
|
|
173
|
+
cogTree_parser = argparse.ArgumentParser()
|
|
174
|
+
cogTree_parser.add_argument('-i', required=True, help='orthologous gene sequence')
|
|
175
|
+
cogTree_parser.add_argument('-fun', required=True, help='interested functions')
|
|
176
|
+
cogTree_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
|
|
177
|
+
cogTree_parser.add_argument('-o', required=True, help='output directory')
|
|
178
|
+
cogTree_parser.add_argument('-bmge', required=False, action="store_true", help='trim with BMGE, default is trimal')
|
|
179
|
+
cogTree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
|
|
180
|
+
cogTree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
|
|
181
|
+
cogTree_parser.add_argument('-iqtree_m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
|
|
182
|
+
cogTree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
183
|
+
cogTree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
|
|
184
|
+
args = vars(cogTree_parser.parse_args())
|
|
185
|
+
cogTree(args)
|
TreeSAK/compare_trees.R
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
|
|
2
|
+
# check.packages function: install and load multiple R packages.
|
|
3
|
+
# Check to see if packages are installed. Install them if they are not, then load them into the R session.
|
|
4
|
+
check.packages <- function(pkg){
|
|
5
|
+
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
|
|
6
|
+
if (length(new.pkg))
|
|
7
|
+
install.packages(new.pkg, dependencies = TRUE)
|
|
8
|
+
sapply(pkg, require, character.only = 1)}
|
|
9
|
+
|
|
10
|
+
# install packages if not installed
|
|
11
|
+
packages<-c("optparse", "ape", "vegan")
|
|
12
|
+
invisible(suppressMessages(check.packages(packages)))
|
|
13
|
+
|
|
14
|
+
option_list = list(
|
|
15
|
+
make_option(c("-a", "--treeo"), type="character", default=NULL, help="the first tree"),
|
|
16
|
+
make_option(c("-b", "--treet"), type="character", default=NULL, help="the second tree"));
|
|
17
|
+
|
|
18
|
+
opt_parser = OptionParser(option_list=option_list);
|
|
19
|
+
opt = parse_args(opt_parser);
|
|
20
|
+
|
|
21
|
+
TREE1 = read.tree(opt$treeo)
|
|
22
|
+
TREE2 = read.tree(opt$treet)
|
|
23
|
+
|
|
24
|
+
D1 = cophenetic(TREE1)
|
|
25
|
+
D1 = D1[order(row.names(D1)),order(row.names(D1))]
|
|
26
|
+
D2 = cophenetic(TREE2)
|
|
27
|
+
D2 = D2[order(row.names(D2)),order(row.names(D2))]
|
|
28
|
+
|
|
29
|
+
mantel(xdis = D1, ydis = D2, permutations = 999)
|
|
30
|
+
|