treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/SingleAleHGT.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def sep_path_basename_ext(file_in):
|
|
6
|
+
|
|
7
|
+
# separate path and file name
|
|
8
|
+
f_path, file_name = os.path.split(file_in)
|
|
9
|
+
if f_path == '':
|
|
10
|
+
f_path = '.'
|
|
11
|
+
|
|
12
|
+
# separate file basename and extension
|
|
13
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
14
|
+
|
|
15
|
+
return f_path, f_base, f_ext
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
SingleAleHGT_usage = '''
|
|
19
|
+
============================================ SingleAleHGT example commands ============================================
|
|
20
|
+
|
|
21
|
+
TreeSAK SingleAleHGT -i concatenated.fasta -s genome.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd
|
|
22
|
+
|
|
23
|
+
=======================================================================================================================
|
|
24
|
+
'''
|
|
25
|
+
|
|
26
|
+
def SingleAleHGT(args):
|
|
27
|
+
|
|
28
|
+
faa_in = args['faa']
|
|
29
|
+
msa_in = args['msa']
|
|
30
|
+
op_dir = args['o']
|
|
31
|
+
genome_tree_file_rooted = args['s']
|
|
32
|
+
API_key = args['api']
|
|
33
|
+
hgt_freq_cutoff = args['fc']
|
|
34
|
+
ar_phylum_color_code_txt = args['color']
|
|
35
|
+
genome_taxon_txt = args['c']
|
|
36
|
+
force_overwrite = args['f']
|
|
37
|
+
trim_msa = args['trim']
|
|
38
|
+
docker_image = args['docker']
|
|
39
|
+
num_threads = args['t']
|
|
40
|
+
|
|
41
|
+
######################################## check input files #######################################
|
|
42
|
+
|
|
43
|
+
# if docker_image is True, check if docker is activated
|
|
44
|
+
if (faa_in is not None) and (msa_in is None):
|
|
45
|
+
f_path, f_base, f_ext = sep_path_basename_ext(faa_in)
|
|
46
|
+
elif (faa_in is None) and (msa_in is not None):
|
|
47
|
+
f_path, f_base, f_ext = sep_path_basename_ext(msa_in)
|
|
48
|
+
else:
|
|
49
|
+
print('Please specify either -faa or -msa, program exited!')
|
|
50
|
+
exit()
|
|
51
|
+
|
|
52
|
+
######################################## define file name ########################################
|
|
53
|
+
|
|
54
|
+
ale1_op_dir = '%s/ALE1_op_dir' % op_dir
|
|
55
|
+
ale2_op_dir = '%s/ALE2_op_dir' % op_dir
|
|
56
|
+
ale4_op_dir = '%s/ALE4_op_dir' % op_dir
|
|
57
|
+
log_txt = '%s/log.txt' % op_dir
|
|
58
|
+
msa_file = '%s/%s.aln' % (ale1_op_dir, f_base)
|
|
59
|
+
msa_trimmed = '%s/%s_trimmed.aln' % (ale1_op_dir, f_base)
|
|
60
|
+
tree_prefix = '%s/%s' % (ale1_op_dir, f_base)
|
|
61
|
+
|
|
62
|
+
###################################### create output folder ######################################
|
|
63
|
+
|
|
64
|
+
if os.path.isdir(op_dir) is True:
|
|
65
|
+
if force_overwrite is True:
|
|
66
|
+
os.system('rm -r %s' % op_dir)
|
|
67
|
+
else:
|
|
68
|
+
print('%s exist, program exited!' % op_dir)
|
|
69
|
+
exit()
|
|
70
|
+
os.mkdir(op_dir)
|
|
71
|
+
os.mkdir(ale1_op_dir)
|
|
72
|
+
|
|
73
|
+
##################################################################################################
|
|
74
|
+
|
|
75
|
+
# run mafft-einsi
|
|
76
|
+
if (faa_in is not None) and (msa_in is None):
|
|
77
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_threads, faa_in, msa_file)
|
|
78
|
+
|
|
79
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
80
|
+
log_txt_handle.write(mafft_cmd + '\n')
|
|
81
|
+
os.system(mafft_cmd)
|
|
82
|
+
msa_file_for_next_step = msa_file
|
|
83
|
+
else:
|
|
84
|
+
msa_file_for_next_step = msa_in
|
|
85
|
+
|
|
86
|
+
# run trimal
|
|
87
|
+
if trim_msa is True:
|
|
88
|
+
trimal_cmd = 'trimal -in %s -out %s -automated1' % (msa_file_for_next_step, msa_trimmed)
|
|
89
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
90
|
+
log_txt_handle.write(trimal_cmd + '\n')
|
|
91
|
+
os.system(trimal_cmd)
|
|
92
|
+
iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_trimmed, tree_prefix)
|
|
93
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
94
|
+
log_txt_handle.write(iqtree2_cmd + '\n')
|
|
95
|
+
os.system(iqtree2_cmd)
|
|
96
|
+
else:
|
|
97
|
+
iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_file_for_next_step, tree_prefix)
|
|
98
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
99
|
+
log_txt_handle.write(iqtree2_cmd + '\n')
|
|
100
|
+
os.system(iqtree2_cmd)
|
|
101
|
+
|
|
102
|
+
# run ALE2
|
|
103
|
+
ale2_cmd = 'TreeSAK ALE2 -i %s -s %s -t %s -f -runALE -docker %s -o %s' % (ale1_op_dir, genome_tree_file_rooted, num_threads, docker_image, ale2_op_dir)
|
|
104
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
105
|
+
log_txt_handle.write(ale2_cmd + '\n')
|
|
106
|
+
os.system(ale2_cmd)
|
|
107
|
+
|
|
108
|
+
# run ALE4
|
|
109
|
+
ale4_cmd = 'TreeSAK ALE4 -i1 %s -i2 %s -c %s -color %s -o %s -fc %s -f -api %s' % (ale1_op_dir, ale2_op_dir, genome_taxon_txt, ar_phylum_color_code_txt, ale4_op_dir, hgt_freq_cutoff, API_key)
|
|
110
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
111
|
+
log_txt_handle.write(ale4_cmd + '\n')
|
|
112
|
+
os.system(ale4_cmd)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == '__main__':
|
|
116
|
+
|
|
117
|
+
SingleAleHGT_parser = argparse.ArgumentParser()
|
|
118
|
+
SingleAleHGT_parser.add_argument('-faa', required=False, default=None, help='input aa file, e.g., OMA0001.faa')
|
|
119
|
+
SingleAleHGT_parser.add_argument('-msa', required=False, default=None, help='input MSA file, e.g., OMA0001.aln')
|
|
120
|
+
SingleAleHGT_parser.add_argument('-o', required=True, help='output dir, e.g., SingleAleHGT_wd')
|
|
121
|
+
SingleAleHGT_parser.add_argument('-s', required=True, help='rooted species tree')
|
|
122
|
+
SingleAleHGT_parser.add_argument('-c', required=True, help='genome_taxon, GTDB format')
|
|
123
|
+
SingleAleHGT_parser.add_argument('-color', required=True, help='phylum color code')
|
|
124
|
+
SingleAleHGT_parser.add_argument('-fc', required=False, type=float, default=0.5, help='hgt_freq_cutoff, default: 0.5')
|
|
125
|
+
SingleAleHGT_parser.add_argument('-mld', required=False, type=int, default=5, help='donor_node_min_leaf_num, default: 5')
|
|
126
|
+
SingleAleHGT_parser.add_argument('-mlr', required=False, type=int, default=5, help='recipient_node_min_leaf_num, default: 5')
|
|
127
|
+
SingleAleHGT_parser.add_argument('-trim', required=False, action="store_true", help='trim MSA')
|
|
128
|
+
SingleAleHGT_parser.add_argument('-docker', required=False, default=None, help='Docker image, if ALE was installed with Docker, e.g., gregmich/alesuite_new')
|
|
129
|
+
SingleAleHGT_parser.add_argument('-itol', required=False, default='batch_access_tmp', help='iTOL project_name, default: batch_access_tmp')
|
|
130
|
+
SingleAleHGT_parser.add_argument('-api', required=True, help='iTOL API key')
|
|
131
|
+
SingleAleHGT_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
|
|
132
|
+
SingleAleHGT_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
133
|
+
args = vars(SingleAleHGT_parser.parse_args())
|
|
134
|
+
SingleAleHGT(args)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
'''
|
|
138
|
+
|
|
139
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/ALE1_op_dir_OMA05484_OMA07484_trimmed
|
|
140
|
+
trimal -in ../ALE1_op_dir_OMA05484_OMA07484/concatenated.fasta -out concatenated.fasta -automated1
|
|
141
|
+
iqtree2 -m LG+G+I -bb 1000 --wbtl -nt 10 -s concatenated.fasta -pre OMA05484_OMA07484
|
|
142
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
|
|
143
|
+
TreeSAK ALE2 -i ALE1_op_dir_OMA05484_OMA07484_trimmed -s genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir_OMA05484_OMA07484_trimmed
|
|
144
|
+
TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
145
|
+
|
|
146
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
|
|
147
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -msa ALE1_op_dir_OMA05484_OMA07484_trimmed/concatenated.fasta -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd -trim
|
|
148
|
+
|
|
149
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/demo_SingleAleHGT_wd
|
|
150
|
+
TreeSAK ALE2 -i ALE1_op_dir -s ../genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir
|
|
151
|
+
TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
152
|
+
|
|
153
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o demo_SingleAleHGT_wd -msa ALE1_op_dir/OMA15312.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
|
|
154
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
|
|
155
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd_no_trim -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -docker gregmich/alesuite_new
|
|
156
|
+
|
|
157
|
+
'''
|
TreeSAK/SingleLinePhy.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from Bio import AlignIO
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SingleLinePhy_usage = '''
|
|
7
|
+
======== SingleLinePhy example commands ========
|
|
8
|
+
|
|
9
|
+
TreeSAK SingleLinePhy -i in.phy -o out.phy
|
|
10
|
+
|
|
11
|
+
================================================
|
|
12
|
+
'''
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def SingleLinePhy(args):
|
|
16
|
+
|
|
17
|
+
phy_in = args['i']
|
|
18
|
+
phy_out = args['o']
|
|
19
|
+
|
|
20
|
+
# check input file
|
|
21
|
+
if os.path.isfile(phy_in) is False:
|
|
22
|
+
print('input file not found, program exited!')
|
|
23
|
+
exit()
|
|
24
|
+
|
|
25
|
+
alignment = AlignIO.read(phy_in, 'phylip-relaxed')
|
|
26
|
+
|
|
27
|
+
max_seq_id_len = 0
|
|
28
|
+
for each_seq in alignment:
|
|
29
|
+
seq_id_len = len(each_seq.id)
|
|
30
|
+
if seq_id_len > max_seq_id_len:
|
|
31
|
+
max_seq_id_len = seq_id_len
|
|
32
|
+
|
|
33
|
+
with open(phy_out, 'w') as msa_out_handle:
|
|
34
|
+
msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
|
|
35
|
+
for each_seq in alignment:
|
|
36
|
+
seq_id = each_seq.id
|
|
37
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
38
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
39
|
+
|
|
40
|
+
print('Done!')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
if __name__ == '__main__':
|
|
44
|
+
|
|
45
|
+
# initialize the options parser
|
|
46
|
+
parser = argparse.ArgumentParser()
|
|
47
|
+
parser.add_argument('-i', required=True, help='input file')
|
|
48
|
+
parser.add_argument('-o', required=True, help='output file')
|
|
49
|
+
args = vars(parser.parse_args())
|
|
50
|
+
SingleLinePhy(args)
|
TreeSAK/SliceMSA.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from Bio import AlignIO
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SliceMSA_usage = '''
|
|
7
|
+
========================= SliceMSA example commands =========================
|
|
8
|
+
|
|
9
|
+
TreeSAK SliceMSA -i 16S_aln.fasta -s 200-300 -o 16S_aln_200-300.fasta
|
|
10
|
+
TreeSAK SliceMSA -i 16S_aln.phylip -fi phylip-relaxed -s sections.txt -o SliceMSA_op -fo phylip-relaxed
|
|
11
|
+
|
|
12
|
+
# example
|
|
13
|
+
200-300 select columns 200-300
|
|
14
|
+
-100 select columns 1-300
|
|
15
|
+
500- select columns from 500 to the end
|
|
16
|
+
|
|
17
|
+
# Example of sections.txt (one section per line):
|
|
18
|
+
200-300
|
|
19
|
+
-100
|
|
20
|
+
500-
|
|
21
|
+
|
|
22
|
+
# Examples of alignment format (https://biopython.org/wiki/AlignIO):
|
|
23
|
+
fasta, phylip, phylip-relaxed, phylip-sequential, clustal
|
|
24
|
+
|
|
25
|
+
=============================================================================
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def msa2fasta(msa_object, fasta_out):
|
|
30
|
+
|
|
31
|
+
with open(fasta_out, 'w') as fasta_out_handle:
|
|
32
|
+
for each_seq in msa_object:
|
|
33
|
+
fasta_out_handle.write('>%s\n' % each_seq.id)
|
|
34
|
+
fasta_out_handle.write('%s\n' % str(each_seq.seq))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def msa2phylip(msa_object, phylip_out):
|
|
38
|
+
|
|
39
|
+
max_seq_id_len = 0
|
|
40
|
+
for each_seq in msa_object:
|
|
41
|
+
seq_id_len = len(each_seq.id)
|
|
42
|
+
if seq_id_len > max_seq_id_len:
|
|
43
|
+
max_seq_id_len = seq_id_len
|
|
44
|
+
|
|
45
|
+
with open(phylip_out, 'w') as phylip_out_handle:
|
|
46
|
+
phylip_out_handle.write('%s %s\n' % (len(msa_object), msa_object.get_alignment_length()))
|
|
47
|
+
for each_seq in msa_object:
|
|
48
|
+
seq_id = each_seq.id
|
|
49
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
50
|
+
phylip_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def SliceMSA(args):
|
|
54
|
+
|
|
55
|
+
msa_in_file = args['i']
|
|
56
|
+
aln_in_format = args['fi']
|
|
57
|
+
col_to_select_txt = args['s']
|
|
58
|
+
op_dir = args['o']
|
|
59
|
+
aln_out_format = args['fo']
|
|
60
|
+
force_overwriting = args['force']
|
|
61
|
+
|
|
62
|
+
aln_out_ext = 'fasta'
|
|
63
|
+
if aln_out_format == 'phylip-relaxed':
|
|
64
|
+
aln_out_ext = 'phylip'
|
|
65
|
+
|
|
66
|
+
if os.path.isfile(msa_in_file) is False:
|
|
67
|
+
print('Input MSA not found, program exited!')
|
|
68
|
+
exit()
|
|
69
|
+
|
|
70
|
+
# read in msa
|
|
71
|
+
msa_in = AlignIO.read(msa_in_file, aln_in_format)
|
|
72
|
+
|
|
73
|
+
# parse provided sections
|
|
74
|
+
section_to_select_list = []
|
|
75
|
+
if os.path.isfile(col_to_select_txt) is False:
|
|
76
|
+
col_to_select_txt_split = col_to_select_txt.strip().split('-')
|
|
77
|
+
if col_to_select_txt == '-':
|
|
78
|
+
section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
|
|
79
|
+
elif col_to_select_txt.startswith('-'):
|
|
80
|
+
section_to_select_list.append(['1', col_to_select_txt_split[1]])
|
|
81
|
+
elif col_to_select_txt.endswith('-'):
|
|
82
|
+
section_to_select_list.append([col_to_select_txt_split[0], str(msa_in.get_alignment_length())])
|
|
83
|
+
else:
|
|
84
|
+
section_to_select_list.append(col_to_select_txt_split)
|
|
85
|
+
else:
|
|
86
|
+
for each_section in open(col_to_select_txt):
|
|
87
|
+
each_section = each_section.strip()
|
|
88
|
+
each_section_split = each_section.strip().split('-')
|
|
89
|
+
if each_section == '-':
|
|
90
|
+
section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
|
|
91
|
+
elif each_section.startswith('-'):
|
|
92
|
+
section_to_select_list.append(['1', each_section_split[1]])
|
|
93
|
+
elif each_section.endswith('-'):
|
|
94
|
+
section_to_select_list.append([each_section_split[0], str(msa_in.get_alignment_length())])
|
|
95
|
+
else:
|
|
96
|
+
section_to_select_list.append(each_section_split)
|
|
97
|
+
|
|
98
|
+
# check output folder
|
|
99
|
+
if len(section_to_select_list) > 1:
|
|
100
|
+
if os.path.isdir(op_dir) is True:
|
|
101
|
+
if force_overwriting is True:
|
|
102
|
+
os.system('rm -r %s' % op_dir)
|
|
103
|
+
else:
|
|
104
|
+
print('Output folder already exist, program exited!')
|
|
105
|
+
exit()
|
|
106
|
+
os.system('mkdir %s' % op_dir)
|
|
107
|
+
|
|
108
|
+
# write out sections
|
|
109
|
+
if len(section_to_select_list) == 1:
|
|
110
|
+
current_section = msa_in[:, (int(section_to_select_list[0][0]) - 1):(int(section_to_select_list[0][1]))]
|
|
111
|
+
if aln_out_ext == 'fasta':
|
|
112
|
+
msa2fasta(current_section, op_dir)
|
|
113
|
+
if aln_out_ext == 'phylip':
|
|
114
|
+
msa2phylip(current_section, op_dir)
|
|
115
|
+
else:
|
|
116
|
+
for each_section in section_to_select_list:
|
|
117
|
+
|
|
118
|
+
pwd_op_file = '%s/%s.%s' % (op_dir, '-'.join(each_section), aln_out_ext)
|
|
119
|
+
current_section = msa_in[:, (int(each_section[0])-1):(int(each_section[1]))]
|
|
120
|
+
|
|
121
|
+
# write out
|
|
122
|
+
if aln_out_ext == 'fasta':
|
|
123
|
+
msa2fasta(current_section, pwd_op_file)
|
|
124
|
+
if aln_out_ext == 'phylip':
|
|
125
|
+
msa2phylip(current_section, pwd_op_file)
|
|
126
|
+
|
|
127
|
+
print('MSA subset(s) exported to %s, Done!' % op_dir)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if __name__ == '__main__':
|
|
131
|
+
|
|
132
|
+
# arguments for rename_seq_parser
|
|
133
|
+
SliceMSA_parser = argparse.ArgumentParser()
|
|
134
|
+
SliceMSA_parser.add_argument('-i', required=True, help='input MSA in fasta format')
|
|
135
|
+
SliceMSA_parser.add_argument('-fi', required=False, default='fasta', help='format (NOT file extension) of input MSA, default: fasta')
|
|
136
|
+
SliceMSA_parser.add_argument('-s', required=True, help='columns to export, e.g. 200-300, -100, 50-')
|
|
137
|
+
SliceMSA_parser.add_argument('-o', required=True, help='output file or folder')
|
|
138
|
+
SliceMSA_parser.add_argument('-fo', required=False, default='fasta', help='format of output MSA, select from fasta and phylip-relaxed, default: fasta')
|
|
139
|
+
SliceMSA_parser.add_argument('-force', required=False, action="store_true", help='force overwrite existing output folder')
|
|
140
|
+
args = vars(SliceMSA_parser.parse_args())
|
|
141
|
+
SliceMSA(args)
|
|
142
|
+
|
TreeSAK/SplitScore.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
|
|
2
|
+
SplitScore_usage = '''
|
|
3
|
+
============================================= SplitScore example commands =============================================
|
|
4
|
+
|
|
5
|
+
# SplitScore modules
|
|
6
|
+
TreeSAK SplitScore1 -> Step 1: Infer gene tree
|
|
7
|
+
TreeSAK SplitScore1OMA -> Step 1: Infer gene tree (based on OMA outputs)
|
|
8
|
+
TreeSAK SplitScore2 -> Step 2: Calculate split score
|
|
9
|
+
|
|
10
|
+
# SplitScore1
|
|
11
|
+
TreeSAK SplitScore1 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f
|
|
12
|
+
TreeSAK SplitScore1 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f -u interested_gnm.txt
|
|
13
|
+
|
|
14
|
+
# SplitScore2
|
|
15
|
+
# Please ensure that all the commands produced in step one have been executed before proceeding to step two.
|
|
16
|
+
TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir
|
|
17
|
+
|
|
18
|
+
=======================================================================================================================
|
|
19
|
+
'''
|
TreeSAK/SplitScore1.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
import os
|
|
3
|
+
import glob
|
|
4
|
+
import argparse
|
|
5
|
+
from Bio import SeqIO
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
SplitScore1_usage = '''
|
|
9
|
+
======================== SplitScore1 example commands ========================
|
|
10
|
+
|
|
11
|
+
TreeSAK SplitScore1 -i marker_seq -x fa -o SplitScore1_op_dir -jst 9 -f
|
|
12
|
+
|
|
13
|
+
# Format of gene id
|
|
14
|
+
APA_bin56_00001
|
|
15
|
+
APA_bin56_00002
|
|
16
|
+
APA_bin56_00003
|
|
17
|
+
|
|
18
|
+
==============================================================================
|
|
19
|
+
'''
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def sep_path_basename_ext(file_in):
|
|
23
|
+
f_path, file_name = os.path.split(file_in)
|
|
24
|
+
if f_path == '':
|
|
25
|
+
f_path = '.'
|
|
26
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
27
|
+
return f_path, f_base, f_ext
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def SplitScore1(args):
|
|
31
|
+
|
|
32
|
+
oma_op_fasta = args['i']
|
|
33
|
+
fasta_file_ext = args['x']
|
|
34
|
+
interested_gnm_txt = args['u']
|
|
35
|
+
iqtree_model = args['m']
|
|
36
|
+
cov_cutoff = args['c']
|
|
37
|
+
force_overwrite = args['f']
|
|
38
|
+
num_of_js_threads = args['jst']
|
|
39
|
+
op_dir = args['o']
|
|
40
|
+
|
|
41
|
+
################################################################################
|
|
42
|
+
|
|
43
|
+
interested_gnm_set = set()
|
|
44
|
+
if interested_gnm_txt is not None:
|
|
45
|
+
if os.path.isfile(interested_gnm_txt):
|
|
46
|
+
for each_gnm in open(interested_gnm_txt):
|
|
47
|
+
interested_gnm_set.add(each_gnm.strip())
|
|
48
|
+
else:
|
|
49
|
+
print('%s not found, program exited' % interested_gnm_txt)
|
|
50
|
+
exit()
|
|
51
|
+
|
|
52
|
+
################################################################################
|
|
53
|
+
|
|
54
|
+
fa_file_re = '%s/*.%s' % (oma_op_fasta, fasta_file_ext)
|
|
55
|
+
fa_file_list = glob.glob(fa_file_re)
|
|
56
|
+
if len(fa_file_list) == 0:
|
|
57
|
+
print('No file found in %s, program exited!' % oma_op_fasta)
|
|
58
|
+
exit()
|
|
59
|
+
|
|
60
|
+
og_to_gene_dict = dict()
|
|
61
|
+
for each_fa in fa_file_list:
|
|
62
|
+
_, f_base, _ = sep_path_basename_ext(each_fa)
|
|
63
|
+
seq_id_set = set()
|
|
64
|
+
for each_seq in SeqIO.parse(each_fa, 'fasta'):
|
|
65
|
+
seq_id_set.add(each_seq.id)
|
|
66
|
+
og_to_gene_dict[f_base] = seq_id_set
|
|
67
|
+
|
|
68
|
+
################################################################################
|
|
69
|
+
|
|
70
|
+
gnm_to_process = set()
|
|
71
|
+
for each_og in og_to_gene_dict:
|
|
72
|
+
gene_set = og_to_gene_dict[each_og]
|
|
73
|
+
gnm_set = set()
|
|
74
|
+
for each_gene in gene_set:
|
|
75
|
+
gnm_id = '_'.join(each_gene.split('_')[:-1])
|
|
76
|
+
gnm_set.add(gnm_id)
|
|
77
|
+
if interested_gnm_txt is None:
|
|
78
|
+
gnm_to_process.add(gnm_id)
|
|
79
|
+
else:
|
|
80
|
+
if gnm_id in interested_gnm_set:
|
|
81
|
+
gnm_to_process.add(gnm_id)
|
|
82
|
+
|
|
83
|
+
if len(gene_set) != len(gnm_set):
|
|
84
|
+
print('Program exited!')
|
|
85
|
+
exit()
|
|
86
|
+
|
|
87
|
+
################################################################################
|
|
88
|
+
|
|
89
|
+
# define file name
|
|
90
|
+
qualified_og_dir = '%s/qualified_OGs' % op_dir
|
|
91
|
+
cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
|
|
92
|
+
cmd_2_trimal_txt = '%s/cmd_2_trimal.txt' % op_dir
|
|
93
|
+
cmd_3_iqtree_txt = '%s/cmd_3_iqtree.txt' % op_dir
|
|
94
|
+
ignored_marker_txt = '%s/ignored_markers.txt' % op_dir
|
|
95
|
+
|
|
96
|
+
# create output folder
|
|
97
|
+
if os.path.isdir(op_dir) is True:
|
|
98
|
+
if force_overwrite is True:
|
|
99
|
+
os.system('rm -r %s' % op_dir)
|
|
100
|
+
else:
|
|
101
|
+
print('%s exist, program exited!' % op_dir)
|
|
102
|
+
exit()
|
|
103
|
+
os.mkdir(op_dir)
|
|
104
|
+
os.mkdir(qualified_og_dir)
|
|
105
|
+
|
|
106
|
+
################################################################################
|
|
107
|
+
|
|
108
|
+
cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
|
|
109
|
+
cmd_2_trimal_txt_handle = open(cmd_2_trimal_txt, 'w')
|
|
110
|
+
cmd_3_iqtree_txt_handle = open(cmd_3_iqtree_txt, 'w')
|
|
111
|
+
ignored_og_dict = dict()
|
|
112
|
+
for each_og in sorted(list(og_to_gene_dict.keys())):
|
|
113
|
+
seq_file_in = '%s/%s.%s' % (oma_op_fasta, each_og, fasta_file_ext)
|
|
114
|
+
file_out_seq = '%s/%s.%s' % (qualified_og_dir, each_og, fasta_file_ext)
|
|
115
|
+
file_out_aln = '%s.aln' % each_og
|
|
116
|
+
file_out_aln_trimmed = '%s_trimmed.aln' % each_og
|
|
117
|
+
|
|
118
|
+
seq_file_out_handle = open(file_out_seq, 'w')
|
|
119
|
+
current_gnm_set = set()
|
|
120
|
+
for each_seq in SeqIO.parse(seq_file_in, 'fasta'):
|
|
121
|
+
seq_id = each_seq.id
|
|
122
|
+
gnm_id = '_'.join(seq_id.split('_')[:-1])
|
|
123
|
+
if gnm_id in gnm_to_process:
|
|
124
|
+
current_gnm_set.add(gnm_id)
|
|
125
|
+
seq_file_out_handle.write('>%s\n' % each_seq.id)
|
|
126
|
+
seq_file_out_handle.write('%s\n' % each_seq.seq)
|
|
127
|
+
seq_file_out_handle.close()
|
|
128
|
+
|
|
129
|
+
cov_value = len(current_gnm_set)*100/len(gnm_to_process)
|
|
130
|
+
cov_value = float("{0:.2f}".format(cov_value))
|
|
131
|
+
|
|
132
|
+
if cov_value < cov_cutoff:
|
|
133
|
+
report_str = 'Ignored %s, contains proteins from %s (%s%s) genomes, < %s%s.' % (each_og, len(current_gnm_set), cov_value, '%', cov_cutoff, '%')
|
|
134
|
+
ignored_og_dict[each_og] = report_str
|
|
135
|
+
os.system('rm %s' % file_out_seq)
|
|
136
|
+
else:
|
|
137
|
+
# align, trim and iqtree
|
|
138
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s.%s > %s' % (num_of_js_threads, each_og, fasta_file_ext, file_out_aln)
|
|
139
|
+
trimal_cmd = 'trimal -in %s -out %s -automated1' % (file_out_aln, file_out_aln_trimmed)
|
|
140
|
+
iqtree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s -T %s --quiet' % (file_out_aln_trimmed, iqtree_model, each_og, num_of_js_threads)
|
|
141
|
+
# Undinarchaeota illuminate DPANN phylogeny and the impact of gene transfer on archaeal evolution, settings: -m LG+G -bb 1000 -wbtl -bnni
|
|
142
|
+
cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
|
|
143
|
+
cmd_2_trimal_txt_handle.write(trimal_cmd + '\n')
|
|
144
|
+
cmd_3_iqtree_txt_handle.write(iqtree_cmd + '\n')
|
|
145
|
+
cmd_1_mafft_txt_handle.close()
|
|
146
|
+
cmd_2_trimal_txt_handle.close()
|
|
147
|
+
cmd_3_iqtree_txt_handle.close()
|
|
148
|
+
|
|
149
|
+
# report ignored markers
|
|
150
|
+
if len(ignored_og_dict) > 0:
|
|
151
|
+
print('The following %s markers were ignored due to low genome coverage, see details in %s:' % (len(ignored_og_dict), ignored_marker_txt))
|
|
152
|
+
print('\n'.join(sorted(list(ignored_og_dict.keys()))))
|
|
153
|
+
ignored_marker_txt_handle = open(ignored_marker_txt, 'w')
|
|
154
|
+
for each_ignored_marker in sorted(list(ignored_og_dict.keys())):
|
|
155
|
+
ignored_marker_txt_handle.write(ignored_og_dict[each_ignored_marker] + '\n')
|
|
156
|
+
ignored_marker_txt_handle.close()
|
|
157
|
+
|
|
158
|
+
# report
|
|
159
|
+
print('You will need to execute the commands exported to the following three files before moving to SplitScore2')
|
|
160
|
+
print(cmd_1_mafft_txt)
|
|
161
|
+
print(cmd_2_trimal_txt)
|
|
162
|
+
print(cmd_3_iqtree_txt)
|
|
163
|
+
print('Done!')
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
if __name__ == '__main__':
|
|
167
|
+
|
|
168
|
+
SplitScore1_parser = argparse.ArgumentParser()
|
|
169
|
+
SplitScore1_parser.add_argument('-i', required=True, help='orthologous gene sequence')
|
|
170
|
+
SplitScore1_parser.add_argument('-x', required=True, help='fasta file extension')
|
|
171
|
+
SplitScore1_parser.add_argument('-o', required=True, help='output directory')
|
|
172
|
+
SplitScore1_parser.add_argument('-u', required=False, default=None, help='interested genomes, no file extension')
|
|
173
|
+
SplitScore1_parser.add_argument('-m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
|
|
174
|
+
SplitScore1_parser.add_argument('-c', required=False, type=int, default=85, help='coverage cutoff, default: 85')
|
|
175
|
+
SplitScore1_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
176
|
+
SplitScore1_parser.add_argument('-jst', required=False, type=int, default=1, help='num of threads for iqtree2, default: 1')
|
|
177
|
+
args = vars(SplitScore1_parser.parse_args())
|
|
178
|
+
SplitScore1(args)
|