treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from ete3 import Tree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
iTOL_gene_tree_usage = '''
|
|
7
|
+
====================== iTOL_gene_tree example commands ======================
|
|
8
|
+
|
|
9
|
+
TreeSAK iTOL_gene_tree -tree genes.tree -i gnm_taxon.txt -o gene_taxon.txt
|
|
10
|
+
TreeSAK iTOL_gene_tree -txt gene_id.txt -i gnm_taxon.txt -o gene_taxon.txt
|
|
11
|
+
|
|
12
|
+
=============================================================================
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def iTOL_gene_tree(args):
|
|
17
|
+
|
|
18
|
+
input_tree_file = args['tree']
|
|
19
|
+
input_txt_file = args['txt']
|
|
20
|
+
meta_txt = args['i']
|
|
21
|
+
op_txt = args['o']
|
|
22
|
+
include_na = args['na']
|
|
23
|
+
|
|
24
|
+
if (input_tree_file is None) and (input_txt_file is None):
|
|
25
|
+
print('Please provide gene id with at least one approach, program exited!')
|
|
26
|
+
exit()
|
|
27
|
+
|
|
28
|
+
if os.path.isfile(meta_txt) is False:
|
|
29
|
+
print('Metadata file not found, program exited!')
|
|
30
|
+
exit()
|
|
31
|
+
|
|
32
|
+
metadata_dict = dict()
|
|
33
|
+
for each_gnm in open(meta_txt):
|
|
34
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
35
|
+
if len(each_gnm_split) == 2:
|
|
36
|
+
gnm_id = each_gnm_split[0]
|
|
37
|
+
meta_value = each_gnm_split[1]
|
|
38
|
+
metadata_dict[gnm_id] = meta_value
|
|
39
|
+
|
|
40
|
+
gene_id_set = set()
|
|
41
|
+
if input_tree_file is not None:
|
|
42
|
+
if os.path.isfile(input_tree_file) is False:
|
|
43
|
+
print('Tree file not found, program exited!')
|
|
44
|
+
exit()
|
|
45
|
+
else:
|
|
46
|
+
for leaf in Tree(input_tree_file, format=1):
|
|
47
|
+
gene_id_set.add(leaf.name)
|
|
48
|
+
|
|
49
|
+
if input_txt_file is not None:
|
|
50
|
+
if os.path.isfile(input_txt_file) is False:
|
|
51
|
+
print('Txt file not found, program exited!')
|
|
52
|
+
exit()
|
|
53
|
+
else:
|
|
54
|
+
for each_id in open(input_txt_file):
|
|
55
|
+
gene_id_set.add(each_id.strip())
|
|
56
|
+
|
|
57
|
+
op_txt_handle = open(op_txt, 'w')
|
|
58
|
+
for gene_id in gene_id_set:
|
|
59
|
+
gnm_id = '_'.join(gene_id.split('_')[:-1])
|
|
60
|
+
gnm_meta = metadata_dict.get(gnm_id, 'na')
|
|
61
|
+
if include_na is True:
|
|
62
|
+
op_txt_handle.write('%s\t%s\n' % (gene_id, gnm_meta))
|
|
63
|
+
else:
|
|
64
|
+
if gnm_meta != 'na':
|
|
65
|
+
op_txt_handle.write('%s\t%s\n' % (gene_id, gnm_meta))
|
|
66
|
+
op_txt_handle.close()
|
|
67
|
+
|
|
68
|
+
print('Done!')
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if __name__ == '__main__':
|
|
72
|
+
|
|
73
|
+
iTOL_gene_tree_parser = argparse.ArgumentParser(usage=iTOL_gene_tree_usage)
|
|
74
|
+
iTOL_gene_tree_parser.add_argument('-i', required=True, help='input metadata')
|
|
75
|
+
iTOL_gene_tree_parser.add_argument('-tree', required=False, default=None, help='gene id, in tree file')
|
|
76
|
+
iTOL_gene_tree_parser.add_argument('-txt', required=False, default=None, help='gene id, in txt file')
|
|
77
|
+
iTOL_gene_tree_parser.add_argument('-o', required=True, help='output metadata')
|
|
78
|
+
iTOL_gene_tree_parser.add_argument('-na', required=False, action='store_true', help='include leaves with na values')
|
|
79
|
+
args = vars(iTOL_gene_tree_parser.parse_args())
|
|
80
|
+
iTOL_gene_tree(args)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import math
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
iTOL_msa_stats_usage = '''
|
|
8
|
+
========= iTOL_msa_stats example command =========
|
|
9
|
+
|
|
10
|
+
TreeSAK iTOL_msa_stats -i concatenated.phy.fasta
|
|
11
|
+
|
|
12
|
+
==================================================
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def sep_path_basename_ext(file_in):
|
|
17
|
+
f_path, f_name = os.path.split(file_in)
|
|
18
|
+
if f_path == '':
|
|
19
|
+
f_path = '.'
|
|
20
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
21
|
+
f_ext = f_ext[1:]
|
|
22
|
+
return f_name, f_path, f_base, f_ext
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def iTOL_msa_stats(args):
|
|
26
|
+
|
|
27
|
+
aln_file = args['i']
|
|
28
|
+
|
|
29
|
+
_, aln_path, aln_base, _ = sep_path_basename_ext(aln_file)
|
|
30
|
+
|
|
31
|
+
stats_txt = '%s/%s_gap_pct.txt' % (aln_path, aln_base)
|
|
32
|
+
stats_txt_itol = '%s/%s_gap_pct_iTOL.txt' % (aln_path, aln_base)
|
|
33
|
+
|
|
34
|
+
max_gap_pct = 0
|
|
35
|
+
stats_txt_handle = open(stats_txt, 'w')
|
|
36
|
+
for each_seq in SeqIO.parse(aln_file, 'fasta'):
|
|
37
|
+
seq_id = each_seq.id
|
|
38
|
+
seq_seq = str(each_seq.seq)
|
|
39
|
+
gap_pct = seq_seq.count('-')*100/len(seq_seq)
|
|
40
|
+
gap_pct = float("{0:.2f}".format(gap_pct))
|
|
41
|
+
if gap_pct > max_gap_pct:
|
|
42
|
+
max_gap_pct = gap_pct
|
|
43
|
+
stats_txt_handle.write('%s\t%s\n' % (seq_id, gap_pct))
|
|
44
|
+
stats_txt_handle.close()
|
|
45
|
+
|
|
46
|
+
max_scale_value = math.ceil(max_gap_pct/5) * 5
|
|
47
|
+
gap_pct_itol_cmd = 'TreeSAK iTOL -SimpleBar -lv %s -scale 0-25-50-75-100 -lt Gap_Pecentage -o %s' % (stats_txt, stats_txt_itol)
|
|
48
|
+
os.system(gap_pct_itol_cmd)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == '__main__':
|
|
52
|
+
|
|
53
|
+
iTOL_msa_stats_parser = argparse.ArgumentParser(usage=iTOL_msa_stats_usage)
|
|
54
|
+
iTOL_msa_stats_parser.add_argument('-i', required=True, help='MSA file')
|
|
55
|
+
args = vars(iTOL_msa_stats_parser.parse_args())
|
|
56
|
+
iTOL_msa_stats(args)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import operator
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def keep_highest_rrtc(rrtc_in, rrtc_out):
|
|
5
|
+
|
|
6
|
+
rrtc_highest_prob_dict = dict()
|
|
7
|
+
for each_rrtc in open(rrtc_in):
|
|
8
|
+
each_rrtc_split = each_rrtc.strip().split(':')[0].split('\t')
|
|
9
|
+
rrtc_r = each_rrtc.strip().split(':')[0].split('\t')[0]
|
|
10
|
+
rrtc_d = each_rrtc.strip().split(':')[0].split('\t')[1]
|
|
11
|
+
rrtc_v = float(each_rrtc.strip().split(':')[1])
|
|
12
|
+
rrtc_key = '%s___%s' % (rrtc_r, rrtc_d)
|
|
13
|
+
if rrtc_key not in rrtc_highest_prob_dict:
|
|
14
|
+
rrtc_highest_prob_dict[rrtc_key] = rrtc_v
|
|
15
|
+
else:
|
|
16
|
+
if rrtc_v > rrtc_highest_prob_dict[rrtc_key]:
|
|
17
|
+
rrtc_highest_prob_dict[rrtc_key] = rrtc_v
|
|
18
|
+
|
|
19
|
+
with open(rrtc_out, 'w') as rrtc_out_handle:
|
|
20
|
+
for each_rrtc in sorted(rrtc_highest_prob_dict.items(), key=operator.itemgetter(1))[::-1]:
|
|
21
|
+
print(each_rrtc)
|
|
22
|
+
rrtc_r = each_rrtc[0].split('___')[0]
|
|
23
|
+
rrtc_d = each_rrtc[0].split('___')[1]
|
|
24
|
+
rrtc_v = each_rrtc[1]
|
|
25
|
+
rrtc_out_handle.write('%s\t%s:%s\n' % (rrtc_r, rrtc_d, rrtc_v))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
rrtc_in = '/Users/songweizhi/Desktop/rrtc.txt'
|
|
29
|
+
rrtc_out = '/Users/songweizhi/Desktop/rrtc_out.txt'
|
|
30
|
+
keep_highest_rrtc(rrtc_in, rrtc_out)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
demo_dict = { 'a': 6, 'b': 2, 'c': 2 }
|
|
34
|
+
for each in sorted(demo_dict.items(), key=operator.itemgetter(1))[::-1]:
|
|
35
|
+
print(each[0])
|
|
36
|
+
print(each[1])
|
|
37
|
+
|
TreeSAK/koTree.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
koTree_usage = '''
|
|
9
|
+
================================ koTree example commands ================================
|
|
10
|
+
|
|
11
|
+
TreeSAK koTree -i combined.faa -kegg KEGG_wd -o op_dir -bmge -t 12 -f -fun ko_id.txt
|
|
12
|
+
TreeSAK koTree -i combined.faa -kegg KEGG_wd -o op_dir -bmge -t 12 -f -fun K01995
|
|
13
|
+
TreeSAK koTree -i combined.faa -kegg KEGG_wd -o op_dir -bmge -t 12 -f -fun K01995,K01996
|
|
14
|
+
|
|
15
|
+
=========================================================================================
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def select_seq(seq_file, seq_id_set, output_file):
|
|
20
|
+
output_file_handle = open(output_file, 'w')
|
|
21
|
+
for seq_record in SeqIO.parse(seq_file, 'fasta'):
|
|
22
|
+
seq_id = seq_record.id
|
|
23
|
+
if seq_id in seq_id_set:
|
|
24
|
+
SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
|
|
25
|
+
output_file_handle.close()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def koTree(args):
|
|
29
|
+
|
|
30
|
+
combined_faa = args['i']
|
|
31
|
+
kegg_annotation_wd = args['kegg']
|
|
32
|
+
interested_fun_txt = args['fun']
|
|
33
|
+
op_dir = args['o']
|
|
34
|
+
trim_with_bmge = args['bmge']
|
|
35
|
+
trim_model = args['bmge_m']
|
|
36
|
+
entropy_score_cutoff = args['bmge_esc']
|
|
37
|
+
iqtree_model = args['iqtree_m']
|
|
38
|
+
force_overwrite = args['f']
|
|
39
|
+
num_of_threads = args['t']
|
|
40
|
+
|
|
41
|
+
# specify path to BMGE.jar
|
|
42
|
+
current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
|
|
43
|
+
pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
|
|
44
|
+
|
|
45
|
+
interested_fun_set = set()
|
|
46
|
+
if os.path.isfile(interested_fun_txt) is False:
|
|
47
|
+
if ',' in interested_fun_txt:
|
|
48
|
+
interested_fun_set = interested_fun_txt.split(',')
|
|
49
|
+
else:
|
|
50
|
+
interested_fun_set.add(interested_fun_txt)
|
|
51
|
+
else:
|
|
52
|
+
for each_fun in open(interested_fun_txt):
|
|
53
|
+
interested_fun_set.add(each_fun.strip().split()[0])
|
|
54
|
+
|
|
55
|
+
################################################################################
|
|
56
|
+
|
|
57
|
+
faa_dir = '%s/dir_1_faa' % op_dir
|
|
58
|
+
aln_dir = '%s/dir_2_msa' % op_dir
|
|
59
|
+
trimmed_aln_dir = '%s/dir_3_trimmed_msa' % op_dir
|
|
60
|
+
tree_dir = '%s/dir_4_tree' % op_dir
|
|
61
|
+
cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
|
|
62
|
+
cmd_2_trim_txt = '%s/cmd_2_trim.txt' % op_dir
|
|
63
|
+
cmd_3_tree_txt = '%s/cmd_3_tree.txt' % op_dir
|
|
64
|
+
|
|
65
|
+
################################################################################
|
|
66
|
+
|
|
67
|
+
# create output folder
|
|
68
|
+
if os.path.isdir(op_dir) is True:
|
|
69
|
+
if force_overwrite is True:
|
|
70
|
+
os.system('rm -r %s' % op_dir)
|
|
71
|
+
else:
|
|
72
|
+
print('%s exist, program exited!' % op_dir)
|
|
73
|
+
exit()
|
|
74
|
+
|
|
75
|
+
os.mkdir(op_dir)
|
|
76
|
+
os.mkdir(faa_dir)
|
|
77
|
+
os.mkdir(aln_dir)
|
|
78
|
+
os.mkdir(trimmed_aln_dir)
|
|
79
|
+
os.mkdir(tree_dir)
|
|
80
|
+
|
|
81
|
+
################################################################################
|
|
82
|
+
|
|
83
|
+
fun_to_gene_dict = dict()
|
|
84
|
+
if kegg_annotation_wd is not None:
|
|
85
|
+
|
|
86
|
+
print('Reading in KEGG annotation results')
|
|
87
|
+
file_re = '%s/*KEGG_wd/*_ko_assignment_ABCD.txt' % (kegg_annotation_wd)
|
|
88
|
+
file_list = glob.glob(file_re)
|
|
89
|
+
|
|
90
|
+
if len(file_list) == 0:
|
|
91
|
+
print('KEGG annotation file not detected, program exited!')
|
|
92
|
+
exit()
|
|
93
|
+
|
|
94
|
+
for each_file in file_list:
|
|
95
|
+
line_index = 0
|
|
96
|
+
for each_line in open(each_file):
|
|
97
|
+
if line_index > 0:
|
|
98
|
+
each_line_split = each_line.strip().split('\t')
|
|
99
|
+
if len(each_line_split) == 9:
|
|
100
|
+
gene_id = each_line_split[0]
|
|
101
|
+
ko_d_id = each_line_split[4][2:]
|
|
102
|
+
if ko_d_id in interested_fun_set:
|
|
103
|
+
if ko_d_id not in fun_to_gene_dict:
|
|
104
|
+
fun_to_gene_dict[ko_d_id] = set()
|
|
105
|
+
fun_to_gene_dict[ko_d_id].add(gene_id)
|
|
106
|
+
line_index += 1
|
|
107
|
+
|
|
108
|
+
cmd_list_mafft = []
|
|
109
|
+
cmd_list_trim = []
|
|
110
|
+
cmd_list_tree = []
|
|
111
|
+
cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
|
|
112
|
+
cmd_2_trim_txt_handle = open(cmd_2_trim_txt, 'w')
|
|
113
|
+
cmd_3_tree_txt_handle = open(cmd_3_tree_txt, 'w')
|
|
114
|
+
for each_fun in sorted(fun_to_gene_dict):
|
|
115
|
+
|
|
116
|
+
# define file name
|
|
117
|
+
fun_faa = '%s/%s.faa' % (faa_dir, each_fun)
|
|
118
|
+
current_gene_tree_dir = '%s/%s' % (tree_dir, each_fun)
|
|
119
|
+
fun_aln = '%s/%s.aln' % (aln_dir, each_fun)
|
|
120
|
+
fun_aln_trimmed = '%s/%s_trimal.aln' % (trimmed_aln_dir, each_fun)
|
|
121
|
+
if trim_with_bmge is True:
|
|
122
|
+
fun_aln_trimmed = '%s/%s_bmge.aln' % (trimmed_aln_dir, each_fun)
|
|
123
|
+
|
|
124
|
+
# extract sequences
|
|
125
|
+
current_fun_gene_set = fun_to_gene_dict[each_fun]
|
|
126
|
+
select_seq(combined_faa, current_fun_gene_set, fun_faa)
|
|
127
|
+
|
|
128
|
+
os.system('mkdir %s' % current_gene_tree_dir)
|
|
129
|
+
|
|
130
|
+
# prepare commands
|
|
131
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (1, fun_faa, fun_aln)
|
|
132
|
+
trim_cmd = 'trimal -in %s -out %s -automated1' % (fun_aln, fun_aln_trimmed)
|
|
133
|
+
if trim_with_bmge is True:
|
|
134
|
+
trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, fun_aln, trim_model, entropy_score_cutoff, fun_aln_trimmed)
|
|
135
|
+
infer_tree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s/%s -T %s --quiet' % (fun_aln_trimmed, iqtree_model, current_gene_tree_dir, each_fun, num_of_threads)
|
|
136
|
+
|
|
137
|
+
# add commands to list
|
|
138
|
+
cmd_list_mafft.append(mafft_cmd)
|
|
139
|
+
cmd_list_trim.append(trim_cmd)
|
|
140
|
+
cmd_list_tree.append(infer_tree_cmd)
|
|
141
|
+
|
|
142
|
+
# write out commands
|
|
143
|
+
cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
|
|
144
|
+
cmd_2_trim_txt_handle.write(trim_cmd + '\n')
|
|
145
|
+
cmd_3_tree_txt_handle.write(infer_tree_cmd + '\n')
|
|
146
|
+
|
|
147
|
+
cmd_1_mafft_txt_handle.close()
|
|
148
|
+
cmd_2_trim_txt_handle.close()
|
|
149
|
+
cmd_3_tree_txt_handle.close()
|
|
150
|
+
|
|
151
|
+
# run mafft commands
|
|
152
|
+
print('Running mafft with %s cores for %s commands' % (num_of_threads, len(cmd_list_mafft)))
|
|
153
|
+
pool = mp.Pool(processes=num_of_threads)
|
|
154
|
+
pool.map(os.system, cmd_list_mafft)
|
|
155
|
+
pool.close()
|
|
156
|
+
pool.join()
|
|
157
|
+
|
|
158
|
+
# run trim commands
|
|
159
|
+
print('Trimming with %s cores for %s commands' % (num_of_threads, len(cmd_list_trim)))
|
|
160
|
+
pool = mp.Pool(processes=num_of_threads)
|
|
161
|
+
pool.map(os.system, cmd_list_trim)
|
|
162
|
+
pool.close()
|
|
163
|
+
pool.join()
|
|
164
|
+
|
|
165
|
+
# run iqtree commands
|
|
166
|
+
print('Running iqtree with %s cores' % num_of_threads)
|
|
167
|
+
for each_iqtree_cmd in sorted(cmd_list_tree):
|
|
168
|
+
print(each_iqtree_cmd)
|
|
169
|
+
os.system(each_iqtree_cmd)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
if __name__ == '__main__':
|
|
173
|
+
|
|
174
|
+
koTree_parser = argparse.ArgumentParser()
|
|
175
|
+
koTree_parser.add_argument('-i', required=True, help='orthologous gene sequence')
|
|
176
|
+
koTree_parser.add_argument('-fun', required=True, help='interested functions')
|
|
177
|
+
koTree_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
|
|
178
|
+
koTree_parser.add_argument('-o', required=True, help='output directory')
|
|
179
|
+
koTree_parser.add_argument('-bmge', required=False, action="store_true", help='trim with BMGE, default is trimal')
|
|
180
|
+
koTree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
|
|
181
|
+
koTree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
|
|
182
|
+
koTree_parser.add_argument('-iqtree_m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
|
|
183
|
+
koTree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
184
|
+
koTree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
|
|
185
|
+
args = vars(koTree_parser.parse_args())
|
|
186
|
+
koTree(args)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
'''
|
|
190
|
+
|
|
191
|
+
cd /scratch/PI/ocessongwz/Sponge_r220/4_OMA_wd/OMA_wd/Output
|
|
192
|
+
TreeSAK FunTree -i /scratch/PI/ocessongwz/Sponge_r220/3_combined_genomes_50_5_dRep97_291.faa -fun K01995,K01996,K01997,K01998,K01999 -kegg /scratch/PI/ocessongwz/Sponge_r220/3_combined_genomes_50_5_dRep97_291_KEGG_wd -o interested_fun_tree_branched_chain_aa_transport_system -bmge -t 12 -f
|
|
193
|
+
|
|
194
|
+
'''
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def sep_path_basename_ext(file_in):
|
|
8
|
+
|
|
9
|
+
f_path, f_name = os.path.split(file_in)
|
|
10
|
+
if f_path == '':
|
|
11
|
+
f_path = '.'
|
|
12
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
13
|
+
f_ext = f_ext[1:]
|
|
14
|
+
|
|
15
|
+
return f_name, f_path, f_base, f_ext
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
file_dir = '/Users/songweizhi/Desktop/00'
|
|
19
|
+
file_ext = 'treefile'
|
|
20
|
+
tree_format = 1
|
|
21
|
+
opdir = '/Users/songweizhi/Desktop/00_renamed'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
file_re = '%s/*.%s' % (file_dir, file_ext)
|
|
25
|
+
file_list = glob.glob(file_re)
|
|
26
|
+
|
|
27
|
+
for tree_file in file_list:
|
|
28
|
+
f_name, f_path, f_base, f_ext = sep_path_basename_ext(tree_file)
|
|
29
|
+
tree_out = '%s/%s' % (opdir, f_name)
|
|
30
|
+
t = Tree(tree_file, format=tree_format)
|
|
31
|
+
for leaf in t:
|
|
32
|
+
leaf_name_new = '_'.join(leaf.name.split('_')[:-1])
|
|
33
|
+
leaf.name = leaf_name_new
|
|
34
|
+
t.write(format=tree_format, outfile=tree_out)
|
TreeSAK/label_tree.R
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
#!/usr/bin/env Rscript
|
|
2
|
+
|
|
3
|
+
######################################## Usage ########################################
|
|
4
|
+
|
|
5
|
+
# usgae
|
|
6
|
+
# Rscript add_group_to_tree.R -t input_tree.newick -g grouping_file.txt
|
|
7
|
+
|
|
8
|
+
#######################################################################################
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# check.packages function: install and load multiple R packages.
|
|
12
|
+
# Check to see if packages are installed. Install them if they are not, then load them into the R session.
|
|
13
|
+
check.packages <- function(pkg){
|
|
14
|
+
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
|
|
15
|
+
if (length(new.pkg))
|
|
16
|
+
install.packages(new.pkg, dependencies = TRUE)
|
|
17
|
+
sapply(pkg, require, character.only = TRUE)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Usage example
|
|
21
|
+
packages<-c("ape", "tools", "optparse")
|
|
22
|
+
invisible(suppressMessages(check.packages(packages)))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
option_list = list(
|
|
26
|
+
|
|
27
|
+
make_option(c("-t", "--tree"), type="character", help="tree file", metavar="character"),
|
|
28
|
+
make_option(c("-g", "--grouping"), type="character", help="grouping file (group_id,bin_id)", metavar="character"));
|
|
29
|
+
|
|
30
|
+
opt_parser = OptionParser(option_list=option_list);
|
|
31
|
+
opt = parse_args(opt_parser);
|
|
32
|
+
grouping_file = opt$grouping
|
|
33
|
+
tree_file_in = opt$tree
|
|
34
|
+
tree_file_path = dirname(tree_file_in)
|
|
35
|
+
|
|
36
|
+
tree_file_in_name_no_extension = file_path_sans_ext(basename(grouping_file))
|
|
37
|
+
tree_txt_file_with_group = paste(tree_file_in_name_no_extension, 'with_group.tree', sep = '_')
|
|
38
|
+
tree_txt_file_only_group = paste(tree_file_in_name_no_extension, 'only_group.tree', sep = '_')
|
|
39
|
+
|
|
40
|
+
pwd_tree_txt_file_with_group = paste(tree_file_path, tree_txt_file_with_group, sep = '/')
|
|
41
|
+
pwd_tree_txt_file_only_group = paste(tree_file_path, tree_txt_file_only_group, sep = '/')
|
|
42
|
+
|
|
43
|
+
# read in grouping file
|
|
44
|
+
grouping_df = read.csv(grouping_file, header = FALSE)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
#################### get tree with group ####################
|
|
48
|
+
|
|
49
|
+
SCG_tree_with_group = read.tree(tree_file_in)
|
|
50
|
+
i = 1
|
|
51
|
+
for (i in 1:length(SCG_tree_with_group$tip.label)) {
|
|
52
|
+
label_name = SCG_tree_with_group$tip.label[i]
|
|
53
|
+
label_name_row_num = which(grouping_df$V2 == label_name)
|
|
54
|
+
group_id = grouping_df$V1[label_name_row_num]
|
|
55
|
+
SCG_tree_with_group$tip.label[i] = paste(group_id, SCG_tree_with_group$tip.label[i], sep = '_')
|
|
56
|
+
i = i + 1}
|
|
57
|
+
|
|
58
|
+
# write out tree
|
|
59
|
+
write.tree(SCG_tree_with_group, file=pwd_tree_txt_file_with_group)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
#################### get tree with group only ####################
|
|
63
|
+
|
|
64
|
+
SCG_tree_only_group = read.tree(tree_file_in)
|
|
65
|
+
i = 1
|
|
66
|
+
for (i in 1:length(SCG_tree_only_group$tip.label)) {
|
|
67
|
+
label_name = SCG_tree_only_group$tip.label[i]
|
|
68
|
+
label_name_row_num = which(grouping_df$V2 == label_name)
|
|
69
|
+
group_id = grouping_df$V1[label_name_row_num]
|
|
70
|
+
SCG_tree_only_group$tip.label[i] = paste(group_id)
|
|
71
|
+
i = i + 1}
|
|
72
|
+
|
|
73
|
+
# write out tree
|
|
74
|
+
write.tree(SCG_tree_only_group, file=pwd_tree_txt_file_only_group)
|
|
75
|
+
|
TreeSAK/label_tree.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from BioSAK.BioSAK_config import config_dict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
label_tree_usage = '''
|
|
7
|
+
======================== label_tree example commands ========================
|
|
8
|
+
|
|
9
|
+
module load R
|
|
10
|
+
|
|
11
|
+
# label tree with customized grouping file
|
|
12
|
+
BioSAK label_tree -tree NorthSea.tree -label labels.txt
|
|
13
|
+
|
|
14
|
+
# label tree by taxonomic classification at phylum and class levels
|
|
15
|
+
BioSAK label_tree -tree NorthSea.tree -taxon GTDB_output.tsv -rank p
|
|
16
|
+
BioSAK label_tree -tree NorthSea.tree -taxon GTDB_output.tsv -rank c
|
|
17
|
+
|
|
18
|
+
# label file format:
|
|
19
|
+
label_A,tree_leaf_1
|
|
20
|
+
label_B,tree_leaf_2
|
|
21
|
+
label_B,tree_leaf_3
|
|
22
|
+
label_C,tree_leaf_4
|
|
23
|
+
|
|
24
|
+
=============================================================================
|
|
25
|
+
'''
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def sep_path_basename_ext(file_in):
|
|
29
|
+
|
|
30
|
+
# separate path and file name
|
|
31
|
+
file_path, file_name = os.path.split(file_in)
|
|
32
|
+
if file_path == '':
|
|
33
|
+
file_path = '.'
|
|
34
|
+
|
|
35
|
+
# separate file basename and extension
|
|
36
|
+
file_basename, file_extension = os.path.splitext(file_name)
|
|
37
|
+
|
|
38
|
+
return file_path, file_basename, file_extension
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def label_tree(args, config_dict):
|
|
42
|
+
|
|
43
|
+
tree_in = args['tree']
|
|
44
|
+
label_file = args['label']
|
|
45
|
+
leaf_taxon = args['taxon']
|
|
46
|
+
taxon_rank = args['rank']
|
|
47
|
+
label_tree_R = config_dict['label_tree_R']
|
|
48
|
+
|
|
49
|
+
if (label_file is not None) and (leaf_taxon is None) and (taxon_rank is None):
|
|
50
|
+
label_tree_cmd = 'Rscript %s -t %s -g %s' % (label_tree_R, tree_in, label_file)
|
|
51
|
+
os.system(label_tree_cmd)
|
|
52
|
+
|
|
53
|
+
elif (label_file is None) and (leaf_taxon is not None) and (taxon_rank is not None):
|
|
54
|
+
|
|
55
|
+
# define tmp file name
|
|
56
|
+
tree_file_path, tree_file_basename, tree_file_extension = sep_path_basename_ext(tree_in)
|
|
57
|
+
taxon_grouping = '%s/%s_%s.txt' % (tree_file_path, tree_file_basename, taxon_rank)
|
|
58
|
+
|
|
59
|
+
# read GTDB output into dict
|
|
60
|
+
taxon_assignment_dict = {}
|
|
61
|
+
for each_genome in open(leaf_taxon):
|
|
62
|
+
if not each_genome.startswith('user_genome'):
|
|
63
|
+
each_split = each_genome.strip().split('\t')
|
|
64
|
+
bin_name = each_split[0]
|
|
65
|
+
|
|
66
|
+
assignment_full = []
|
|
67
|
+
if len(each_split) == 1:
|
|
68
|
+
assignment_full = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
|
|
69
|
+
elif (len(each_split) > 1) and (';' in each_split[1]):
|
|
70
|
+
assignment = each_split[1].split(';')
|
|
71
|
+
if len(assignment) == 7:
|
|
72
|
+
assignment_full = assignment
|
|
73
|
+
if len(assignment) == 6:
|
|
74
|
+
assignment_full = assignment + ['s__']
|
|
75
|
+
if len(assignment) == 5:
|
|
76
|
+
assignment_full = assignment + ['g__', 's__']
|
|
77
|
+
if len(assignment) == 4:
|
|
78
|
+
assignment_full = assignment + ['f__', 'g__', 's__']
|
|
79
|
+
if len(assignment) == 3:
|
|
80
|
+
assignment_full = assignment + ['o__', 'f__', 'g__', 's__']
|
|
81
|
+
if len(assignment) == 2:
|
|
82
|
+
assignment_full = assignment + ['c__', 'o__', 'f__', 'g__', 's__']
|
|
83
|
+
|
|
84
|
+
elif (len(each_split) > 1) and (';' not in each_split[1]):
|
|
85
|
+
assignment_full = [each_split[1]] + ['p__', 'c__', 'o__', 'f__', 'g__', 's__']
|
|
86
|
+
|
|
87
|
+
# store in dict
|
|
88
|
+
taxon_assignment_dict[bin_name] = assignment_full
|
|
89
|
+
|
|
90
|
+
# get all identified taxon at defined ranks
|
|
91
|
+
rank_to_position_dict = {'d': 0, 'p': 1, 'c': 2, 'o': 3, 'f': 4, 'g': 5, 's': 6}
|
|
92
|
+
specified_rank_pos = rank_to_position_dict[taxon_rank]
|
|
93
|
+
|
|
94
|
+
taxon_grouping_handle = open(taxon_grouping, 'w')
|
|
95
|
+
for each_TaxonAssign in taxon_assignment_dict:
|
|
96
|
+
specified_rank_id = taxon_assignment_dict[each_TaxonAssign][specified_rank_pos]
|
|
97
|
+
taxon_grouping_handle.write('%s,%s\n' % (specified_rank_id, each_TaxonAssign))
|
|
98
|
+
taxon_grouping_handle.close()
|
|
99
|
+
|
|
100
|
+
# run R script
|
|
101
|
+
label_tree_cmd = 'Rscript %s -t %s -g %s' % (label_tree_R, tree_in, taxon_grouping)
|
|
102
|
+
os.system(label_tree_cmd)
|
|
103
|
+
|
|
104
|
+
else:
|
|
105
|
+
print('Please provide either a customized label file or the taxonomy info of tree leaves together with a taxonomic rank')
|
|
106
|
+
print('Program exited!')
|
|
107
|
+
exit()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
if __name__ == '__main__':
|
|
111
|
+
|
|
112
|
+
# initialize the options parser
|
|
113
|
+
parser = argparse.ArgumentParser(usage=label_tree_usage)
|
|
114
|
+
|
|
115
|
+
parser.add_argument('-tree', required=True, help='tree file in newick format')
|
|
116
|
+
parser.add_argument('-label', required=False, default=None, help='label file (label,leaf)')
|
|
117
|
+
parser.add_argument('-taxon', required=False, default=None, help='taxonomic classification')
|
|
118
|
+
parser.add_argument('-rank', required=False, default=None, help='taxonomic rank to label')
|
|
119
|
+
|
|
120
|
+
args = vars(parser.parse_args())
|
|
121
|
+
label_tree(args, config_dict)
|