treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/ALE.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
|
|
2
|
+
ALE_usage = '''
|
|
3
|
+
================================================= ALE example commands =================================================
|
|
4
|
+
|
|
5
|
+
# ALE modules
|
|
6
|
+
TreeSAK ALE1 -> Step 1: get gene tree
|
|
7
|
+
TreeSAK ALE2 -> Step 2: run ALE
|
|
8
|
+
TreeSAK ALE3 -> Step 3: parse ALE outputs (ancestral genome reconstruction, transfer propensity/verticality and gain/loss)
|
|
9
|
+
TreeSAK ALE4 -> Filter ALE identified HGTs
|
|
10
|
+
TreeSAK ALE5 -> Get RTC file based on ALE detected HGTs
|
|
11
|
+
TreeSAK SingleAleHGT -> Perform HGT analysis using ALE for single protein family
|
|
12
|
+
TreeSAK ALE6 -> faa ancestral genomes
|
|
13
|
+
TreeSAK ALE7 -> get function P/A in ancestral genomes
|
|
14
|
+
|
|
15
|
+
# Example commands
|
|
16
|
+
TreeSAK ALE1 -i OrthologousGroups.txt -s combined_d__Archaea_o_rs.faa -p oma -m 50 -t 12 -jst 3 -f -o ALE1_op_dir
|
|
17
|
+
TreeSAK ALE2 -i ALE1_op_dir -s genome_tree_rooted_noEU.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
|
|
18
|
+
TreeSAK ALE3 -i ALE2_op_dir -o ALE3_op_dir_c0.75 -f -c 0.75
|
|
19
|
+
TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.3 -fc 0.3 -f -api your_own_itol_api
|
|
20
|
+
TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.5 -fc 0.5 -f -api your_own_itol_api
|
|
21
|
+
TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.8 -fc 0.8 -f -api your_own_itol_api
|
|
22
|
+
TreeSAK ALE7 -6 ALE6_op_dir -fun ko.txt -node internal_node.txt -o Fun_PA.txt
|
|
23
|
+
TreeSAK ALE7 -6 ALE6_op_dir -fun K01995,K01995 -node 359,466,470 -o Fun_PA.txt
|
|
24
|
+
TreeSAK ALE7 -6 ALE6_op_dir -fun arCOG07811,K01995 -node 359,466,470 -o Fun_PA.txt
|
|
25
|
+
TreeSAK SingleAleHGT -i OMA00001.aln -s genome.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o OMA00001_ALE_HGT_wd
|
|
26
|
+
|
|
27
|
+
Note:
|
|
28
|
+
Genome names should NOT contain "_".
|
|
29
|
+
|
|
30
|
+
========================================================================================================================
|
|
31
|
+
'''
|
|
32
|
+
|
|
33
|
+
'''
|
|
34
|
+
cd /Users/songweizhi/Desktop/run_ALE_wd
|
|
35
|
+
TreeSAK ALE2 -i ALE1_op_dir -s genome_tree_rooted_noEU.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
|
|
36
|
+
TreeSAK ALE3 -i ALE2_op_dir -c 0.8 -f -o ALE3_op_dir
|
|
37
|
+
TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.3 -fc 0.3 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
38
|
+
TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.5 -fc 0.5 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
39
|
+
TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.8 -fc 0.8 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
40
|
+
|
|
41
|
+
python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE4.py -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.3 -fc 0.3 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
42
|
+
python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE4.py -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.5 -fc 0.5 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
43
|
+
python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE4.py -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.8 -fc 0.8 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
44
|
+
python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE3.py -i ALE2_op_dir -c 0.8 -f -o ALE3_op_dir
|
|
45
|
+
'''
|
|
46
|
+
|
|
47
|
+
'''
|
|
48
|
+
cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/6_ALE_wd
|
|
49
|
+
python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE1.py -i OMA_op_filtered/OrthologousGroups.txt -s OMA_op_filtered/OrthologousGroups.fasta -p oma -m 3 -t 10 -jt 2 -f -o ALE1_op_dir
|
|
50
|
+
TreeSAK ALE2 -i ALE1_op_dir -s genome_tree_rooted_noEU.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
|
|
51
|
+
|
|
52
|
+
cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/8_ALE_wd_all_OGs
|
|
53
|
+
TreeSAK ALE2 -i ALE1_op_dir_ufboot -s concatenated_rooted.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
|
|
54
|
+
|
|
55
|
+
cd /home-user/wzsong/tmp
|
|
56
|
+
TreeSAK ALE2 -i ALE1_op_dir_ufboot -s concatenated_rooted.treefile -t 32 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
|
|
57
|
+
|
|
58
|
+
cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/8_ALE_wd_all_OGs
|
|
59
|
+
TreeSAK ALE3 -i ALE2_op_dir -o ALE3_op_dir_c0.75 -f -c 0.75
|
|
60
|
+
|
|
61
|
+
cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/8_ALE_wd_all_OGs
|
|
62
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE3.py -i ALE2_op_dir -o ALE3_op_dir_c0.75 -f -c 0.75 -a ALE1_arcog_description.txt
|
|
63
|
+
'''
|
TreeSAK/ALE1.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from ete3 import Tree
|
|
6
|
+
from distutils.spawn import find_executable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
ALE1_usage = '''
|
|
10
|
+
====================================== ALE1 example commands ======================================
|
|
11
|
+
|
|
12
|
+
TreeSAK ALE1 -i OrthologousGroups.txt -s combined.faa -p oma -m 50 -jst 3 -f -o ALE1_op_dir -bmge
|
|
13
|
+
TreeSAK ALE1 -ms s03_marker_seq -msx fa -p marker_set_1 -m 50 -jst 3 -f -o ALE1_op_dir -bmge
|
|
14
|
+
|
|
15
|
+
===================================================================================================
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check_dependencies(program_list):
|
|
20
|
+
|
|
21
|
+
not_detected_programs = []
|
|
22
|
+
for needed_program in program_list:
|
|
23
|
+
if find_executable(needed_program) is None:
|
|
24
|
+
not_detected_programs.append(needed_program)
|
|
25
|
+
|
|
26
|
+
if not_detected_programs != []:
|
|
27
|
+
print('%s not found, program exited!' % ','.join(not_detected_programs))
|
|
28
|
+
exit()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def sep_path_basename_ext(file_in):
|
|
32
|
+
|
|
33
|
+
f_path, file_name = os.path.split(file_in)
|
|
34
|
+
if f_path == '':
|
|
35
|
+
f_path = '.'
|
|
36
|
+
|
|
37
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
38
|
+
|
|
39
|
+
return f_path, f_base, f_ext
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
|
|
43
|
+
|
|
44
|
+
input_tree = Tree(tree_file_in)
|
|
45
|
+
subset_tree = input_tree.copy()
|
|
46
|
+
subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
|
|
47
|
+
if tree_file_out is None:
|
|
48
|
+
return subset_tree.write()
|
|
49
|
+
else:
|
|
50
|
+
subset_tree.write(outfile=tree_file_out)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_ortho_to_gene_dict(ortho_groups_txt, og_program):
|
|
54
|
+
|
|
55
|
+
ortho_to_gene_dict = dict()
|
|
56
|
+
for each_og in open(ortho_groups_txt):
|
|
57
|
+
if not each_og.startswith('#'):
|
|
58
|
+
og_id = ''
|
|
59
|
+
gene_list = []
|
|
60
|
+
if og_program == 'orthofinder':
|
|
61
|
+
each_og_split = each_og.strip().split(' ')
|
|
62
|
+
og_id = each_og_split[0][:-1]
|
|
63
|
+
gene_list = each_og_split[1:]
|
|
64
|
+
elif og_program == 'oma':
|
|
65
|
+
each_og_split = each_og.strip().split('\t')
|
|
66
|
+
og_id = each_og_split[0]
|
|
67
|
+
group_member_list = each_og_split[1:]
|
|
68
|
+
for each_protein in group_member_list:
|
|
69
|
+
protein_id = each_protein.split(' ')[0].split(':')[1]
|
|
70
|
+
gene_list.append(protein_id)
|
|
71
|
+
ortho_to_gene_dict[og_id] = gene_list
|
|
72
|
+
|
|
73
|
+
return ortho_to_gene_dict
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def ALE1(args):
|
|
77
|
+
|
|
78
|
+
orthogroups_op_txt = args['i']
|
|
79
|
+
combined_faa = args['s']
|
|
80
|
+
og_program = args['p']
|
|
81
|
+
marker_seq_dir = args['ms']
|
|
82
|
+
marker_seq_ext = args['msx']
|
|
83
|
+
min_og_genome_num = args['m']
|
|
84
|
+
js_num_threads = args['jst']
|
|
85
|
+
force_create_op_dir = args['f']
|
|
86
|
+
op_dir = args['o']
|
|
87
|
+
trim_with_bmge = args['bmge']
|
|
88
|
+
bmge_trim_model = args['bmge_m']
|
|
89
|
+
bmge_entropy_score_cutoff = args['bmge_esc']
|
|
90
|
+
designate_ogs = []
|
|
91
|
+
to_ignore_ogs_list = []
|
|
92
|
+
|
|
93
|
+
# check dependencies
|
|
94
|
+
check_dependencies(['java', 'blastp', 'mafft-einsi'])
|
|
95
|
+
|
|
96
|
+
# specify path to BMGE.jar
|
|
97
|
+
current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
|
|
98
|
+
pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
|
|
99
|
+
|
|
100
|
+
# define output file name
|
|
101
|
+
get_gene_tree_cmds_txt = '%s_cmds.txt' % op_dir
|
|
102
|
+
|
|
103
|
+
# determine the version of iqtree available on the system
|
|
104
|
+
if find_executable('iqtree2'):
|
|
105
|
+
iqtree_exe = 'iqtree2'
|
|
106
|
+
elif find_executable('iqtree'):
|
|
107
|
+
iqtree_exe = 'iqtree'
|
|
108
|
+
else:
|
|
109
|
+
print('iqtree not detected, program exited!')
|
|
110
|
+
exit()
|
|
111
|
+
|
|
112
|
+
# create op_dir
|
|
113
|
+
if os.path.isdir(op_dir) is True:
|
|
114
|
+
if force_create_op_dir is True:
|
|
115
|
+
os.system('rm -r %s' % op_dir)
|
|
116
|
+
else:
|
|
117
|
+
print('Output folder detected, program exited!')
|
|
118
|
+
exit()
|
|
119
|
+
os.system('mkdir %s' % op_dir)
|
|
120
|
+
|
|
121
|
+
if (orthogroups_op_txt is not None) and (combined_faa is not None) and (marker_seq_dir is None):
|
|
122
|
+
|
|
123
|
+
# get ortho_to_gene_dict
|
|
124
|
+
ortho_to_gene_dict = get_ortho_to_gene_dict(orthogroups_op_txt, og_program)
|
|
125
|
+
|
|
126
|
+
# get qualified orthogroups
|
|
127
|
+
qualified_og_set = set()
|
|
128
|
+
for each_ortho in ortho_to_gene_dict:
|
|
129
|
+
ortho_gene_set = ortho_to_gene_dict[each_ortho]
|
|
130
|
+
ortho_gnm_set = set()
|
|
131
|
+
for each_gene in ortho_gene_set:
|
|
132
|
+
gene_gnm = '_'.join(each_gene.split('_')[:-1])
|
|
133
|
+
ortho_gnm_set.add(gene_gnm)
|
|
134
|
+
if len(ortho_gnm_set) >= min_og_genome_num:
|
|
135
|
+
qualified_og_set.add(each_ortho)
|
|
136
|
+
print('The total number of identified orthogroups is %s.' % len(ortho_to_gene_dict))
|
|
137
|
+
print('The number of orthogroups spanning >= %s genomes is %s.' % (min_og_genome_num, len(qualified_og_set)))
|
|
138
|
+
|
|
139
|
+
# process qualified OG
|
|
140
|
+
og_to_process = sorted([i for i in qualified_og_set])
|
|
141
|
+
if len(designate_ogs) > 0:
|
|
142
|
+
print('The number of designated OGs to process: %s' % len(designate_ogs))
|
|
143
|
+
og_to_process = designate_ogs
|
|
144
|
+
|
|
145
|
+
og_to_process_no_ignored = set()
|
|
146
|
+
for each_og in og_to_process:
|
|
147
|
+
if each_og not in to_ignore_ogs_list:
|
|
148
|
+
og_to_process_no_ignored.add(each_og)
|
|
149
|
+
|
|
150
|
+
# read sequence into dict
|
|
151
|
+
gene_seq_dict = dict()
|
|
152
|
+
for each_seq in SeqIO.parse(combined_faa, 'fasta'):
|
|
153
|
+
seq_id = each_seq.id
|
|
154
|
+
gene_seq_dict[seq_id] = str(each_seq.seq)
|
|
155
|
+
|
|
156
|
+
# extract gene sequences and prepare commands for building gene tree
|
|
157
|
+
print('Preparing commands and sequence files for building gene trees')
|
|
158
|
+
get_gene_tree_cmds_txt_handle = open(get_gene_tree_cmds_txt, 'w')
|
|
159
|
+
for qualified_og in sorted(og_to_process_no_ignored):
|
|
160
|
+
qualified_og_gene_set = ortho_to_gene_dict[qualified_og]
|
|
161
|
+
qualified_og_gene_faa = '%s/%s.faa' % (op_dir, qualified_og)
|
|
162
|
+
|
|
163
|
+
og_aln = '%s.aln' % qualified_og
|
|
164
|
+
og_aln_trimmed = '%s_trimmed.aln' % qualified_og
|
|
165
|
+
|
|
166
|
+
# write out commands
|
|
167
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s.faa > %s' % (js_num_threads, qualified_og, og_aln)
|
|
168
|
+
trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, og_aln, bmge_trim_model, bmge_entropy_score_cutoff, og_aln_trimmed)
|
|
169
|
+
iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, og_aln, qualified_og)
|
|
170
|
+
if trim_with_bmge is True:
|
|
171
|
+
iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, og_aln_trimmed, qualified_og)
|
|
172
|
+
|
|
173
|
+
if trim_with_bmge is True:
|
|
174
|
+
get_gene_tree_cmds_txt_handle.write('%s; %s; %s\n' % (mafft_cmd, trim_cmd, iqtree_cmd))
|
|
175
|
+
else:
|
|
176
|
+
get_gene_tree_cmds_txt_handle.write('%s; %s\n' % (mafft_cmd, iqtree_cmd))
|
|
177
|
+
|
|
178
|
+
# write out sequences
|
|
179
|
+
qualified_og_gene_faa_handle = open(qualified_og_gene_faa, 'w')
|
|
180
|
+
for each_gene in qualified_og_gene_set:
|
|
181
|
+
qualified_og_gene_faa_handle.write('>%s\n' % each_gene)
|
|
182
|
+
qualified_og_gene_faa_handle.write('%s\n' % gene_seq_dict[each_gene])
|
|
183
|
+
qualified_og_gene_faa_handle.close()
|
|
184
|
+
get_gene_tree_cmds_txt_handle.close()
|
|
185
|
+
|
|
186
|
+
elif (orthogroups_op_txt is None) and (combined_faa is None) and (marker_seq_dir is not None):
|
|
187
|
+
|
|
188
|
+
marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
|
|
189
|
+
marker_seq_list = glob.glob(marker_seq_re)
|
|
190
|
+
|
|
191
|
+
marker_to_gene_dict = dict()
|
|
192
|
+
for each_file in marker_seq_list:
|
|
193
|
+
_, f_base, _ = sep_path_basename_ext(each_file)
|
|
194
|
+
marker_to_gene_dict[f_base] = set()
|
|
195
|
+
for each_seq in SeqIO.parse(each_file, 'fasta'):
|
|
196
|
+
marker_to_gene_dict[f_base].add(each_seq.id)
|
|
197
|
+
|
|
198
|
+
# get qualified orthogroups
|
|
199
|
+
qualified_og_set = set()
|
|
200
|
+
for each_ortho in marker_to_gene_dict:
|
|
201
|
+
ortho_gene_set = marker_to_gene_dict[each_ortho]
|
|
202
|
+
ortho_gnm_set = set()
|
|
203
|
+
for each_gene in ortho_gene_set:
|
|
204
|
+
gene_gnm = '_'.join(each_gene.split('_')[:-1])
|
|
205
|
+
ortho_gnm_set.add(gene_gnm)
|
|
206
|
+
if len(ortho_gnm_set) >= min_og_genome_num:
|
|
207
|
+
qualified_og_set.add(each_ortho)
|
|
208
|
+
print('The total number of identified orthogroups is %s.' % len(marker_to_gene_dict))
|
|
209
|
+
print('The number of orthogroups spanning >= %s genomes is %s.' % (min_og_genome_num, len(qualified_og_set)))
|
|
210
|
+
|
|
211
|
+
# process qualified OG
|
|
212
|
+
og_to_process = sorted([i for i in qualified_og_set])
|
|
213
|
+
if len(designate_ogs) > 0:
|
|
214
|
+
print('The number of designated OGs to process: %s' % len(designate_ogs))
|
|
215
|
+
og_to_process = designate_ogs
|
|
216
|
+
|
|
217
|
+
og_to_process_no_ignored = set()
|
|
218
|
+
for each_og in og_to_process:
|
|
219
|
+
if each_og not in to_ignore_ogs_list:
|
|
220
|
+
og_to_process_no_ignored.add(each_og)
|
|
221
|
+
|
|
222
|
+
# extract gene sequences and prepare commands for building gene tree
|
|
223
|
+
print('Preparing commands for building gene trees')
|
|
224
|
+
get_gene_tree_cmds_txt_handle = open(get_gene_tree_cmds_txt, 'w')
|
|
225
|
+
for qualified_og in sorted(og_to_process_no_ignored):
|
|
226
|
+
|
|
227
|
+
# copy sequence file into output directory
|
|
228
|
+
os.system('cp %s/%s.%s %s/' % (marker_seq_dir, qualified_og, marker_seq_ext, op_dir))
|
|
229
|
+
|
|
230
|
+
qualified_og_aln = '%s.aln' % qualified_og
|
|
231
|
+
qualified_og_aln_trimmed = '%s_trimmed.aln' % qualified_og
|
|
232
|
+
|
|
233
|
+
# write out commands
|
|
234
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s.%s > %s' % (js_num_threads, qualified_og, marker_seq_ext, qualified_og_aln)
|
|
235
|
+
trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, qualified_og_aln, bmge_trim_model, bmge_entropy_score_cutoff, qualified_og_aln_trimmed)
|
|
236
|
+
iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, qualified_og_aln, qualified_og)
|
|
237
|
+
if trim_with_bmge is True:
|
|
238
|
+
iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, qualified_og_aln_trimmed, qualified_og)
|
|
239
|
+
|
|
240
|
+
if trim_with_bmge is False:
|
|
241
|
+
get_gene_tree_cmds_txt_handle.write('%s; %s\n' % (mafft_cmd, iqtree_cmd))
|
|
242
|
+
else:
|
|
243
|
+
get_gene_tree_cmds_txt_handle.write('%s; %s; %s\n' % (mafft_cmd, trim_cmd, iqtree_cmd))
|
|
244
|
+
|
|
245
|
+
get_gene_tree_cmds_txt_handle.close()
|
|
246
|
+
|
|
247
|
+
print('Sequece files exported to %s.' % op_dir)
|
|
248
|
+
print('Commands for inferring gene tree exported to %s.' % get_gene_tree_cmds_txt)
|
|
249
|
+
print('Done!')
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
if __name__ == '__main__':
|
|
253
|
+
|
|
254
|
+
ALE1_parser = argparse.ArgumentParser()
|
|
255
|
+
ALE1_parser.add_argument('-i', required=False, default=None, help='orthologous groups, either from orthofinder or oma')
|
|
256
|
+
ALE1_parser.add_argument('-s', required=False, default=None, help='sequence file, e.g., combined.faa')
|
|
257
|
+
ALE1_parser.add_argument('-ms', required=False, default=None, help='input is a folder holds the sequence of each marker')
|
|
258
|
+
ALE1_parser.add_argument('-msx', required=False, default='fa', help='file extension of marker sequence file, default: fa')
|
|
259
|
+
ALE1_parser.add_argument('-p', required=True, help='orthologous identification program, orthofinder or oma')
|
|
260
|
+
ALE1_parser.add_argument('-m', required=False, type=int, default=50, help='min_og_genome_num, default: 50')
|
|
261
|
+
ALE1_parser.add_argument('-bmge', required=False, action="store_true", help='trim MSA with BMGE, default no trimming')
|
|
262
|
+
ALE1_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
|
|
263
|
+
ALE1_parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
|
|
264
|
+
ALE1_parser.add_argument('-o', required=True, help='output dir, i.e., OMA working directory')
|
|
265
|
+
ALE1_parser.add_argument('-jst', required=False, type=int, default=3, help='number of threads specified in job script, default: 3')
|
|
266
|
+
ALE1_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
267
|
+
args = vars(ALE1_parser.parse_args())
|
|
268
|
+
ALE1(args)
|
TreeSAK/ALE2.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ALE2_usage = '''
|
|
9
|
+
============================================ ALE2 example commands ===========================================
|
|
10
|
+
|
|
11
|
+
TreeSAK ALE2 -1 ALE1_op_dir -s genome.treefile -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir
|
|
12
|
+
|
|
13
|
+
Note:
|
|
14
|
+
Genome names should NOT contain "_", the program will tackle this automatically.
|
|
15
|
+
|
|
16
|
+
# You can try to add this while building the docker images
|
|
17
|
+
--platform linux/arm64/v8
|
|
18
|
+
|
|
19
|
+
# Only the ufboot files in ALE1_op_dir will be needed in this step.
|
|
20
|
+
|
|
21
|
+
===============================================================================================================
|
|
22
|
+
'''
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def sep_path_basename_ext(file_in):
|
|
26
|
+
|
|
27
|
+
# separate path and file name
|
|
28
|
+
f_path, file_name = os.path.split(file_in)
|
|
29
|
+
if f_path == '':
|
|
30
|
+
f_path = '.'
|
|
31
|
+
|
|
32
|
+
# separate file basename and extension
|
|
33
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
34
|
+
|
|
35
|
+
return f_path, f_base, f_ext
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
|
|
39
|
+
|
|
40
|
+
input_tree = Tree(tree_file_in)
|
|
41
|
+
subset_tree = input_tree.copy()
|
|
42
|
+
subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
|
|
43
|
+
if tree_file_out is None:
|
|
44
|
+
return subset_tree.write()
|
|
45
|
+
else:
|
|
46
|
+
subset_tree.write(outfile=tree_file_out)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def prepare_ale_ip_worker(arg_list):
|
|
50
|
+
|
|
51
|
+
ufboot_in = arg_list[0]
|
|
52
|
+
ufboot_out = arg_list[1]
|
|
53
|
+
|
|
54
|
+
ufboot_out_handle = open(ufboot_out, 'w')
|
|
55
|
+
for each_gene_tree in open(ufboot_in):
|
|
56
|
+
gene_tree_str = each_gene_tree.strip()
|
|
57
|
+
gene_tree_in = Tree(gene_tree_str, format=1)
|
|
58
|
+
for leaf in gene_tree_in:
|
|
59
|
+
leaf_name_split = leaf.name.split('_')
|
|
60
|
+
gnm_id = '_'.join(leaf_name_split[:-1])
|
|
61
|
+
gene_index = leaf_name_split[-1]
|
|
62
|
+
gnm_id_renamed = gnm_id.replace('_', '')
|
|
63
|
+
gene_id_renamed = '%s_%s' % (gnm_id_renamed, gene_index)
|
|
64
|
+
leaf.name = gene_id_renamed
|
|
65
|
+
gene_tree_str_renamed = gene_tree_in.write()
|
|
66
|
+
ufboot_out_handle.write(gene_tree_str_renamed + '\n')
|
|
67
|
+
ufboot_out_handle.close()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def ALE2(args):
|
|
71
|
+
|
|
72
|
+
ale1_op_dir = args['1']
|
|
73
|
+
genome_tree_file_rooted = args['s']
|
|
74
|
+
force_create_ale_wd = args['f']
|
|
75
|
+
num_threads = args['t']
|
|
76
|
+
ale2_op_dir = args['o']
|
|
77
|
+
run_ale = args['runALE']
|
|
78
|
+
docker_image = args['docker']
|
|
79
|
+
run_ale_cmds_txt = '%s_cmds.txt' % ale2_op_dir
|
|
80
|
+
|
|
81
|
+
ufboot_file_re = '%s/*.ufboot' % ale1_op_dir
|
|
82
|
+
ufboot_file_list = glob.glob(ufboot_file_re)
|
|
83
|
+
og_to_process_list = []
|
|
84
|
+
for each_ufboot in ufboot_file_list:
|
|
85
|
+
_, ufboot_base, _ = sep_path_basename_ext(each_ufboot)
|
|
86
|
+
og_to_process_list.append(ufboot_base)
|
|
87
|
+
|
|
88
|
+
# define file name
|
|
89
|
+
gnm_tree_no_underscore = 'genome_tree.newick'
|
|
90
|
+
gnm_tree_leaf_rename_txt = 'genome_tree_leaf_rename.txt'
|
|
91
|
+
gnm_tree_no_underscore_in_wd = '%s/%s' % (ale2_op_dir, gnm_tree_no_underscore)
|
|
92
|
+
|
|
93
|
+
# create ale2_op_dir
|
|
94
|
+
if force_create_ale_wd is True:
|
|
95
|
+
if os.path.isdir(ale2_op_dir) is True:
|
|
96
|
+
os.system('rm -r %s' % ale2_op_dir)
|
|
97
|
+
os.system('mkdir %s' % ale2_op_dir)
|
|
98
|
+
|
|
99
|
+
# prepare genome tree for running ALE
|
|
100
|
+
gnm_tree_leaf_rename_txt_handle = open(gnm_tree_leaf_rename_txt, 'w')
|
|
101
|
+
gnm_tree_in = Tree(genome_tree_file_rooted, format=1)
|
|
102
|
+
rename_dict = dict()
|
|
103
|
+
for leaf in gnm_tree_in:
|
|
104
|
+
leaf_name = leaf.name
|
|
105
|
+
leaf_name_new = leaf_name.replace('_', '')
|
|
106
|
+
gnm_tree_leaf_rename_txt_handle.write('%s\t%s\n' % (leaf_name_new, leaf.name))
|
|
107
|
+
leaf.name = leaf_name_new
|
|
108
|
+
rename_dict[leaf_name] = leaf_name_new
|
|
109
|
+
gnm_tree_leaf_rename_txt_handle.close()
|
|
110
|
+
|
|
111
|
+
gnm_tree_in.write(outfile=gnm_tree_no_underscore_in_wd)
|
|
112
|
+
|
|
113
|
+
# prepare gene tree for running ALE
|
|
114
|
+
run_ale_cmds_txt_handle = open(run_ale_cmds_txt, 'w')
|
|
115
|
+
prepare_ale_ip_worker_arg_lol = []
|
|
116
|
+
ale_cmd_list = []
|
|
117
|
+
for qualified_og in og_to_process_list:
|
|
118
|
+
pwd_gene_tree_ufboot = '%s/%s.ufboot' % (ale1_op_dir, qualified_og)
|
|
119
|
+
if os.path.isfile(pwd_gene_tree_ufboot) is False:
|
|
120
|
+
print('%s not found, please build gene tree first!' % pwd_gene_tree_ufboot)
|
|
121
|
+
else:
|
|
122
|
+
pwd_gene_tree_ufboot_in = '%s/%s.ufboot' % (ale1_op_dir, qualified_og)
|
|
123
|
+
pwd_gene_tree_ufboot_out = '%s/%s.ufboot' % (ale2_op_dir, qualified_og)
|
|
124
|
+
|
|
125
|
+
# get commands for ALEobserve and ALEml_undated
|
|
126
|
+
obtain_ale_file_cmd = 'ALEobserve %s.ufboot > %s.ALEobserve.log' % (qualified_og, qualified_og)
|
|
127
|
+
reconciliation_cmd = 'ALEml_undated %s %s.ufboot.ale > %s.ALEml_undated.log' % (gnm_tree_no_underscore, qualified_og, qualified_og)
|
|
128
|
+
if docker_image is not None:
|
|
129
|
+
obtain_ale_file_cmd = 'docker run -v $PWD:$PWD -w $PWD %s %s' % (docker_image, obtain_ale_file_cmd)
|
|
130
|
+
reconciliation_cmd = 'docker run -v $PWD:$PWD -w $PWD %s %s' % (docker_image, reconciliation_cmd)
|
|
131
|
+
|
|
132
|
+
current_arg_list = [pwd_gene_tree_ufboot_in, pwd_gene_tree_ufboot_out]
|
|
133
|
+
run_ale_cmds_txt_handle.write('%s; %s\n' % (obtain_ale_file_cmd, reconciliation_cmd))
|
|
134
|
+
ale_cmd_list.append('%s; %s\n' % (obtain_ale_file_cmd, reconciliation_cmd))
|
|
135
|
+
prepare_ale_ip_worker_arg_lol.append(current_arg_list)
|
|
136
|
+
run_ale_cmds_txt_handle.close()
|
|
137
|
+
|
|
138
|
+
# prepare input files and job script for running ALE with multiprocessing
|
|
139
|
+
print('Preparing files for running ALE with %s cores for %s OGs' % (num_threads, len(prepare_ale_ip_worker_arg_lol)))
|
|
140
|
+
pool = mp.Pool(processes=num_threads)
|
|
141
|
+
pool.map(prepare_ale_ip_worker, prepare_ale_ip_worker_arg_lol)
|
|
142
|
+
pool.close()
|
|
143
|
+
pool.join()
|
|
144
|
+
|
|
145
|
+
# run ALE
|
|
146
|
+
if run_ale is True:
|
|
147
|
+
print('running ALE with %s cores for %s OGs' % (num_threads, len(prepare_ale_ip_worker_arg_lol)))
|
|
148
|
+
os.chdir(ale2_op_dir)
|
|
149
|
+
pool = mp.Pool(processes=num_threads)
|
|
150
|
+
pool.map(os.system, ale_cmd_list)
|
|
151
|
+
pool.close()
|
|
152
|
+
pool.join()
|
|
153
|
+
|
|
154
|
+
print('Done!')
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
if __name__ == '__main__':
|
|
158
|
+
|
|
159
|
+
ALE2_parser = argparse.ArgumentParser()
|
|
160
|
+
ALE2_parser.add_argument('-1', required=True, help='ALE1 output directory')
|
|
161
|
+
ALE2_parser.add_argument('-s', required=True, help='rooted species tree')
|
|
162
|
+
ALE2_parser.add_argument('-o', required=True, help='output dir, i.e., OMA working directory')
|
|
163
|
+
ALE2_parser.add_argument('-runALE', required=False, action="store_true", help='run ALE')
|
|
164
|
+
ALE2_parser.add_argument('-docker', required=False, default=None, help='Docker image, if ALE was installed with Docker, e.g., gregmich/alesuite_new')
|
|
165
|
+
ALE2_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
166
|
+
ALE2_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
|
|
167
|
+
args = vars(ALE2_parser.parse_args())
|
|
168
|
+
ALE2(args)
|
TreeSAK/ALE2RTC.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
ALE2RTC_usage = '''
|
|
6
|
+
================================ ALE2RTC example commands ================================
|
|
7
|
+
|
|
8
|
+
TreeSAK ALE2RTC -h
|
|
9
|
+
|
|
10
|
+
==========================================================================================
|
|
11
|
+
'''
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def ALE2RTC():
|
|
15
|
+
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
# file_in = args['i']
|
|
19
|
+
# op_dir = args['o']
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# if __name__ == '__main__':
|
|
23
|
+
#
|
|
24
|
+
# ALE2RTC_parser = argparse.ArgumentParser()
|
|
25
|
+
# ALE2RTC_parser.add_argument('-i', required=True, help='the file "out" generated by MCMCTree')
|
|
26
|
+
# ALE2RTC_parser.add_argument('-o', required=True, help='output directory, which will be the input to the pRTC module (-rrtc)')
|
|
27
|
+
# args = vars(ALE2RTC_parser.parse_args())
|
|
28
|
+
# ALE2RTC(args)
|
|
29
|
+
|
|
30
|
+
ALE2RTC()
|