treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/SplitScore.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
SplitScore_usage = '''
|
|
3
|
+
============================================= SplitScore example commands =============================================
|
|
4
|
+
|
|
5
|
+
# SplitScore modules
|
|
6
|
+
TreeSAK SplitScore1 -> Step 1: Infer gene tree
|
|
7
|
+
TreeSAK SplitScore1OMA -> Step 1: Infer gene tree (based on OMA outputs)
|
|
8
|
+
TreeSAK SplitScore2 -> Step 2: Calculate split score
|
|
9
|
+
|
|
10
|
+
# SplitScore1
|
|
11
|
+
TreeSAK SplitScore1 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f
|
|
12
|
+
TreeSAK SplitScore1 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f -u interested_gnm.txt
|
|
13
|
+
|
|
14
|
+
# SplitScore2
|
|
15
|
+
# Please ensure that all the commands produced in step one have been executed before proceeding to step two.
|
|
16
|
+
TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir
|
|
17
|
+
|
|
18
|
+
# As described in the Undinarchaeota paper (Nina Dombrowski 2020, NC)
|
|
19
|
+
|
|
20
|
+
=======================================================================================================================
|
|
21
|
+
'''
|
TreeSAK/SplitScore1.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
import os
|
|
3
|
+
import glob
|
|
4
|
+
import argparse
|
|
5
|
+
from Bio import SeqIO
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
SplitScore1_usage = '''
|
|
9
|
+
======================== SplitScore1 example commands ========================
|
|
10
|
+
|
|
11
|
+
TreeSAK SplitScore1 -i marker_seq -x fa -o SplitScore1_op_dir -jst 9 -f
|
|
12
|
+
|
|
13
|
+
# As described in the Undinarchaeota paper (Nina Dombrowski 2020, NC)
|
|
14
|
+
|
|
15
|
+
==============================================================================
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def sep_path_basename_ext(file_in):
|
|
20
|
+
f_path, file_name = os.path.split(file_in)
|
|
21
|
+
if f_path == '':
|
|
22
|
+
f_path = '.'
|
|
23
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
24
|
+
return f_path, f_base, f_ext
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def SplitScore1(args):
|
|
28
|
+
|
|
29
|
+
oma_op_fasta = args['i']
|
|
30
|
+
fasta_file_ext = args['x']
|
|
31
|
+
interested_gnm_txt = args['u']
|
|
32
|
+
iqtree_model = args['m']
|
|
33
|
+
cov_cutoff = args['c']
|
|
34
|
+
force_overwrite = args['f']
|
|
35
|
+
num_of_js_threads = args['jst']
|
|
36
|
+
op_dir = args['o']
|
|
37
|
+
seq_named_by_gnm = args['seq_named_by_gnm']
|
|
38
|
+
bmge_trim_model = 'BLOSUM30'
|
|
39
|
+
bmge_entropy_score_cutoff = '0.55'
|
|
40
|
+
|
|
41
|
+
################################################################################
|
|
42
|
+
|
|
43
|
+
interested_gnm_set = set()
|
|
44
|
+
if interested_gnm_txt is not None:
|
|
45
|
+
if os.path.isfile(interested_gnm_txt):
|
|
46
|
+
for each_gnm in open(interested_gnm_txt):
|
|
47
|
+
interested_gnm_set.add(each_gnm.strip())
|
|
48
|
+
else:
|
|
49
|
+
print('%s not found, program exited' % interested_gnm_txt)
|
|
50
|
+
exit()
|
|
51
|
+
|
|
52
|
+
################################################################################
|
|
53
|
+
|
|
54
|
+
# specify path to BMGE.jar
|
|
55
|
+
current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
|
|
56
|
+
pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
|
|
57
|
+
|
|
58
|
+
fa_file_re = '%s/*.%s' % (oma_op_fasta, fasta_file_ext)
|
|
59
|
+
fa_file_list = glob.glob(fa_file_re)
|
|
60
|
+
if len(fa_file_list) == 0:
|
|
61
|
+
print('No file found in %s, program exited!' % oma_op_fasta)
|
|
62
|
+
exit()
|
|
63
|
+
|
|
64
|
+
og_to_gene_dict = dict()
|
|
65
|
+
for each_fa in fa_file_list:
|
|
66
|
+
_, f_base, _ = sep_path_basename_ext(each_fa)
|
|
67
|
+
seq_id_set = set()
|
|
68
|
+
for each_seq in SeqIO.parse(each_fa, 'fasta'):
|
|
69
|
+
seq_id_set.add(each_seq.id)
|
|
70
|
+
og_to_gene_dict[f_base] = seq_id_set
|
|
71
|
+
|
|
72
|
+
################################################################################
|
|
73
|
+
|
|
74
|
+
gnm_to_process = set()
|
|
75
|
+
for each_og in og_to_gene_dict:
|
|
76
|
+
gene_set = og_to_gene_dict[each_og]
|
|
77
|
+
gnm_set = set()
|
|
78
|
+
for each_gene in gene_set:
|
|
79
|
+
gnm_id = '_'.join(each_gene.split('_')[:-1])
|
|
80
|
+
if seq_named_by_gnm is True:
|
|
81
|
+
gnm_id = each_gene
|
|
82
|
+
gnm_set.add(gnm_id)
|
|
83
|
+
if interested_gnm_txt is None:
|
|
84
|
+
gnm_to_process.add(gnm_id)
|
|
85
|
+
else:
|
|
86
|
+
if gnm_id in interested_gnm_set:
|
|
87
|
+
gnm_to_process.add(gnm_id)
|
|
88
|
+
|
|
89
|
+
if len(gene_set) != len(gnm_set):
|
|
90
|
+
print('Program exited!')
|
|
91
|
+
exit()
|
|
92
|
+
|
|
93
|
+
################################################################################
|
|
94
|
+
|
|
95
|
+
# define file name
|
|
96
|
+
qualified_og_dir = '%s/qualified_OGs' % op_dir
|
|
97
|
+
cmds_in_one_line_txt = '%s/cmds_mafft_bmge_iqtree.txt' % op_dir
|
|
98
|
+
ignored_marker_txt = '%s/ignored_markers.txt' % op_dir
|
|
99
|
+
|
|
100
|
+
# create output folder
|
|
101
|
+
if os.path.isdir(op_dir) is True:
|
|
102
|
+
if force_overwrite is True:
|
|
103
|
+
os.system('rm -r %s' % op_dir)
|
|
104
|
+
else:
|
|
105
|
+
print('%s exist, program exited!' % op_dir)
|
|
106
|
+
exit()
|
|
107
|
+
os.mkdir(op_dir)
|
|
108
|
+
os.mkdir(qualified_og_dir)
|
|
109
|
+
|
|
110
|
+
################################################################################
|
|
111
|
+
|
|
112
|
+
cmds_in_one_line_txt_handle = open(cmds_in_one_line_txt, 'w')
|
|
113
|
+
ignored_og_dict = dict()
|
|
114
|
+
for each_og in sorted(list(og_to_gene_dict.keys())):
|
|
115
|
+
seq_file_in = '%s/%s.%s' % (oma_op_fasta, each_og, fasta_file_ext)
|
|
116
|
+
file_out_seq = '%s/%s.%s' % (qualified_og_dir, each_og, fasta_file_ext)
|
|
117
|
+
file_out_aln = '%s.aln' % each_og
|
|
118
|
+
file_out_aln_trimmed = '%s_trimmed.aln' % each_og
|
|
119
|
+
|
|
120
|
+
seq_file_out_handle = open(file_out_seq, 'w')
|
|
121
|
+
current_gnm_set = set()
|
|
122
|
+
for each_seq in SeqIO.parse(seq_file_in, 'fasta'):
|
|
123
|
+
seq_id = each_seq.id
|
|
124
|
+
gnm_id = '_'.join(seq_id.split('_')[:-1])
|
|
125
|
+
if seq_named_by_gnm is True:
|
|
126
|
+
gnm_id = seq_id
|
|
127
|
+
if gnm_id in gnm_to_process:
|
|
128
|
+
current_gnm_set.add(gnm_id)
|
|
129
|
+
seq_file_out_handle.write('>%s\n' % each_seq.id)
|
|
130
|
+
seq_file_out_handle.write('%s\n' % each_seq.seq)
|
|
131
|
+
seq_file_out_handle.close()
|
|
132
|
+
|
|
133
|
+
cov_value = len(current_gnm_set)*100/len(gnm_to_process)
|
|
134
|
+
cov_value = float("{0:.2f}".format(cov_value))
|
|
135
|
+
|
|
136
|
+
if cov_value < cov_cutoff:
|
|
137
|
+
report_str = 'Ignored %s, contains proteins from %s (%s%s) genomes, < %s%s.' % (each_og, len(current_gnm_set), cov_value, '%', cov_cutoff, '%')
|
|
138
|
+
ignored_og_dict[each_og] = report_str
|
|
139
|
+
os.system('rm %s' % file_out_seq)
|
|
140
|
+
else:
|
|
141
|
+
# align, trim and iqtree
|
|
142
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s.%s > %s' % (num_of_js_threads, each_og, fasta_file_ext, file_out_aln)
|
|
143
|
+
bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, file_out_aln, bmge_trim_model, bmge_entropy_score_cutoff, file_out_aln_trimmed)
|
|
144
|
+
iqtree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s -T %s --quiet' % (file_out_aln_trimmed, iqtree_model, each_og, num_of_js_threads)
|
|
145
|
+
# Undinarchaeota illuminate DPANN phylogeny and the impact of gene transfer on archaeal evolution, settings: -m LG+G -bb 1000 -wbtl -bnni
|
|
146
|
+
cmds_in_one_line_txt_handle.write('%s; %s; %s\n' % (mafft_cmd, bmge_cmd, iqtree_cmd))
|
|
147
|
+
cmds_in_one_line_txt_handle.close()
|
|
148
|
+
|
|
149
|
+
# report ignored markers
|
|
150
|
+
if len(ignored_og_dict) > 0:
|
|
151
|
+
print('The following %s markers were ignored due to low genome coverage, see details in %s:' % (len(ignored_og_dict), ignored_marker_txt))
|
|
152
|
+
print('\n'.join(sorted(list(ignored_og_dict.keys()))))
|
|
153
|
+
ignored_marker_txt_handle = open(ignored_marker_txt, 'w')
|
|
154
|
+
for each_ignored_marker in sorted(list(ignored_og_dict.keys())):
|
|
155
|
+
ignored_marker_txt_handle.write(ignored_og_dict[each_ignored_marker] + '\n')
|
|
156
|
+
ignored_marker_txt_handle.close()
|
|
157
|
+
|
|
158
|
+
# report
|
|
159
|
+
print('You will need to execute the commands exported to the following file before moving to SplitScore2')
|
|
160
|
+
print(cmds_in_one_line_txt)
|
|
161
|
+
print('Done!')
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
if __name__ == '__main__':
|
|
165
|
+
|
|
166
|
+
SplitScore1_parser = argparse.ArgumentParser()
|
|
167
|
+
SplitScore1_parser.add_argument('-i', required=True, help='orthologous gene sequence')
|
|
168
|
+
SplitScore1_parser.add_argument('-x', required=True, help='fasta file extension')
|
|
169
|
+
SplitScore1_parser.add_argument('-o', required=True, help='output directory')
|
|
170
|
+
SplitScore1_parser.add_argument('-u', required=False, default=None, help='interested genomes, no file extension')
|
|
171
|
+
SplitScore1_parser.add_argument('-m', required=False, default='LG+G', help='iqtree_model, default: LG+G')
|
|
172
|
+
SplitScore1_parser.add_argument('-c', required=False, type=int, default=75, help='coverage cutoff, default: 75')
|
|
173
|
+
SplitScore1_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
174
|
+
SplitScore1_parser.add_argument('-seq_named_by_gnm', required=False, action="store_true", help='named_by_gnm, specify if sequence named by gnm')
|
|
175
|
+
SplitScore1_parser.add_argument('-jst', required=False, type=int, default=1, help='num of threads for iqtree2, default: 1')
|
|
176
|
+
args = vars(SplitScore1_parser.parse_args())
|
|
177
|
+
SplitScore1(args)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SplitScore1OMA_usage = '''
|
|
8
|
+
======================== SplitScore1OMA example commands ========================
|
|
9
|
+
|
|
10
|
+
# SplitScore1
|
|
11
|
+
TreeSAK SplitScore1OMA -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f
|
|
12
|
+
TreeSAK SplitScore1OMA -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f -u interested_gnm.txt
|
|
13
|
+
# Please ensure that all the commands in iqtree_cmds.txt have been executed before proceeding to step 2.
|
|
14
|
+
|
|
15
|
+
=================================================================================
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def select_seq(seq_file, seq_id_list, output_file):
|
|
20
|
+
output_file_handle = open(output_file, 'w')
|
|
21
|
+
for seq_record in SeqIO.parse(seq_file, 'fasta'):
|
|
22
|
+
seq_id = seq_record.id
|
|
23
|
+
if seq_id in seq_id_list:
|
|
24
|
+
output_file_handle.write('>%s\n' % seq_id)
|
|
25
|
+
output_file_handle.write('%s\n' % str(seq_record.seq))
|
|
26
|
+
output_file_handle.close()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_gene_tree(oma_op_txt, oma_op_fasta, interested_gnm_txt, cov_cutoff, oma_op_fasta_qualified, iqtree_model, num_of_js_threads, force_overwrite, get_gene_tree_cmd_txt):
|
|
30
|
+
|
|
31
|
+
# get the total number of genome
|
|
32
|
+
genome_id_set = set()
|
|
33
|
+
for each_group in open(oma_op_txt):
|
|
34
|
+
if not each_group.startswith('#'):
|
|
35
|
+
for each_gene in each_group.strip().split('\t')[1:]:
|
|
36
|
+
gnm_id = '_'.join(each_gene.split(':')[1].split(' ')[0].split('_')[:-1])
|
|
37
|
+
genome_id_set.add(gnm_id)
|
|
38
|
+
|
|
39
|
+
interested_gnm_set = set()
|
|
40
|
+
if interested_gnm_txt is not None:
|
|
41
|
+
for each_gnm in open(interested_gnm_txt):
|
|
42
|
+
interested_gnm_set.add(each_gnm.strip())
|
|
43
|
+
else:
|
|
44
|
+
interested_gnm_set = genome_id_set
|
|
45
|
+
|
|
46
|
+
# create output folder
|
|
47
|
+
if os.path.isdir(oma_op_fasta_qualified) is True:
|
|
48
|
+
if force_overwrite is True:
|
|
49
|
+
os.system('rm -r %s' % oma_op_fasta_qualified)
|
|
50
|
+
else:
|
|
51
|
+
print('%s already exist, program exited!' % oma_op_fasta_qualified)
|
|
52
|
+
exit()
|
|
53
|
+
os.system('mkdir %s' % oma_op_fasta_qualified)
|
|
54
|
+
|
|
55
|
+
# filter OMA output
|
|
56
|
+
qualified_grp_to_gene_dict = dict()
|
|
57
|
+
for each_group in open(oma_op_txt):
|
|
58
|
+
if not each_group.startswith('#'):
|
|
59
|
+
each_group_split = each_group.strip().split('\t')
|
|
60
|
+
group_id = each_group_split[0]
|
|
61
|
+
gene_list_by_gnm = each_group_split[1:]
|
|
62
|
+
current_gene_list = [i.split(':')[1].split(' ')[0] for i in gene_list_by_gnm]
|
|
63
|
+
current_gnm_list_interested = []
|
|
64
|
+
current_gene_list_interested = []
|
|
65
|
+
for gene in current_gene_list:
|
|
66
|
+
gnm = '_'.join(gene.split('_')[:-1])
|
|
67
|
+
if gnm in interested_gnm_set:
|
|
68
|
+
current_gnm_list_interested.append(gnm)
|
|
69
|
+
current_gene_list_interested.append(gene)
|
|
70
|
+
|
|
71
|
+
current_cov = len(current_gnm_list_interested) * 100 / len(interested_gnm_set)
|
|
72
|
+
if current_cov >= cov_cutoff:
|
|
73
|
+
qualified_grp_to_gene_dict[group_id] = current_gene_list_interested
|
|
74
|
+
|
|
75
|
+
print('The number of orthologous groups with coverage >= %s is %s.' % (cov_cutoff, len(qualified_grp_to_gene_dict)))
|
|
76
|
+
|
|
77
|
+
# prepare commands for getting gene tree
|
|
78
|
+
get_gene_tree_cmd_txt_handle = open(get_gene_tree_cmd_txt, 'w')
|
|
79
|
+
for qualified_grp in sorted(list(qualified_grp_to_gene_dict.keys())):
|
|
80
|
+
group_id_only_num = qualified_grp.replace('OMA', '')
|
|
81
|
+
while group_id_only_num[0] == '0':
|
|
82
|
+
group_id_only_num = group_id_only_num[1:]
|
|
83
|
+
|
|
84
|
+
# define file name
|
|
85
|
+
og_id = 'OG%s' % group_id_only_num
|
|
86
|
+
pwd_seq_file_in = '%s/%s.fa' % (oma_op_fasta, og_id)
|
|
87
|
+
pwd_og_seq = '%s/%s.fa' % (oma_op_fasta_qualified, og_id)
|
|
88
|
+
pwd_og_aln = '%s/%s.aln' % (oma_op_fasta_qualified, og_id)
|
|
89
|
+
pwd_og_aln_trimmed = '%s/%s_trimmed.aln' % (oma_op_fasta_qualified, og_id)
|
|
90
|
+
|
|
91
|
+
# get sequence
|
|
92
|
+
if len(interested_gnm_set) == len(genome_id_set):
|
|
93
|
+
cp_cmd = 'cp %s %s' % (pwd_seq_file_in, pwd_og_seq)
|
|
94
|
+
os.system(cp_cmd)
|
|
95
|
+
else:
|
|
96
|
+
select_seq(pwd_seq_file_in, qualified_grp_to_gene_dict[qualified_grp], pwd_og_seq)
|
|
97
|
+
|
|
98
|
+
# align, trim and iqtree
|
|
99
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_js_threads, pwd_og_seq, pwd_og_aln)
|
|
100
|
+
trimal_cmd = 'trimal -in %s -out %s -automated1' % (pwd_og_aln, pwd_og_aln_trimmed)
|
|
101
|
+
iqtree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -T %s -B 1000 --quiet --wbtl --prefix %s/%s' % (pwd_og_aln_trimmed, iqtree_model, num_of_js_threads, oma_op_fasta_qualified, og_id)
|
|
102
|
+
cmds_one_line = '%s; %s; %s' % (mafft_cmd, trimal_cmd, iqtree_cmd)
|
|
103
|
+
get_gene_tree_cmd_txt_handle.write(cmds_one_line.replace((oma_op_fasta_qualified + '/'), '') + '\n')
|
|
104
|
+
get_gene_tree_cmd_txt_handle.close()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def SplitScore1OMA(args):
|
|
108
|
+
|
|
109
|
+
oma_op_txt = args['i']
|
|
110
|
+
oma_op_fasta = args['s']
|
|
111
|
+
interested_gnm_txt = args['u']
|
|
112
|
+
iqtree_model = args['m']
|
|
113
|
+
cov_cutoff = args['c']
|
|
114
|
+
force_overwrite = args['f']
|
|
115
|
+
num_of_js_threads = args['jst']
|
|
116
|
+
step_1_op_dir = args['o']
|
|
117
|
+
|
|
118
|
+
# define file name
|
|
119
|
+
qualified_og_dir = '%s/qualified_OGs' % step_1_op_dir
|
|
120
|
+
iqtree_cmds_txt = '%s/iqtree_cmds.txt' % step_1_op_dir
|
|
121
|
+
|
|
122
|
+
# create output folder
|
|
123
|
+
if os.path.isdir(step_1_op_dir) is True:
|
|
124
|
+
if force_overwrite is True:
|
|
125
|
+
os.system('rm -r %s' % step_1_op_dir)
|
|
126
|
+
else:
|
|
127
|
+
print('%s exist, program exited!' % step_1_op_dir)
|
|
128
|
+
exit()
|
|
129
|
+
os.mkdir(step_1_op_dir)
|
|
130
|
+
os.mkdir(qualified_og_dir)
|
|
131
|
+
|
|
132
|
+
# get get_gene_tree
|
|
133
|
+
get_gene_tree(oma_op_txt, oma_op_fasta, interested_gnm_txt, cov_cutoff, qualified_og_dir, iqtree_model, num_of_js_threads, force_overwrite, iqtree_cmds_txt)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == '__main__':
|
|
137
|
+
|
|
138
|
+
SplitScore1OMA_parser = argparse.ArgumentParser()
|
|
139
|
+
SplitScore1OMA_parser.add_argument('-i', required=True, help='OrthologousGroups.txt, produced by OMA')
|
|
140
|
+
SplitScore1OMA_parser.add_argument('-s', required=True, help='OrthologousGroupsFasta, produced by OMA')
|
|
141
|
+
SplitScore1OMA_parser.add_argument('-u', required=False, default= None, help='ID of interested genomes, no file extension')
|
|
142
|
+
SplitScore1OMA_parser.add_argument('-o', required=True, help='output directory')
|
|
143
|
+
SplitScore1OMA_parser.add_argument('-m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
|
|
144
|
+
SplitScore1OMA_parser.add_argument('-c', required=False, type=int, default=80, help='coverage cutoff, default: 80')
|
|
145
|
+
SplitScore1OMA_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
146
|
+
SplitScore1OMA_parser.add_argument('-jst', required=False, type=int, default=1, help='num of threads for inferring gene tree, default: 1')
|
|
147
|
+
args = vars(SplitScore1OMA_parser.parse_args())
|
|
148
|
+
SplitScore1OMA(args)
|