treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SplitScore1OMA_usage = '''
|
|
8
|
+
======================== SplitScore1OMA example commands ========================
|
|
9
|
+
|
|
10
|
+
# SplitScore1
|
|
11
|
+
TreeSAK SplitScore1OMA -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f
|
|
12
|
+
TreeSAK SplitScore1OMA -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f -u interested_gnm.txt
|
|
13
|
+
# Please ensure that all the commands in iqtree_cmds.txt have been executed before proceeding to step 2.
|
|
14
|
+
|
|
15
|
+
=================================================================================
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def select_seq(seq_file, seq_id_list, output_file):
|
|
20
|
+
output_file_handle = open(output_file, 'w')
|
|
21
|
+
for seq_record in SeqIO.parse(seq_file, 'fasta'):
|
|
22
|
+
seq_id = seq_record.id
|
|
23
|
+
if seq_id in seq_id_list:
|
|
24
|
+
output_file_handle.write('>%s\n' % seq_id)
|
|
25
|
+
output_file_handle.write('%s\n' % str(seq_record.seq))
|
|
26
|
+
output_file_handle.close()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_gene_tree(oma_op_txt, oma_op_fasta, interested_gnm_txt, cov_cutoff, oma_op_fasta_qualified, iqtree_model, num_of_js_threads, force_overwrite, get_gene_tree_cmd_txt):
|
|
30
|
+
|
|
31
|
+
# get the total number of genome
|
|
32
|
+
genome_id_set = set()
|
|
33
|
+
for each_group in open(oma_op_txt):
|
|
34
|
+
if not each_group.startswith('#'):
|
|
35
|
+
for each_gene in each_group.strip().split('\t')[1:]:
|
|
36
|
+
gnm_id = '_'.join(each_gene.split(':')[1].split(' ')[0].split('_')[:-1])
|
|
37
|
+
genome_id_set.add(gnm_id)
|
|
38
|
+
|
|
39
|
+
interested_gnm_set = set()
|
|
40
|
+
if interested_gnm_txt is not None:
|
|
41
|
+
for each_gnm in open(interested_gnm_txt):
|
|
42
|
+
interested_gnm_set.add(each_gnm.strip())
|
|
43
|
+
else:
|
|
44
|
+
interested_gnm_set = genome_id_set
|
|
45
|
+
|
|
46
|
+
# create output folder
|
|
47
|
+
if os.path.isdir(oma_op_fasta_qualified) is True:
|
|
48
|
+
if force_overwrite is True:
|
|
49
|
+
os.system('rm -r %s' % oma_op_fasta_qualified)
|
|
50
|
+
else:
|
|
51
|
+
print('%s already exist, program exited!' % oma_op_fasta_qualified)
|
|
52
|
+
exit()
|
|
53
|
+
os.system('mkdir %s' % oma_op_fasta_qualified)
|
|
54
|
+
|
|
55
|
+
# filter OMA output
|
|
56
|
+
qualified_grp_to_gene_dict = dict()
|
|
57
|
+
for each_group in open(oma_op_txt):
|
|
58
|
+
if not each_group.startswith('#'):
|
|
59
|
+
each_group_split = each_group.strip().split('\t')
|
|
60
|
+
group_id = each_group_split[0]
|
|
61
|
+
gene_list_by_gnm = each_group_split[1:]
|
|
62
|
+
current_gene_list = [i.split(':')[1].split(' ')[0] for i in gene_list_by_gnm]
|
|
63
|
+
current_gnm_list_interested = []
|
|
64
|
+
current_gene_list_interested = []
|
|
65
|
+
for gene in current_gene_list:
|
|
66
|
+
gnm = '_'.join(gene.split('_')[:-1])
|
|
67
|
+
if gnm in interested_gnm_set:
|
|
68
|
+
current_gnm_list_interested.append(gnm)
|
|
69
|
+
current_gene_list_interested.append(gene)
|
|
70
|
+
|
|
71
|
+
current_cov = len(current_gnm_list_interested) * 100 / len(interested_gnm_set)
|
|
72
|
+
if current_cov >= cov_cutoff:
|
|
73
|
+
qualified_grp_to_gene_dict[group_id] = current_gene_list_interested
|
|
74
|
+
|
|
75
|
+
print('The number of orthologous groups with coverage >= %s is %s.' % (cov_cutoff, len(qualified_grp_to_gene_dict)))
|
|
76
|
+
|
|
77
|
+
# prepare commands for getting gene tree
|
|
78
|
+
get_gene_tree_cmd_txt_handle = open(get_gene_tree_cmd_txt, 'w')
|
|
79
|
+
for qualified_grp in sorted(list(qualified_grp_to_gene_dict.keys())):
|
|
80
|
+
group_id_only_num = qualified_grp.replace('OMA', '')
|
|
81
|
+
while group_id_only_num[0] == '0':
|
|
82
|
+
group_id_only_num = group_id_only_num[1:]
|
|
83
|
+
|
|
84
|
+
# define file name
|
|
85
|
+
og_id = 'OG%s' % group_id_only_num
|
|
86
|
+
pwd_seq_file_in = '%s/%s.fa' % (oma_op_fasta, og_id)
|
|
87
|
+
pwd_og_seq = '%s/%s.fa' % (oma_op_fasta_qualified, og_id)
|
|
88
|
+
pwd_og_aln = '%s/%s.aln' % (oma_op_fasta_qualified, og_id)
|
|
89
|
+
pwd_og_aln_trimmed = '%s/%s_trimmed.aln' % (oma_op_fasta_qualified, og_id)
|
|
90
|
+
|
|
91
|
+
# get sequence
|
|
92
|
+
if len(interested_gnm_set) == len(genome_id_set):
|
|
93
|
+
cp_cmd = 'cp %s %s' % (pwd_seq_file_in, pwd_og_seq)
|
|
94
|
+
os.system(cp_cmd)
|
|
95
|
+
else:
|
|
96
|
+
select_seq(pwd_seq_file_in, qualified_grp_to_gene_dict[qualified_grp], pwd_og_seq)
|
|
97
|
+
|
|
98
|
+
# align, trim and iqtree
|
|
99
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_js_threads, pwd_og_seq, pwd_og_aln)
|
|
100
|
+
trimal_cmd = 'trimal -in %s -out %s -automated1' % (pwd_og_aln, pwd_og_aln_trimmed)
|
|
101
|
+
iqtree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -T %s -B 1000 --quiet --wbtl --prefix %s/%s' % (pwd_og_aln_trimmed, iqtree_model, num_of_js_threads, oma_op_fasta_qualified, og_id)
|
|
102
|
+
cmds_one_line = '%s; %s; %s' % (mafft_cmd, trimal_cmd, iqtree_cmd)
|
|
103
|
+
get_gene_tree_cmd_txt_handle.write(cmds_one_line.replace((oma_op_fasta_qualified + '/'), '') + '\n')
|
|
104
|
+
get_gene_tree_cmd_txt_handle.close()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def SplitScore1OMA(args):
|
|
108
|
+
|
|
109
|
+
oma_op_txt = args['i']
|
|
110
|
+
oma_op_fasta = args['s']
|
|
111
|
+
interested_gnm_txt = args['u']
|
|
112
|
+
iqtree_model = args['m']
|
|
113
|
+
cov_cutoff = args['c']
|
|
114
|
+
force_overwrite = args['f']
|
|
115
|
+
num_of_js_threads = args['jst']
|
|
116
|
+
step_1_op_dir = args['o']
|
|
117
|
+
|
|
118
|
+
# define file name
|
|
119
|
+
qualified_og_dir = '%s/qualified_OGs' % step_1_op_dir
|
|
120
|
+
iqtree_cmds_txt = '%s/iqtree_cmds.txt' % step_1_op_dir
|
|
121
|
+
|
|
122
|
+
# create output folder
|
|
123
|
+
if os.path.isdir(step_1_op_dir) is True:
|
|
124
|
+
if force_overwrite is True:
|
|
125
|
+
os.system('rm -r %s' % step_1_op_dir)
|
|
126
|
+
else:
|
|
127
|
+
print('%s exist, program exited!' % step_1_op_dir)
|
|
128
|
+
exit()
|
|
129
|
+
os.mkdir(step_1_op_dir)
|
|
130
|
+
os.mkdir(qualified_og_dir)
|
|
131
|
+
|
|
132
|
+
# get get_gene_tree
|
|
133
|
+
get_gene_tree(oma_op_txt, oma_op_fasta, interested_gnm_txt, cov_cutoff, qualified_og_dir, iqtree_model, num_of_js_threads, force_overwrite, iqtree_cmds_txt)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == '__main__':
|
|
137
|
+
|
|
138
|
+
SplitScore1OMA_parser = argparse.ArgumentParser()
|
|
139
|
+
SplitScore1OMA_parser.add_argument('-i', required=True, help='OrthologousGroups.txt, produced by OMA')
|
|
140
|
+
SplitScore1OMA_parser.add_argument('-s', required=True, help='OrthologousGroupsFasta, produced by OMA')
|
|
141
|
+
SplitScore1OMA_parser.add_argument('-u', required=False, default= None, help='ID of interested genomes, no file extension')
|
|
142
|
+
SplitScore1OMA_parser.add_argument('-o', required=True, help='output directory')
|
|
143
|
+
SplitScore1OMA_parser.add_argument('-m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
|
|
144
|
+
SplitScore1OMA_parser.add_argument('-c', required=False, type=int, default=80, help='coverage cutoff, default: 80')
|
|
145
|
+
SplitScore1OMA_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
146
|
+
SplitScore1OMA_parser.add_argument('-jst', required=False, type=int, default=1, help='num of threads for inferring gene tree, default: 1')
|
|
147
|
+
args = vars(SplitScore1OMA_parser.parse_args())
|
|
148
|
+
SplitScore1OMA(args)
|