treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/dating_ss.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
import itertools
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
from Bio import AlignIO
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
Dating_usage = '''
|
|
9
|
+
============================= Dating example commands =============================
|
|
10
|
+
|
|
11
|
+
# example commands
|
|
12
|
+
TreeSAK Dating_ss -deltall DeltaLL_stdout.txt -aod s11_marker_sets_by_DeltaLL -o s12_dating_wd -c 25-50-75-100 -mmn 20 -f
|
|
13
|
+
|
|
14
|
+
===================================================================================
|
|
15
|
+
'''
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def sep_path_basename_ext(file_in):
|
|
19
|
+
file_path, file_name = os.path.split(file_in)
|
|
20
|
+
if file_path == '':
|
|
21
|
+
file_path = '.'
|
|
22
|
+
file_basename, file_extension = os.path.splitext(file_name)
|
|
23
|
+
return file_path, file_basename, file_extension
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def submit_js(js):
|
|
27
|
+
current_wd = os.getcwd()
|
|
28
|
+
js_path, js_basename, js_ext = sep_path_basename_ext(js)
|
|
29
|
+
os.chdir(js_path)
|
|
30
|
+
os.system('qsub %s%s' % (js_basename, js_ext))
|
|
31
|
+
os.chdir(current_wd)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def root_with_out_group(tree_file, out_group_txt, tree_file_rooted):
|
|
35
|
+
|
|
36
|
+
out_group_set = set()
|
|
37
|
+
for each_og in open(out_group_txt):
|
|
38
|
+
out_group_set.add(each_og.strip())
|
|
39
|
+
|
|
40
|
+
tre = Tree(tree_file, format=1)
|
|
41
|
+
out_group_lca = tre.get_common_ancestor(out_group_set)
|
|
42
|
+
tre.set_outgroup(out_group_lca)
|
|
43
|
+
tre.write(outfile=tree_file_rooted)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def replace_clades(main_tree, sub_tree, tree_out, quote_node_name):
|
|
47
|
+
|
|
48
|
+
tre_sub = Tree(sub_tree, format=1, quoted_node_names=quote_node_name)
|
|
49
|
+
subtree_leaf_name_list = tre_sub.get_leaf_names()
|
|
50
|
+
tre_main = Tree(main_tree)
|
|
51
|
+
lca = tre_main.get_common_ancestor(subtree_leaf_name_list)
|
|
52
|
+
|
|
53
|
+
if len(lca.get_leaf_names()) != len(subtree_leaf_name_list):
|
|
54
|
+
print('LCA of subtree leaves in main tree contain extra leaves, program exited!')
|
|
55
|
+
exit()
|
|
56
|
+
|
|
57
|
+
lca_p = lca.up
|
|
58
|
+
lca_p.remove_child(lca)
|
|
59
|
+
lca_p.add_child(tre_sub)
|
|
60
|
+
tre_main.write(outfile=tree_out, format=8, quoted_node_names=quote_node_name)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def prep_mcmctree_ctl(ctl_para_dict, mcmctree_ctl_file):
|
|
64
|
+
|
|
65
|
+
with open(mcmctree_ctl_file, 'w') as ctl_file_handle:
|
|
66
|
+
ctl_file_handle.write(' finetune = %s\n' % ctl_para_dict.get('seed', '-1'))
|
|
67
|
+
ctl_file_handle.write(' seqfile = %s\n' % ctl_para_dict['seqfile'])
|
|
68
|
+
ctl_file_handle.write(' treefile = %s\n' % ctl_para_dict['treefile'])
|
|
69
|
+
ctl_file_handle.write(' mcmcfile = %s\n' % ctl_para_dict['mcmcfile'])
|
|
70
|
+
ctl_file_handle.write(' outfile = %s\n' % ctl_para_dict['outfile'])
|
|
71
|
+
ctl_file_handle.write(' ndata = %s\n' % ctl_para_dict.get('ndata', 1))
|
|
72
|
+
ctl_file_handle.write(' seqtype = %s * 0: nucleotides; 1:codons; 2:AAs\n' % ctl_para_dict['seqtype'])
|
|
73
|
+
ctl_file_handle.write(' usedata = %s * 0: no data; 1:seq like; 2:normal approximation; 3:out.BV (in.BV)\n' % ctl_para_dict['usedata'])
|
|
74
|
+
ctl_file_handle.write(' clock = %s * 1: global clock; 2: independent rates; 3: correlated rates\n' % ctl_para_dict['clock'])
|
|
75
|
+
ctl_file_handle.write(' RootAge = %s * safe constraint on root age, used if no fossil for root.\n' % ctl_para_dict.get('RootAge', '<1.0'))
|
|
76
|
+
ctl_file_handle.write(' model = %s * 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85\n' % ctl_para_dict.get('model', 0))
|
|
77
|
+
ctl_file_handle.write(' alpha = %s * alpha for gamma rates at sites\n' % ctl_para_dict.get('alpha', 0.5))
|
|
78
|
+
ctl_file_handle.write(' ncatG = %s * No. categories in discrete gamma\n' % ctl_para_dict.get('ncatG', 4))
|
|
79
|
+
ctl_file_handle.write(' cleandata = %s * remove sites with ambiguity data (1:yes, 0:no)?\n' % ctl_para_dict.get('cleandata', 0))
|
|
80
|
+
ctl_file_handle.write(' BDparas = %s * birth, death, sampling\n' % ctl_para_dict.get('BDparas', '1 1 0.1'))
|
|
81
|
+
ctl_file_handle.write(' kappa_gamma = %s * gamma prior for kappa\n' % ctl_para_dict.get('kappa_gamma', '6 2'))
|
|
82
|
+
ctl_file_handle.write(' alpha_gamma = %s * gamma prior for alpha\n' % ctl_para_dict.get('alpha_gamma', '1 1'))
|
|
83
|
+
ctl_file_handle.write(' rgene_gamma = %s * gammaDir prior for rate for genes\n' % ctl_para_dict.get('rgene_gamma', '1 50 1'))
|
|
84
|
+
ctl_file_handle.write(' sigma2_gamma = %s * gammaDir prior for sigma^2 (for clock=2 or 3)\n' % ctl_para_dict.get('sigma2_gamma', '1 10 1'))
|
|
85
|
+
ctl_file_handle.write(' finetune = %s * auto (0 or 1): times, musigma2, rates, mixing, paras, FossilErr\n' % ctl_para_dict.get('finetune', '1: .1 .1 .1 .1 .1 .1'))
|
|
86
|
+
ctl_file_handle.write(' print = %s * 0: no mcmc sample; 1: everything except branch rates 2: everything\n' % ctl_para_dict.get('print', 1))
|
|
87
|
+
ctl_file_handle.write(' burnin = %s\n' % ctl_para_dict.get('burnin', 50000))
|
|
88
|
+
ctl_file_handle.write(' sampfreq = %s\n' % ctl_para_dict.get('sampfreq', 5))
|
|
89
|
+
ctl_file_handle.write(' nsample = %s\n' % ctl_para_dict.get('nsample', 150000))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_parameter_combinations(para_to_test_dict):
|
|
93
|
+
|
|
94
|
+
para_lol_name = []
|
|
95
|
+
para_lol_value = []
|
|
96
|
+
para_lol_name_with_value = []
|
|
97
|
+
for each_para in sorted(list(para_to_test_dict.keys())):
|
|
98
|
+
para_setting_list_name = []
|
|
99
|
+
para_setting_list_value = []
|
|
100
|
+
para_setting_list_name_with_value = []
|
|
101
|
+
for each_setting in sorted(para_to_test_dict[each_para]):
|
|
102
|
+
name_str = ('%s%s' % (each_para, each_setting)).replace(' ', '_')
|
|
103
|
+
para_setting_list_name.append(each_para)
|
|
104
|
+
para_setting_list_value.append(each_setting)
|
|
105
|
+
para_setting_list_name_with_value.append(name_str)
|
|
106
|
+
para_lol_name.append(para_setting_list_name)
|
|
107
|
+
para_lol_value.append(para_setting_list_value)
|
|
108
|
+
para_lol_name_with_value.append(para_setting_list_name_with_value)
|
|
109
|
+
|
|
110
|
+
all_combination_list_name = [p for p in itertools.product(*para_lol_name)]
|
|
111
|
+
all_combination_list_value = [p for p in itertools.product(*para_lol_value)]
|
|
112
|
+
all_combination_list_name_with_value = [p for p in itertools.product(*para_lol_name_with_value)]
|
|
113
|
+
all_combination_list_name_with_value_str = ['_'.join(i) for i in all_combination_list_name_with_value]
|
|
114
|
+
|
|
115
|
+
para_dod = dict()
|
|
116
|
+
element_index = 0
|
|
117
|
+
for each_combination in all_combination_list_name_with_value_str:
|
|
118
|
+
current_name_list = all_combination_list_name[element_index]
|
|
119
|
+
current_value_list = all_combination_list_value[element_index]
|
|
120
|
+
current_para_dict = dict()
|
|
121
|
+
for key, value in zip(current_name_list, current_value_list):
|
|
122
|
+
current_para_dict[key] = value
|
|
123
|
+
para_dod[each_combination] = current_para_dict
|
|
124
|
+
element_index += 1
|
|
125
|
+
|
|
126
|
+
return para_dod
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def fa2phy(fasta_in, phy_out):
|
|
130
|
+
|
|
131
|
+
alignment = AlignIO.read(fasta_in, 'fasta')
|
|
132
|
+
|
|
133
|
+
max_seq_id_len = 0
|
|
134
|
+
for each_seq in alignment:
|
|
135
|
+
seq_id_len = len(each_seq.id)
|
|
136
|
+
if seq_id_len > max_seq_id_len:
|
|
137
|
+
max_seq_id_len = seq_id_len
|
|
138
|
+
|
|
139
|
+
with open(phy_out, 'w') as msa_out_handle:
|
|
140
|
+
msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
|
|
141
|
+
for each_seq in alignment:
|
|
142
|
+
seq_id = each_seq.id
|
|
143
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
144
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def dating_ss(args):
|
|
148
|
+
|
|
149
|
+
deltall_stdout_txt = args['deltall']
|
|
150
|
+
aod = args['aod']
|
|
151
|
+
out_group_txt = args['og']
|
|
152
|
+
eu_tree = args['eu']
|
|
153
|
+
op_dir = args['o']
|
|
154
|
+
deltall_keep_pct_str = args['c']
|
|
155
|
+
min_marker_num = args['mmn']
|
|
156
|
+
force_overwrite = args['f']
|
|
157
|
+
root_age = args['ra']
|
|
158
|
+
submit_job = args['qsub']
|
|
159
|
+
para_to_test = args['to_test']
|
|
160
|
+
js_cpu_num = 1
|
|
161
|
+
quote_node_name = False
|
|
162
|
+
|
|
163
|
+
para_to_test_dict = dict()
|
|
164
|
+
for each_para in open(para_to_test):
|
|
165
|
+
each_para_split = each_para.strip().split()
|
|
166
|
+
para_list = each_para_split[1].split(',')
|
|
167
|
+
para_to_test_dict[each_para_split[0]] = para_list
|
|
168
|
+
print('Parameters to test: %s' % para_to_test_dict)
|
|
169
|
+
|
|
170
|
+
if os.path.isfile(eu_tree) is False:
|
|
171
|
+
print('%s not found, program exited!' % eu_tree)
|
|
172
|
+
exit()
|
|
173
|
+
|
|
174
|
+
deltall_keep_pct_list = [int(i) for i in deltall_keep_pct_str.split('-')]
|
|
175
|
+
deltall_stdout_path, deltall_stdout_basename, deltall_stdout_ext = sep_path_basename_ext(deltall_stdout_txt)
|
|
176
|
+
|
|
177
|
+
# create dir
|
|
178
|
+
if os.path.isdir(op_dir) is True:
|
|
179
|
+
if force_overwrite is True:
|
|
180
|
+
os.system('rm -r %s' % op_dir)
|
|
181
|
+
else:
|
|
182
|
+
print('output folder detected, program exited!')
|
|
183
|
+
exit()
|
|
184
|
+
os.system('mkdir %s' % op_dir)
|
|
185
|
+
|
|
186
|
+
# read in deltall_stdout_txt
|
|
187
|
+
deltall_op_dict = dict()
|
|
188
|
+
for each_line in open(deltall_stdout_txt):
|
|
189
|
+
if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
|
|
190
|
+
each_line_split = each_line.strip().split('\t')
|
|
191
|
+
marker_id = each_line_split[0]
|
|
192
|
+
value = float(each_line_split[1])
|
|
193
|
+
if marker_id not in deltall_op_dict:
|
|
194
|
+
deltall_op_dict[marker_id] = [value]
|
|
195
|
+
else:
|
|
196
|
+
deltall_op_dict[marker_id].append(value)
|
|
197
|
+
|
|
198
|
+
# assigned score to marker
|
|
199
|
+
metric_1_dict = dict()
|
|
200
|
+
metric_2_dict = dict()
|
|
201
|
+
for each_marker in deltall_op_dict:
|
|
202
|
+
metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
|
|
203
|
+
metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
|
|
204
|
+
metric_1_dict[each_marker] = metric_1_value
|
|
205
|
+
metric_2_dict[each_marker] = metric_2_value
|
|
206
|
+
|
|
207
|
+
metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
|
|
208
|
+
metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
|
|
209
|
+
|
|
210
|
+
metric_1_score_dict = dict()
|
|
211
|
+
metric_1_score = 1
|
|
212
|
+
for each_marker_1 in metric_1_dict_sorted:
|
|
213
|
+
metric_1_score_dict[each_marker_1] = metric_1_score
|
|
214
|
+
metric_1_score += 1
|
|
215
|
+
|
|
216
|
+
metric_2_score_dict = dict()
|
|
217
|
+
metric_2_score = 1
|
|
218
|
+
for each_marker_2 in metric_2_dict_sorted:
|
|
219
|
+
metric_2_score_dict[each_marker_2] = metric_2_score
|
|
220
|
+
metric_2_score += 1
|
|
221
|
+
|
|
222
|
+
overall_score_dict = dict()
|
|
223
|
+
for each_marker in deltall_op_dict:
|
|
224
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
225
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
226
|
+
metric_score_overall = metric_score_1 + metric_score_2
|
|
227
|
+
overall_score_dict[each_marker] = metric_score_overall
|
|
228
|
+
marker_list_sorted_by_deltall = [k for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])]
|
|
229
|
+
|
|
230
|
+
# get qualified marker list
|
|
231
|
+
for each_keep_pct in deltall_keep_pct_list:
|
|
232
|
+
marker_num_to_keep = round(len(marker_list_sorted_by_deltall)*each_keep_pct/100)
|
|
233
|
+
|
|
234
|
+
if marker_num_to_keep < min_marker_num:
|
|
235
|
+
print('Ignored DeltaLL cutoff at %s , the number of qualified markers (%s) less than %s' % (each_keep_pct, marker_num_to_keep, min_marker_num))
|
|
236
|
+
else:
|
|
237
|
+
prefix_base = '%s_DeltaLL_%s' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
238
|
+
aln_concatenated = '%s_DeltaLL_%s_concatenated.phy' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
239
|
+
aln_concatenated_in_aod_wd_fasta = '%s_DeltaLL_%s_concatenated.phy.fasta' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
240
|
+
c60_tree_file_rooted_with_time_final = '%s_DeltaLL_%s_rooted_with_time_final.treefile' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
241
|
+
pwd_c60_tree_file = '%s/%s_DeltaLL_%s_iqtree_C60_PMSF/concatenated.treefile' % (aod, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
242
|
+
pwd_c60_tree_file_renamed = '%s/%s_DeltaLL_%s_raw.treefile' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
243
|
+
pwd_c60_tree_file_rooted = '%s/%s_DeltaLL_%s_rooted.treefile' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
244
|
+
pwd_c60_tree_file_rooted_with_time = '%s/%s_DeltaLL_%s_rooted_with_time.treefile' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
245
|
+
pwd_aln_concatenated_in_aod_wd_fasta = '%s/%s' % (aod, aln_concatenated_in_aod_wd_fasta)
|
|
246
|
+
pwd_aln_concatenated_in_op_wd_phylip = '%s/%s' % (op_dir, aln_concatenated)
|
|
247
|
+
pwd_c60_tree_file_rooted_with_time_final = '%s/%s' % (op_dir, c60_tree_file_rooted_with_time_final)
|
|
248
|
+
get_BV_wd = '%s/%s_get_BV_wd' % (op_dir, prefix_base)
|
|
249
|
+
pwd_aln_concatenated_in_bv_wd_phylip = '%s/%s' % (get_BV_wd, aln_concatenated)
|
|
250
|
+
|
|
251
|
+
fa2phy(pwd_aln_concatenated_in_aod_wd_fasta, pwd_aln_concatenated_in_op_wd_phylip)
|
|
252
|
+
os.system('cp %s %s' % (pwd_c60_tree_file, pwd_c60_tree_file_renamed))
|
|
253
|
+
|
|
254
|
+
# root genome tree with outgroup
|
|
255
|
+
root_with_out_group(pwd_c60_tree_file_renamed, out_group_txt, pwd_c60_tree_file_rooted)
|
|
256
|
+
|
|
257
|
+
# add time constraints
|
|
258
|
+
replace_clades(pwd_c60_tree_file_rooted, eu_tree, pwd_c60_tree_file_rooted_with_time, quote_node_name)
|
|
259
|
+
|
|
260
|
+
# remove "NoName" from the rooted tree with time constraints
|
|
261
|
+
tree_str = open(pwd_c60_tree_file_rooted_with_time).readline().strip().replace('NoName', '')
|
|
262
|
+
|
|
263
|
+
# add root age
|
|
264
|
+
tree_str = tree_str.replace(';', '<%s;' % root_age)
|
|
265
|
+
tre_object = Tree(tree_str, format=8, quoted_node_names=quote_node_name)
|
|
266
|
+
with open(pwd_c60_tree_file_rooted_with_time_final, 'w') as pwd_c60_tree_file_rooted_with_time_final_hanlde:
|
|
267
|
+
pwd_c60_tree_file_rooted_with_time_final_hanlde.write('%s\t1\n' % len(tre_object.get_leaf_names()))
|
|
268
|
+
pwd_c60_tree_file_rooted_with_time_final_hanlde.write(tree_str.replace('""', '') + '\n')
|
|
269
|
+
#pwd_c60_tree_file_rooted_with_time_final_hanlde.write(tree_str + '\n')
|
|
270
|
+
|
|
271
|
+
# rm tmp tree files
|
|
272
|
+
os.system('rm %s' % pwd_c60_tree_file_renamed)
|
|
273
|
+
os.system('rm %s' % pwd_c60_tree_file_rooted)
|
|
274
|
+
os.system('rm %s' % pwd_c60_tree_file_rooted_with_time)
|
|
275
|
+
|
|
276
|
+
# get BV file
|
|
277
|
+
os.mkdir(get_BV_wd)
|
|
278
|
+
fa2phy(pwd_aln_concatenated_in_aod_wd_fasta, pwd_aln_concatenated_in_bv_wd_phylip) # sequence in phylip format need to be in one line
|
|
279
|
+
os.system('cp %s %s/' % (pwd_c60_tree_file_rooted_with_time_final, get_BV_wd))
|
|
280
|
+
|
|
281
|
+
get_BV_js = '%s/%s_get_BV.sh' % (op_dir, prefix_base)
|
|
282
|
+
get_BV_mcmctree_ctl = '%s_get_BV_mcmctree.ctl' % (prefix_base)
|
|
283
|
+
pwd_get_BV_mcmctree_ctl = '%s/%s' % (get_BV_wd, get_BV_mcmctree_ctl)
|
|
284
|
+
|
|
285
|
+
get_BV_para_dict = dict()
|
|
286
|
+
get_BV_para_dict['seqfile'] = aln_concatenated
|
|
287
|
+
get_BV_para_dict['treefile'] = c60_tree_file_rooted_with_time_final
|
|
288
|
+
get_BV_para_dict['mcmcfile'] = '%s_mcmc.txt' % prefix_base
|
|
289
|
+
get_BV_para_dict['outfile'] = '%s_out.txt' % prefix_base
|
|
290
|
+
get_BV_para_dict['seqtype'] = '2'
|
|
291
|
+
get_BV_para_dict['usedata'] = '3'
|
|
292
|
+
get_BV_para_dict['clock'] = '3'
|
|
293
|
+
prep_mcmctree_ctl(get_BV_para_dict, pwd_get_BV_mcmctree_ctl)
|
|
294
|
+
|
|
295
|
+
with open(get_BV_js, 'w') as get_BV_js_handle:
|
|
296
|
+
get_BV_js_handle.write('#!/bin/bash\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task %s\n\n' % js_cpu_num)
|
|
297
|
+
get_BV_js_handle.write('cd %s/%s\n' % (os.getcwd(), get_BV_wd))
|
|
298
|
+
get_BV_js_handle.write('mcmctree %s\n' % get_BV_mcmctree_ctl)
|
|
299
|
+
|
|
300
|
+
# prepare files for dating
|
|
301
|
+
para_dod = get_parameter_combinations(para_to_test_dict)
|
|
302
|
+
for para_combination in para_dod:
|
|
303
|
+
mcmctree_ctl = '%s_%s_mcmctree.ctl' % (prefix_base, para_combination)
|
|
304
|
+
current_dating_wd = '%s/%s_DeltaLL_%s_%s_dating_wd' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct, para_combination)
|
|
305
|
+
pwd_mcmctree_ctl = '%s/%s_%s_mcmctree.ctl' % (current_dating_wd, prefix_base, para_combination)
|
|
306
|
+
js_mcmctree = '%s/js_%s_DeltaLL_%s_%s.sh' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct, para_combination)
|
|
307
|
+
pwd_aln_in_dating_wd = '%s/%s' % (current_dating_wd, aln_concatenated)
|
|
308
|
+
|
|
309
|
+
# create dating wd and copy tree and alignment files into it
|
|
310
|
+
os.mkdir(current_dating_wd)
|
|
311
|
+
fa2phy(pwd_aln_concatenated_in_aod_wd_fasta, pwd_aln_in_dating_wd) # sequence in phylip format need to be in one line
|
|
312
|
+
os.system('cp %s %s/' % (pwd_c60_tree_file_rooted_with_time_final, current_dating_wd))
|
|
313
|
+
|
|
314
|
+
current_para_dict = para_dod[para_combination]
|
|
315
|
+
current_para_dict['seqfile'] = aln_concatenated
|
|
316
|
+
current_para_dict['treefile'] = c60_tree_file_rooted_with_time_final
|
|
317
|
+
current_para_dict['mcmcfile'] = '%s_%s_mcmc.txt' % (prefix_base, para_combination)
|
|
318
|
+
current_para_dict['outfile'] = '%s_%s_out.txt' % (prefix_base, para_combination)
|
|
319
|
+
current_para_dict['seqtype'] = '2'
|
|
320
|
+
current_para_dict['usedata'] = '2'
|
|
321
|
+
|
|
322
|
+
prep_mcmctree_ctl(current_para_dict, pwd_mcmctree_ctl)
|
|
323
|
+
|
|
324
|
+
with open(js_mcmctree, 'w') as js_mcmctree_handle:
|
|
325
|
+
js_mcmctree_handle.write('#!/bin/bash\n\n')
|
|
326
|
+
js_mcmctree_handle.write('cd %s/%s\n' % (os.getcwd(), current_dating_wd))
|
|
327
|
+
js_mcmctree_handle.write('cp ../%s_get_BV_wd/out.BV in.BV\n' % prefix_base)
|
|
328
|
+
js_mcmctree_handle.write('mcmctree %s\n' % mcmctree_ctl)
|
|
329
|
+
print('Job script for performing dating exported to %s' % js_mcmctree)
|
|
330
|
+
|
|
331
|
+
if submit_job is True:
|
|
332
|
+
submit_js(get_BV_js)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
if __name__ == '__main__':
|
|
336
|
+
|
|
337
|
+
parser = argparse.ArgumentParser()
|
|
338
|
+
parser.add_argument('-deltall', required=True, help='DeltaLL stdout')
|
|
339
|
+
parser.add_argument('-aod', required=True, help='AssessMarkerDeltaLL output dir')
|
|
340
|
+
parser.add_argument('-og', required=True, help='outgroup leaves, one id per line')
|
|
341
|
+
parser.add_argument('-eu', required=True, help='EU tree with time constraints')
|
|
342
|
+
parser.add_argument('-o', required=True, help='dating wd')
|
|
343
|
+
parser.add_argument('-c', required=False, default='25-50-75-100', help='cutoffs, default: 25-50-75-100')
|
|
344
|
+
parser.add_argument('-mmn', required=False, default=20, type=int, help='minimal marker number, default: 20')
|
|
345
|
+
parser.add_argument('-ra', required=False, default=45, type=int, help='root age, default: 45')
|
|
346
|
+
parser.add_argument('-qsub', required=False, action="store_true", help='submit job scripts for getting in.BV')
|
|
347
|
+
parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
348
|
+
parser.add_argument('-to_test', required=True, help='Settings to test')
|
|
349
|
+
args = vars(parser.parse_args())
|
|
350
|
+
dating_ss(args)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
'''
|
|
354
|
+
|
|
355
|
+
cd /Users/songweizhi/Desktop/dating_test
|
|
356
|
+
python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/dating_ss.py -deltall Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s10_assess_marker_deltaLL/PA_75_DeltaLL_stdout.txt -aod Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s11_marker_sets_by_DeltaLL -og out_group.txt -eu 27.nwk -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s12_dating_wd -c 25-50-75-100 -mmn 20 -f
|
|
357
|
+
|
|
358
|
+
cd /home-user/wzsong/DateArTree
|
|
359
|
+
python3 dating_ss.py -deltall Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s10_assess_marker_deltaLL/PA_75_DeltaLL_stdout.txt -aod Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s11_marker_sets_by_DeltaLL -og out_group.txt -eu 27.nwk -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30/s12_dating_wd -c 25-50-75-100 -mmn 20 -f
|
|
360
|
+
|
|
361
|
+
'''
|
TreeSAK/deltall.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
deltall_usage = '''
|
|
5
|
+
========================= deltall example commands =========================
|
|
6
|
+
|
|
7
|
+
TreeSAK deltall -i nohup.out -o DeltaLL_op_summary.txt
|
|
8
|
+
|
|
9
|
+
# This script was wrote to parse the stdout of deltaLL.rb from Sishuo Wang
|
|
10
|
+
|
|
11
|
+
============================================================================
|
|
12
|
+
'''
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def deltall(args):
|
|
16
|
+
|
|
17
|
+
deltall_stdout_txt = args['i']
|
|
18
|
+
summary_txt = args['o']
|
|
19
|
+
|
|
20
|
+
deltall_op_dict = dict()
|
|
21
|
+
for each_line in open(deltall_stdout_txt):
|
|
22
|
+
if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
|
|
23
|
+
each_line_split = each_line.strip().split('\t')
|
|
24
|
+
marker_id = each_line_split[0]
|
|
25
|
+
value = float(each_line_split[1])
|
|
26
|
+
if marker_id not in deltall_op_dict:
|
|
27
|
+
deltall_op_dict[marker_id] = [value]
|
|
28
|
+
else:
|
|
29
|
+
deltall_op_dict[marker_id].append(value)
|
|
30
|
+
|
|
31
|
+
metric_1_dict = dict()
|
|
32
|
+
metric_2_dict = dict()
|
|
33
|
+
for each_marker in deltall_op_dict:
|
|
34
|
+
metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
|
|
35
|
+
metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
|
|
36
|
+
metric_1_dict[each_marker] = metric_1_value
|
|
37
|
+
metric_2_dict[each_marker] = metric_2_value
|
|
38
|
+
|
|
39
|
+
metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
|
|
40
|
+
metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
|
|
41
|
+
|
|
42
|
+
metric_1_score_dict = dict()
|
|
43
|
+
metric_1_score = 1
|
|
44
|
+
for each_marker_1 in metric_1_dict_sorted:
|
|
45
|
+
metric_1_score_dict[each_marker_1] = metric_1_score
|
|
46
|
+
metric_1_score += 1
|
|
47
|
+
|
|
48
|
+
metric_2_score_dict = dict()
|
|
49
|
+
metric_2_score = 1
|
|
50
|
+
for each_marker_2 in metric_2_dict_sorted:
|
|
51
|
+
metric_2_score_dict[each_marker_2] = metric_2_score
|
|
52
|
+
metric_2_score += 1
|
|
53
|
+
|
|
54
|
+
overall_score_dict = dict()
|
|
55
|
+
for each_marker in deltall_op_dict:
|
|
56
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
57
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
58
|
+
metric_score_overall = metric_score_1 + metric_score_2
|
|
59
|
+
overall_score_dict[each_marker] = metric_score_overall
|
|
60
|
+
|
|
61
|
+
overall_score_dict_sorted = {k: v for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])}
|
|
62
|
+
|
|
63
|
+
summary_txt_handle = open(summary_txt, 'w')
|
|
64
|
+
summary_txt_handle.write('Marker\tmetric1\tmetric1_score\tmetric2\tmetric2_score\toverall_score\n')
|
|
65
|
+
for each_marker in overall_score_dict_sorted:
|
|
66
|
+
metric_value_1 = metric_1_dict[each_marker]
|
|
67
|
+
metric_value_2 = metric_2_dict[each_marker]
|
|
68
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
69
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
70
|
+
metric_score_overall = overall_score_dict_sorted[each_marker]
|
|
71
|
+
summary_txt_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (each_marker, metric_value_1, metric_score_1, metric_value_2, metric_score_2, metric_score_overall))
|
|
72
|
+
summary_txt_handle.close()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
if __name__ == '__main__':
|
|
76
|
+
|
|
77
|
+
# initialize the options parser
|
|
78
|
+
parser = argparse.ArgumentParser()
|
|
79
|
+
parser.add_argument('-i', required=True, help='input file (e.g., nohup.out)')
|
|
80
|
+
parser.add_argument('-o', required=True, help='output summary')
|
|
81
|
+
args = vars(parser.parse_args())
|
|
82
|
+
deltall(args)
|