treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/AssessCVG.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import argparse
|
|
3
|
+
import arviz as az
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import plotly.graph_objects as go
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
AssessCVG_usage = '''
|
|
9
|
+
================================= AssessCVG example commands =================================
|
|
10
|
+
|
|
11
|
+
TreeSAK AssessCVG -m1 r1_mcmc.txt -m2 r1_mcmc.txt -o convergence_plot.png
|
|
12
|
+
|
|
13
|
+
# This script was modified based on the script from Tianhua Liao:
|
|
14
|
+
https://github.com/444thLiao/evol_tk/blob/master/dating_workflow/vis/assess_convergence.py
|
|
15
|
+
|
|
16
|
+
==============================================================================================
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def read_mcmc(mcmc, all_col=False):
|
|
21
|
+
if type(mcmc) != str:
|
|
22
|
+
return mcmc
|
|
23
|
+
if all_col:
|
|
24
|
+
mcmc_df = pd.read_csv(mcmc, sep='\t', index_col=0)
|
|
25
|
+
else:
|
|
26
|
+
f1 = open(mcmc)
|
|
27
|
+
header = [_ for _ in next(f1).strip().split('\t')]
|
|
28
|
+
r_header = [_ for _ in header if not _.startswith('r_g')]
|
|
29
|
+
# normally it need to iterate rows and ignore the columns representing rates
|
|
30
|
+
text = '\t'.join(r_header)+'\n'
|
|
31
|
+
r_header=set(r_header)
|
|
32
|
+
for row in f1:
|
|
33
|
+
text += '\t'.join([r for r,h in zip(row.strip().split('\t'),header) if h in r_header])+'\n'
|
|
34
|
+
mcmc_df = pd.read_csv(io.StringIO(text), sep='\t', index_col=0)
|
|
35
|
+
return mcmc_df
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def cal_HPD_CI(df,burn_in=2000):
|
|
39
|
+
"""
|
|
40
|
+
get HPD CI through mcmc.txt directly instead of reading the log/out file.
|
|
41
|
+
Only calculate high density probility 95%.
|
|
42
|
+
Args:
|
|
43
|
+
df (pd.DataFrame): [description]
|
|
44
|
+
burn_in (int, optional): [description]. Defaults to 2000.
|
|
45
|
+
"""
|
|
46
|
+
col2CI = {}
|
|
47
|
+
for colname,col in df.iteritems():
|
|
48
|
+
vals = col.values[burn_in:]
|
|
49
|
+
col2CI[colname] = az.hdi(vals, hdi_prob=.95)
|
|
50
|
+
return col2CI
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_posterior_df(mcmc, burn_in=2000, scale=1, all_col=True):
|
|
54
|
+
mcmc_df = read_mcmc(mcmc, all_col=all_col)
|
|
55
|
+
if pd.isna(mcmc_df.iloc[-1, -1]):
|
|
56
|
+
# if not completed
|
|
57
|
+
mcmc_df = mcmc_df.drop(mcmc_df.index[-1])
|
|
58
|
+
mcmc_df = mcmc_df.loc[~mcmc_df.isna().any(1), :]
|
|
59
|
+
node_names = [_ for _ in mcmc_df.columns if _.startswith('t_n')]
|
|
60
|
+
rates = [_ for _ in mcmc_df.columns if _.startswith('r_g')]
|
|
61
|
+
paras = [_ for _ in mcmc_df.columns if _.startswith('mu') or _.startswith('sigma2')]
|
|
62
|
+
|
|
63
|
+
post_df = pd.DataFrame(columns=['Posterior mean time (100 Ma)',
|
|
64
|
+
'CI_width', 'CIs'],
|
|
65
|
+
index=node_names)
|
|
66
|
+
raw_n2CI = cal_HPD_CI(mcmc_df, burn_in=burn_in)
|
|
67
|
+
if 'lnL' in mcmc_df.columns:
|
|
68
|
+
post_df.loc['lnL', :] = 'NA'
|
|
69
|
+
post_df.loc['lnL', :] = [round(mcmc_df.loc[:, 'lnL'].mean(), 2),
|
|
70
|
+
round(raw_n2CI['lnL'][1] - raw_n2CI['lnL'][0], 2),
|
|
71
|
+
f"{round(raw_n2CI['lnL'][0], 2)} - {round(raw_n2CI['lnL'][1], 2)}",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
n2CI = {k: f"{round(v[0] * scale, 2)} - {round(v[1] * scale, 2)}"
|
|
75
|
+
for k, v in raw_n2CI.items()}
|
|
76
|
+
n2mean_time = {k: round(v * scale, 2)
|
|
77
|
+
for k, v in mcmc_df.mean().to_dict().items()}
|
|
78
|
+
|
|
79
|
+
post_df.loc[node_names, 'Posterior mean time (100 Ma)'] = [n2mean_time[_]
|
|
80
|
+
for _ in post_df.index
|
|
81
|
+
if _ != 'lnL']
|
|
82
|
+
post_df.loc[node_names, 'CIs'] = [n2CI[_]
|
|
83
|
+
for _ in post_df.index
|
|
84
|
+
if _ != 'lnL']
|
|
85
|
+
post_df.loc[node_names, 'CI_width'] = [raw_n2CI[_][1] * scale - raw_n2CI[_][0] * scale
|
|
86
|
+
for _ in post_df.index
|
|
87
|
+
if _ != 'lnL']
|
|
88
|
+
return post_df
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def AssessCVG(args):
|
|
92
|
+
|
|
93
|
+
mcmc_txt_1 = args['m1']
|
|
94
|
+
mcmc_txt_2 = args['m2']
|
|
95
|
+
output_plot = args['o']
|
|
96
|
+
|
|
97
|
+
CI_1 = get_posterior_df(mcmc_txt_1)
|
|
98
|
+
CI_2 = get_posterior_df(mcmc_txt_2)
|
|
99
|
+
|
|
100
|
+
# remove lnL row
|
|
101
|
+
CI_1 = CI_1.iloc[:-1, :]
|
|
102
|
+
CI_2 = CI_2.iloc[:-1, :]
|
|
103
|
+
|
|
104
|
+
dis1 = list(CI_1['Posterior mean time (100 Ma)'])
|
|
105
|
+
dis2 = list(CI_2['Posterior mean time (100 Ma)'])
|
|
106
|
+
|
|
107
|
+
fig = go.Figure()
|
|
108
|
+
fig.add_scatter(x=dis1, y=dis2, name='compared', mode='markers')
|
|
109
|
+
fig.add_scatter(x=[min(dis1 + dis2), max(dis1 + dis2)],
|
|
110
|
+
y=[min(dis1 + dis2), max(dis1 + dis2)],
|
|
111
|
+
mode='lines', name='y=x')
|
|
112
|
+
|
|
113
|
+
fig.layout.width = 750
|
|
114
|
+
fig.layout.height = 750
|
|
115
|
+
fig.layout.xaxis.title = "run2 posterior mean time"
|
|
116
|
+
fig.layout.yaxis.title = "run1 posterior mean time"
|
|
117
|
+
fig.write_image(output_plot)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == '__main__':
|
|
121
|
+
|
|
122
|
+
# initialize the options parser
|
|
123
|
+
parser = argparse.ArgumentParser()
|
|
124
|
+
parser.add_argument('-m1', required=True, help='mcmc.txt from run 1')
|
|
125
|
+
parser.add_argument('-m2', required=True, help='mcmc.txt from run 2')
|
|
126
|
+
parser.add_argument('-o', required=True, help='output convergence plot')
|
|
127
|
+
args = vars(parser.parse_args())
|
|
128
|
+
AssessCVG(args)
|
TreeSAK/AssessMarker.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
from Bio import SeqIO
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_deltall_stdout(deltall_stdout_txt, summary_txt):
|
|
7
|
+
|
|
8
|
+
deltall_op_dict = dict()
|
|
9
|
+
for each_line in open(deltall_stdout_txt):
|
|
10
|
+
if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
|
|
11
|
+
each_line_split = each_line.strip().split('\t')
|
|
12
|
+
marker_id = each_line_split[0]
|
|
13
|
+
value = float(each_line_split[1])
|
|
14
|
+
if marker_id not in deltall_op_dict:
|
|
15
|
+
deltall_op_dict[marker_id] = [value]
|
|
16
|
+
else:
|
|
17
|
+
deltall_op_dict[marker_id].append(value)
|
|
18
|
+
|
|
19
|
+
metric_1_dict = dict()
|
|
20
|
+
metric_2_dict = dict()
|
|
21
|
+
for each_marker in deltall_op_dict:
|
|
22
|
+
metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
|
|
23
|
+
metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
|
|
24
|
+
metric_1_dict[each_marker] = metric_1_value
|
|
25
|
+
metric_2_dict[each_marker] = metric_2_value
|
|
26
|
+
|
|
27
|
+
metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
|
|
28
|
+
metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
|
|
29
|
+
|
|
30
|
+
metric_1_score_dict = dict()
|
|
31
|
+
metric_1_score = 1
|
|
32
|
+
for each_marker_1 in metric_1_dict_sorted:
|
|
33
|
+
metric_1_score_dict[each_marker_1] = metric_1_score
|
|
34
|
+
metric_1_score += 1
|
|
35
|
+
|
|
36
|
+
metric_2_score_dict = dict()
|
|
37
|
+
metric_2_score = 1
|
|
38
|
+
for each_marker_2 in metric_2_dict_sorted:
|
|
39
|
+
metric_2_score_dict[each_marker_2] = metric_2_score
|
|
40
|
+
metric_2_score += 1
|
|
41
|
+
|
|
42
|
+
overall_score_dict = dict()
|
|
43
|
+
for each_marker in deltall_op_dict:
|
|
44
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
45
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
46
|
+
metric_score_overall = metric_score_1 + metric_score_2
|
|
47
|
+
overall_score_dict[each_marker] = metric_score_overall
|
|
48
|
+
|
|
49
|
+
overall_score_dict_sorted = {k: v for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])}
|
|
50
|
+
|
|
51
|
+
summary_txt_handle = open(summary_txt, 'w')
|
|
52
|
+
summary_txt_handle.write('Marker\tmetric1\tmetric1_score\tmetric2\tmetric2_score\toverall_score\n')
|
|
53
|
+
for each_marker in overall_score_dict_sorted:
|
|
54
|
+
metric_value_1 = metric_1_dict[each_marker]
|
|
55
|
+
metric_value_2 = metric_2_dict[each_marker]
|
|
56
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
57
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
58
|
+
metric_score_overall = overall_score_dict_sorted[each_marker]
|
|
59
|
+
summary_txt_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (each_marker, metric_value_1, metric_score_1, metric_value_2, metric_score_2, metric_score_overall))
|
|
60
|
+
summary_txt_handle.close()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def assess_markers_pa(trimmed_aln_dir, gnm_meta_txt, present_pct_cutoff_list, assess_summary_1_txt, assess_summary_2_txt):
|
|
64
|
+
|
|
65
|
+
trimmed_aln_file_re = '%s/*.aln' % trimmed_aln_dir
|
|
66
|
+
trimmed_aln_file_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_aln_file_re)]
|
|
67
|
+
|
|
68
|
+
# read in genome metadata
|
|
69
|
+
domain_to_gnm_dict = dict()
|
|
70
|
+
gnm_to_domain_dict = dict()
|
|
71
|
+
for each_gnm in open(gnm_meta_txt):
|
|
72
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
73
|
+
gnm_id = each_gnm_split[0]
|
|
74
|
+
domain_name = each_gnm_split[1]
|
|
75
|
+
gnm_to_domain_dict[gnm_id] = domain_name
|
|
76
|
+
|
|
77
|
+
if domain_name not in domain_to_gnm_dict:
|
|
78
|
+
domain_to_gnm_dict[domain_name] = {gnm_id}
|
|
79
|
+
else:
|
|
80
|
+
domain_to_gnm_dict[domain_name].add(gnm_id)
|
|
81
|
+
|
|
82
|
+
assess_summary_1_txt_handle = open(assess_summary_1_txt, 'w')
|
|
83
|
+
assess_summary_2_txt_handle = open(assess_summary_2_txt, 'w')
|
|
84
|
+
gnm_to_marker_dict = dict()
|
|
85
|
+
marker_to_gnm_dict = dict()
|
|
86
|
+
cutoff_to_qualified_marker_dict = dict()
|
|
87
|
+
assess_summary_1_txt_handle.write('Marker\tArchaea\tEukaryota\n')
|
|
88
|
+
assess_summary_2_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in present_pct_cutoff_list]))
|
|
89
|
+
for each_aln in trimmed_aln_file_list:
|
|
90
|
+
marker_id = each_aln.split('.aln')[0]
|
|
91
|
+
pwd_aln = '%s/%s' % (trimmed_aln_dir, each_aln)
|
|
92
|
+
gnm_set = set()
|
|
93
|
+
for each_seq in SeqIO.parse(pwd_aln, 'fasta'):
|
|
94
|
+
gnm_id = each_seq.id
|
|
95
|
+
gnm_set.add(gnm_id)
|
|
96
|
+
if gnm_id not in gnm_to_marker_dict:
|
|
97
|
+
gnm_to_marker_dict[gnm_id] = {marker_id}
|
|
98
|
+
else:
|
|
99
|
+
gnm_to_marker_dict[gnm_id].add(marker_id)
|
|
100
|
+
marker_to_gnm_dict[marker_id] = gnm_set
|
|
101
|
+
|
|
102
|
+
gnm_num_ar = 0
|
|
103
|
+
gnm_num_eu = 0
|
|
104
|
+
for each_g in gnm_set:
|
|
105
|
+
g_domain = gnm_to_domain_dict[each_g]
|
|
106
|
+
if g_domain == 'Archaea':
|
|
107
|
+
gnm_num_ar += 1
|
|
108
|
+
if g_domain == 'Eukaryota':
|
|
109
|
+
gnm_num_eu += 1
|
|
110
|
+
gnm_pct_ar = float("{0:.2f}".format(gnm_num_ar / 133 * 100))
|
|
111
|
+
gnm_pct_eu = float("{0:.2f}".format(gnm_num_eu / 27 * 100))
|
|
112
|
+
|
|
113
|
+
# assessment
|
|
114
|
+
assessment_result_list = []
|
|
115
|
+
for present_pct_cutoff in present_pct_cutoff_list:
|
|
116
|
+
if (gnm_pct_ar >= present_pct_cutoff) and (gnm_pct_eu >= present_pct_cutoff):
|
|
117
|
+
assessment_result_list.append('1')
|
|
118
|
+
if str(present_pct_cutoff) not in cutoff_to_qualified_marker_dict:
|
|
119
|
+
cutoff_to_qualified_marker_dict[str(present_pct_cutoff)] = [marker_id]
|
|
120
|
+
else:
|
|
121
|
+
cutoff_to_qualified_marker_dict[str(present_pct_cutoff)].append(marker_id)
|
|
122
|
+
else:
|
|
123
|
+
assessment_result_list.append('0')
|
|
124
|
+
assess_summary_1_txt_handle.write('%s\t%s\t%s\n' % (marker_id, gnm_pct_ar, gnm_pct_eu))
|
|
125
|
+
assess_summary_2_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join(assessment_result_list)))
|
|
126
|
+
|
|
127
|
+
summary_list = [len(cutoff_to_qualified_marker_dict.get(str(i), [])) for i in present_pct_cutoff_list]
|
|
128
|
+
summary_list_str = [str(j) for j in summary_list]
|
|
129
|
+
assess_summary_2_txt_handle.write('Total\t%s\n' % ('\t'.join(summary_list_str)))
|
|
130
|
+
assess_summary_1_txt_handle.close()
|
|
131
|
+
assess_summary_2_txt_handle.close()
|
|
132
|
+
|
|
133
|
+
return cutoff_to_qualified_marker_dict, gnm_to_marker_dict, marker_to_gnm_dict
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def read_in_assessment_pa_2_txt(assessment_pa_2_txt):
|
|
137
|
+
|
|
138
|
+
pa_pct_list = []
|
|
139
|
+
pa_pct_to_marker_dict = dict()
|
|
140
|
+
for each_marker in open(assessment_pa_2_txt):
|
|
141
|
+
each_marker_split = each_marker.strip().split('\t')
|
|
142
|
+
if each_marker.startswith('Marker\t'):
|
|
143
|
+
pa_pct_list = [int(i) for i in each_marker_split[1:]]
|
|
144
|
+
|
|
145
|
+
# initialize pa_pct_to_marker_dict
|
|
146
|
+
for pa_pct in pa_pct_list:
|
|
147
|
+
pa_pct_to_marker_dict[pa_pct] = set()
|
|
148
|
+
|
|
149
|
+
elif not each_marker.startswith('Total\t'):
|
|
150
|
+
marker_id = each_marker_split[0]
|
|
151
|
+
for (pct, pa) in zip(pa_pct_list, each_marker_split[1:]):
|
|
152
|
+
if pa == '1':
|
|
153
|
+
pa_pct_to_marker_dict[pct].add(marker_id)
|
|
154
|
+
|
|
155
|
+
return pa_pct_to_marker_dict
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def get_marker_set_dict(assessment_pa_2_txt, assessment_deltall_txt, deltall_keep_pct_list, min_marker_num):
|
|
159
|
+
|
|
160
|
+
# read in assessment_pa_2_txt
|
|
161
|
+
pa_pct_to_marker_dict = read_in_assessment_pa_2_txt(assessment_pa_2_txt)
|
|
162
|
+
# for each in pa_pct_to_marker_dict:
|
|
163
|
+
# print('%s(%s)\t%s' % (each, len(pa_pct_to_marker_dict[each]), pa_pct_to_marker_dict[each]))
|
|
164
|
+
|
|
165
|
+
# store marker in list according to their DeltaLl scores
|
|
166
|
+
marker_list_by_score = []
|
|
167
|
+
for each_marker in open(assessment_deltall_txt):
|
|
168
|
+
if not each_marker.startswith('Marker\t'):
|
|
169
|
+
each_marker_split = each_marker.strip().split('\t')
|
|
170
|
+
marker_id = each_marker_split[0]
|
|
171
|
+
marker_list_by_score.append(marker_id)
|
|
172
|
+
|
|
173
|
+
# get intersections
|
|
174
|
+
marker_set_dict = dict()
|
|
175
|
+
for each_keep_pct in deltall_keep_pct_list:
|
|
176
|
+
keep_num = round(len(marker_list_by_score) * each_keep_pct / 100)
|
|
177
|
+
if keep_num >= min_marker_num:
|
|
178
|
+
marker_to_keep = marker_list_by_score[:keep_num]
|
|
179
|
+
# print('deltall_keep_pct(top %s%s)(%s)\t%s' % (each_keep_pct, '%', keep_num, marker_to_keep))
|
|
180
|
+
for each_pa_pct in pa_pct_to_marker_dict:
|
|
181
|
+
current_pa_pct_marker_set = pa_pct_to_marker_dict[each_pa_pct]
|
|
182
|
+
marker_set_key = 'deltall%s_pa%s' % (each_keep_pct, each_pa_pct)
|
|
183
|
+
marker_shared = set(marker_to_keep).intersection(current_pa_pct_marker_set)
|
|
184
|
+
marker_set_dict[marker_set_key] = marker_shared
|
|
185
|
+
|
|
186
|
+
return marker_set_dict
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def AssessMarker(assess_marker_wd, trimmed_aln_dir, gnm_group_txt, deltall_stdout_txt, present_pct_cutoff_list, deltall_keep_pct_list, min_marker_pct_per_gnm, min_marker_num, force_create_dir, catfasta2phyml_pl):
|
|
190
|
+
|
|
191
|
+
# define output file name
|
|
192
|
+
assess_summary_deltall_txt = '%s/assessment_deltall.txt' % assess_marker_wd
|
|
193
|
+
assess_summary_1_txt_by_marker = '%s/assessment_pa_1.txt' % assess_marker_wd
|
|
194
|
+
assess_summary_2_txt_by_marker = '%s/assessment_pa_2.txt' % assess_marker_wd
|
|
195
|
+
assess_summary_txt_by_genome = '%s/assessment_pa_by_genome.txt' % assess_marker_wd
|
|
196
|
+
|
|
197
|
+
# parse deltall stdout
|
|
198
|
+
parse_deltall_stdout(deltall_stdout_txt, assess_summary_deltall_txt)
|
|
199
|
+
|
|
200
|
+
# assess markers
|
|
201
|
+
cutoff_to_qualified_marker_dict, gnm_to_marker_dict, marker_to_gnm_dict = assess_markers_pa(trimmed_aln_dir, gnm_group_txt, present_pct_cutoff_list, assess_summary_1_txt_by_marker, assess_summary_2_txt_by_marker)
|
|
202
|
+
|
|
203
|
+
# write out qualified markers
|
|
204
|
+
for each_cutoff in cutoff_to_qualified_marker_dict:
|
|
205
|
+
qualified_m_list = sorted(cutoff_to_qualified_marker_dict[each_cutoff])
|
|
206
|
+
pwd_op_txt = '%s/assessment_pa_qualified_marker_%s.txt' % (assess_marker_wd, each_cutoff)
|
|
207
|
+
with open(pwd_op_txt, 'w') as pwd_op_txt_handle:
|
|
208
|
+
pwd_op_txt_handle.write('\n'.join(qualified_m_list))
|
|
209
|
+
|
|
210
|
+
# write out summary by genomes
|
|
211
|
+
assess_summary_txt_by_genome_handle = open(assess_summary_txt_by_genome, 'w')
|
|
212
|
+
assess_summary_txt_by_genome_handle.write('Cutoff\tGenome\tMarker_all\tMarker_qualified\tMarker_qualified_pct(cutoff:%s)\n' % min_marker_pct_per_gnm)
|
|
213
|
+
for each_cutoff in present_pct_cutoff_list:
|
|
214
|
+
current_cutoff_qualified_marker_list = cutoff_to_qualified_marker_dict.get(str(each_cutoff), [])
|
|
215
|
+
if len(current_cutoff_qualified_marker_list) > 0:
|
|
216
|
+
qualified_gnm_set = set()
|
|
217
|
+
for each_gnm in gnm_to_marker_dict:
|
|
218
|
+
gnm_identified_marker_set = gnm_to_marker_dict[each_gnm]
|
|
219
|
+
gnm_identified_marker_set_qualified = set()
|
|
220
|
+
for identified_marker in gnm_identified_marker_set:
|
|
221
|
+
if identified_marker in current_cutoff_qualified_marker_list:
|
|
222
|
+
gnm_identified_marker_set_qualified.add(identified_marker)
|
|
223
|
+
gnm_identified_marker_set_qualified_pct = len(gnm_identified_marker_set_qualified)*100/len(current_cutoff_qualified_marker_list)
|
|
224
|
+
gnm_identified_marker_set_qualified_pct = float("{0:.2f}".format(gnm_identified_marker_set_qualified_pct))
|
|
225
|
+
if gnm_identified_marker_set_qualified_pct >= min_marker_pct_per_gnm:
|
|
226
|
+
qualified_gnm_set.add(each_gnm)
|
|
227
|
+
else:
|
|
228
|
+
assess_summary_txt_by_genome_handle.write('%s\t%s\t%s\t%s\t%s\n' % (each_cutoff, each_gnm, len(gnm_identified_marker_set), len(gnm_identified_marker_set_qualified), gnm_identified_marker_set_qualified_pct))
|
|
229
|
+
assess_summary_txt_by_genome_handle.write('\n')
|
|
230
|
+
assess_summary_txt_by_genome_handle.close()
|
|
231
|
+
|
|
232
|
+
# select marker gene and concatenate the alignments
|
|
233
|
+
marker_set_dict = get_marker_set_dict(assess_summary_2_txt_by_marker, assess_summary_deltall_txt, deltall_keep_pct_list, min_marker_num)
|
|
234
|
+
for each_marker_set in marker_set_dict:
|
|
235
|
+
current_marker_set = marker_set_dict[each_marker_set]
|
|
236
|
+
if len(current_marker_set) >= min_marker_num:
|
|
237
|
+
|
|
238
|
+
#print('%s\t%s\t%s' % (each_marker_set, len(current_marker_set), current_marker_set))
|
|
239
|
+
pwd_iqtree_dir = '%s/%s_iqtree_wd' % (assess_marker_wd, each_marker_set)
|
|
240
|
+
pwd_marker_id_txt = '%s/%s_marker_id.txt' % (pwd_iqtree_dir, each_marker_set)
|
|
241
|
+
pwd_aln_dir = '%s/%s_aln_trimmed' % (pwd_iqtree_dir, each_marker_set)
|
|
242
|
+
|
|
243
|
+
# copy marker alignments into corresponding dir
|
|
244
|
+
if force_create_dir is True:
|
|
245
|
+
if os.path.isdir(pwd_iqtree_dir) is True:
|
|
246
|
+
os.system('rm -r %s' % pwd_iqtree_dir)
|
|
247
|
+
os.system('mkdir %s' % pwd_iqtree_dir)
|
|
248
|
+
os.system('mkdir %s' % pwd_aln_dir)
|
|
249
|
+
|
|
250
|
+
# write out marker id
|
|
251
|
+
with open(pwd_marker_id_txt, 'w') as pwd_marker_id_txt_handle:
|
|
252
|
+
pwd_marker_id_txt_handle.write('\n'.join(current_marker_set))
|
|
253
|
+
|
|
254
|
+
for each_aln in current_marker_set:
|
|
255
|
+
pwd_aln = '%s/%s.aln' % (trimmed_aln_dir, each_aln)
|
|
256
|
+
cp_cmd = 'cp %s %s/' % (pwd_aln, pwd_aln_dir)
|
|
257
|
+
os.system(cp_cmd)
|
|
258
|
+
|
|
259
|
+
# concatenate alignment
|
|
260
|
+
pwd_concatenate_aln = '%s/%s_concatenated.phy' % (pwd_iqtree_dir, each_marker_set)
|
|
261
|
+
pwd_concatenate_aln_partitions = '%s/%s_partitions.txt' % (pwd_iqtree_dir, each_marker_set)
|
|
262
|
+
catfasta2phyml_cmd = 'perl %s --sequential --concatenate %s/*.aln > %s 2> %s' % (catfasta2phyml_pl, pwd_aln_dir, pwd_concatenate_aln, pwd_concatenate_aln_partitions)
|
|
263
|
+
#print(catfasta2phyml_cmd)
|
|
264
|
+
os.system(catfasta2phyml_cmd)
|
|
265
|
+
|
|
266
|
+
# get guide tree
|
|
267
|
+
get_guide_cmd = 'iqtree -s %s_concatenated.phy --prefix %s_guide_tree --seqtype AA -m LG -T 12 -B 1000 --alrt 1000' % (each_marker_set, each_marker_set)
|
|
268
|
+
# print(get_guide_cmd)
|
|
269
|
+
|
|
270
|
+
# run C60 + PMSF
|
|
271
|
+
c60_pmsf_cmd = 'iqtree -s %s_concatenated.phy --prefix %s --seqtype AA -m LG+G+F+C60 -T 12 -B 1000 --alrt 1000 -ft %s_guide_tree.treefile' % (each_marker_set, each_marker_set, each_marker_set)
|
|
272
|
+
# print(c60_pmsf_cmd)
|
|
273
|
+
|
|
274
|
+
# generate job script
|
|
275
|
+
pwd_js = '%s/js_%s_iqtree.sh' % (assess_marker_wd, each_marker_set)
|
|
276
|
+
with open(pwd_js, 'w') as pwd_js_handle:
|
|
277
|
+
#pwd_js_handle.write('#!/bin/bash\n#SBATCH --nodelist cl007\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task 12\n\n')
|
|
278
|
+
pwd_js_handle.write('#!/bin/bash\n#SBATCH\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task 12\n\n')
|
|
279
|
+
pwd_js_handle.write('cd %s_iqtree_wd\n' % each_marker_set)
|
|
280
|
+
pwd_js_handle.write(get_guide_cmd + '\n')
|
|
281
|
+
pwd_js_handle.write(c60_pmsf_cmd + '\n')
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# inputs
|
|
285
|
+
assess_marker_wd = '/home-user/wzsong/DateArTree/04_dating_Williams_2017_45_arCOG_assess_marker'
|
|
286
|
+
trimmed_aln_dir = '/home-user/wzsong/DateArTree/02_identify_marker_gene_Williams_2017_45_arCOG/best_hit_by_marker_5_aln_trimmed'
|
|
287
|
+
gnm_group_txt = '/home-user/wzsong/DateArTree/01_genome_selection/gnm_metadata.txt'
|
|
288
|
+
deltall_stdout_txt = '/home-user/wzsong/DateArTree/02_identify_marker_gene_Williams_2017_45_arCOG_DeltaLL/nohup.out'
|
|
289
|
+
present_pct_cutoff_list = [25, 50, 75, 85, 100]
|
|
290
|
+
deltall_keep_pct_list = [25, 50, 75, 100]
|
|
291
|
+
min_marker_pct_per_gnm = 75
|
|
292
|
+
min_marker_num = 20
|
|
293
|
+
force_create_dir = True
|
|
294
|
+
#catfasta2phyml_pl = '/Users/songweizhi/PycharmProjects/Sponge_Hologenome/Scripts/catfasta2phyml.pl'
|
|
295
|
+
catfasta2phyml_pl = '/home-user/wzsong/Scripts/catfasta2phyml.pl'
|
|
296
|
+
|
|
297
|
+
AssessMarker(assess_marker_wd, trimmed_aln_dir, gnm_group_txt, deltall_stdout_txt, present_pct_cutoff_list, deltall_keep_pct_list, min_marker_pct_per_gnm, min_marker_num, force_create_dir, catfasta2phyml_pl)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
'''
|
|
301
|
+
Note
|
|
302
|
+
1. Extra genomes in gnm_metadata.txt won't affect assessment results.
|
|
303
|
+
2. Genomes can not be found in gnm_metadata.txt will trigger an error.
|
|
304
|
+
3. Alignments in {trimmed_aln_dir} need to be trimmed before assessment
|
|
305
|
+
4. Sequences in MSAs need to be named by genome id.
|
|
306
|
+
'''
|