treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/AssessCVG.py ADDED
@@ -0,0 +1,128 @@
1
+ import io
2
+ import argparse
3
+ import arviz as az
4
+ import pandas as pd
5
+ import plotly.graph_objects as go
6
+
7
+
8
+ AssessCVG_usage = '''
9
+ ================================= AssessCVG example commands =================================
10
+
11
+ TreeSAK AssessCVG -m1 r1_mcmc.txt -m2 r1_mcmc.txt -o convergence_plot.png
12
+
13
+ # This script was modified based on the script from Tianhua Liao:
14
+ https://github.com/444thLiao/evol_tk/blob/master/dating_workflow/vis/assess_convergence.py
15
+
16
+ ==============================================================================================
17
+ '''
18
+
19
+
20
+ def read_mcmc(mcmc, all_col=False):
21
+ if type(mcmc) != str:
22
+ return mcmc
23
+ if all_col:
24
+ mcmc_df = pd.read_csv(mcmc, sep='\t', index_col=0)
25
+ else:
26
+ f1 = open(mcmc)
27
+ header = [_ for _ in next(f1).strip().split('\t')]
28
+ r_header = [_ for _ in header if not _.startswith('r_g')]
29
+ # normally it need to iterate rows and ignore the columns representing rates
30
+ text = '\t'.join(r_header)+'\n'
31
+ r_header=set(r_header)
32
+ for row in f1:
33
+ text += '\t'.join([r for r,h in zip(row.strip().split('\t'),header) if h in r_header])+'\n'
34
+ mcmc_df = pd.read_csv(io.StringIO(text), sep='\t', index_col=0)
35
+ return mcmc_df
36
+
37
+
38
+ def cal_HPD_CI(df,burn_in=2000):
39
+ """
40
+ get HPD CI through mcmc.txt directly instead of reading the log/out file.
41
+ Only calculate high density probility 95%.
42
+ Args:
43
+ df (pd.DataFrame): [description]
44
+ burn_in (int, optional): [description]. Defaults to 2000.
45
+ """
46
+ col2CI = {}
47
+ for colname,col in df.iteritems():
48
+ vals = col.values[burn_in:]
49
+ col2CI[colname] = az.hdi(vals, hdi_prob=.95)
50
+ return col2CI
51
+
52
+
53
+ def get_posterior_df(mcmc, burn_in=2000, scale=1, all_col=True):
54
+ mcmc_df = read_mcmc(mcmc, all_col=all_col)
55
+ if pd.isna(mcmc_df.iloc[-1, -1]):
56
+ # if not completed
57
+ mcmc_df = mcmc_df.drop(mcmc_df.index[-1])
58
+ mcmc_df = mcmc_df.loc[~mcmc_df.isna().any(1), :]
59
+ node_names = [_ for _ in mcmc_df.columns if _.startswith('t_n')]
60
+ rates = [_ for _ in mcmc_df.columns if _.startswith('r_g')]
61
+ paras = [_ for _ in mcmc_df.columns if _.startswith('mu') or _.startswith('sigma2')]
62
+
63
+ post_df = pd.DataFrame(columns=['Posterior mean time (100 Ma)',
64
+ 'CI_width', 'CIs'],
65
+ index=node_names)
66
+ raw_n2CI = cal_HPD_CI(mcmc_df, burn_in=burn_in)
67
+ if 'lnL' in mcmc_df.columns:
68
+ post_df.loc['lnL', :] = 'NA'
69
+ post_df.loc['lnL', :] = [round(mcmc_df.loc[:, 'lnL'].mean(), 2),
70
+ round(raw_n2CI['lnL'][1] - raw_n2CI['lnL'][0], 2),
71
+ f"{round(raw_n2CI['lnL'][0], 2)} - {round(raw_n2CI['lnL'][1], 2)}",
72
+ ]
73
+
74
+ n2CI = {k: f"{round(v[0] * scale, 2)} - {round(v[1] * scale, 2)}"
75
+ for k, v in raw_n2CI.items()}
76
+ n2mean_time = {k: round(v * scale, 2)
77
+ for k, v in mcmc_df.mean().to_dict().items()}
78
+
79
+ post_df.loc[node_names, 'Posterior mean time (100 Ma)'] = [n2mean_time[_]
80
+ for _ in post_df.index
81
+ if _ != 'lnL']
82
+ post_df.loc[node_names, 'CIs'] = [n2CI[_]
83
+ for _ in post_df.index
84
+ if _ != 'lnL']
85
+ post_df.loc[node_names, 'CI_width'] = [raw_n2CI[_][1] * scale - raw_n2CI[_][0] * scale
86
+ for _ in post_df.index
87
+ if _ != 'lnL']
88
+ return post_df
89
+
90
+
91
+ def AssessCVG(args):
92
+
93
+ mcmc_txt_1 = args['m1']
94
+ mcmc_txt_2 = args['m2']
95
+ output_plot = args['o']
96
+
97
+ CI_1 = get_posterior_df(mcmc_txt_1)
98
+ CI_2 = get_posterior_df(mcmc_txt_2)
99
+
100
+ # remove lnL row
101
+ CI_1 = CI_1.iloc[:-1, :]
102
+ CI_2 = CI_2.iloc[:-1, :]
103
+
104
+ dis1 = list(CI_1['Posterior mean time (100 Ma)'])
105
+ dis2 = list(CI_2['Posterior mean time (100 Ma)'])
106
+
107
+ fig = go.Figure()
108
+ fig.add_scatter(x=dis1, y=dis2, name='compared', mode='markers')
109
+ fig.add_scatter(x=[min(dis1 + dis2), max(dis1 + dis2)],
110
+ y=[min(dis1 + dis2), max(dis1 + dis2)],
111
+ mode='lines', name='y=x')
112
+
113
+ fig.layout.width = 750
114
+ fig.layout.height = 750
115
+ fig.layout.xaxis.title = "run2 posterior mean time"
116
+ fig.layout.yaxis.title = "run1 posterior mean time"
117
+ fig.write_image(output_plot)
118
+
119
+
120
+ if __name__ == '__main__':
121
+
122
+ # initialize the options parser
123
+ parser = argparse.ArgumentParser()
124
+ parser.add_argument('-m1', required=True, help='mcmc.txt from run 1')
125
+ parser.add_argument('-m2', required=True, help='mcmc.txt from run 2')
126
+ parser.add_argument('-o', required=True, help='output convergence plot')
127
+ args = vars(parser.parse_args())
128
+ AssessCVG(args)
@@ -0,0 +1,306 @@
1
+ import os
2
+ import glob
3
+ from Bio import SeqIO
4
+
5
+
6
+ def parse_deltall_stdout(deltall_stdout_txt, summary_txt):
7
+
8
+ deltall_op_dict = dict()
9
+ for each_line in open(deltall_stdout_txt):
10
+ if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
11
+ each_line_split = each_line.strip().split('\t')
12
+ marker_id = each_line_split[0]
13
+ value = float(each_line_split[1])
14
+ if marker_id not in deltall_op_dict:
15
+ deltall_op_dict[marker_id] = [value]
16
+ else:
17
+ deltall_op_dict[marker_id].append(value)
18
+
19
+ metric_1_dict = dict()
20
+ metric_2_dict = dict()
21
+ for each_marker in deltall_op_dict:
22
+ metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
23
+ metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
24
+ metric_1_dict[each_marker] = metric_1_value
25
+ metric_2_dict[each_marker] = metric_2_value
26
+
27
+ metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
28
+ metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
29
+
30
+ metric_1_score_dict = dict()
31
+ metric_1_score = 1
32
+ for each_marker_1 in metric_1_dict_sorted:
33
+ metric_1_score_dict[each_marker_1] = metric_1_score
34
+ metric_1_score += 1
35
+
36
+ metric_2_score_dict = dict()
37
+ metric_2_score = 1
38
+ for each_marker_2 in metric_2_dict_sorted:
39
+ metric_2_score_dict[each_marker_2] = metric_2_score
40
+ metric_2_score += 1
41
+
42
+ overall_score_dict = dict()
43
+ for each_marker in deltall_op_dict:
44
+ metric_score_1 = metric_1_score_dict[each_marker]
45
+ metric_score_2 = metric_2_score_dict[each_marker]
46
+ metric_score_overall = metric_score_1 + metric_score_2
47
+ overall_score_dict[each_marker] = metric_score_overall
48
+
49
+ overall_score_dict_sorted = {k: v for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])}
50
+
51
+ summary_txt_handle = open(summary_txt, 'w')
52
+ summary_txt_handle.write('Marker\tmetric1\tmetric1_score\tmetric2\tmetric2_score\toverall_score\n')
53
+ for each_marker in overall_score_dict_sorted:
54
+ metric_value_1 = metric_1_dict[each_marker]
55
+ metric_value_2 = metric_2_dict[each_marker]
56
+ metric_score_1 = metric_1_score_dict[each_marker]
57
+ metric_score_2 = metric_2_score_dict[each_marker]
58
+ metric_score_overall = overall_score_dict_sorted[each_marker]
59
+ summary_txt_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (each_marker, metric_value_1, metric_score_1, metric_value_2, metric_score_2, metric_score_overall))
60
+ summary_txt_handle.close()
61
+
62
+
63
+ def assess_markers_pa(trimmed_aln_dir, gnm_meta_txt, present_pct_cutoff_list, assess_summary_1_txt, assess_summary_2_txt):
64
+
65
+ trimmed_aln_file_re = '%s/*.aln' % trimmed_aln_dir
66
+ trimmed_aln_file_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_aln_file_re)]
67
+
68
+ # read in genome metadata
69
+ domain_to_gnm_dict = dict()
70
+ gnm_to_domain_dict = dict()
71
+ for each_gnm in open(gnm_meta_txt):
72
+ each_gnm_split = each_gnm.strip().split('\t')
73
+ gnm_id = each_gnm_split[0]
74
+ domain_name = each_gnm_split[1]
75
+ gnm_to_domain_dict[gnm_id] = domain_name
76
+
77
+ if domain_name not in domain_to_gnm_dict:
78
+ domain_to_gnm_dict[domain_name] = {gnm_id}
79
+ else:
80
+ domain_to_gnm_dict[domain_name].add(gnm_id)
81
+
82
+ assess_summary_1_txt_handle = open(assess_summary_1_txt, 'w')
83
+ assess_summary_2_txt_handle = open(assess_summary_2_txt, 'w')
84
+ gnm_to_marker_dict = dict()
85
+ marker_to_gnm_dict = dict()
86
+ cutoff_to_qualified_marker_dict = dict()
87
+ assess_summary_1_txt_handle.write('Marker\tArchaea\tEukaryota\n')
88
+ assess_summary_2_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in present_pct_cutoff_list]))
89
+ for each_aln in trimmed_aln_file_list:
90
+ marker_id = each_aln.split('.aln')[0]
91
+ pwd_aln = '%s/%s' % (trimmed_aln_dir, each_aln)
92
+ gnm_set = set()
93
+ for each_seq in SeqIO.parse(pwd_aln, 'fasta'):
94
+ gnm_id = each_seq.id
95
+ gnm_set.add(gnm_id)
96
+ if gnm_id not in gnm_to_marker_dict:
97
+ gnm_to_marker_dict[gnm_id] = {marker_id}
98
+ else:
99
+ gnm_to_marker_dict[gnm_id].add(marker_id)
100
+ marker_to_gnm_dict[marker_id] = gnm_set
101
+
102
+ gnm_num_ar = 0
103
+ gnm_num_eu = 0
104
+ for each_g in gnm_set:
105
+ g_domain = gnm_to_domain_dict[each_g]
106
+ if g_domain == 'Archaea':
107
+ gnm_num_ar += 1
108
+ if g_domain == 'Eukaryota':
109
+ gnm_num_eu += 1
110
+ gnm_pct_ar = float("{0:.2f}".format(gnm_num_ar / 133 * 100))
111
+ gnm_pct_eu = float("{0:.2f}".format(gnm_num_eu / 27 * 100))
112
+
113
+ # assessment
114
+ assessment_result_list = []
115
+ for present_pct_cutoff in present_pct_cutoff_list:
116
+ if (gnm_pct_ar >= present_pct_cutoff) and (gnm_pct_eu >= present_pct_cutoff):
117
+ assessment_result_list.append('1')
118
+ if str(present_pct_cutoff) not in cutoff_to_qualified_marker_dict:
119
+ cutoff_to_qualified_marker_dict[str(present_pct_cutoff)] = [marker_id]
120
+ else:
121
+ cutoff_to_qualified_marker_dict[str(present_pct_cutoff)].append(marker_id)
122
+ else:
123
+ assessment_result_list.append('0')
124
+ assess_summary_1_txt_handle.write('%s\t%s\t%s\n' % (marker_id, gnm_pct_ar, gnm_pct_eu))
125
+ assess_summary_2_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join(assessment_result_list)))
126
+
127
+ summary_list = [len(cutoff_to_qualified_marker_dict.get(str(i), [])) for i in present_pct_cutoff_list]
128
+ summary_list_str = [str(j) for j in summary_list]
129
+ assess_summary_2_txt_handle.write('Total\t%s\n' % ('\t'.join(summary_list_str)))
130
+ assess_summary_1_txt_handle.close()
131
+ assess_summary_2_txt_handle.close()
132
+
133
+ return cutoff_to_qualified_marker_dict, gnm_to_marker_dict, marker_to_gnm_dict
134
+
135
+
136
+ def read_in_assessment_pa_2_txt(assessment_pa_2_txt):
137
+
138
+ pa_pct_list = []
139
+ pa_pct_to_marker_dict = dict()
140
+ for each_marker in open(assessment_pa_2_txt):
141
+ each_marker_split = each_marker.strip().split('\t')
142
+ if each_marker.startswith('Marker\t'):
143
+ pa_pct_list = [int(i) for i in each_marker_split[1:]]
144
+
145
+ # initialize pa_pct_to_marker_dict
146
+ for pa_pct in pa_pct_list:
147
+ pa_pct_to_marker_dict[pa_pct] = set()
148
+
149
+ elif not each_marker.startswith('Total\t'):
150
+ marker_id = each_marker_split[0]
151
+ for (pct, pa) in zip(pa_pct_list, each_marker_split[1:]):
152
+ if pa == '1':
153
+ pa_pct_to_marker_dict[pct].add(marker_id)
154
+
155
+ return pa_pct_to_marker_dict
156
+
157
+
158
+ def get_marker_set_dict(assessment_pa_2_txt, assessment_deltall_txt, deltall_keep_pct_list, min_marker_num):
159
+
160
+ # read in assessment_pa_2_txt
161
+ pa_pct_to_marker_dict = read_in_assessment_pa_2_txt(assessment_pa_2_txt)
162
+ # for each in pa_pct_to_marker_dict:
163
+ # print('%s(%s)\t%s' % (each, len(pa_pct_to_marker_dict[each]), pa_pct_to_marker_dict[each]))
164
+
165
+ # store marker in list according to their DeltaLl scores
166
+ marker_list_by_score = []
167
+ for each_marker in open(assessment_deltall_txt):
168
+ if not each_marker.startswith('Marker\t'):
169
+ each_marker_split = each_marker.strip().split('\t')
170
+ marker_id = each_marker_split[0]
171
+ marker_list_by_score.append(marker_id)
172
+
173
+ # get intersections
174
+ marker_set_dict = dict()
175
+ for each_keep_pct in deltall_keep_pct_list:
176
+ keep_num = round(len(marker_list_by_score) * each_keep_pct / 100)
177
+ if keep_num >= min_marker_num:
178
+ marker_to_keep = marker_list_by_score[:keep_num]
179
+ # print('deltall_keep_pct(top %s%s)(%s)\t%s' % (each_keep_pct, '%', keep_num, marker_to_keep))
180
+ for each_pa_pct in pa_pct_to_marker_dict:
181
+ current_pa_pct_marker_set = pa_pct_to_marker_dict[each_pa_pct]
182
+ marker_set_key = 'deltall%s_pa%s' % (each_keep_pct, each_pa_pct)
183
+ marker_shared = set(marker_to_keep).intersection(current_pa_pct_marker_set)
184
+ marker_set_dict[marker_set_key] = marker_shared
185
+
186
+ return marker_set_dict
187
+
188
+
189
+ def AssessMarker(assess_marker_wd, trimmed_aln_dir, gnm_group_txt, deltall_stdout_txt, present_pct_cutoff_list, deltall_keep_pct_list, min_marker_pct_per_gnm, min_marker_num, force_create_dir, catfasta2phyml_pl):
190
+
191
+ # define output file name
192
+ assess_summary_deltall_txt = '%s/assessment_deltall.txt' % assess_marker_wd
193
+ assess_summary_1_txt_by_marker = '%s/assessment_pa_1.txt' % assess_marker_wd
194
+ assess_summary_2_txt_by_marker = '%s/assessment_pa_2.txt' % assess_marker_wd
195
+ assess_summary_txt_by_genome = '%s/assessment_pa_by_genome.txt' % assess_marker_wd
196
+
197
+ # parse deltall stdout
198
+ parse_deltall_stdout(deltall_stdout_txt, assess_summary_deltall_txt)
199
+
200
+ # assess markers
201
+ cutoff_to_qualified_marker_dict, gnm_to_marker_dict, marker_to_gnm_dict = assess_markers_pa(trimmed_aln_dir, gnm_group_txt, present_pct_cutoff_list, assess_summary_1_txt_by_marker, assess_summary_2_txt_by_marker)
202
+
203
+ # write out qualified markers
204
+ for each_cutoff in cutoff_to_qualified_marker_dict:
205
+ qualified_m_list = sorted(cutoff_to_qualified_marker_dict[each_cutoff])
206
+ pwd_op_txt = '%s/assessment_pa_qualified_marker_%s.txt' % (assess_marker_wd, each_cutoff)
207
+ with open(pwd_op_txt, 'w') as pwd_op_txt_handle:
208
+ pwd_op_txt_handle.write('\n'.join(qualified_m_list))
209
+
210
+ # write out summary by genomes
211
+ assess_summary_txt_by_genome_handle = open(assess_summary_txt_by_genome, 'w')
212
+ assess_summary_txt_by_genome_handle.write('Cutoff\tGenome\tMarker_all\tMarker_qualified\tMarker_qualified_pct(cutoff:%s)\n' % min_marker_pct_per_gnm)
213
+ for each_cutoff in present_pct_cutoff_list:
214
+ current_cutoff_qualified_marker_list = cutoff_to_qualified_marker_dict.get(str(each_cutoff), [])
215
+ if len(current_cutoff_qualified_marker_list) > 0:
216
+ qualified_gnm_set = set()
217
+ for each_gnm in gnm_to_marker_dict:
218
+ gnm_identified_marker_set = gnm_to_marker_dict[each_gnm]
219
+ gnm_identified_marker_set_qualified = set()
220
+ for identified_marker in gnm_identified_marker_set:
221
+ if identified_marker in current_cutoff_qualified_marker_list:
222
+ gnm_identified_marker_set_qualified.add(identified_marker)
223
+ gnm_identified_marker_set_qualified_pct = len(gnm_identified_marker_set_qualified)*100/len(current_cutoff_qualified_marker_list)
224
+ gnm_identified_marker_set_qualified_pct = float("{0:.2f}".format(gnm_identified_marker_set_qualified_pct))
225
+ if gnm_identified_marker_set_qualified_pct >= min_marker_pct_per_gnm:
226
+ qualified_gnm_set.add(each_gnm)
227
+ else:
228
+ assess_summary_txt_by_genome_handle.write('%s\t%s\t%s\t%s\t%s\n' % (each_cutoff, each_gnm, len(gnm_identified_marker_set), len(gnm_identified_marker_set_qualified), gnm_identified_marker_set_qualified_pct))
229
+ assess_summary_txt_by_genome_handle.write('\n')
230
+ assess_summary_txt_by_genome_handle.close()
231
+
232
+ # select marker gene and concatenate the alignments
233
+ marker_set_dict = get_marker_set_dict(assess_summary_2_txt_by_marker, assess_summary_deltall_txt, deltall_keep_pct_list, min_marker_num)
234
+ for each_marker_set in marker_set_dict:
235
+ current_marker_set = marker_set_dict[each_marker_set]
236
+ if len(current_marker_set) >= min_marker_num:
237
+
238
+ #print('%s\t%s\t%s' % (each_marker_set, len(current_marker_set), current_marker_set))
239
+ pwd_iqtree_dir = '%s/%s_iqtree_wd' % (assess_marker_wd, each_marker_set)
240
+ pwd_marker_id_txt = '%s/%s_marker_id.txt' % (pwd_iqtree_dir, each_marker_set)
241
+ pwd_aln_dir = '%s/%s_aln_trimmed' % (pwd_iqtree_dir, each_marker_set)
242
+
243
+ # copy marker alignments into corresponding dir
244
+ if force_create_dir is True:
245
+ if os.path.isdir(pwd_iqtree_dir) is True:
246
+ os.system('rm -r %s' % pwd_iqtree_dir)
247
+ os.system('mkdir %s' % pwd_iqtree_dir)
248
+ os.system('mkdir %s' % pwd_aln_dir)
249
+
250
+ # write out marker id
251
+ with open(pwd_marker_id_txt, 'w') as pwd_marker_id_txt_handle:
252
+ pwd_marker_id_txt_handle.write('\n'.join(current_marker_set))
253
+
254
+ for each_aln in current_marker_set:
255
+ pwd_aln = '%s/%s.aln' % (trimmed_aln_dir, each_aln)
256
+ cp_cmd = 'cp %s %s/' % (pwd_aln, pwd_aln_dir)
257
+ os.system(cp_cmd)
258
+
259
+ # concatenate alignment
260
+ pwd_concatenate_aln = '%s/%s_concatenated.phy' % (pwd_iqtree_dir, each_marker_set)
261
+ pwd_concatenate_aln_partitions = '%s/%s_partitions.txt' % (pwd_iqtree_dir, each_marker_set)
262
+ catfasta2phyml_cmd = 'perl %s --sequential --concatenate %s/*.aln > %s 2> %s' % (catfasta2phyml_pl, pwd_aln_dir, pwd_concatenate_aln, pwd_concatenate_aln_partitions)
263
+ #print(catfasta2phyml_cmd)
264
+ os.system(catfasta2phyml_cmd)
265
+
266
+ # get guide tree
267
+ get_guide_cmd = 'iqtree -s %s_concatenated.phy --prefix %s_guide_tree --seqtype AA -m LG -T 12 -B 1000 --alrt 1000' % (each_marker_set, each_marker_set)
268
+ # print(get_guide_cmd)
269
+
270
+ # run C60 + PMSF
271
+ c60_pmsf_cmd = 'iqtree -s %s_concatenated.phy --prefix %s --seqtype AA -m LG+G+F+C60 -T 12 -B 1000 --alrt 1000 -ft %s_guide_tree.treefile' % (each_marker_set, each_marker_set, each_marker_set)
272
+ # print(c60_pmsf_cmd)
273
+
274
+ # generate job script
275
+ pwd_js = '%s/js_%s_iqtree.sh' % (assess_marker_wd, each_marker_set)
276
+ with open(pwd_js, 'w') as pwd_js_handle:
277
+ #pwd_js_handle.write('#!/bin/bash\n#SBATCH --nodelist cl007\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task 12\n\n')
278
+ pwd_js_handle.write('#!/bin/bash\n#SBATCH\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task 12\n\n')
279
+ pwd_js_handle.write('cd %s_iqtree_wd\n' % each_marker_set)
280
+ pwd_js_handle.write(get_guide_cmd + '\n')
281
+ pwd_js_handle.write(c60_pmsf_cmd + '\n')
282
+
283
+
284
+ # inputs
285
+ assess_marker_wd = '/home-user/wzsong/DateArTree/04_dating_Williams_2017_45_arCOG_assess_marker'
286
+ trimmed_aln_dir = '/home-user/wzsong/DateArTree/02_identify_marker_gene_Williams_2017_45_arCOG/best_hit_by_marker_5_aln_trimmed'
287
+ gnm_group_txt = '/home-user/wzsong/DateArTree/01_genome_selection/gnm_metadata.txt'
288
+ deltall_stdout_txt = '/home-user/wzsong/DateArTree/02_identify_marker_gene_Williams_2017_45_arCOG_DeltaLL/nohup.out'
289
+ present_pct_cutoff_list = [25, 50, 75, 85, 100]
290
+ deltall_keep_pct_list = [25, 50, 75, 100]
291
+ min_marker_pct_per_gnm = 75
292
+ min_marker_num = 20
293
+ force_create_dir = True
294
+ #catfasta2phyml_pl = '/Users/songweizhi/PycharmProjects/Sponge_Hologenome/Scripts/catfasta2phyml.pl'
295
+ catfasta2phyml_pl = '/home-user/wzsong/Scripts/catfasta2phyml.pl'
296
+
297
+ AssessMarker(assess_marker_wd, trimmed_aln_dir, gnm_group_txt, deltall_stdout_txt, present_pct_cutoff_list, deltall_keep_pct_list, min_marker_pct_per_gnm, min_marker_num, force_create_dir, catfasta2phyml_pl)
298
+
299
+
300
+ '''
301
+ Note
302
+ 1. Extra genomes in gnm_metadata.txt won't affect assessment results.
303
+ 2. Genomes can not be found in gnm_metadata.txt will trigger an error.
304
+ 3. Alignments in {trimmed_aln_dir} need to be trimmed before assessment
305
+ 4. Sequences in MSAs need to be named by genome id.
306
+ '''