treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/AssessPB.py ADDED
@@ -0,0 +1,130 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ AssessPB_usage = '''
6
+ ====================== AssessPB example commands ======================
7
+
8
+ # Dependency: bpcomp and tracecomp (from PhyloBayes-MPI)
9
+
10
+ export OMPI_MCA_btl=^openib
11
+
12
+ TreeSAK AssessPB -c1 c1dir/c1 -c2 c2dir/c2
13
+ TreeSAK AssessPB -c1 c1dir/c1 -c2 c2dir/c2 -c3 c3dir/c3
14
+ TreeSAK AssessPB -c1 c1dir/c1 -c2 c2dir/c2 -c3 c3dir/c3 -c4 c4dir/c4
15
+ TreeSAK AssessPB -cdir chain_dir
16
+
17
+ # This is a wrapper for:
18
+ bpcomp -x 1000 10 c1 c2
19
+ bpcomp -x 1000 10 c1 c2 c3 c4
20
+ tracecomp -x 1000 c1 c2
21
+ tracecomp -x 1000 c1 c2 c3 c4
22
+
23
+ =======================================================================
24
+ '''
25
+
26
+
27
+ def compare2chains(chain_1, chain_2, chain_3, chain_4, burn_in, sample_interval, with_bpcomp, with_tracecomp, op_dir, cmd_txt):
28
+
29
+ # bpcomp: -x <burnin> [<every> <until>]. default burnin = 10 percent of the chain
30
+ # tracecomp: -x <burnin> [<every> <until>]. default burnin = 20 percent of the chain
31
+
32
+ bpcomp_cmd = 'bpcomp -o %s/bpcomp -x %s %s %s %s' % (op_dir, burn_in, sample_interval, chain_1, chain_2)
33
+ tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s' % (op_dir, burn_in, chain_1, chain_2)
34
+
35
+ if (chain_3 is not None) and (chain_4 is None):
36
+ bpcomp_cmd = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s' % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3)
37
+ tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s' % (op_dir, burn_in, chain_1, chain_2, chain_3)
38
+
39
+ if (chain_3 is not None) and (chain_4 is not None):
40
+ bpcomp_cmd = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s %s' % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3, chain_4)
41
+ tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s %s' % (op_dir, burn_in, chain_1, chain_2, chain_3, chain_4)
42
+
43
+ # write out commands
44
+ cmd_txt_handle = open(cmd_txt, 'a')
45
+ cmd_txt_handle.write(bpcomp_cmd + '\n')
46
+ cmd_txt_handle.write(tracecomp_cmd + '\n')
47
+ cmd_txt_handle.close()
48
+
49
+ # execute commands
50
+ if with_bpcomp is True:
51
+ print()
52
+ print('====================== bpcomp ======================')
53
+ print()
54
+ print(bpcomp_cmd)
55
+ os.system(bpcomp_cmd)
56
+ print('Guideline')
57
+ print('1. maxdiff < 0.1: good run.')
58
+ print('2. maxdiff < 0.3: acceptable: gives a good qualitative picture of the posterior consensus.')
59
+ print('3. 0.3 < maxdiff < 1: the sample is not yet sufficiently large and have not converged, but on right track.')
60
+ print('4. if maxdiff = 1 even after 10,000 points: at least one run stuck in a local maximum.')
61
+ print()
62
+
63
+ if with_tracecomp is True:
64
+ print('==================== tracecomp ====================')
65
+ print()
66
+ print(tracecomp_cmd)
67
+ print()
68
+ os.system(tracecomp_cmd)
69
+ print()
70
+ print('Guideline')
71
+ print('1. rel diff < 0.1 and minimum effective size > 300: good run.')
72
+ print('2. rel diff < 0.3 and minimum effective size > 50: acceptable run.')
73
+ print()
74
+
75
+ print('====================================================')
76
+
77
+
78
+ def AssessPB(args):
79
+
80
+ chain_1 = args['c1']
81
+ chain_2 = args['c2']
82
+ chain_3 = args['c3']
83
+ chain_4 = args['c4']
84
+ chain_dir = args['cdir']
85
+ burn_in = args['bi']
86
+ sample_interval = args['si']
87
+ op_dir = args['o']
88
+ force_overwrite = args['f']
89
+ with_bpcomp = True
90
+ with_tracecomp = True
91
+
92
+ cmd_txt = '%s/cmds.txt' % op_dir
93
+
94
+ # create output dir
95
+ if os.path.isdir(op_dir) is True:
96
+ if force_overwrite is True:
97
+ os.system('rm -r %s' % op_dir)
98
+ else:
99
+ print('output folder already exist, program exited!')
100
+ exit()
101
+ os.system('mkdir %s' % op_dir)
102
+
103
+ if (chain_1 is not None) and (chain_2 is not None) and (chain_dir is None):
104
+ compare2chains(chain_1, chain_2, chain_3, chain_4, burn_in, sample_interval, with_bpcomp, with_tracecomp, op_dir, cmd_txt)
105
+
106
+ elif (chain_1 is None) and (chain_2 is None) and (chain_dir is not None):
107
+ print('Compare multiple chains')
108
+ print('Function to be added!')
109
+ print('Program exited!')
110
+
111
+ else:
112
+ print('Please compare either no more than four chains (specified by -c1, -c2, -c3 and -c4) or multiple chains provided within -cdir')
113
+ print('Program exited!')
114
+ exit()
115
+
116
+
117
+ if __name__ == '__main__':
118
+
119
+ AssessPB_parser = argparse.ArgumentParser()
120
+ AssessPB_parser.add_argument('-c1', required=False, default=None, help='chain 1')
121
+ AssessPB_parser.add_argument('-c2', required=False, default=None, help='chain 2')
122
+ AssessPB_parser.add_argument('-c3', required=False, default=None, help='chain 3')
123
+ AssessPB_parser.add_argument('-c4', required=False, default=None, help='chain 4')
124
+ AssessPB_parser.add_argument('-cdir', required=False, default=None, help='chain folder')
125
+ AssessPB_parser.add_argument('-bi', required=False, default=1000, help='burn-in, default: 1000')
126
+ AssessPB_parser.add_argument('-si', required=False, default=10, help='sample interval, default: 10')
127
+ AssessPB_parser.add_argument('-o', required=True, default=None, help='output directory')
128
+ AssessPB_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
129
+ args = vars(AssessPB_parser.parse_args())
130
+ AssessPB(args)
TreeSAK/BMGE.jar ADDED
Binary file
TreeSAK/BMGE.py ADDED
@@ -0,0 +1,49 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ BMGE_usage = '''
6
+ ======================= BMGE example commands =======================
7
+
8
+ # require: java
9
+
10
+ TreeSAK BMGE -p demo -i input.aln -m BLOSUM30 -esc 0.55
11
+
12
+ # Settings for calculating split score (Nina Dombrowski):
13
+ # -t AA -m BLOSUM30 -h 0.55
14
+
15
+ =====================================================================
16
+ '''
17
+
18
+
19
+ def BMGE(args):
20
+
21
+ op_prefix = args['p']
22
+ msa_in = args['i']
23
+ trim_model = args['m']
24
+ entropy_score_cutoff = args['esc']
25
+
26
+ # define file name
27
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
28
+
29
+ # specify path to BMGE.jar
30
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
31
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
32
+
33
+ # run BMGE
34
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_fasta)
35
+ print(bmge_cmd)
36
+ os.system(bmge_cmd)
37
+
38
+ print('Done!')
39
+
40
+
41
+ if __name__ == '__main__':
42
+
43
+ BMGE_parser = argparse.ArgumentParser()
44
+ BMGE_parser.add_argument('-p', required=True, help='output prefix')
45
+ BMGE_parser.add_argument('-i', required=True, help='input MSA')
46
+ BMGE_parser.add_argument('-m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
47
+ BMGE_parser.add_argument('-esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
48
+ args = vars(BMGE_parser.parse_args())
49
+ BMGE(args)
TreeSAK/CompareMCMC.py ADDED
@@ -0,0 +1,138 @@
1
+ import os
2
+ import argparse
3
+ import arviz as az
4
+ import pandas as pd
5
+ import matplotlib as mpl
6
+ mpl.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+ from matplotlib.pyplot import figure
9
+
10
+
11
+ CompareMCMC_usage = '''
12
+ ====================================== CompareMCMC example commands ======================================
13
+
14
+ TreeSAK CompareMCMC -mx IR_mcmc.txt -my AR_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
15
+
16
+ cd /Users/songweizhi/Desktop
17
+ TreeSAK CompareMCMC -mx /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run1_mcmc.txt -my /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run2_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
18
+
19
+ ==========================================================================================================
20
+ '''
21
+
22
+
23
+ def sep_path_basename_ext(file_in):
24
+ file_path, file_name = os.path.split(file_in)
25
+ if file_path == '':
26
+ file_path = '.'
27
+ file_basename, file_extension = os.path.splitext(file_name)
28
+ return file_path, file_basename, file_extension
29
+
30
+
31
+ def CompareMCMC(args):
32
+
33
+ mcmc_txt_x = args['mx']
34
+ mcmc_txt_y = args['my']
35
+ label_x = args['lx']
36
+ label_y = args['ly']
37
+ pwd_figure = args['o']
38
+ max_axis_value = args['max']
39
+ label_fs = args['fs']
40
+
41
+ x_path, x_basename, x_ext = sep_path_basename_ext(mcmc_txt_x)
42
+ y_path, y_basename, y_ext = sep_path_basename_ext(mcmc_txt_y)
43
+
44
+ if label_x is None:
45
+ label_x = x_basename
46
+ if label_y is None:
47
+ label_y = y_basename
48
+
49
+ # read in dataframe
50
+ df_x = pd.read_table(mcmc_txt_x, index_col=0)
51
+ df_y = pd.read_table(mcmc_txt_y, index_col=0)
52
+
53
+ # get Mean value for each column
54
+ df_x_col_to_mean_dict = {col_name: mean for col_name, mean in df_x.mean().iteritems()}
55
+ df_y_col_to_mean_dict = {col_name: mean for col_name, mean in df_y.mean().iteritems()}
56
+
57
+ # get CI95 for each column
58
+ df_x_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_x.iteritems()}
59
+ df_y_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_y.iteritems()}
60
+
61
+ num_list_x = []
62
+ num_list_y = []
63
+ err_range_x = []
64
+ err_range_y = []
65
+ for col_name, col in df_x.iteritems():
66
+ if col_name not in ['mu', 'sigma2', 'lnL']:
67
+ num_list_x.append(df_x_col_to_mean_dict[col_name])
68
+ num_list_y.append(df_y_col_to_mean_dict[col_name])
69
+ err_range_x.append(df_x_col_to_ci_dict[col_name])
70
+ err_range_y.append(df_y_col_to_ci_dict[col_name])
71
+
72
+ x_err_l = []
73
+ x_err_r = []
74
+ y_err_l = []
75
+ y_err_u = []
76
+ max_value = 0
77
+ min_value = 100000000000000
78
+ n = 0
79
+ while n < len(num_list_x):
80
+ x_value = num_list_x[n]
81
+ y_value = num_list_y[n]
82
+ x_range = err_range_x[n]
83
+ y_range = err_range_y[n]
84
+ x_l_dist = abs(x_value - x_range[0])
85
+ x_r_dist = abs(x_range[1] - x_value)
86
+ y_l_dist = abs(y_value - y_range[0])
87
+ y_u_dist = abs(y_range[1] - y_value)
88
+ x_err_l.append(x_l_dist)
89
+ x_err_r.append(x_r_dist)
90
+ y_err_l.append(y_l_dist)
91
+ y_err_u.append(y_u_dist)
92
+
93
+ current_max = max(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
94
+ current_min = min(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
95
+
96
+ if current_max > max_value:
97
+ max_value = current_max
98
+ if current_min < min_value:
99
+ min_value = current_min
100
+ n += 1
101
+
102
+ figure(figsize=(6, 6), dpi=300)
103
+ plt.plot([min_value, max_value], [min_value, max_value], color='black', linestyle='dashed', linewidth=1, alpha=0.5)
104
+ plt.scatter(num_list_x, num_list_y, s=0)
105
+ plt.errorbar(num_list_x, num_list_y, xerr=[x_err_l, x_err_r], yerr=[y_err_l, y_err_u],
106
+ ls='none', ecolor='skyblue', elinewidth=1, alpha=0.5)
107
+
108
+ if max_axis_value is not None:
109
+ plt.xlim([0, max_axis_value])
110
+ plt.ylim([0, max_axis_value])
111
+
112
+ # Set the font size of xticks and yticks
113
+ plt.xticks(fontsize=label_fs)
114
+ plt.yticks(fontsize=label_fs)
115
+ plt.xlabel(label_x, fontsize=label_fs)
116
+ plt.ylabel(label_y, fontsize=label_fs)
117
+
118
+ # write out
119
+ plt.tight_layout()
120
+ plt.savefig(pwd_figure)
121
+ plt.close()
122
+
123
+ print('Plot exported to %s, done!' % pwd_figure)
124
+
125
+
126
+ if __name__ == '__main__':
127
+
128
+ # initialize the options parser
129
+ parser = argparse.ArgumentParser()
130
+ parser.add_argument('-mx', required=True, help='mcmc.txt for x axis')
131
+ parser.add_argument('-my', required=True, help='mcmc.txt for y axis')
132
+ parser.add_argument('-lx', required=False, default=None, help='label for x axis')
133
+ parser.add_argument('-ly', required=False, default=None, help='label for y axis')
134
+ parser.add_argument('-max', required=False, default=None, type=int, help='maximum axis value')
135
+ parser.add_argument('-fs', required=False, default=16, type=int, help='label font size, default: 16')
136
+ parser.add_argument('-o', required=True, help='output plot')
137
+ args = vars(parser.parse_args())
138
+ CompareMCMC(args)
TreeSAK/ConcateMSA.py ADDED
@@ -0,0 +1,111 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+
7
+
8
+ ConcateMSA_usage = '''
9
+ ================= ConcateMSA example commands =================
10
+
11
+ TreeSAK ConcateMSA -i aln -x aln -p concatenated -gene2gnm
12
+
13
+ # output file include:
14
+ concatenated.fasta
15
+ concatenated.phylip
16
+ concatenated.partition.txt
17
+
18
+ ===============================================================
19
+ '''
20
+
21
+
22
+ def ConcateMSA(args):
23
+
24
+ msa_dir = args['i']
25
+ msa_ext = args['x']
26
+ op_prefix = args['p']
27
+ gene2gnm = args['gene2gnm']
28
+
29
+ concatenated_msa_phy = '%s.phylip' % op_prefix
30
+ concatenated_msa_fasta = '%s.fasta' % op_prefix
31
+ partition_file = '%s.partition.txt' % op_prefix
32
+
33
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
34
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
35
+ msa_file_list_sorted = sorted(msa_file_list)
36
+
37
+ complete_gnm_set = set()
38
+ for each_msa_file in msa_file_list:
39
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
40
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
41
+ seq_id = each_seq.id
42
+ if gene2gnm is True:
43
+ seq_id = '_'.join(seq_id.split('_')[:-1])
44
+ complete_gnm_set.add(seq_id)
45
+
46
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
47
+
48
+ # initialize concatenated msa dict
49
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
50
+ msa_len_dict = dict()
51
+ for each_msa_file in msa_file_list_sorted:
52
+ msa_id = each_msa_file.split('.' + msa_ext)[0]
53
+
54
+ # read in msa
55
+ current_msa_len = 0
56
+ current_msa_len_set = set()
57
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
58
+ current_msa_seq_dict = dict()
59
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
60
+ seq_id = each_seq.id
61
+ if gene2gnm is True:
62
+ seq_id = '_'.join(seq_id.split('_')[:-1])
63
+ complete_gnm_set.add(seq_id)
64
+ current_msa_seq_dict[seq_id] = str(each_seq.seq)
65
+ current_msa_len_set.add(len(each_seq.seq))
66
+ current_msa_len = len(each_seq.seq)
67
+
68
+ if len(current_msa_len_set) != 1:
69
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
70
+ exit()
71
+
72
+ msa_len_dict[msa_id] = current_msa_len
73
+
74
+ # add sequence to concatenated msa dict
75
+ for each_gnm in complete_gnm_list_sorted:
76
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
77
+ gnm_to_seq_dict[each_gnm] += msa_seq
78
+
79
+ # write out concatenated msa
80
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
81
+ for each_gnm in complete_gnm_list_sorted:
82
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
83
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
84
+ concatenated_msa_handle.close()
85
+
86
+ # write out partition file
87
+ end_pos = 0
88
+ partition_file_handle = open(partition_file, 'w')
89
+ for each_m in msa_file_list_sorted:
90
+ gene_id = each_m.split('.' + msa_ext)[0]
91
+ current_m_len = msa_len_dict[gene_id]
92
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
93
+ end_pos += current_m_len
94
+ partition_file_handle.close()
95
+
96
+ # convert msa in fasta to phy
97
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
98
+
99
+
100
+
101
+ if __name__ == '__main__':
102
+
103
+ # initialize the options parser
104
+ parser = argparse.ArgumentParser()
105
+ parser.add_argument('-i', required=True, help='input MSA folder')
106
+ parser.add_argument('-x', required=False, default='aln', help='input file extension')
107
+ parser.add_argument('-p', required=True, help='output prefix')
108
+ parser.add_argument('-gene2gnm', required=False, action="store_true", help='gene id to gnm id, split sequence id before the last _')
109
+
110
+ args = vars(parser.parse_args())
111
+ ConcateMSA(args)
TreeSAK/ConvertMSA.py ADDED
@@ -0,0 +1,135 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+
7
+
8
+ ConvertMSA_usage = '''
9
+ ================================= ConvertMSA example commands =================================
10
+
11
+ # phylip to fasta
12
+ TreeSAK ConvertMSA -i concatenated.phy -fi phylip-relaxed -o concatenated.fasta -fo fasta
13
+ TreeSAK ConvertMSA -i phy_files -fi phylip-relaxed -xi phy -o MSA_in_fasta -fo fasta -xo fa
14
+
15
+ # examples of alignment format (https://biopython.org/wiki/AlignIO):
16
+ fasta, phylip, phylip-relaxed, phylip-sequential, clustal
17
+
18
+ ===============================================================================================
19
+ '''
20
+
21
+
22
+ def sep_path_basename_ext(file_in):
23
+
24
+ # separate path and file name
25
+ file_path, file_name = os.path.split(file_in)
26
+ if file_path == '':
27
+ file_path = '.'
28
+
29
+ # separate file basename and extension
30
+ file_basename, file_extension = os.path.splitext(file_name)
31
+
32
+ return file_path, file_basename, file_extension
33
+
34
+
35
+ def ConvertMSA(args):
36
+
37
+ aln_in = args['i']
38
+ aln_in_ext = args['xi']
39
+ aln_in_format = args['fi']
40
+ aln_out = args['o']
41
+ aln_out_ext = args['xo']
42
+ aln_out_format = args['fo']
43
+ one_line = args['oneline']
44
+ no_gap = args['nogap']
45
+ force_overwriting = args['f']
46
+
47
+ if ((one_line is True) and (aln_out_format != 'fasta')) or ((no_gap is True) and (aln_out_format != 'fasta')):
48
+ print('Please provide "-oneline" and/or "-nogap" only if "-fo" is fasta')
49
+ exit()
50
+
51
+ if os.path.isfile(aln_in) is True:
52
+ if (one_line is False) and (no_gap is False):
53
+ AlignIO.convert(aln_in, aln_in_format, aln_out, aln_out_format)
54
+ else:
55
+ aln_out_tmp = aln_out + '.tmp'
56
+ AlignIO.convert(aln_in, aln_in_format, aln_out_tmp, aln_out_format)
57
+ pwd_aln_out_handle = open(aln_out, 'w')
58
+ for each_seq in SeqIO.parse(aln_out_tmp, 'fasta'):
59
+ seq_id = each_seq.id
60
+ seq_sequence = str(each_seq.seq)
61
+ if no_gap is False:
62
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
63
+ pwd_aln_out_handle.write('%s\n' % seq_sequence)
64
+ else:
65
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
66
+ pwd_aln_out_handle.write('%s\n' % seq_sequence.replace('-', ''))
67
+ pwd_aln_out_handle.close()
68
+ os.system('rm %s' % aln_out_tmp)
69
+
70
+ print('Done!')
71
+
72
+ elif os.path.isdir(aln_in) is True:
73
+ aln_in_re = '%s/*.%s' % (aln_in, aln_in_ext)
74
+ aln_in_list = [os.path.basename(file_name) for file_name in glob.glob(aln_in_re)]
75
+
76
+ # check input
77
+ if len(aln_in_list) == 0:
78
+ print('Input file not detected, program exited!')
79
+ exit()
80
+
81
+ # check output folder
82
+ if os.path.isdir(aln_out) is True:
83
+ if force_overwriting is True:
84
+ os.system('rm -r %s' % aln_out)
85
+ else:
86
+ print('Output folder already exist, program exited!')
87
+ exit()
88
+ os.system('mkdir %s' % aln_out)
89
+
90
+ # convert
91
+ for each_aln_in in aln_in_list:
92
+
93
+ aln_in_path, aln_in_basename, aln_in_ext = sep_path_basename_ext(each_aln_in)
94
+ pwd_aln_in = '%s/%s' % (aln_in, each_aln_in)
95
+ pwd_aln_out = '%s/%s.%s' % (aln_out, aln_in_basename, aln_out_ext)
96
+ pwd_aln_out_tmp = '%s/%s_tmp.%s' % (aln_out, aln_in_basename, aln_out_ext)
97
+
98
+ if (one_line is False) and (no_gap is False):
99
+ AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out, aln_out_format)
100
+ else:
101
+ AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out_tmp, aln_out_format)
102
+ pwd_aln_out_handle = open(pwd_aln_out, 'w')
103
+ for each_seq in SeqIO.parse(pwd_aln_out_tmp, 'fasta'):
104
+ seq_id = each_seq.id
105
+ seq_sequence = str(each_seq.seq)
106
+ if no_gap is False:
107
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
108
+ pwd_aln_out_handle.write('%s\n' % seq_sequence)
109
+ else:
110
+ sequence_no_gap = seq_sequence.replace('-', '')
111
+ if len(sequence_no_gap) > 0:
112
+ pwd_aln_out_handle.write('>%s\n' % seq_id)
113
+ pwd_aln_out_handle.write('%s\n' % sequence_no_gap)
114
+ pwd_aln_out_handle.close()
115
+ os.system('rm %s' % pwd_aln_out_tmp)
116
+ print('Done!')
117
+ else:
118
+ print('Input file not found, program exited!')
119
+
120
+
121
+ if __name__ == '__main__':
122
+
123
+ # initialize the options parser
124
+ parser = argparse.ArgumentParser()
125
+ parser.add_argument('-i', required=True, help='input alignment')
126
+ parser.add_argument('-xi', required=False, default='aln', help='input alignment extension')
127
+ parser.add_argument('-fi', required=True, help='input alignment format, e.g., fasta, phylip')
128
+ parser.add_argument('-o', required=True, help='output alignment')
129
+ parser.add_argument('-xo', required=False, default='aln', help='output alignment extension')
130
+ parser.add_argument('-fo', required=True, help='output alignment format, e.g., fasta, phylip')
131
+ parser.add_argument('-oneline', required=False, action="store_true", help='put sequence in single line, available if -fo is fasta')
132
+ parser.add_argument('-nogap', required=False, action="store_true", help='remove gaps from alignment, available if -fo is fasta')
133
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite existing output folder')
134
+ args = vars(parser.parse_args())
135
+ ConvertMSA(args)
TreeSAK/Dir.rb ADDED
@@ -0,0 +1,82 @@
1
+ require 'find'
2
+
3
+
4
+ ################################################################################
5
+ class Dir
6
+ def self.mkdirs(path)
7
+ if(!File.directory?(path))
8
+ if(!mkdirs(File.dirname(path)))
9
+ return false;
10
+ end
11
+ mkdir(path)
12
+ end
13
+ return true
14
+ end
15
+ end
16
+
17
+
18
+ ################################################################################
19
+ def mkdir_with_force(outdir, is_force=false, is_tolerate=false)
20
+ if outdir.class != String
21
+ raise "outdir wrong? Exiting ......"
22
+ end
23
+
24
+ if ! Dir.exists?(outdir)
25
+ `mkdir -p #{outdir}`
26
+ else
27
+ if is_tolerate
28
+ ;
29
+ elsif is_force
30
+ `rm -rf #{outdir}`
31
+ `mkdir -p #{outdir}`
32
+ else
33
+ raise "The outdir #{outdir} has already existed!"
34
+ end
35
+ end
36
+ end
37
+
38
+
39
+ def read_infiles(indir, suffix='', is_all_subfolder=false)
40
+ infiles = Array.new
41
+ if ! is_all_subfolder
42
+ Dir.foreach(indir) do |b|
43
+ next if b =~ /^\./
44
+ if suffix.is_a?(String)
45
+ if suffix != ''
46
+ next if b !~ /#{suffix}$/
47
+ end
48
+ elsif suffix.is_a?(Array)
49
+ next unless suffix.any?{|i| b =~ /#{i}$/ }
50
+ end
51
+ infiles << File.join(indir, b)
52
+ end
53
+ else
54
+ Find.find(indir) do |path|
55
+ next if File.directory?(path)
56
+ next if File.basename(path) =~ /^\./
57
+ infiles << path if suffix.is_a?(String) ? path =~ /\.#{suffix}$/ : suffix.any?{|i| path =~ /#{i}$/ }
58
+ end
59
+ end
60
+ return(infiles)
61
+ end
62
+
63
+
64
+ def getFilesBySuffices(indir, suffices)
65
+ files = Array.new
66
+ infiles = read_infiles(indir)
67
+ infiles.each do |infile|
68
+ if suffices.include?(File.extname(infile))
69
+ files << infile
70
+ end
71
+ end
72
+ return(files)
73
+ end
74
+
75
+
76
+ def get_file_path(file)
77
+ path = File.symlink?(file) ? File.readlink(file) : file
78
+ return(path)
79
+ end
80
+
81
+ ################################################################################
82
+