treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,157 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ def sep_path_basename_ext(file_in):
6
+
7
+ # separate path and file name
8
+ f_path, file_name = os.path.split(file_in)
9
+ if f_path == '':
10
+ f_path = '.'
11
+
12
+ # separate file basename and extension
13
+ f_base, f_ext = os.path.splitext(file_name)
14
+
15
+ return f_path, f_base, f_ext
16
+
17
+
18
+ SingleAleHGT_usage = '''
19
+ ============================================ SingleAleHGT example commands ============================================
20
+
21
+ TreeSAK SingleAleHGT -i concatenated.fasta -s genome.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd
22
+
23
+ =======================================================================================================================
24
+ '''
25
+
26
+ def SingleAleHGT(args):
27
+
28
+ faa_in = args['faa']
29
+ msa_in = args['msa']
30
+ op_dir = args['o']
31
+ genome_tree_file_rooted = args['s']
32
+ API_key = args['api']
33
+ hgt_freq_cutoff = args['fc']
34
+ ar_phylum_color_code_txt = args['color']
35
+ genome_taxon_txt = args['c']
36
+ force_overwrite = args['f']
37
+ trim_msa = args['trim']
38
+ docker_image = args['docker']
39
+ num_threads = args['t']
40
+
41
+ ######################################## check input files #######################################
42
+
43
+ # if docker_image is True, check if docker is activated
44
+ if (faa_in is not None) and (msa_in is None):
45
+ f_path, f_base, f_ext = sep_path_basename_ext(faa_in)
46
+ elif (faa_in is None) and (msa_in is not None):
47
+ f_path, f_base, f_ext = sep_path_basename_ext(msa_in)
48
+ else:
49
+ print('Please specify either -faa or -msa, program exited!')
50
+ exit()
51
+
52
+ ######################################## define file name ########################################
53
+
54
+ ale1_op_dir = '%s/ALE1_op_dir' % op_dir
55
+ ale2_op_dir = '%s/ALE2_op_dir' % op_dir
56
+ ale4_op_dir = '%s/ALE4_op_dir' % op_dir
57
+ log_txt = '%s/log.txt' % op_dir
58
+ msa_file = '%s/%s.aln' % (ale1_op_dir, f_base)
59
+ msa_trimmed = '%s/%s_trimmed.aln' % (ale1_op_dir, f_base)
60
+ tree_prefix = '%s/%s' % (ale1_op_dir, f_base)
61
+
62
+ ###################################### create output folder ######################################
63
+
64
+ if os.path.isdir(op_dir) is True:
65
+ if force_overwrite is True:
66
+ os.system('rm -r %s' % op_dir)
67
+ else:
68
+ print('%s exist, program exited!' % op_dir)
69
+ exit()
70
+ os.mkdir(op_dir)
71
+ os.mkdir(ale1_op_dir)
72
+
73
+ ##################################################################################################
74
+
75
+ # run mafft-einsi
76
+ if (faa_in is not None) and (msa_in is None):
77
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_threads, faa_in, msa_file)
78
+
79
+ with open(log_txt, 'a') as log_txt_handle:
80
+ log_txt_handle.write(mafft_cmd + '\n')
81
+ os.system(mafft_cmd)
82
+ msa_file_for_next_step = msa_file
83
+ else:
84
+ msa_file_for_next_step = msa_in
85
+
86
+ # run trimal
87
+ if trim_msa is True:
88
+ trimal_cmd = 'trimal -in %s -out %s -automated1' % (msa_file_for_next_step, msa_trimmed)
89
+ with open(log_txt, 'a') as log_txt_handle:
90
+ log_txt_handle.write(trimal_cmd + '\n')
91
+ os.system(trimal_cmd)
92
+ iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_trimmed, tree_prefix)
93
+ with open(log_txt, 'a') as log_txt_handle:
94
+ log_txt_handle.write(iqtree2_cmd + '\n')
95
+ os.system(iqtree2_cmd)
96
+ else:
97
+ iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_file_for_next_step, tree_prefix)
98
+ with open(log_txt, 'a') as log_txt_handle:
99
+ log_txt_handle.write(iqtree2_cmd + '\n')
100
+ os.system(iqtree2_cmd)
101
+
102
+ # run ALE2
103
+ ale2_cmd = 'TreeSAK ALE2 -i %s -s %s -t %s -f -runALE -docker %s -o %s' % (ale1_op_dir, genome_tree_file_rooted, num_threads, docker_image, ale2_op_dir)
104
+ with open(log_txt, 'a') as log_txt_handle:
105
+ log_txt_handle.write(ale2_cmd + '\n')
106
+ os.system(ale2_cmd)
107
+
108
+ # run ALE4
109
+ ale4_cmd = 'TreeSAK ALE4 -i1 %s -i2 %s -c %s -color %s -o %s -fc %s -f -api %s' % (ale1_op_dir, ale2_op_dir, genome_taxon_txt, ar_phylum_color_code_txt, ale4_op_dir, hgt_freq_cutoff, API_key)
110
+ with open(log_txt, 'a') as log_txt_handle:
111
+ log_txt_handle.write(ale4_cmd + '\n')
112
+ os.system(ale4_cmd)
113
+
114
+
115
+ if __name__ == '__main__':
116
+
117
+ SingleAleHGT_parser = argparse.ArgumentParser()
118
+ SingleAleHGT_parser.add_argument('-faa', required=False, default=None, help='input aa file, e.g., OMA0001.faa')
119
+ SingleAleHGT_parser.add_argument('-msa', required=False, default=None, help='input MSA file, e.g., OMA0001.aln')
120
+ SingleAleHGT_parser.add_argument('-o', required=True, help='output dir, e.g., SingleAleHGT_wd')
121
+ SingleAleHGT_parser.add_argument('-s', required=True, help='rooted species tree')
122
+ SingleAleHGT_parser.add_argument('-c', required=True, help='genome_taxon, GTDB format')
123
+ SingleAleHGT_parser.add_argument('-color', required=True, help='phylum color code')
124
+ SingleAleHGT_parser.add_argument('-fc', required=False, type=float, default=0.5, help='hgt_freq_cutoff, default: 0.5')
125
+ SingleAleHGT_parser.add_argument('-mld', required=False, type=int, default=5, help='donor_node_min_leaf_num, default: 5')
126
+ SingleAleHGT_parser.add_argument('-mlr', required=False, type=int, default=5, help='recipient_node_min_leaf_num, default: 5')
127
+ SingleAleHGT_parser.add_argument('-trim', required=False, action="store_true", help='trim MSA')
128
+ SingleAleHGT_parser.add_argument('-docker', required=False, default=None, help='Docker image, if ALE was installed with Docker, e.g., gregmich/alesuite_new')
129
+ SingleAleHGT_parser.add_argument('-itol', required=False, default='batch_access_tmp', help='iTOL project_name, default: batch_access_tmp')
130
+ SingleAleHGT_parser.add_argument('-api', required=True, help='iTOL API key')
131
+ SingleAleHGT_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
132
+ SingleAleHGT_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
133
+ args = vars(SingleAleHGT_parser.parse_args())
134
+ SingleAleHGT(args)
135
+
136
+
137
+ '''
138
+
139
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/ALE1_op_dir_OMA05484_OMA07484_trimmed
140
+ trimal -in ../ALE1_op_dir_OMA05484_OMA07484/concatenated.fasta -out concatenated.fasta -automated1
141
+ iqtree2 -m LG+G+I -bb 1000 --wbtl -nt 10 -s concatenated.fasta -pre OMA05484_OMA07484
142
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
143
+ TreeSAK ALE2 -i ALE1_op_dir_OMA05484_OMA07484_trimmed -s genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir_OMA05484_OMA07484_trimmed
144
+ TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
145
+
146
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
147
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -msa ALE1_op_dir_OMA05484_OMA07484_trimmed/concatenated.fasta -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd -trim
148
+
149
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/demo_SingleAleHGT_wd
150
+ TreeSAK ALE2 -i ALE1_op_dir -s ../genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir
151
+ TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
152
+
153
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o demo_SingleAleHGT_wd -msa ALE1_op_dir/OMA15312.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
154
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
155
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd_no_trim -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -docker gregmich/alesuite_new
156
+
157
+ '''
@@ -0,0 +1,50 @@
1
+ import os
2
+ import argparse
3
+ from Bio import AlignIO
4
+
5
+
6
+ SingleLinePhy_usage = '''
7
+ ======== SingleLinePhy example commands ========
8
+
9
+ TreeSAK SingleLinePhy -i in.phy -o out.phy
10
+
11
+ ================================================
12
+ '''
13
+
14
+
15
+ def SingleLinePhy(args):
16
+
17
+ phy_in = args['i']
18
+ phy_out = args['o']
19
+
20
+ # check input file
21
+ if os.path.isfile(phy_in) is False:
22
+ print('input file not found, program exited!')
23
+ exit()
24
+
25
+ alignment = AlignIO.read(phy_in, 'phylip-relaxed')
26
+
27
+ max_seq_id_len = 0
28
+ for each_seq in alignment:
29
+ seq_id_len = len(each_seq.id)
30
+ if seq_id_len > max_seq_id_len:
31
+ max_seq_id_len = seq_id_len
32
+
33
+ with open(phy_out, 'w') as msa_out_handle:
34
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
35
+ for each_seq in alignment:
36
+ seq_id = each_seq.id
37
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
38
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
39
+
40
+ print('Done!')
41
+
42
+
43
+ if __name__ == '__main__':
44
+
45
+ # initialize the options parser
46
+ parser = argparse.ArgumentParser()
47
+ parser.add_argument('-i', required=True, help='input file')
48
+ parser.add_argument('-o', required=True, help='output file')
49
+ args = vars(parser.parse_args())
50
+ SingleLinePhy(args)
TreeSAK/SliceMSA.py ADDED
@@ -0,0 +1,142 @@
1
+ import os
2
+ import argparse
3
+ from Bio import AlignIO
4
+
5
+
6
+ SliceMSA_usage = '''
7
+ ========================= SliceMSA example commands =========================
8
+
9
+ TreeSAK SliceMSA -i 16S_aln.fasta -s 200-300 -o 16S_aln_200-300.fasta
10
+ TreeSAK SliceMSA -i 16S_aln.phylip -fi phylip-relaxed -s sections.txt -o SliceMSA_op -fo phylip-relaxed
11
+
12
+ # example
13
+ 200-300 select columns 200-300
14
+ -100 select columns 1-300
15
+ 500- select columns from 500 to the end
16
+
17
+ # Example of sections.txt (one section per line):
18
+ 200-300
19
+ -100
20
+ 500-
21
+
22
+ # Examples of alignment format (https://biopython.org/wiki/AlignIO):
23
+ fasta, phylip, phylip-relaxed, phylip-sequential, clustal
24
+
25
+ =============================================================================
26
+ '''
27
+
28
+
29
+ def msa2fasta(msa_object, fasta_out):
30
+
31
+ with open(fasta_out, 'w') as fasta_out_handle:
32
+ for each_seq in msa_object:
33
+ fasta_out_handle.write('>%s\n' % each_seq.id)
34
+ fasta_out_handle.write('%s\n' % str(each_seq.seq))
35
+
36
+
37
+ def msa2phylip(msa_object, phylip_out):
38
+
39
+ max_seq_id_len = 0
40
+ for each_seq in msa_object:
41
+ seq_id_len = len(each_seq.id)
42
+ if seq_id_len > max_seq_id_len:
43
+ max_seq_id_len = seq_id_len
44
+
45
+ with open(phylip_out, 'w') as phylip_out_handle:
46
+ phylip_out_handle.write('%s %s\n' % (len(msa_object), msa_object.get_alignment_length()))
47
+ for each_seq in msa_object:
48
+ seq_id = each_seq.id
49
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
50
+ phylip_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
51
+
52
+
53
+ def SliceMSA(args):
54
+
55
+ msa_in_file = args['i']
56
+ aln_in_format = args['fi']
57
+ col_to_select_txt = args['s']
58
+ op_dir = args['o']
59
+ aln_out_format = args['fo']
60
+ force_overwriting = args['force']
61
+
62
+ aln_out_ext = 'fasta'
63
+ if aln_out_format == 'phylip-relaxed':
64
+ aln_out_ext = 'phylip'
65
+
66
+ if os.path.isfile(msa_in_file) is False:
67
+ print('Input MSA not found, program exited!')
68
+ exit()
69
+
70
+ # read in msa
71
+ msa_in = AlignIO.read(msa_in_file, aln_in_format)
72
+
73
+ # parse provided sections
74
+ section_to_select_list = []
75
+ if os.path.isfile(col_to_select_txt) is False:
76
+ col_to_select_txt_split = col_to_select_txt.strip().split('-')
77
+ if col_to_select_txt == '-':
78
+ section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
79
+ elif col_to_select_txt.startswith('-'):
80
+ section_to_select_list.append(['1', col_to_select_txt_split[1]])
81
+ elif col_to_select_txt.endswith('-'):
82
+ section_to_select_list.append([col_to_select_txt_split[0], str(msa_in.get_alignment_length())])
83
+ else:
84
+ section_to_select_list.append(col_to_select_txt_split)
85
+ else:
86
+ for each_section in open(col_to_select_txt):
87
+ each_section = each_section.strip()
88
+ each_section_split = each_section.strip().split('-')
89
+ if each_section == '-':
90
+ section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
91
+ elif each_section.startswith('-'):
92
+ section_to_select_list.append(['1', each_section_split[1]])
93
+ elif each_section.endswith('-'):
94
+ section_to_select_list.append([each_section_split[0], str(msa_in.get_alignment_length())])
95
+ else:
96
+ section_to_select_list.append(each_section_split)
97
+
98
+ # check output folder
99
+ if len(section_to_select_list) > 1:
100
+ if os.path.isdir(op_dir) is True:
101
+ if force_overwriting is True:
102
+ os.system('rm -r %s' % op_dir)
103
+ else:
104
+ print('Output folder already exist, program exited!')
105
+ exit()
106
+ os.system('mkdir %s' % op_dir)
107
+
108
+ # write out sections
109
+ if len(section_to_select_list) == 1:
110
+ current_section = msa_in[:, (int(section_to_select_list[0][0]) - 1):(int(section_to_select_list[0][1]))]
111
+ if aln_out_ext == 'fasta':
112
+ msa2fasta(current_section, op_dir)
113
+ if aln_out_ext == 'phylip':
114
+ msa2phylip(current_section, op_dir)
115
+ else:
116
+ for each_section in section_to_select_list:
117
+
118
+ pwd_op_file = '%s/%s.%s' % (op_dir, '-'.join(each_section), aln_out_ext)
119
+ current_section = msa_in[:, (int(each_section[0])-1):(int(each_section[1]))]
120
+
121
+ # write out
122
+ if aln_out_ext == 'fasta':
123
+ msa2fasta(current_section, pwd_op_file)
124
+ if aln_out_ext == 'phylip':
125
+ msa2phylip(current_section, pwd_op_file)
126
+
127
+ print('MSA subset(s) exported to %s, Done!' % op_dir)
128
+
129
+
130
+ if __name__ == '__main__':
131
+
132
+ # arguments for rename_seq_parser
133
+ SliceMSA_parser = argparse.ArgumentParser()
134
+ SliceMSA_parser.add_argument('-i', required=True, help='input MSA in fasta format')
135
+ SliceMSA_parser.add_argument('-fi', required=False, default='fasta', help='format (NOT file extension) of input MSA, default: fasta')
136
+ SliceMSA_parser.add_argument('-s', required=True, help='columns to export, e.g. 200-300, -100, 50-')
137
+ SliceMSA_parser.add_argument('-o', required=True, help='output file or folder')
138
+ SliceMSA_parser.add_argument('-fo', required=False, default='fasta', help='format of output MSA, select from fasta and phylip-relaxed, default: fasta')
139
+ SliceMSA_parser.add_argument('-force', required=False, action="store_true", help='force overwrite existing output folder')
140
+ args = vars(SliceMSA_parser.parse_args())
141
+ SliceMSA(args)
142
+
TreeSAK/SplitScore.py ADDED
@@ -0,0 +1,19 @@
1
+
2
+ SplitScore_usage = '''
3
+ ============================================= SplitScore example commands =============================================
4
+
5
+ # SplitScore modules
6
+ TreeSAK SplitScore1 -> Step 1: Infer gene tree
7
+ TreeSAK SplitScore1OMA -> Step 1: Infer gene tree (based on OMA outputs)
8
+ TreeSAK SplitScore2 -> Step 2: Calculate split score
9
+
10
+ # SplitScore1
11
+ TreeSAK SplitScore1 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f
12
+ TreeSAK SplitScore1 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o step1_op_dir -t 6 -f -u interested_gnm.txt
13
+
14
+ # SplitScore2
15
+ # Please ensure that all the commands produced in step one have been executed before proceeding to step two.
16
+ TreeSAK SplitScore2 -i step1_op_dir -g gnm_cluster.tsv -k gnm_taxon.txt -f -t 10 -o step_2_op_dir
17
+
18
+ =======================================================================================================================
19
+ '''
TreeSAK/SplitScore1.py ADDED
@@ -0,0 +1,178 @@
1
+ from __future__ import print_function
2
+ import os
3
+ import glob
4
+ import argparse
5
+ from Bio import SeqIO
6
+
7
+
8
+ SplitScore1_usage = '''
9
+ ======================== SplitScore1 example commands ========================
10
+
11
+ TreeSAK SplitScore1 -i marker_seq -x fa -o SplitScore1_op_dir -jst 9 -f
12
+
13
+ # Format of gene id
14
+ APA_bin56_00001
15
+ APA_bin56_00002
16
+ APA_bin56_00003
17
+
18
+ ==============================================================================
19
+ '''
20
+
21
+
22
+ def sep_path_basename_ext(file_in):
23
+ f_path, file_name = os.path.split(file_in)
24
+ if f_path == '':
25
+ f_path = '.'
26
+ f_base, f_ext = os.path.splitext(file_name)
27
+ return f_path, f_base, f_ext
28
+
29
+
30
+ def SplitScore1(args):
31
+
32
+ oma_op_fasta = args['i']
33
+ fasta_file_ext = args['x']
34
+ interested_gnm_txt = args['u']
35
+ iqtree_model = args['m']
36
+ cov_cutoff = args['c']
37
+ force_overwrite = args['f']
38
+ num_of_js_threads = args['jst']
39
+ op_dir = args['o']
40
+
41
+ ################################################################################
42
+
43
+ interested_gnm_set = set()
44
+ if interested_gnm_txt is not None:
45
+ if os.path.isfile(interested_gnm_txt):
46
+ for each_gnm in open(interested_gnm_txt):
47
+ interested_gnm_set.add(each_gnm.strip())
48
+ else:
49
+ print('%s not found, program exited' % interested_gnm_txt)
50
+ exit()
51
+
52
+ ################################################################################
53
+
54
+ fa_file_re = '%s/*.%s' % (oma_op_fasta, fasta_file_ext)
55
+ fa_file_list = glob.glob(fa_file_re)
56
+ if len(fa_file_list) == 0:
57
+ print('No file found in %s, program exited!' % oma_op_fasta)
58
+ exit()
59
+
60
+ og_to_gene_dict = dict()
61
+ for each_fa in fa_file_list:
62
+ _, f_base, _ = sep_path_basename_ext(each_fa)
63
+ seq_id_set = set()
64
+ for each_seq in SeqIO.parse(each_fa, 'fasta'):
65
+ seq_id_set.add(each_seq.id)
66
+ og_to_gene_dict[f_base] = seq_id_set
67
+
68
+ ################################################################################
69
+
70
+ gnm_to_process = set()
71
+ for each_og in og_to_gene_dict:
72
+ gene_set = og_to_gene_dict[each_og]
73
+ gnm_set = set()
74
+ for each_gene in gene_set:
75
+ gnm_id = '_'.join(each_gene.split('_')[:-1])
76
+ gnm_set.add(gnm_id)
77
+ if interested_gnm_txt is None:
78
+ gnm_to_process.add(gnm_id)
79
+ else:
80
+ if gnm_id in interested_gnm_set:
81
+ gnm_to_process.add(gnm_id)
82
+
83
+ if len(gene_set) != len(gnm_set):
84
+ print('Program exited!')
85
+ exit()
86
+
87
+ ################################################################################
88
+
89
+ # define file name
90
+ qualified_og_dir = '%s/qualified_OGs' % op_dir
91
+ cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
92
+ cmd_2_trimal_txt = '%s/cmd_2_trimal.txt' % op_dir
93
+ cmd_3_iqtree_txt = '%s/cmd_3_iqtree.txt' % op_dir
94
+ ignored_marker_txt = '%s/ignored_markers.txt' % op_dir
95
+
96
+ # create output folder
97
+ if os.path.isdir(op_dir) is True:
98
+ if force_overwrite is True:
99
+ os.system('rm -r %s' % op_dir)
100
+ else:
101
+ print('%s exist, program exited!' % op_dir)
102
+ exit()
103
+ os.mkdir(op_dir)
104
+ os.mkdir(qualified_og_dir)
105
+
106
+ ################################################################################
107
+
108
+ cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
109
+ cmd_2_trimal_txt_handle = open(cmd_2_trimal_txt, 'w')
110
+ cmd_3_iqtree_txt_handle = open(cmd_3_iqtree_txt, 'w')
111
+ ignored_og_dict = dict()
112
+ for each_og in sorted(list(og_to_gene_dict.keys())):
113
+ seq_file_in = '%s/%s.%s' % (oma_op_fasta, each_og, fasta_file_ext)
114
+ file_out_seq = '%s/%s.%s' % (qualified_og_dir, each_og, fasta_file_ext)
115
+ file_out_aln = '%s.aln' % each_og
116
+ file_out_aln_trimmed = '%s_trimmed.aln' % each_og
117
+
118
+ seq_file_out_handle = open(file_out_seq, 'w')
119
+ current_gnm_set = set()
120
+ for each_seq in SeqIO.parse(seq_file_in, 'fasta'):
121
+ seq_id = each_seq.id
122
+ gnm_id = '_'.join(seq_id.split('_')[:-1])
123
+ if gnm_id in gnm_to_process:
124
+ current_gnm_set.add(gnm_id)
125
+ seq_file_out_handle.write('>%s\n' % each_seq.id)
126
+ seq_file_out_handle.write('%s\n' % each_seq.seq)
127
+ seq_file_out_handle.close()
128
+
129
+ cov_value = len(current_gnm_set)*100/len(gnm_to_process)
130
+ cov_value = float("{0:.2f}".format(cov_value))
131
+
132
+ if cov_value < cov_cutoff:
133
+ report_str = 'Ignored %s, contains proteins from %s (%s%s) genomes, < %s%s.' % (each_og, len(current_gnm_set), cov_value, '%', cov_cutoff, '%')
134
+ ignored_og_dict[each_og] = report_str
135
+ os.system('rm %s' % file_out_seq)
136
+ else:
137
+ # align, trim and iqtree
138
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s.%s > %s' % (num_of_js_threads, each_og, fasta_file_ext, file_out_aln)
139
+ trimal_cmd = 'trimal -in %s -out %s -automated1' % (file_out_aln, file_out_aln_trimmed)
140
+ iqtree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s -T %s --quiet' % (file_out_aln_trimmed, iqtree_model, each_og, num_of_js_threads)
141
+ # Undinarchaeota illuminate DPANN phylogeny and the impact of gene transfer on archaeal evolution, settings: -m LG+G -bb 1000 -wbtl -bnni
142
+ cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
143
+ cmd_2_trimal_txt_handle.write(trimal_cmd + '\n')
144
+ cmd_3_iqtree_txt_handle.write(iqtree_cmd + '\n')
145
+ cmd_1_mafft_txt_handle.close()
146
+ cmd_2_trimal_txt_handle.close()
147
+ cmd_3_iqtree_txt_handle.close()
148
+
149
+ # report ignored markers
150
+ if len(ignored_og_dict) > 0:
151
+ print('The following %s markers were ignored due to low genome coverage, see details in %s:' % (len(ignored_og_dict), ignored_marker_txt))
152
+ print('\n'.join(sorted(list(ignored_og_dict.keys()))))
153
+ ignored_marker_txt_handle = open(ignored_marker_txt, 'w')
154
+ for each_ignored_marker in sorted(list(ignored_og_dict.keys())):
155
+ ignored_marker_txt_handle.write(ignored_og_dict[each_ignored_marker] + '\n')
156
+ ignored_marker_txt_handle.close()
157
+
158
+ # report
159
+ print('You will need to execute the commands exported to the following three files before moving to SplitScore2')
160
+ print(cmd_1_mafft_txt)
161
+ print(cmd_2_trimal_txt)
162
+ print(cmd_3_iqtree_txt)
163
+ print('Done!')
164
+
165
+
166
+ if __name__ == '__main__':
167
+
168
+ SplitScore1_parser = argparse.ArgumentParser()
169
+ SplitScore1_parser.add_argument('-i', required=True, help='orthologous gene sequence')
170
+ SplitScore1_parser.add_argument('-x', required=True, help='fasta file extension')
171
+ SplitScore1_parser.add_argument('-o', required=True, help='output directory')
172
+ SplitScore1_parser.add_argument('-u', required=False, default=None, help='interested genomes, no file extension')
173
+ SplitScore1_parser.add_argument('-m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
174
+ SplitScore1_parser.add_argument('-c', required=False, type=int, default=85, help='coverage cutoff, default: 85')
175
+ SplitScore1_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
176
+ SplitScore1_parser.add_argument('-jst', required=False, type=int, default=1, help='num of threads for iqtree2, default: 1')
177
+ args = vars(SplitScore1_parser.parse_args())
178
+ SplitScore1(args)