treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/ALE.py ADDED
@@ -0,0 +1,63 @@
1
+
2
+ ALE_usage = '''
3
+ ================================================= ALE example commands =================================================
4
+
5
+ # ALE modules
6
+ TreeSAK ALE1 -> Step 1: get gene tree
7
+ TreeSAK ALE2 -> Step 2: run ALE
8
+ TreeSAK ALE3 -> Step 3: parse ALE outputs (ancestral genome reconstruction, transfer propensity/verticality and gain/loss)
9
+ TreeSAK ALE4 -> Filter ALE identified HGTs
10
+ TreeSAK ALE5 -> Get RTC file based on ALE detected HGTs
11
+ TreeSAK SingleAleHGT -> Perform HGT analysis using ALE for single protein family
12
+ TreeSAK ALE6 -> faa ancestral genomes
13
+ TreeSAK ALE7 -> get function P/A in ancestral genomes
14
+
15
+ # Example commands
16
+ TreeSAK ALE1 -i OrthologousGroups.txt -s combined_d__Archaea_o_rs.faa -p oma -m 50 -t 12 -jst 3 -f -o ALE1_op_dir
17
+ TreeSAK ALE2 -i ALE1_op_dir -s genome_tree_rooted_noEU.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
18
+ TreeSAK ALE3 -i ALE2_op_dir -o ALE3_op_dir_c0.75 -f -c 0.75
19
+ TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.3 -fc 0.3 -f -api your_own_itol_api
20
+ TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.5 -fc 0.5 -f -api your_own_itol_api
21
+ TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.8 -fc 0.8 -f -api your_own_itol_api
22
+ TreeSAK ALE7 -6 ALE6_op_dir -fun ko.txt -node internal_node.txt -o Fun_PA.txt
23
+ TreeSAK ALE7 -6 ALE6_op_dir -fun K01995,K01995 -node 359,466,470 -o Fun_PA.txt
24
+ TreeSAK ALE7 -6 ALE6_op_dir -fun arCOG07811,K01995 -node 359,466,470 -o Fun_PA.txt
25
+ TreeSAK SingleAleHGT -i OMA00001.aln -s genome.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o OMA00001_ALE_HGT_wd
26
+
27
+ Note:
28
+ Genome names should NOT contain "_".
29
+
30
+ ========================================================================================================================
31
+ '''
32
+
33
+ '''
34
+ cd /Users/songweizhi/Desktop/run_ALE_wd
35
+ TreeSAK ALE2 -i ALE1_op_dir -s genome_tree_rooted_noEU.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
36
+ TreeSAK ALE3 -i ALE2_op_dir -c 0.8 -f -o ALE3_op_dir
37
+ TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.3 -fc 0.3 -f -api S1kZZuDHc0d5M7J5vLnUNQ
38
+ TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.5 -fc 0.5 -f -api S1kZZuDHc0d5M7J5vLnUNQ
39
+ TreeSAK ALE4 -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.8 -fc 0.8 -f -api S1kZZuDHc0d5M7J5vLnUNQ
40
+
41
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE4.py -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.3 -fc 0.3 -f -api S1kZZuDHc0d5M7J5vLnUNQ
42
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE4.py -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.5 -fc 0.5 -f -api S1kZZuDHc0d5M7J5vLnUNQ
43
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE4.py -i1 ALE1_op_dir -i2 ALE2_op_dir -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_0.8 -fc 0.8 -f -api S1kZZuDHc0d5M7J5vLnUNQ
44
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE3.py -i ALE2_op_dir -c 0.8 -f -o ALE3_op_dir
45
+ '''
46
+
47
+ '''
48
+ cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/6_ALE_wd
49
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE1.py -i OMA_op_filtered/OrthologousGroups.txt -s OMA_op_filtered/OrthologousGroups.fasta -p oma -m 3 -t 10 -jt 2 -f -o ALE1_op_dir
50
+ TreeSAK ALE2 -i ALE1_op_dir -s genome_tree_rooted_noEU.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
51
+
52
+ cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/8_ALE_wd_all_OGs
53
+ TreeSAK ALE2 -i ALE1_op_dir_ufboot -s concatenated_rooted.treefile -t 10 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
54
+
55
+ cd /home-user/wzsong/tmp
56
+ TreeSAK ALE2 -i ALE1_op_dir_ufboot -s concatenated_rooted.treefile -t 32 -f -o ALE2_op_dir -runALE -docker gregmich/alesuite_new
57
+
58
+ cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/8_ALE_wd_all_OGs
59
+ TreeSAK ALE3 -i ALE2_op_dir -o ALE3_op_dir_c0.75 -f -c 0.75
60
+
61
+ cd /Users/songweizhi/Documents/Research/Sponge_Hologenome/8_ALE_wd_all_OGs
62
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/ALE3.py -i ALE2_op_dir -o ALE3_op_dir_c0.75 -f -c 0.75 -a ALE1_arcog_description.txt
63
+ '''
TreeSAK/ALE1.py ADDED
@@ -0,0 +1,268 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from ete3 import Tree
6
+ from distutils.spawn import find_executable
7
+
8
+
9
+ ALE1_usage = '''
10
+ ====================================== ALE1 example commands ======================================
11
+
12
+ TreeSAK ALE1 -i OrthologousGroups.txt -s combined.faa -p oma -m 50 -jst 3 -f -o ALE1_op_dir -bmge
13
+ TreeSAK ALE1 -ms s03_marker_seq -msx fa -p marker_set_1 -m 50 -jst 3 -f -o ALE1_op_dir -bmge
14
+
15
+ ===================================================================================================
16
+ '''
17
+
18
+
19
+ def check_dependencies(program_list):
20
+
21
+ not_detected_programs = []
22
+ for needed_program in program_list:
23
+ if find_executable(needed_program) is None:
24
+ not_detected_programs.append(needed_program)
25
+
26
+ if not_detected_programs != []:
27
+ print('%s not found, program exited!' % ','.join(not_detected_programs))
28
+ exit()
29
+
30
+
31
+ def sep_path_basename_ext(file_in):
32
+
33
+ f_path, file_name = os.path.split(file_in)
34
+ if f_path == '':
35
+ f_path = '.'
36
+
37
+ f_base, f_ext = os.path.splitext(file_name)
38
+
39
+ return f_path, f_base, f_ext
40
+
41
+
42
+ def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
43
+
44
+ input_tree = Tree(tree_file_in)
45
+ subset_tree = input_tree.copy()
46
+ subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
47
+ if tree_file_out is None:
48
+ return subset_tree.write()
49
+ else:
50
+ subset_tree.write(outfile=tree_file_out)
51
+
52
+
53
+ def get_ortho_to_gene_dict(ortho_groups_txt, og_program):
54
+
55
+ ortho_to_gene_dict = dict()
56
+ for each_og in open(ortho_groups_txt):
57
+ if not each_og.startswith('#'):
58
+ og_id = ''
59
+ gene_list = []
60
+ if og_program == 'orthofinder':
61
+ each_og_split = each_og.strip().split(' ')
62
+ og_id = each_og_split[0][:-1]
63
+ gene_list = each_og_split[1:]
64
+ elif og_program == 'oma':
65
+ each_og_split = each_og.strip().split('\t')
66
+ og_id = each_og_split[0]
67
+ group_member_list = each_og_split[1:]
68
+ for each_protein in group_member_list:
69
+ protein_id = each_protein.split(' ')[0].split(':')[1]
70
+ gene_list.append(protein_id)
71
+ ortho_to_gene_dict[og_id] = gene_list
72
+
73
+ return ortho_to_gene_dict
74
+
75
+
76
+ def ALE1(args):
77
+
78
+ orthogroups_op_txt = args['i']
79
+ combined_faa = args['s']
80
+ og_program = args['p']
81
+ marker_seq_dir = args['ms']
82
+ marker_seq_ext = args['msx']
83
+ min_og_genome_num = args['m']
84
+ js_num_threads = args['jst']
85
+ force_create_op_dir = args['f']
86
+ op_dir = args['o']
87
+ trim_with_bmge = args['bmge']
88
+ bmge_trim_model = args['bmge_m']
89
+ bmge_entropy_score_cutoff = args['bmge_esc']
90
+ designate_ogs = []
91
+ to_ignore_ogs_list = []
92
+
93
+ # check dependencies
94
+ check_dependencies(['java', 'blastp', 'mafft-einsi'])
95
+
96
+ # specify path to BMGE.jar
97
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
98
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
99
+
100
+ # define output file name
101
+ get_gene_tree_cmds_txt = '%s_cmds.txt' % op_dir
102
+
103
+ # determine the version of iqtree available on the system
104
+ if find_executable('iqtree2'):
105
+ iqtree_exe = 'iqtree2'
106
+ elif find_executable('iqtree'):
107
+ iqtree_exe = 'iqtree'
108
+ else:
109
+ print('iqtree not detected, program exited!')
110
+ exit()
111
+
112
+ # create op_dir
113
+ if os.path.isdir(op_dir) is True:
114
+ if force_create_op_dir is True:
115
+ os.system('rm -r %s' % op_dir)
116
+ else:
117
+ print('Output folder detected, program exited!')
118
+ exit()
119
+ os.system('mkdir %s' % op_dir)
120
+
121
+ if (orthogroups_op_txt is not None) and (combined_faa is not None) and (marker_seq_dir is None):
122
+
123
+ # get ortho_to_gene_dict
124
+ ortho_to_gene_dict = get_ortho_to_gene_dict(orthogroups_op_txt, og_program)
125
+
126
+ # get qualified orthogroups
127
+ qualified_og_set = set()
128
+ for each_ortho in ortho_to_gene_dict:
129
+ ortho_gene_set = ortho_to_gene_dict[each_ortho]
130
+ ortho_gnm_set = set()
131
+ for each_gene in ortho_gene_set:
132
+ gene_gnm = '_'.join(each_gene.split('_')[:-1])
133
+ ortho_gnm_set.add(gene_gnm)
134
+ if len(ortho_gnm_set) >= min_og_genome_num:
135
+ qualified_og_set.add(each_ortho)
136
+ print('The total number of identified orthogroups is %s.' % len(ortho_to_gene_dict))
137
+ print('The number of orthogroups spanning >= %s genomes is %s.' % (min_og_genome_num, len(qualified_og_set)))
138
+
139
+ # process qualified OG
140
+ og_to_process = sorted([i for i in qualified_og_set])
141
+ if len(designate_ogs) > 0:
142
+ print('The number of designated OGs to process: %s' % len(designate_ogs))
143
+ og_to_process = designate_ogs
144
+
145
+ og_to_process_no_ignored = set()
146
+ for each_og in og_to_process:
147
+ if each_og not in to_ignore_ogs_list:
148
+ og_to_process_no_ignored.add(each_og)
149
+
150
+ # read sequence into dict
151
+ gene_seq_dict = dict()
152
+ for each_seq in SeqIO.parse(combined_faa, 'fasta'):
153
+ seq_id = each_seq.id
154
+ gene_seq_dict[seq_id] = str(each_seq.seq)
155
+
156
+ # extract gene sequences and prepare commands for building gene tree
157
+ print('Preparing commands and sequence files for building gene trees')
158
+ get_gene_tree_cmds_txt_handle = open(get_gene_tree_cmds_txt, 'w')
159
+ for qualified_og in sorted(og_to_process_no_ignored):
160
+ qualified_og_gene_set = ortho_to_gene_dict[qualified_og]
161
+ qualified_og_gene_faa = '%s/%s.faa' % (op_dir, qualified_og)
162
+
163
+ og_aln = '%s.aln' % qualified_og
164
+ og_aln_trimmed = '%s_trimmed.aln' % qualified_og
165
+
166
+ # write out commands
167
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s.faa > %s' % (js_num_threads, qualified_og, og_aln)
168
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, og_aln, bmge_trim_model, bmge_entropy_score_cutoff, og_aln_trimmed)
169
+ iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, og_aln, qualified_og)
170
+ if trim_with_bmge is True:
171
+ iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, og_aln_trimmed, qualified_og)
172
+
173
+ if trim_with_bmge is True:
174
+ get_gene_tree_cmds_txt_handle.write('%s; %s; %s\n' % (mafft_cmd, trim_cmd, iqtree_cmd))
175
+ else:
176
+ get_gene_tree_cmds_txt_handle.write('%s; %s\n' % (mafft_cmd, iqtree_cmd))
177
+
178
+ # write out sequences
179
+ qualified_og_gene_faa_handle = open(qualified_og_gene_faa, 'w')
180
+ for each_gene in qualified_og_gene_set:
181
+ qualified_og_gene_faa_handle.write('>%s\n' % each_gene)
182
+ qualified_og_gene_faa_handle.write('%s\n' % gene_seq_dict[each_gene])
183
+ qualified_og_gene_faa_handle.close()
184
+ get_gene_tree_cmds_txt_handle.close()
185
+
186
+ elif (orthogroups_op_txt is None) and (combined_faa is None) and (marker_seq_dir is not None):
187
+
188
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
189
+ marker_seq_list = glob.glob(marker_seq_re)
190
+
191
+ marker_to_gene_dict = dict()
192
+ for each_file in marker_seq_list:
193
+ _, f_base, _ = sep_path_basename_ext(each_file)
194
+ marker_to_gene_dict[f_base] = set()
195
+ for each_seq in SeqIO.parse(each_file, 'fasta'):
196
+ marker_to_gene_dict[f_base].add(each_seq.id)
197
+
198
+ # get qualified orthogroups
199
+ qualified_og_set = set()
200
+ for each_ortho in marker_to_gene_dict:
201
+ ortho_gene_set = marker_to_gene_dict[each_ortho]
202
+ ortho_gnm_set = set()
203
+ for each_gene in ortho_gene_set:
204
+ gene_gnm = '_'.join(each_gene.split('_')[:-1])
205
+ ortho_gnm_set.add(gene_gnm)
206
+ if len(ortho_gnm_set) >= min_og_genome_num:
207
+ qualified_og_set.add(each_ortho)
208
+ print('The total number of identified orthogroups is %s.' % len(marker_to_gene_dict))
209
+ print('The number of orthogroups spanning >= %s genomes is %s.' % (min_og_genome_num, len(qualified_og_set)))
210
+
211
+ # process qualified OG
212
+ og_to_process = sorted([i for i in qualified_og_set])
213
+ if len(designate_ogs) > 0:
214
+ print('The number of designated OGs to process: %s' % len(designate_ogs))
215
+ og_to_process = designate_ogs
216
+
217
+ og_to_process_no_ignored = set()
218
+ for each_og in og_to_process:
219
+ if each_og not in to_ignore_ogs_list:
220
+ og_to_process_no_ignored.add(each_og)
221
+
222
+ # extract gene sequences and prepare commands for building gene tree
223
+ print('Preparing commands for building gene trees')
224
+ get_gene_tree_cmds_txt_handle = open(get_gene_tree_cmds_txt, 'w')
225
+ for qualified_og in sorted(og_to_process_no_ignored):
226
+
227
+ # copy sequence file into output directory
228
+ os.system('cp %s/%s.%s %s/' % (marker_seq_dir, qualified_og, marker_seq_ext, op_dir))
229
+
230
+ qualified_og_aln = '%s.aln' % qualified_og
231
+ qualified_og_aln_trimmed = '%s_trimmed.aln' % qualified_og
232
+
233
+ # write out commands
234
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s.%s > %s' % (js_num_threads, qualified_og, marker_seq_ext, qualified_og_aln)
235
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, qualified_og_aln, bmge_trim_model, bmge_entropy_score_cutoff, qualified_og_aln_trimmed)
236
+ iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, qualified_og_aln, qualified_og)
237
+ if trim_with_bmge is True:
238
+ iqtree_cmd = '%s -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (iqtree_exe, js_num_threads, qualified_og_aln_trimmed, qualified_og)
239
+
240
+ if trim_with_bmge is False:
241
+ get_gene_tree_cmds_txt_handle.write('%s; %s\n' % (mafft_cmd, iqtree_cmd))
242
+ else:
243
+ get_gene_tree_cmds_txt_handle.write('%s; %s; %s\n' % (mafft_cmd, trim_cmd, iqtree_cmd))
244
+
245
+ get_gene_tree_cmds_txt_handle.close()
246
+
247
+ print('Sequece files exported to %s.' % op_dir)
248
+ print('Commands for inferring gene tree exported to %s.' % get_gene_tree_cmds_txt)
249
+ print('Done!')
250
+
251
+
252
+ if __name__ == '__main__':
253
+
254
+ ALE1_parser = argparse.ArgumentParser()
255
+ ALE1_parser.add_argument('-i', required=False, default=None, help='orthologous groups, either from orthofinder or oma')
256
+ ALE1_parser.add_argument('-s', required=False, default=None, help='sequence file, e.g., combined.faa')
257
+ ALE1_parser.add_argument('-ms', required=False, default=None, help='input is a folder holds the sequence of each marker')
258
+ ALE1_parser.add_argument('-msx', required=False, default='fa', help='file extension of marker sequence file, default: fa')
259
+ ALE1_parser.add_argument('-p', required=True, help='orthologous identification program, orthofinder or oma')
260
+ ALE1_parser.add_argument('-m', required=False, type=int, default=50, help='min_og_genome_num, default: 50')
261
+ ALE1_parser.add_argument('-bmge', required=False, action="store_true", help='trim MSA with BMGE, default no trimming')
262
+ ALE1_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
263
+ ALE1_parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
264
+ ALE1_parser.add_argument('-o', required=True, help='output dir, i.e., OMA working directory')
265
+ ALE1_parser.add_argument('-jst', required=False, type=int, default=3, help='number of threads specified in job script, default: 3')
266
+ ALE1_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
267
+ args = vars(ALE1_parser.parse_args())
268
+ ALE1(args)
TreeSAK/ALE2.py ADDED
@@ -0,0 +1,168 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from ete3 import Tree
5
+ import multiprocessing as mp
6
+
7
+
8
+ ALE2_usage = '''
9
+ ============================================ ALE2 example commands ===========================================
10
+
11
+ TreeSAK ALE2 -1 ALE1_op_dir -s genome.treefile -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir
12
+
13
+ Note:
14
+ Genome names should NOT contain "_", the program will tackle this automatically.
15
+
16
+ # You can try to add this while building the docker images
17
+ --platform linux/arm64/v8
18
+
19
+ # Only the ufboot files in ALE1_op_dir will be needed in this step.
20
+
21
+ ===============================================================================================================
22
+ '''
23
+
24
+
25
+ def sep_path_basename_ext(file_in):
26
+
27
+ # separate path and file name
28
+ f_path, file_name = os.path.split(file_in)
29
+ if f_path == '':
30
+ f_path = '.'
31
+
32
+ # separate file basename and extension
33
+ f_base, f_ext = os.path.splitext(file_name)
34
+
35
+ return f_path, f_base, f_ext
36
+
37
+
38
+ def subset_tree(tree_file_in, leaves_to_keep_list, tree_file_out):
39
+
40
+ input_tree = Tree(tree_file_in)
41
+ subset_tree = input_tree.copy()
42
+ subset_tree.prune(leaves_to_keep_list, preserve_branch_length=True)
43
+ if tree_file_out is None:
44
+ return subset_tree.write()
45
+ else:
46
+ subset_tree.write(outfile=tree_file_out)
47
+
48
+
49
+ def prepare_ale_ip_worker(arg_list):
50
+
51
+ ufboot_in = arg_list[0]
52
+ ufboot_out = arg_list[1]
53
+
54
+ ufboot_out_handle = open(ufboot_out, 'w')
55
+ for each_gene_tree in open(ufboot_in):
56
+ gene_tree_str = each_gene_tree.strip()
57
+ gene_tree_in = Tree(gene_tree_str, format=1)
58
+ for leaf in gene_tree_in:
59
+ leaf_name_split = leaf.name.split('_')
60
+ gnm_id = '_'.join(leaf_name_split[:-1])
61
+ gene_index = leaf_name_split[-1]
62
+ gnm_id_renamed = gnm_id.replace('_', '')
63
+ gene_id_renamed = '%s_%s' % (gnm_id_renamed, gene_index)
64
+ leaf.name = gene_id_renamed
65
+ gene_tree_str_renamed = gene_tree_in.write()
66
+ ufboot_out_handle.write(gene_tree_str_renamed + '\n')
67
+ ufboot_out_handle.close()
68
+
69
+
70
+ def ALE2(args):
71
+
72
+ ale1_op_dir = args['1']
73
+ genome_tree_file_rooted = args['s']
74
+ force_create_ale_wd = args['f']
75
+ num_threads = args['t']
76
+ ale2_op_dir = args['o']
77
+ run_ale = args['runALE']
78
+ docker_image = args['docker']
79
+ run_ale_cmds_txt = '%s_cmds.txt' % ale2_op_dir
80
+
81
+ ufboot_file_re = '%s/*.ufboot' % ale1_op_dir
82
+ ufboot_file_list = glob.glob(ufboot_file_re)
83
+ og_to_process_list = []
84
+ for each_ufboot in ufboot_file_list:
85
+ _, ufboot_base, _ = sep_path_basename_ext(each_ufboot)
86
+ og_to_process_list.append(ufboot_base)
87
+
88
+ # define file name
89
+ gnm_tree_no_underscore = 'genome_tree.newick'
90
+ gnm_tree_leaf_rename_txt = 'genome_tree_leaf_rename.txt'
91
+ gnm_tree_no_underscore_in_wd = '%s/%s' % (ale2_op_dir, gnm_tree_no_underscore)
92
+
93
+ # create ale2_op_dir
94
+ if force_create_ale_wd is True:
95
+ if os.path.isdir(ale2_op_dir) is True:
96
+ os.system('rm -r %s' % ale2_op_dir)
97
+ os.system('mkdir %s' % ale2_op_dir)
98
+
99
+ # prepare genome tree for running ALE
100
+ gnm_tree_leaf_rename_txt_handle = open(gnm_tree_leaf_rename_txt, 'w')
101
+ gnm_tree_in = Tree(genome_tree_file_rooted, format=1)
102
+ rename_dict = dict()
103
+ for leaf in gnm_tree_in:
104
+ leaf_name = leaf.name
105
+ leaf_name_new = leaf_name.replace('_', '')
106
+ gnm_tree_leaf_rename_txt_handle.write('%s\t%s\n' % (leaf_name_new, leaf.name))
107
+ leaf.name = leaf_name_new
108
+ rename_dict[leaf_name] = leaf_name_new
109
+ gnm_tree_leaf_rename_txt_handle.close()
110
+
111
+ gnm_tree_in.write(outfile=gnm_tree_no_underscore_in_wd)
112
+
113
+ # prepare gene tree for running ALE
114
+ run_ale_cmds_txt_handle = open(run_ale_cmds_txt, 'w')
115
+ prepare_ale_ip_worker_arg_lol = []
116
+ ale_cmd_list = []
117
+ for qualified_og in og_to_process_list:
118
+ pwd_gene_tree_ufboot = '%s/%s.ufboot' % (ale1_op_dir, qualified_og)
119
+ if os.path.isfile(pwd_gene_tree_ufboot) is False:
120
+ print('%s not found, please build gene tree first!' % pwd_gene_tree_ufboot)
121
+ else:
122
+ pwd_gene_tree_ufboot_in = '%s/%s.ufboot' % (ale1_op_dir, qualified_og)
123
+ pwd_gene_tree_ufboot_out = '%s/%s.ufboot' % (ale2_op_dir, qualified_og)
124
+
125
+ # get commands for ALEobserve and ALEml_undated
126
+ obtain_ale_file_cmd = 'ALEobserve %s.ufboot > %s.ALEobserve.log' % (qualified_og, qualified_og)
127
+ reconciliation_cmd = 'ALEml_undated %s %s.ufboot.ale > %s.ALEml_undated.log' % (gnm_tree_no_underscore, qualified_og, qualified_og)
128
+ if docker_image is not None:
129
+ obtain_ale_file_cmd = 'docker run -v $PWD:$PWD -w $PWD %s %s' % (docker_image, obtain_ale_file_cmd)
130
+ reconciliation_cmd = 'docker run -v $PWD:$PWD -w $PWD %s %s' % (docker_image, reconciliation_cmd)
131
+
132
+ current_arg_list = [pwd_gene_tree_ufboot_in, pwd_gene_tree_ufboot_out]
133
+ run_ale_cmds_txt_handle.write('%s; %s\n' % (obtain_ale_file_cmd, reconciliation_cmd))
134
+ ale_cmd_list.append('%s; %s\n' % (obtain_ale_file_cmd, reconciliation_cmd))
135
+ prepare_ale_ip_worker_arg_lol.append(current_arg_list)
136
+ run_ale_cmds_txt_handle.close()
137
+
138
+ # prepare input files and job script for running ALE with multiprocessing
139
+ print('Preparing files for running ALE with %s cores for %s OGs' % (num_threads, len(prepare_ale_ip_worker_arg_lol)))
140
+ pool = mp.Pool(processes=num_threads)
141
+ pool.map(prepare_ale_ip_worker, prepare_ale_ip_worker_arg_lol)
142
+ pool.close()
143
+ pool.join()
144
+
145
+ # run ALE
146
+ if run_ale is True:
147
+ print('running ALE with %s cores for %s OGs' % (num_threads, len(prepare_ale_ip_worker_arg_lol)))
148
+ os.chdir(ale2_op_dir)
149
+ pool = mp.Pool(processes=num_threads)
150
+ pool.map(os.system, ale_cmd_list)
151
+ pool.close()
152
+ pool.join()
153
+
154
+ print('Done!')
155
+
156
+
157
+ if __name__ == '__main__':
158
+
159
+ ALE2_parser = argparse.ArgumentParser()
160
+ ALE2_parser.add_argument('-1', required=True, help='ALE1 output directory')
161
+ ALE2_parser.add_argument('-s', required=True, help='rooted species tree')
162
+ ALE2_parser.add_argument('-o', required=True, help='output dir, i.e., OMA working directory')
163
+ ALE2_parser.add_argument('-runALE', required=False, action="store_true", help='run ALE')
164
+ ALE2_parser.add_argument('-docker', required=False, default=None, help='Docker image, if ALE was installed with Docker, e.g., gregmich/alesuite_new')
165
+ ALE2_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
166
+ ALE2_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
167
+ args = vars(ALE2_parser.parse_args())
168
+ ALE2(args)
TreeSAK/ALE2RTC.py ADDED
@@ -0,0 +1,30 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ ALE2RTC_usage = '''
6
+ ================================ ALE2RTC example commands ================================
7
+
8
+ TreeSAK ALE2RTC -h
9
+
10
+ ==========================================================================================
11
+ '''
12
+
13
+
14
+ def ALE2RTC():
15
+
16
+ pass
17
+
18
+ # file_in = args['i']
19
+ # op_dir = args['o']
20
+
21
+
22
+ # if __name__ == '__main__':
23
+ #
24
+ # ALE2RTC_parser = argparse.ArgumentParser()
25
+ # ALE2RTC_parser.add_argument('-i', required=True, help='the file "out" generated by MCMCTree')
26
+ # ALE2RTC_parser.add_argument('-o', required=True, help='output directory, which will be the input to the pRTC module (-rrtc)')
27
+ # args = vars(ALE2RTC_parser.parse_args())
28
+ # ALE2RTC(args)
29
+
30
+ ALE2RTC()