treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,79 @@
1
+ import os
2
+ from ete3 import Tree
3
+ from ete3 import TreeStyle
4
+ from ete3 import NodeStyle
5
+ from ete3 import TextFace
6
+
7
+
8
+ def plot_tree(tree, tree_title, tree_output):
9
+ # set tree parameters
10
+ ts = TreeStyle()
11
+ ts.mode = "r" # tree model: 'r' for rectangular, 'c' for circular
12
+ ts.show_leaf_name = 0
13
+ # set tree title text parameters
14
+ ts.title.add_face(TextFace(tree_title,
15
+ fsize = 8,
16
+ fgcolor = 'black',
17
+ ftype = 'Arial',
18
+ tight_text = False),
19
+ column = 0) # tree title text setting
20
+ # set layout parameters
21
+ ts.rotation = 0 # from 0 to 360
22
+ ts.show_scale = False
23
+ ts.margin_top = 10 # top tree image margin
24
+ ts.margin_bottom = 10 # bottom tree image margin
25
+ ts.margin_left = 10 # left tree image margin
26
+ ts.margin_right = 10 # right tree image margin
27
+ ts.show_border = False # set tree image border
28
+ ts.branch_vertical_margin = 3 # 3 pixels between adjancent branches
29
+
30
+ # set tree node style
31
+ for each_node in tree.traverse():
32
+ # leaf node parameters
33
+ if each_node.is_leaf():
34
+ ns = NodeStyle()
35
+ ns["shape"] = "circle" # dot shape: circle, square or sphere
36
+ ns["size"] = 0 # dot size
37
+ ns['hz_line_width'] = 0.5 # branch line width
38
+ ns['vt_line_width'] = 0.5 # branch line width
39
+ ns['hz_line_type'] = 0 # branch line type: 0 for solid, 1 for dashed, 2 for dotted
40
+ ns['vt_line_type'] = 0 # branch line type
41
+ ns["fgcolor"] = "blue" # the dot setting
42
+ each_node.add_face(TextFace(each_node.name,
43
+ fsize = 5,
44
+ fgcolor = 'black',
45
+ tight_text = False,
46
+ bold = False),
47
+ column = 0,
48
+ position = 'branch-right') # leaf node the node name text setting
49
+
50
+ each_node.set_style(ns)
51
+
52
+ # non-leaf node parameters
53
+ else:
54
+ nlns = NodeStyle()
55
+ nlns["size"] = 0 # dot size
56
+ #nlns["rotation"] = 45
57
+ each_node.add_face(TextFace(each_node.name,
58
+
59
+ fsize = 3,
60
+ fgcolor = 'black',
61
+ tight_text = False,
62
+ bold = False),
63
+ column = 5,
64
+ position = 'branch-top') # non-leaf node name text setting)
65
+
66
+ each_node.set_style(nlns)
67
+
68
+ tree.render(tree_output, w=900, units="px", tree_style=ts) # set figures size
69
+
70
+
71
+ #os.chdir('/Users/songweizhi/Desktop')
72
+
73
+ tree_2 = '(CF_Refined_71:0.21847,CF_Refined_170:0.41504,(((CF_Refined_7:0.63495,CF_Refined_96:0.68718)0.984:0.33915,CF_Refined_82:0.16074)0.980:0.12012,((CF_Refined_25:0.20437,CF_Refined_95:1.40476)0.367:0.60450,(((CF_Refined_86:0.37933,(CF_Refined_74:0.61406,CF_Refined_43:0.10850)1.000:0.34468)0.999:0.19175,((CF_Refined_57:0.19003,(CF_Refined_99:0.18534,CF_Refined_160:0.33153)0.861:0.04660)1.000:0.18553,(CF_Refined_78:0.64747,(CF_Refined_129:0.26317,(CF_Refined_64:0.25293,CF_Refined_100:0.10449)0.993:0.14949)0.577:0.13006)1.000:0.19231)0.998:0.16057)0.961:0.08413,((CF_Refined_155:0.18262,CF_Refined_180:0.02711)1.000:0.42318,(CF_Refined_162:0.64092,CF_Refined_131:0.36385)1.000:0.48656)0.992:0.23471)0.853:0.05581)0.955:0.08666)1.000:0.14706);'
74
+
75
+ tree = Tree(tree_2, format=1)
76
+
77
+
78
+ plot_tree(tree, 'Species_Tree', '/Users/songweizhi/Desktop/Species_Tree.png')
79
+
TreeSAK/OMA.py ADDED
@@ -0,0 +1,170 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+
5
+
6
+ OMA_usage = '''
7
+ ======================= OMA example commands =======================
8
+
9
+ TreeSAK OMA -i faa_files -x faa -og og_gnm.txt -o OMA_wd -f -t 32
10
+
11
+ ====================================================================
12
+ '''
13
+
14
+ def sep_path_basename_ext(file_in):
15
+
16
+ f_path, file_name = os.path.split(file_in)
17
+ if f_path == '':
18
+ f_path = '.'
19
+ f_base, f_ext = os.path.splitext(file_name)
20
+ return f_path, f_base, f_ext
21
+
22
+
23
+ def get_default_para_dict():
24
+
25
+ default_para_str = '''
26
+ OutputFolder := 'Output';
27
+ ReuseCachedResults := true;
28
+ AlignBatchSize := 1e6;
29
+ MinScore := 181;
30
+ LengthTol := 0.61;
31
+ StablePairTol := 1.81;
32
+ InparalogTol := 3.00;
33
+ ParalogTol := -2.5*StablePairTol;
34
+ VerifiedPairTol := 1.53;
35
+ MinSeqLen := 50;
36
+ UseOnlyOneSplicingVariant := true;
37
+ UseExperimentalHomologousClusters := false;
38
+ QuasiCliquesCutoff := 1.0:
39
+ StableIdsForGroups := false;
40
+ GuessIdType := false;
41
+ DoHierarchicalGroups := 'bottom-up';
42
+ SpeciesTree := 'estimate';
43
+ MinEdgeCompletenessFraction := 0.65;
44
+ ReachabilityCutoff := 0.65;
45
+ MaxTimePerLevel := 1200; # 20min
46
+ DoGroupFunctionPrediction := true;
47
+ GroupFunctionCutoff := 0.5;
48
+ CladeDefinition := 'default';
49
+ UseEsprit := false;
50
+ DistConfLevel := 2;
51
+ MinProbContig := 0.4;
52
+ MaxContigOverlap := 5;
53
+ MinSeqLenContig := 20;
54
+ MinBestScore := 250;
55
+ '''
56
+
57
+ default_para_dict = dict()
58
+ for each_line in default_para_str.split(' '):
59
+ para_line = each_line.replace(' ', '').replace('\n', '').split(';')[0]
60
+ if para_line != '':
61
+ para_line_split = para_line.split(':=')
62
+ default_para_dict[para_line_split[0]] = para_line_split[1]
63
+
64
+ return default_para_dict
65
+
66
+
67
+ def OMA(args):
68
+
69
+ gnm_dir = args['i']
70
+ file_ext = args['x']
71
+ seq_type = args['st']
72
+ og_gnm_txt = args['og']
73
+ op_dir = args['o']
74
+ force_overwrite = args['f']
75
+ num_threads = args['t']
76
+
77
+ # define file name
78
+ pwd_gnm_rename_txt = '%s/rename.txt' % op_dir
79
+ pwd_parameter_file = '%s/parameters.drw' % op_dir
80
+ oma_input_dir = '%s/DB' % op_dir
81
+
82
+ # create dir
83
+ if os.path.isdir(op_dir) is True:
84
+ if force_overwrite is True:
85
+ os.system('rm -r %s' % op_dir)
86
+ else:
87
+ print('output folder detected, program exited!')
88
+ exit()
89
+ os.system('mkdir %s' % op_dir)
90
+ os.system('mkdir %s' % oma_input_dir)
91
+
92
+ # check genome files
93
+ gnm_file_re = '%s/*.%s' % (gnm_dir, file_ext)
94
+ gnm_file_list = glob.glob(gnm_file_re)
95
+ if len(gnm_file_list) == 0:
96
+ print('No genome detected, program exited!')
97
+ exit()
98
+
99
+ # check og_gnm_txt
100
+ if os.path.isfile(og_gnm_txt) is False:
101
+ print('Out group genome id file not detected, program exited!')
102
+ exit()
103
+
104
+ # copy genome files into DB folder
105
+ gnm_id_rename_dict = dict()
106
+ rename_list = []
107
+ for each_gnm in gnm_file_list:
108
+ gnm_path, gnm_base, gnm_ext = sep_path_basename_ext(each_gnm)
109
+ gnm_base_renamed = gnm_base.replace('.', '_')
110
+ pwd_gnm_db = '%s/%s.fa' % (oma_input_dir, gnm_base_renamed)
111
+ if gnm_base != gnm_base_renamed:
112
+ rename_list.append('%s\t%s' % (gnm_base, gnm_base_renamed))
113
+ gnm_id_rename_dict[gnm_base] = gnm_base_renamed
114
+ os.system('cp %s %s' % (each_gnm, pwd_gnm_db))
115
+
116
+ # write out rename file
117
+ if len(rename_list) > 0:
118
+ pwd_gnm_rename_txt_handle = open(pwd_gnm_rename_txt, 'w')
119
+ for each_e in sorted(rename_list):
120
+ pwd_gnm_rename_txt_handle.write(each_e + '\n')
121
+ pwd_gnm_rename_txt_handle.close()
122
+ else:
123
+ print('Format of file names passed checking')
124
+
125
+ # get default_para_dict
126
+ default_para_dict = get_default_para_dict()
127
+
128
+ # read in og_gnm_txt
129
+ renamed_og_gnm_list = []
130
+ for each_og_gnm in open(og_gnm_txt):
131
+ og_gnm_renamed = gnm_id_rename_dict[each_og_gnm.strip()]
132
+ renamed_og_gnm_list.append(og_gnm_renamed)
133
+
134
+ # write out parameter file
135
+ with open(pwd_parameter_file, 'w') as pwd_parameter_file_handle:
136
+
137
+ # write InputDataType line
138
+ if seq_type in ['AA', 'aa', 'Aa']:
139
+ pwd_parameter_file_handle.write("InputDataType := 'AA';\n")
140
+ if seq_type in ['DNA', 'dna', 'Dna']:
141
+ pwd_parameter_file_handle.write("InputDataType := 'DNA';\n")
142
+
143
+ # write OutgroupSpecies line
144
+ OutgroupSpecies_value_str = "['%s']" % "', '".join(renamed_og_gnm_list)
145
+ pwd_parameter_file_handle.write("OutgroupSpecies := %s;\n" % OutgroupSpecies_value_str)
146
+
147
+ # write out the rest lines
148
+ for each_para in default_para_dict:
149
+ para_value = default_para_dict[each_para]
150
+ pwd_parameter_file_handle.write("%s := %s;\n" % (each_para, para_value))
151
+
152
+ # final report
153
+ print('You can run OMA with:')
154
+ print('cd %s' % op_dir)
155
+ print('oma -n %s' % num_threads)
156
+ print('# You may want to customize parameters specified in %s ' % pwd_parameter_file)
157
+
158
+
159
+ if __name__ == '__main__':
160
+
161
+ OMA_parser = argparse.ArgumentParser()
162
+ OMA_parser.add_argument('-i', required=True, help='genome folder')
163
+ OMA_parser.add_argument('-x', required=True, help='genome file extension')
164
+ OMA_parser.add_argument('-st', required=False, default='AA', help='sequence type, AA or DNA, default: AA')
165
+ OMA_parser.add_argument('-og', required=True, help='outgroup genomes, without file extension')
166
+ OMA_parser.add_argument('-o', required=True, default=None, help='output dir, i.e., OMA working directory')
167
+ OMA_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
168
+ OMA_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads for running OMA, default: 6')
169
+ args = vars(OMA_parser.parse_args())
170
+ OMA(args)
TreeSAK/OMA2.py ADDED
@@ -0,0 +1,212 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+
6
+
7
+ OMA2_usage = '''
8
+ ============================== OMA2 example commands ==============================
9
+
10
+ TreeSAK OMA2 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o op_dir -f -n 3
11
+ TreeSAK OMA2 -i OrthologousGroups.txt -s OrthologousGroupsFasta -o op_dir -f -c 85
12
+
13
+ ===================================================================================
14
+ '''
15
+
16
+
17
+ def sep_path_basename_ext(file_in):
18
+
19
+ f_path, f_name = os.path.split(file_in)
20
+ if f_path == '':
21
+ f_path = '.'
22
+ f_base, f_ext = os.path.splitext(f_name)
23
+
24
+ return f_name, f_path, f_base, f_ext[1:]
25
+
26
+
27
+ def get_gnm_og_cov(og_dir, og_ext, og_cov_txt):
28
+
29
+ og_file_re = '%s/*.%s' % (og_dir, og_ext)
30
+ og_file_list = glob.glob(og_file_re)
31
+
32
+ gnm_to_og_dict = dict()
33
+ for og_file in og_file_list:
34
+ _, _, og_id, _ = sep_path_basename_ext(og_file)
35
+ for each_seq in SeqIO.parse(og_file, 'fasta'):
36
+ seq_id = each_seq.id
37
+ gnm_id = '_'.join(seq_id.split('_')[:-1])
38
+ if gnm_id not in gnm_to_og_dict:
39
+ gnm_to_og_dict[gnm_id] = set()
40
+ gnm_to_og_dict[gnm_id].add(og_id)
41
+
42
+ og_cov_txt_handle = open(og_cov_txt, 'w')
43
+ for each_gnm in sorted(list(gnm_to_og_dict.keys())):
44
+ gnm_og_set = gnm_to_og_dict[each_gnm]
45
+ og_cov = len(gnm_og_set)*100/len(og_file_list)
46
+ og_cov = float("{0:.2f}".format(og_cov))
47
+ og_cov_txt_handle.write('%s\t%s\n' % (each_gnm, og_cov))
48
+ og_cov_txt_handle.close()
49
+
50
+
51
+ def get_ortho_to_gene_dict(ortho_groups_txt, og_program):
52
+
53
+ ortho_to_gene_dict = dict()
54
+ for each_og in open(ortho_groups_txt):
55
+ if not each_og.startswith('#'):
56
+ og_id = ''
57
+ gene_list = []
58
+ if og_program == 'orthofinder':
59
+ each_og_split = each_og.strip().split(' ')
60
+ og_id = each_og_split[0][:-1]
61
+ gene_list = each_og_split[1:]
62
+ elif og_program == 'oma':
63
+ each_og_split = each_og.strip().split('\t')
64
+ og_id = each_og_split[0]
65
+ group_member_list = each_og_split[1:]
66
+ for each_protein in group_member_list:
67
+ protein_id = each_protein.split(' ')[0].split(':')[1]
68
+ gene_list.append(protein_id)
69
+ ortho_to_gene_dict[og_id] = gene_list
70
+
71
+ return ortho_to_gene_dict
72
+
73
+
74
+ def select_seq(seq_in, seq_id_list, seq_out):
75
+ output_file_handle = open(seq_out, 'w')
76
+ for seq_record in SeqIO.parse(seq_in, 'fasta'):
77
+ if seq_record.id in seq_id_list:
78
+ output_file_handle.write('>%s\n' % seq_record.id)
79
+ output_file_handle.write('%s\n' % seq_record.seq)
80
+ output_file_handle.close()
81
+
82
+
83
+ def OMA2(args):
84
+
85
+ og_txt = args['i']
86
+ og_seq_dir = args['s']
87
+ gnm_txt = args['g']
88
+ op_dir = args['o']
89
+ force_overwrite = args['f']
90
+ min_gene_num = args['n']
91
+ min_gene_cov = args['c']
92
+
93
+ if (min_gene_num is None) and (min_gene_cov is None):
94
+ print('Please specify either -c or -n, program exited!')
95
+ exit()
96
+ elif (min_gene_num is not None) and (min_gene_cov is not None):
97
+ print('-c and -n are not compatible, program exited!')
98
+ exit()
99
+
100
+ og_txt_out = ''
101
+ gnm_og_num_txt = ''
102
+ filtered_seq_dir = ''
103
+ if min_gene_num is not None:
104
+ og_txt_out = '%s/OrthologousGroups_num%s.txt' % (op_dir, min_gene_num)
105
+ gnm_og_num_txt = '%s/OrthologousGroups_num%s_per_genome.txt' % (op_dir, min_gene_num)
106
+ filtered_seq_dir = '%s/OrthologousGroupsFasta_num%s' % (op_dir, min_gene_num)
107
+ if min_gene_cov is not None:
108
+ og_txt_out = '%s/OrthologousGroups_cov%s.txt' % (op_dir, min_gene_cov)
109
+ gnm_og_num_txt = '%s/OrthologousGroups_cov%s_per_genome.txt' % (op_dir, min_gene_cov)
110
+ filtered_seq_dir = '%s/OrthologousGroupsFasta_cov%s' % (op_dir, min_gene_cov)
111
+
112
+ # check genome files
113
+ interested_gnm_set = set()
114
+ if gnm_txt is not None:
115
+ if os.path.isfile(gnm_txt) is True:
116
+ for each_gnm in open(gnm_txt):
117
+ gnm_id = each_gnm.strip().split()[0]
118
+ interested_gnm_set.add(gnm_id)
119
+ else:
120
+ print('%s not found, program exited!' % gnm_txt)
121
+ exit()
122
+
123
+ # create dir
124
+ if os.path.isdir(op_dir) is True:
125
+ if force_overwrite is True:
126
+ os.system('rm -r %s' % op_dir)
127
+ else:
128
+ print('output folder detected, program exited!')
129
+ exit()
130
+ os.system('mkdir %s' % op_dir)
131
+ os.system('mkdir %s' % filtered_seq_dir)
132
+
133
+ # get overall genome set
134
+ overall_gnm_set = set()
135
+ for each_line in open(og_txt):
136
+ if not each_line.startswith('#'):
137
+ each_line_split = each_line.strip().split('\t')
138
+ gene_list = each_line_split[1:]
139
+ for each_gene in gene_list:
140
+ gene_gnm = each_gene.split(':')[0]
141
+ overall_gnm_set.add(gene_gnm)
142
+
143
+ qualified_og_set = set()
144
+ id_to_name_dict = dict()
145
+ gene_to_extract_dict = dict()
146
+ og_txt_out_handle = open(og_txt_out, 'w')
147
+ for each_line in open(og_txt):
148
+ if not each_line.startswith('#'):
149
+ each_line_split = each_line.strip().split('\t')
150
+ og_id = each_line_split[0]
151
+ filename = 'OG%s' % int(og_id[3:])
152
+ id_to_name_dict[og_id] = filename
153
+ gene_list = each_line_split[1:]
154
+ filtered_gene_set = set()
155
+ for each_gene in gene_list:
156
+ gene_gnm = each_gene.split(':')[0]
157
+ gene_id = each_gene.split(':')[1].split(' ')[0]
158
+ if len(interested_gnm_set) == 0:
159
+ filtered_gene_set.add(gene_id)
160
+ else:
161
+ if gene_gnm in interested_gnm_set:
162
+ filtered_gene_set.add(gene_id)
163
+
164
+ qualified_og = False
165
+ if min_gene_num is not None:
166
+ if len(filtered_gene_set) >= float(min_gene_num):
167
+ qualified_og = True
168
+ if min_gene_cov is not None:
169
+ if len(interested_gnm_set) == 0:
170
+ gnm_cov = len(filtered_gene_set)*100/len(overall_gnm_set)
171
+ else:
172
+ gnm_cov = len(filtered_gene_set)*100/len(interested_gnm_set)
173
+
174
+ if gnm_cov >= float(min_gene_cov):
175
+ qualified_og = True
176
+
177
+ if qualified_og is True:
178
+ qualified_og_set.add(og_id)
179
+ og_txt_out_handle.write('%s\t%s\n' % (filename, ','.join(sorted(list(filtered_gene_set)))))
180
+ gene_to_extract_dict[og_id] = filtered_gene_set
181
+ og_txt_out_handle.close()
182
+
183
+ for each_og in gene_to_extract_dict:
184
+ seq_file_name = id_to_name_dict[each_og]
185
+ source_file = '%s/%s.fa' % (og_seq_dir, seq_file_name)
186
+ filtered_file = '%s/%s.fa' % (filtered_seq_dir, seq_file_name)
187
+ select_seq(source_file, gene_to_extract_dict[each_og], filtered_file)
188
+
189
+ # get_gnm_og_cov
190
+ get_gnm_og_cov(filtered_seq_dir, 'fa', gnm_og_num_txt)
191
+
192
+ # report
193
+ if min_gene_num is not None:
194
+ print('The number of OG with genes >= %s is %s' % (min_gene_num, len(qualified_og_set)))
195
+ if min_gene_cov is not None:
196
+ print('The number of OG with coverage >= %s is %s' % (min_gene_cov, len(qualified_og_set)))
197
+
198
+ print('Done!')
199
+
200
+
201
+ if __name__ == '__main__':
202
+
203
+ OMA2_parser = argparse.ArgumentParser()
204
+ OMA2_parser.add_argument('-i', required=True, help='OrthologousGroups.txt')
205
+ OMA2_parser.add_argument('-s', required=True, help='sequence dir, OrthologousGroupsFasta')
206
+ OMA2_parser.add_argument('-g', required=False, default=None, help='interested genomes')
207
+ OMA2_parser.add_argument('-o', required=True, default=None, help='output directory')
208
+ OMA2_parser.add_argument('-n', required=False, default=None, help='minimal number of gene in a OG, not compatible with -c')
209
+ OMA2_parser.add_argument('-c', required=False, default=None, help='minimal genome coverage cutoff, not compatible with -n')
210
+ OMA2_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
211
+ args = vars(OMA2_parser.parse_args())
212
+ OMA2(args)
TreeSAK/OneLineAln.py ADDED
@@ -0,0 +1,50 @@
1
+ import argparse
2
+ from Bio import SeqIO
3
+
4
+
5
+ OneLineAln_usage = '''
6
+ ========================= OneLineAln example commands =========================
7
+
8
+ BioSAK OneLineAln -in MarkerGenes.aln -out MarkerGenes_OneLine.aln
9
+ BioSAK OneLineAln -in MarkerGenes.aln -out MarkerGenes_OneLine.aln -upper
10
+
11
+ ===============================================================================
12
+ '''
13
+
14
+ def OneLineAln(args):
15
+
16
+ aln_in_fasta = args['in']
17
+ aln_out_one_line = args['out']
18
+ to_uppercase = args['upper']
19
+
20
+ # get longest_seq_id
21
+ longest_seq_id = 0
22
+ for seq in SeqIO.parse(aln_in_fasta, 'fasta'):
23
+ if len(seq.id) > longest_seq_id:
24
+ longest_seq_id = len(seq.id)
25
+
26
+ # write out in new format
27
+ aln_in_one_line_handle = open(aln_out_one_line, 'w')
28
+ for seq in SeqIO.parse(aln_in_fasta, 'fasta'):
29
+ seq_id_polished = seq.id + (longest_seq_id - len(seq.id))*' '
30
+ if to_uppercase is True:
31
+ aln_in_one_line_handle.write('%s\t%s\n' % (seq_id_polished, str(seq.seq).upper()))
32
+ else:
33
+ aln_in_one_line_handle.write('%s\t%s\n' % (seq_id_polished, str(seq.seq)))
34
+
35
+ aln_in_one_line_handle.close()
36
+
37
+
38
+ if __name__ == '__main__':
39
+
40
+ OneLineAln_parser = argparse.ArgumentParser()
41
+
42
+ # arguments for rename_seq_parser
43
+ OneLineAln_parser.add_argument('-in', required=True, help='input MSA in fasta format')
44
+ OneLineAln_parser.add_argument('-out', required=False, default=None, help='output file')
45
+ OneLineAln_parser.add_argument('-upper', required=False, action='store_true', help='turn to uppercase')
46
+
47
+ args = vars(OneLineAln_parser.parse_args())
48
+
49
+ OneLineAln(args)
50
+
TreeSAK/PB.py ADDED
@@ -0,0 +1,155 @@
1
+ import os
2
+ import argparse
3
+ from Bio import AlignIO
4
+
5
+
6
+ PB_usage = '''
7
+ ========================================== PB example commands ==========================================
8
+
9
+ # Dependency: mpirun, pb_mpi and readpb_mpi (from PhyloBayes-MPI)
10
+
11
+ export OMPI_MCA_btl=^openib
12
+ TreeSAK PB -i in.phylip -p best20pb -o best20pb -t 52
13
+ TreeSAK PB -i in.phylip -p best20pb -o best20pb -t 52 -n 1
14
+ TreeSAK PB -i in.phylip -p worst20pb -o worst20pb -t 52
15
+
16
+ # Notes:
17
+ 1. This is a wrapper for: mpirun -np 12 pb_mpi -d in.phylip -cat -gtr -x 10 -1 -dgam 4 -s chain_name
18
+ 2. Input MSA need to be in phylip format.
19
+ 3. To stop a chain, just open the <chain_name>.run file and replace the 1 by a 0 (echo 0 > <chain_name>.run).
20
+ 4. Be careful not to restart an already running chain.
21
+ 5. You can stop a chain and restart it under a different degree of parallelization.
22
+ 6. Generally, PhyloBayes provides good results for a total number of points of 10000-30000.
23
+ 7. Results can be assessed with bpcomp and tracecomp
24
+
25
+ * Settings used by Nina Dombrowski: -cat -gtr -x 10 -1 -dgam 4
26
+ For each marker protein family, four parallel chains were run until convergence was reached, unless stated
27
+ otherwise (maxdiff < 0.3; settings: bpcomp -x 25_burnin chain1 chain2 chain3 chain4). Additionally, we
28
+ checked for the minimum effective size using tracecomp (minimum effective size > 50; settings: -x 25_burnin
29
+ chain1 chain2 chain3 chain4).
30
+
31
+ * Settings used by Fan Lu:
32
+ Four chains were run for each consensus tree, and for each chain over 15,000 cycles (5,000 burn-in)
33
+ were conducted, until a maxdiff value lower than 0.3 was reached. Otherwise, non-converged chains were
34
+ continually run to over 20,000 cycles. Posterior predictive tests were conducted using PhyloBayes MPI
35
+ with the ‘readpb_mpi -x 5000 50 -allppred’ command.
36
+
37
+ =========================================================================================================
38
+ '''
39
+
40
+
41
+ def sep_path_basename_ext(file_in):
42
+
43
+ f_path, f_name = os.path.split(file_in)
44
+ if f_path == '':
45
+ f_path = '.'
46
+ f_base, f_ext = os.path.splitext(f_name)
47
+
48
+ return f_name, f_path, f_base, f_ext[1:]
49
+
50
+
51
+ def fa2phy(fasta_in, phy_out):
52
+
53
+ alignment = AlignIO.read(fasta_in, 'fasta')
54
+
55
+ max_seq_id_len = 0
56
+ for each_seq in alignment:
57
+ seq_id_len = len(each_seq.id)
58
+ if seq_id_len > max_seq_id_len:
59
+ max_seq_id_len = seq_id_len
60
+
61
+ with open(phy_out, 'w') as msa_out_handle:
62
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
63
+ for each_seq in alignment:
64
+ seq_id = each_seq.id
65
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
66
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
67
+
68
+
69
+ def PB(args):
70
+
71
+ msa_in = args['i']
72
+ op_dir = args['o']
73
+ op_prefix = args['p']
74
+ fa_to_plp = args['fa2plp']
75
+ num_of_threads = args['t']
76
+ num_of_chains = args['n']
77
+ force_overwrite = args['f']
78
+
79
+ ####################################################################################################################
80
+
81
+ msa_in_name, msa_in_path, msa_in_base, msa_in_ext = sep_path_basename_ext(msa_in)
82
+
83
+ settings_dombrowski = '-cat -gtr -x 10 -1 -dgam 4'
84
+ setting_to_use = settings_dombrowski
85
+ msa_in_plp = '%s/%s.phylip' % (op_dir, msa_in_base)
86
+ cmd_txt = '%s/%s_cmds.txt' % (op_dir, msa_in_base)
87
+
88
+ ####################################################################################################################
89
+
90
+ # create output dir
91
+ if os.path.isdir(op_dir) is True:
92
+ if force_overwrite is True:
93
+ os.system('rm -r %s' % op_dir)
94
+ else:
95
+ print('output folder already exist, program exited!')
96
+ exit()
97
+ os.system('mkdir %s' % op_dir)
98
+
99
+ # fa_to_phylip
100
+ msa_to_use = msa_in
101
+ if fa_to_plp is True:
102
+ fa2phy(msa_in, msa_in_plp)
103
+ msa_to_use = msa_in_plp
104
+
105
+ chain_name_list = []
106
+ pb_mpi_cmd_list = []
107
+ if num_of_chains == 1:
108
+ pb_mpi_cmd = 'export OMPI_MCA_btl=^openib; mpirun -np %s pb_mpi -d %s %s -s %s/%s' % (num_of_threads, msa_to_use, setting_to_use, op_dir, op_prefix)
109
+ chain_name_list.append('%s/%s' % (op_dir, op_prefix))
110
+ pb_mpi_cmd_list.append(pb_mpi_cmd)
111
+ else:
112
+ for chain_index in range(1, (num_of_chains + 1)):
113
+ current_wd = '%s/%s_chain%s' % (op_dir, op_prefix, chain_index)
114
+ os.mkdir(current_wd)
115
+ pb_mpi_cmd = 'export OMPI_MCA_btl=^openib; mpirun -np %s pb_mpi -d %s %s -s %s/%s_chain%s' % (num_of_threads, msa_to_use, setting_to_use, current_wd, op_prefix, chain_index)
116
+ chain_name_list.append('%s/%s_chain%s' % (current_wd, op_prefix, chain_index))
117
+ pb_mpi_cmd_list.append(pb_mpi_cmd)
118
+
119
+ # write out commands
120
+ cmd_txt_handle = open(cmd_txt, 'w')
121
+ cmd_txt_handle.write('# To run pb_mpi\n')
122
+ for cmd in pb_mpi_cmd_list:
123
+ cmd_txt_handle.write(cmd + '\n')
124
+
125
+ cmd_txt_handle.write('\n# To restart a terminated run (e.g., due to walltime limitation)\n')
126
+ for each_chain in chain_name_list:
127
+ cmd_txt_handle.write('export OMPI_MCA_btl=^openib; mpirun -np %s pb_mpi %s\n' % (num_of_threads, each_chain))
128
+ cmd_txt_handle.close()
129
+
130
+ # assess the results
131
+ if num_of_chains > 1:
132
+ readpb_cmd = 'export OMPI_MCA_btl=^openib; bpcomp -x 1000 10 %s' % (' '.join(chain_name_list))
133
+ bpcomp_cmd = 'export OMPI_MCA_btl=^openib; tracecomp -x 1000 %s' % (' '.join(chain_name_list))
134
+ cmd_txt_handle = open(cmd_txt, 'a')
135
+ cmd_txt_handle.write('\n# You may want to use the following commands to assess the results:\n')
136
+ cmd_txt_handle.write(readpb_cmd + '\n')
137
+ cmd_txt_handle.write(bpcomp_cmd + '\n')
138
+ cmd_txt_handle.close()
139
+
140
+ print('Commands exported to %s' % cmd_txt)
141
+ print('Done!')
142
+
143
+
144
+ if __name__ == '__main__':
145
+
146
+ PB_parser = argparse.ArgumentParser()
147
+ PB_parser.add_argument('-i', required=True, help='input MSA file')
148
+ PB_parser.add_argument('-o', required=True, help='output directory')
149
+ PB_parser.add_argument('-p', required=True, help='output prefix')
150
+ PB_parser.add_argument('-fa2plp', required=False, action="store_true", help='convert MSA format from fasta to phylip')
151
+ PB_parser.add_argument('-n', required=False, type=int, default=4, help='number of chains, default: 4')
152
+ PB_parser.add_argument('-t', required=False, type=int, default=48, help='num of cores per mpirun, default: 48')
153
+ PB_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
154
+ args = vars(PB_parser.parse_args())
155
+ PB(args)