treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,97 @@
1
+ import os
2
+ import argparse
3
+ from Bio import SeqIO
4
+
5
+
6
+ get_arCOG_seq_usage = '''
7
+ =========================== get_arCOG_seq example commands ===========================
8
+
9
+ TreeSAK get_arCOG_seq -id cog_id.txt -db_dir /Users/songweizhi/DB/arCOG18 -o op_dir
10
+
11
+ # required db files
12
+ ar18.ar14.02.csv, arCOG_names_220807.txt and ar18.fa
13
+
14
+ ======================================================================================
15
+ '''
16
+
17
+
18
+ def get_arCOG_seq(args):
19
+
20
+ cog_id_txt = args['i']
21
+ db_dir = args['db_dir']
22
+ op_dir = args['o']
23
+ force_create_dir = args['f']
24
+
25
+ ar18_ar14_02_csv = '%s/ar18.ar14.02.csv' % db_dir
26
+ cog_des_txt = '%s/arCOG_names_220807.txt' % db_dir
27
+ ar18_fa = '%s/ar18.fa' % db_dir
28
+ cog_metadata_txt = '%s/metadata.txt' % op_dir
29
+
30
+
31
+ if os.path.isdir(op_dir) is True:
32
+ if force_create_dir is True:
33
+ os.system('rm -r %s' % op_dir)
34
+ else:
35
+ print('Output folder already exist, program exited!')
36
+ exit()
37
+ os.system('mkdir %s' % op_dir)
38
+
39
+ cog_des_dict = dict()
40
+ for each_cog in open(cog_des_txt, encoding="ISO-8859-1"):
41
+ each_cog_split = each_cog.strip().split('\t')
42
+ cog_des_dict[each_cog_split[0]] = each_cog_split[1:]
43
+
44
+ cog_id_set = set()
45
+ for each_cog in open(cog_id_txt):
46
+ cog_id_set.add(each_cog.strip().replace('ArCOG', 'arCOG'))
47
+
48
+ seq_id_set = set()
49
+ seq_to_arcog_dict = dict()
50
+ arcog_to_seq_dict = dict()
51
+ for each_line in open(ar18_ar14_02_csv):
52
+ each_line_split = each_line.strip().split(',')
53
+ arcog_id = each_line_split[6]
54
+ seq_id = each_line_split[2]
55
+
56
+ if arcog_id in cog_id_set:
57
+ seq_id_set.add(seq_id)
58
+ if arcog_id not in arcog_to_seq_dict:
59
+ arcog_to_seq_dict[arcog_id] = {seq_id}
60
+ else:
61
+ arcog_to_seq_dict[arcog_id].add(seq_id)
62
+
63
+ if seq_id not in seq_to_arcog_dict:
64
+ seq_to_arcog_dict[seq_id] = {arcog_id}
65
+ else:
66
+ seq_to_arcog_dict[seq_id].add(arcog_id)
67
+
68
+ # write out sequence by arCOG
69
+ for each_seq in SeqIO.parse(ar18_fa, 'fasta'):
70
+ seq_id = each_seq.id
71
+ if seq_id in seq_id_set:
72
+ seq_cog_set = seq_to_arcog_dict.get(seq_id, [])
73
+ seq_cog_list = [i for i in seq_cog_set]
74
+ if len(seq_cog_list) == 1:
75
+ pwd_fa = '%s/%s.fa' % (op_dir, seq_cog_list[0])
76
+ with open(pwd_fa, 'a') as pwd_fa_handle:
77
+ pwd_fa_handle.write('>%s\n' % seq_id)
78
+ pwd_fa_handle.write('%s\n' % str(each_seq.seq))
79
+
80
+ # write out metadata
81
+ cog_metadata_txt_handle = open(cog_metadata_txt, 'w')
82
+ for each_c in sorted([i for i in cog_id_set]):
83
+ each_c_desc = '\t'.join(cog_des_dict[each_c])
84
+ cog_metadata_txt_handle.write('%s\t%s\n' % (each_c, each_c_desc))
85
+ cog_metadata_txt_handle.close()
86
+
87
+
88
+ if __name__ == '__main__':
89
+
90
+ # initialize the options parser
91
+ parser = argparse.ArgumentParser()
92
+ parser.add_argument('-i', required=True, help='arCOD id file, one id per line')
93
+ parser.add_argument('-db_dir', required=True, help='database folder')
94
+ parser.add_argument('-o', required=True, help='output folder')
95
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite existing output folder')
96
+ args = vars(parser.parse_args())
97
+ get_arCOG_seq(args)
@@ -0,0 +1,222 @@
1
+ import os
2
+ import glob
3
+ import shutil
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ import matplotlib as mpl
7
+ mpl.use('Agg')
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ time_format = '[%Y-%m-%d %H:%M:%S] '
12
+
13
+
14
+ def is_number(s):
15
+ try:
16
+ float(s)
17
+ return True
18
+ except ValueError:
19
+ return False
20
+
21
+
22
+ def force_create_folder(folder_to_create):
23
+ if os.path.isdir(folder_to_create):
24
+ shutil.rmtree(folder_to_create, ignore_errors=True)
25
+ if os.path.isdir(folder_to_create):
26
+ shutil.rmtree(folder_to_create, ignore_errors=True)
27
+ if os.path.isdir(folder_to_create):
28
+ shutil.rmtree(folder_to_create, ignore_errors=True)
29
+ if os.path.isdir(folder_to_create):
30
+ shutil.rmtree(folder_to_create, ignore_errors=True)
31
+
32
+ os.mkdir(folder_to_create)
33
+
34
+
35
+ def sep_path_basename_ext(file_in):
36
+
37
+ # separate path and file name
38
+ file_path, file_name = os.path.split(file_in)
39
+ if file_path == '':
40
+ file_path = '.'
41
+
42
+ # separate file basename and extension
43
+ file_basename, file_ext = os.path.splitext(file_name)
44
+
45
+ return file_path, file_basename, file_ext
46
+
47
+
48
+ def get_no_hidden_folder_list(wd):
49
+ folder_list = []
50
+ for each_folder in os.listdir(wd):
51
+ if not each_folder.startswith('.'):
52
+ folder_list.append(each_folder)
53
+ return folder_list
54
+
55
+
56
+ def unique_list_elements(list_input):
57
+
58
+ list_output = []
59
+ for each_element in list_input:
60
+ if each_element not in list_output:
61
+ list_output.append(each_element)
62
+
63
+ return list_output
64
+
65
+
66
+ def ctg_depth_and_gbk_to_gene_depth(ctg_depth_file, gbk_file, skip_depth_file_header, gene_depth_file_folder):
67
+
68
+ gbk_file_path, gbk_file_basename, gbk_file_extension = sep_path_basename_ext(gbk_file)
69
+ pwd_depth_file = '%s/%s.depth' % (gene_depth_file_folder, gbk_file_basename)
70
+
71
+ # read in depth
72
+ ctg_depth_dict = {}
73
+ line = 0
74
+ for ctg in open(ctg_depth_file):
75
+
76
+ ctg_split = ctg.strip().split('\t')
77
+
78
+ if skip_depth_file_header is True:
79
+ if line > 0:
80
+ ctg_depth_dict[ctg_split[0]] = float(ctg_split[1])
81
+ else:
82
+ ctg_depth_dict[ctg_split[0]] = float(ctg_split[1])
83
+
84
+ line += 1
85
+
86
+ # get gene depth
87
+ gene_depth_file_handle = open(pwd_depth_file, 'w')
88
+ gene_depth_file_handle.write('Gene\tDepth\n')
89
+ for seq_record in SeqIO.parse(gbk_file, 'genbank'):
90
+
91
+ seq_id = seq_record.id
92
+ seq_depth = ctg_depth_dict[seq_id]
93
+
94
+ for feature in seq_record.features:
95
+ if feature.type == 'CDS':
96
+ gene_id = feature.qualifiers['locus_tag'][0]
97
+ for_out = '%s\t%s\n' % (gene_id, seq_depth)
98
+ gene_depth_file_handle.write(for_out)
99
+
100
+ gene_depth_file_handle.close()
101
+
102
+
103
+ def barh_plotter(num_list, label_list, query_seq_num, query_ko_NA, fig_width, fig_height, plot_file):
104
+
105
+ fig, ax = plt.subplots()
106
+ fig.set_size_inches(fig_width, fig_height)
107
+
108
+ y_pos = range(len(num_list))
109
+ ax.barh(y_pos, num_list, height=0.8, align='center', alpha=0.2, linewidth=0)
110
+ ax.set_yticks([]) # not show yticks
111
+ ax.invert_xaxis() # line up bar on right
112
+ ax.invert_yaxis() # put first number on top
113
+ ax.axis('tight') # remove extra spaces at the top and bottom, equal to: ax.margins(0, 0)
114
+ # ax.margins(0, 0.01) # customize space percentage
115
+
116
+ ax.set_xlabel('Number of gene')
117
+ ax.set_title('Query genes number: %s, genes without KO: %s' % (query_seq_num, query_ko_NA))
118
+
119
+ ax2 = ax.twinx()
120
+ ax2.set_ylim(ax.get_ylim())
121
+ ax2.set_yticks(y_pos)
122
+ ax2.set_yticklabels(label_list)
123
+
124
+ plt.tight_layout()
125
+ plt.savefig(plot_file, dpi=300)
126
+ plt.close()
127
+ plt.clf()
128
+
129
+
130
+ def AnnotateNorm(file_in, skip_header, value_column, Divisor_value, file_out, file_out_header):
131
+
132
+ file_out_handle = open(file_out, 'w')
133
+ file_out_handle.write(file_out_header)
134
+ line_num = 0
135
+ for each_line in open(file_in):
136
+
137
+ each_line_split = each_line.strip().split('\t')
138
+ value_str = each_line_split[value_column - 1]
139
+
140
+ if (skip_header is True and line_num > 0) or (skip_header is False):
141
+ value_pct = float(value_str) * 100 / Divisor_value
142
+ each_line_split[value_column - 1] = str(float("{0:.2f}".format(value_pct)))
143
+ file_out_handle.write('%s\n' % '\t'.join(each_line_split))
144
+
145
+ line_num += 1
146
+
147
+ file_out_handle.close()
148
+
149
+
150
+ def get_gene_list_TotalDepth(gene_list, gene_to_depth_dict):
151
+
152
+ total_depth = 0
153
+ for gene in gene_list:
154
+ gene_depth = gene_to_depth_dict[gene]
155
+ total_depth += gene_depth
156
+
157
+ return total_depth
158
+
159
+
160
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
161
+
162
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
163
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
164
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
165
+ msa_file_list_sorted = sorted(msa_file_list)
166
+
167
+ complete_gnm_set = set()
168
+ for each_msa_file in msa_file_list:
169
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
170
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
171
+ complete_gnm_set.add(each_seq.id)
172
+
173
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
174
+
175
+ # initialize concatenated msa dict
176
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
177
+ msa_len_dict = dict()
178
+ for each_msa_file in msa_file_list_sorted:
179
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
180
+
181
+ # read in msa
182
+ current_msa_len = 0
183
+ current_msa_len_set = set()
184
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
185
+ current_msa_seq_dict = dict()
186
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
187
+ complete_gnm_set.add(each_seq.id)
188
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
189
+ current_msa_len_set.add(len(each_seq.seq))
190
+ current_msa_len = len(each_seq.seq)
191
+
192
+ if len(current_msa_len_set) != 1:
193
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
194
+ exit()
195
+
196
+ msa_len_dict[gene_id] = current_msa_len
197
+
198
+ # add sequence to concatenated msa dict
199
+ for each_gnm in complete_gnm_list_sorted:
200
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
201
+ gnm_to_seq_dict[each_gnm] += msa_seq
202
+
203
+ # write out concatenated msa
204
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
205
+ for each_gnm in complete_gnm_list_sorted:
206
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
207
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
208
+ concatenated_msa_handle.close()
209
+
210
+ # write out partition file
211
+ end_pos = 0
212
+ partition_file_handle = open(partition_file, 'w')
213
+ for each_m in msa_file_list_sorted:
214
+ gene_id = each_m.split('.' + msa_ext)[0]
215
+ current_m_len = msa_len_dict[gene_id]
216
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
217
+ end_pos += current_m_len
218
+ partition_file_handle.close()
219
+
220
+ # convert msa in fasta to phy
221
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
222
+
TreeSAK/gnm_leaves.py ADDED
@@ -0,0 +1,43 @@
1
+ import os
2
+ import argparse
3
+ from ete3 import Tree
4
+
5
+
6
+ gnm_leaves_usage = '''
7
+ ========== gnm_leaves example commands ==========
8
+
9
+ TreeSAK gnm_leaves -i input.tree -o output.tree
10
+
11
+ =================================================
12
+ '''
13
+
14
+
15
+ def gnm_leaves(args):
16
+
17
+ tree_file_in = args['i']
18
+ tree_file_out = args['o']
19
+ tree_format = args['fmt']
20
+
21
+ if os.path.isfile(tree_file_in) is False:
22
+ print('Tree file not found, program exited!')
23
+ exit()
24
+
25
+ t = Tree(tree_file_in, format=tree_format)
26
+
27
+ for leaf in t:
28
+ leaf_name = leaf.name
29
+ leaf_name_new = '_'.join(leaf_name.split('_')[:-1])
30
+ leaf.name = leaf_name_new
31
+ t.write(format=tree_format, outfile=tree_file_out)
32
+
33
+ print('Done!')
34
+
35
+
36
+ if __name__ == '__main__':
37
+
38
+ gnm_leaves_parser = argparse.ArgumentParser()
39
+ gnm_leaves_parser.add_argument('-i', required=True, help='input tree')
40
+ gnm_leaves_parser.add_argument('-o', required=True, help='output tree')
41
+ gnm_leaves_parser.add_argument('-fmt', required=False, default=1, type=int, help='tree format, default: 1')
42
+ args = vars(gnm_leaves_parser.parse_args())
43
+ gnm_leaves(args)