treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
TreeSAK/assessOG.py ADDED
@@ -0,0 +1,45 @@
1
+ import os
2
+ import glob
3
+ from Bio import SeqIO
4
+
5
+
6
+ def sep_path_basename_ext(file_in):
7
+
8
+ f_path, f_name = os.path.split(file_in)
9
+ if f_path == '':
10
+ f_path = '.'
11
+ f_base, f_ext = os.path.splitext(f_name)
12
+
13
+ return f_name, f_path, f_base, f_ext[1:]
14
+
15
+
16
+ def get_gnm_og_cov(og_dir, og_ext, og_cov_txt):
17
+
18
+ og_file_re = '%s/*.%s' % (og_dir, og_ext)
19
+ og_file_list = glob.glob(og_file_re)
20
+
21
+ gnm_to_og_dict = dict()
22
+ for og_file in og_file_list:
23
+ _, _, og_id, _ = sep_path_basename_ext(og_file)
24
+ for each_seq in SeqIO.parse(og_file, 'fasta'):
25
+ seq_id = each_seq.id
26
+ gnm_id = '_'.join(seq_id.split('_')[:-1])
27
+ if gnm_id not in gnm_to_og_dict:
28
+ gnm_to_og_dict[gnm_id] = set()
29
+ gnm_to_og_dict[gnm_id].add(og_id)
30
+
31
+ og_cov_txt_handle = open(og_cov_txt, 'w')
32
+ for each_gnm in sorted(list(gnm_to_og_dict.keys())):
33
+ gnm_og_set = gnm_to_og_dict[each_gnm]
34
+ og_cov = len(gnm_og_set)*100/len(og_file_list)
35
+ og_cov = float("{0:.2f}".format(og_cov))
36
+ og_cov_txt_handle.write('%s\t%s\n' % (each_gnm, og_cov))
37
+ og_cov_txt_handle.close()
38
+
39
+
40
+ og_dir = '/Users/songweizhi/Desktop/OrthologousGroupsFasta_cov95'
41
+ og_ext = 'fa'
42
+ og_cov_txt = '/Users/songweizhi/Desktop/gnm_og_cov.txt'
43
+
44
+ get_gnm_og_cov(og_dir, og_ext, og_cov_txt)
45
+
@@ -0,0 +1,140 @@
1
+ import os
2
+ import glob
3
+ from Bio import SeqIO
4
+ from Bio import AlignIO
5
+
6
+
7
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
8
+
9
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
10
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
11
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
12
+ msa_file_list_sorted = sorted(msa_file_list)
13
+
14
+ complete_gnm_set = set()
15
+ for each_msa_file in msa_file_list:
16
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
17
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
18
+ complete_gnm_set.add(each_seq.id)
19
+
20
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
21
+
22
+ # initialize concatenated msa dict
23
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
24
+ msa_len_dict = dict()
25
+ for each_msa_file in msa_file_list_sorted:
26
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
27
+
28
+ # read in msa
29
+ current_msa_len = 0
30
+ current_msa_len_set = set()
31
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
32
+ current_msa_seq_dict = dict()
33
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
34
+ complete_gnm_set.add(each_seq.id)
35
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
36
+ current_msa_len_set.add(len(each_seq.seq))
37
+ current_msa_len = len(each_seq.seq)
38
+
39
+ if len(current_msa_len_set) != 1:
40
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
41
+ exit()
42
+
43
+ msa_len_dict[gene_id] = current_msa_len
44
+
45
+ # add sequence to concatenated msa dict
46
+ for each_gnm in complete_gnm_list_sorted:
47
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
48
+ gnm_to_seq_dict[each_gnm] += msa_seq
49
+
50
+ # write out concatenated msa
51
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
52
+ for each_gnm in complete_gnm_list_sorted:
53
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
54
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
55
+ concatenated_msa_handle.close()
56
+
57
+ # write out partition file
58
+ end_pos = 0
59
+ partition_file_handle = open(partition_file, 'w')
60
+ for each_m in msa_file_list_sorted:
61
+ gene_id = each_m.split('.' + msa_ext)[0]
62
+ current_m_len = msa_len_dict[gene_id]
63
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
64
+ end_pos += current_m_len
65
+ partition_file_handle.close()
66
+
67
+ # convert msa in fasta to phy
68
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
69
+
70
+
71
+ msa_dir = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed'
72
+ msa_ext = 'aln'
73
+ concatenated_msa_phy = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed_concatenated.phy'
74
+ partition_file = '/Users/songweizhi/Desktop/s06_identified_marker_aln_trimmed_concatenated_partition.txt'
75
+ # catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file)
76
+
77
+
78
+
79
+ msa_file = '/Users/songweizhi/Desktop/PA_75_DeltaLL_75_concatenated.phy'
80
+ msa_file_subset = '/Users/songweizhi/Desktop/PA_75_DeltaLL_75_concatenated_subset.phy'
81
+
82
+ from Bio import AlignIO
83
+
84
+ def slice_msa_by_col(msa_in, range_str, msa_out):
85
+ alignment = AlignIO.read(msa_in, 'phylip-relaxed')
86
+
87
+ range_l = int(range_str.split('-')[0]) - 1
88
+ range_r = int(range_str.split('-')[1])
89
+
90
+ aln_subset = alignment[:, range_l:range_r]
91
+ AlignIO.write(aln_subset, msa_out, 'phylip-relaxed')
92
+
93
+
94
+ def slice_msa_by_col_manual(msa_in, range_str, msa_out):
95
+ alignment = AlignIO.read(msa_in, 'phylip-relaxed')
96
+
97
+ range_l = int(range_str.split('-')[0]) - 1
98
+ range_r = int(range_str.split('-')[1])
99
+ aln_subset = alignment[:, range_l:range_r]
100
+
101
+ max_seq_id_len = 0
102
+ for each_seq in aln_subset:
103
+ seq_id_len = len(each_seq.id)
104
+ if seq_id_len > max_seq_id_len:
105
+ max_seq_id_len = seq_id_len
106
+ print(max_seq_id_len)
107
+
108
+ with open(msa_out, 'w') as msa_out_handle:
109
+ msa_out_handle.write('%s %s\n' % (len(aln_subset), aln_subset.get_alignment_length()))
110
+ for each_seq in aln_subset:
111
+ seq_id = each_seq.id
112
+ seq_id_with_space = '%s%s' % (seq_id, ' '*(max_seq_id_len + 2 - len(seq_id)))
113
+ print(seq_id_with_space)
114
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
115
+
116
+
117
+ # AlignIO.write(aln_subset, msa_out, 'phylip-relaxed')
118
+
119
+
120
+ slice_range = ['1-500', '501-1000', '1001-1500', '1501-2000', '2001-2500', '2501-3000', '3001-3500', '3501-4000', '4001-4500', '4501-4879']
121
+
122
+ for each_range in slice_range:
123
+ pwd_msa_op = '/Users/songweizhi/Desktop/%s.phy' % each_range
124
+ slice_msa_by_col_manual(msa_file, each_range, pwd_msa_op)
125
+
126
+
127
+ def fa2phy(fasta_in, phy_out):
128
+ alignment = AlignIO.read(fasta_in, 'fasta')
129
+ max_seq_id_len = 0
130
+ for each_seq in alignment:
131
+ seq_id_len = len(each_seq.id)
132
+ if seq_id_len > max_seq_id_len:
133
+ max_seq_id_len = seq_id_len
134
+
135
+ with open(phy_out, 'w') as msa_out_handle:
136
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
137
+ for each_seq in alignment:
138
+ seq_id = each_seq.id
139
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
140
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
TreeSAK/cogTree.py ADDED
@@ -0,0 +1,185 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ import multiprocessing as mp
6
+
7
+
8
+ cogTree_usage = '''
9
+ ================================ cogTree example commands ================================
10
+
11
+ TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG_id.txt
12
+ TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG00724
13
+ TreeSAK cogTree -i combined.faa -cog arCOG_wd -o op_dir -bmge -t 12 -f -fun arCOG00724,arCOG02271
14
+
15
+ ==========================================================================================
16
+ '''
17
+
18
+
19
+ def select_seq(seq_file, seq_id_set, output_file):
20
+ output_file_handle = open(output_file, 'w')
21
+ for seq_record in SeqIO.parse(seq_file, 'fasta'):
22
+ seq_id = seq_record.id
23
+ if seq_id in seq_id_set:
24
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
25
+ output_file_handle.close()
26
+
27
+
28
+ def cogTree(args):
29
+
30
+ combined_faa = args['i']
31
+ cog_annotation_wd = args['cog']
32
+ interested_fun_txt = args['fun']
33
+ op_dir = args['o']
34
+ trim_with_bmge = args['bmge']
35
+ trim_model = args['bmge_m']
36
+ entropy_score_cutoff = args['bmge_esc']
37
+ iqtree_model = args['iqtree_m']
38
+ force_overwrite = args['f']
39
+ num_of_threads = args['t']
40
+
41
+ # specify path to BMGE.jar
42
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
43
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
44
+
45
+ interested_fun_set = set()
46
+ if os.path.isfile(interested_fun_txt) is False:
47
+ if ',' in interested_fun_txt:
48
+ interested_fun_set = interested_fun_txt.split(',')
49
+ else:
50
+ interested_fun_set.add(interested_fun_txt)
51
+ else:
52
+ for each_fun in open(interested_fun_txt):
53
+ interested_fun_set.add(each_fun.strip().split()[0])
54
+
55
+ ################################################################################
56
+
57
+ faa_dir = '%s/dir_1_faa' % op_dir
58
+ aln_dir = '%s/dir_2_msa' % op_dir
59
+ trimmed_aln_dir = '%s/dir_3_trimmed_msa' % op_dir
60
+ tree_dir = '%s/dir_4_tree' % op_dir
61
+ cmd_1_mafft_txt = '%s/cmd_1_mafft.txt' % op_dir
62
+ cmd_2_trim_txt = '%s/cmd_2_trim.txt' % op_dir
63
+ cmd_3_tree_txt = '%s/cmd_3_tree.txt' % op_dir
64
+
65
+ ################################################################################
66
+
67
+ # create output folder
68
+ if os.path.isdir(op_dir) is True:
69
+ if force_overwrite is True:
70
+ os.system('rm -r %s' % op_dir)
71
+ else:
72
+ print('%s exist, program exited!' % op_dir)
73
+ exit()
74
+
75
+ os.mkdir(op_dir)
76
+ os.mkdir(faa_dir)
77
+ os.mkdir(aln_dir)
78
+ os.mkdir(trimmed_aln_dir)
79
+ os.mkdir(tree_dir)
80
+
81
+ ################################################################################
82
+
83
+ fun_to_gene_dict = dict()
84
+ if cog_annotation_wd is not None:
85
+
86
+ print('Reading in COG annotation results')
87
+ file_re = '%s/*COG_wd/*_query_to_cog.txt' % (cog_annotation_wd)
88
+ file_list = glob.glob(file_re)
89
+ if len(file_list) == 0:
90
+ print('COG annotation file not detected, program exited!')
91
+ exit()
92
+
93
+ for each_file in file_list:
94
+ line_index = 0
95
+ for each_line in open(each_file):
96
+ if line_index > 0:
97
+ each_line_split = each_line.strip().split('\t')
98
+ if len(each_line_split) == 4:
99
+ gene_id = each_line_split[0]
100
+ cog_id = each_line_split[1]
101
+ if cog_id in interested_fun_set:
102
+ if cog_id not in fun_to_gene_dict:
103
+ fun_to_gene_dict[cog_id] = set()
104
+ fun_to_gene_dict[cog_id].add(gene_id)
105
+ line_index += 1
106
+
107
+ cmd_list_mafft = []
108
+ cmd_list_trim = []
109
+ cmd_list_tree = []
110
+ cmd_1_mafft_txt_handle = open(cmd_1_mafft_txt, 'w')
111
+ cmd_2_trim_txt_handle = open(cmd_2_trim_txt, 'w')
112
+ cmd_3_tree_txt_handle = open(cmd_3_tree_txt, 'w')
113
+ for each_fun in sorted(fun_to_gene_dict):
114
+
115
+ # define file name
116
+ fun_faa = '%s/%s.faa' % (faa_dir, each_fun)
117
+ current_gene_tree_dir = '%s/%s' % (tree_dir, each_fun)
118
+ fun_aln = '%s/%s.aln' % (aln_dir, each_fun)
119
+ fun_aln_trimmed = '%s/%s_trimal.aln' % (trimmed_aln_dir, each_fun)
120
+ if trim_with_bmge is True:
121
+ fun_aln_trimmed = '%s/%s_bmge.aln' % (trimmed_aln_dir, each_fun)
122
+
123
+ # extract sequences
124
+ current_fun_gene_set = fun_to_gene_dict[each_fun]
125
+ select_seq(combined_faa, current_fun_gene_set, fun_faa)
126
+
127
+ os.system('mkdir %s' % current_gene_tree_dir)
128
+
129
+ # prepare commands
130
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (1, fun_faa, fun_aln)
131
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (fun_aln, fun_aln_trimmed)
132
+ if trim_with_bmge is True:
133
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, fun_aln, trim_model, entropy_score_cutoff, fun_aln_trimmed)
134
+ infer_tree_cmd = 'iqtree2 -s %s --seqtype AA -m %s -B 1000 --wbtl --bnni --prefix %s/%s -T %s --quiet' % (fun_aln_trimmed, iqtree_model, current_gene_tree_dir, each_fun, num_of_threads)
135
+
136
+ # add commands to list
137
+ cmd_list_mafft.append(mafft_cmd)
138
+ cmd_list_trim.append(trim_cmd)
139
+ cmd_list_tree.append(infer_tree_cmd)
140
+
141
+ # write out commands
142
+ cmd_1_mafft_txt_handle.write(mafft_cmd + '\n')
143
+ cmd_2_trim_txt_handle.write(trim_cmd + '\n')
144
+ cmd_3_tree_txt_handle.write(infer_tree_cmd + '\n')
145
+
146
+ cmd_1_mafft_txt_handle.close()
147
+ cmd_2_trim_txt_handle.close()
148
+ cmd_3_tree_txt_handle.close()
149
+
150
+ # run mafft commands
151
+ print('Running mafft with %s cores for %s commands' % (num_of_threads, len(cmd_list_mafft)))
152
+ pool = mp.Pool(processes=num_of_threads)
153
+ pool.map(os.system, cmd_list_mafft)
154
+ pool.close()
155
+ pool.join()
156
+
157
+ # run trim commands
158
+ print('Trimming with %s cores for %s commands' % (num_of_threads, len(cmd_list_trim)))
159
+ pool = mp.Pool(processes=num_of_threads)
160
+ pool.map(os.system, cmd_list_trim)
161
+ pool.close()
162
+ pool.join()
163
+
164
+ # run iqtree commands
165
+ print('Running iqtree with %s cores' % num_of_threads)
166
+ for each_iqtree_cmd in sorted(cmd_list_tree):
167
+ print(each_iqtree_cmd)
168
+ os.system(each_iqtree_cmd)
169
+
170
+
171
+ if __name__ == '__main__':
172
+
173
+ cogTree_parser = argparse.ArgumentParser()
174
+ cogTree_parser.add_argument('-i', required=True, help='orthologous gene sequence')
175
+ cogTree_parser.add_argument('-fun', required=True, help='interested functions')
176
+ cogTree_parser.add_argument('-cog', required=False, default=None, help='COG annotation results')
177
+ cogTree_parser.add_argument('-o', required=True, help='output directory')
178
+ cogTree_parser.add_argument('-bmge', required=False, action="store_true", help='trim with BMGE, default is trimal')
179
+ cogTree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='trim model, default: BLOSUM30')
180
+ cogTree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='entropy score cutoff, default: 0.55')
181
+ cogTree_parser.add_argument('-iqtree_m', required=False, default='LG+G+I', help='iqtree_model, default: LG+G+I')
182
+ cogTree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
183
+ cogTree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads, default: 1')
184
+ args = vars(cogTree_parser.parse_args())
185
+ cogTree(args)
@@ -0,0 +1,30 @@
1
+
2
+ # check.packages function: install and load multiple R packages.
3
+ # Check to see if packages are installed. Install them if they are not, then load them into the R session.
4
+ check.packages <- function(pkg){
5
+ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
6
+ if (length(new.pkg))
7
+ install.packages(new.pkg, dependencies = TRUE)
8
+ sapply(pkg, require, character.only = 1)}
9
+
10
+ # install packages if not installed
11
+ packages<-c("optparse", "ape", "vegan")
12
+ invisible(suppressMessages(check.packages(packages)))
13
+
14
+ option_list = list(
15
+ make_option(c("-a", "--treeo"), type="character", default=NULL, help="the first tree"),
16
+ make_option(c("-b", "--treet"), type="character", default=NULL, help="the second tree"));
17
+
18
+ opt_parser = OptionParser(option_list=option_list);
19
+ opt = parse_args(opt_parser);
20
+
21
+ TREE1 = read.tree(opt$treeo)
22
+ TREE2 = read.tree(opt$treet)
23
+
24
+ D1 = cophenetic(TREE1)
25
+ D1 = D1[order(row.names(D1)),order(row.names(D1))]
26
+ D2 = cophenetic(TREE2)
27
+ D2 = D2[order(row.names(D2)),order(row.names(D2))]
28
+
29
+ mantel(xdis = D1, ydis = D2, permutations = 999)
30
+
@@ -0,0 +1,255 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from ete3 import Tree
5
+ import multiprocessing as mp
6
+
7
+
8
+ compare_trees_usage = '''
9
+ ======================== compare_trees example command ========================
10
+
11
+ TreeSAK compare_trees -t1 tree_1.newick -t2 tree_2.newick -o op_dir
12
+ TreeSAK compare_trees -t1 tree_dir -t2 tree_dir -tx newick -dm -t 12 -o op_dir
13
+
14
+ ===============================================================================
15
+ '''
16
+
17
+
18
+ def sep_path_basename_ext(file_in):
19
+
20
+ # separate path and file name
21
+ file_path, file_name = os.path.split(file_in)
22
+ if file_path == '':
23
+ file_path = '.'
24
+
25
+ # separate file basename and extension
26
+ file_basename, file_extension = os.path.splitext(file_name)
27
+
28
+ return file_path, file_basename, file_extension
29
+
30
+
31
+ def check_numeric(str_in):
32
+ is_numeric = True
33
+ try:
34
+ x = float(str_in)
35
+ except ValueError:
36
+ is_numeric = False
37
+
38
+ return is_numeric
39
+
40
+
41
+ def parse_mantel_stats(mantel_stats_txt):
42
+
43
+ mantel_similarity = 'na'
44
+ for each_line in open(mantel_stats_txt):
45
+ if 'Mantel statistic r: ' in each_line:
46
+ mantel_similarity = each_line.strip().split('Mantel statistic r: ')[1]
47
+ return mantel_similarity
48
+
49
+
50
+ def get_matrix(query_tree_list, subject_tree_list, mantel_stats_dir, write_out_dm, output_matrix, output_matrix_distance):
51
+
52
+ header_line_str = '\t' + '\t'.join(subject_tree_list) + '\n'
53
+
54
+ output_matrix_handle = open(output_matrix, 'w')
55
+ output_matrix_handle.write(header_line_str)
56
+ distance_lol = []
57
+ for each_qt in query_tree_list:
58
+
59
+ current_qt_mantel_stats_value_list = [each_qt]
60
+ for each_st in subject_tree_list:
61
+
62
+ qt_vs_st_mantel_stats = '%s/%s_vs_%s_mantel_stats.txt' % (mantel_stats_dir, each_qt, each_st)
63
+ st_vs_qt_mantel_stats = '%s/%s_vs_%s_mantel_stats.txt' % (mantel_stats_dir, each_st, each_qt)
64
+
65
+ tree_similarity = 'na'
66
+ if os.path.isfile(qt_vs_st_mantel_stats) is True:
67
+ tree_similarity = parse_mantel_stats(qt_vs_st_mantel_stats)
68
+ if os.path.isfile(st_vs_qt_mantel_stats) is True:
69
+ tree_similarity = parse_mantel_stats(st_vs_qt_mantel_stats)
70
+
71
+ current_qt_mantel_stats_value_list.append(tree_similarity)
72
+
73
+ current_qt_mantel_stats_value_list_distance = [each_qt]
74
+ for each_value in current_qt_mantel_stats_value_list[1:]:
75
+ if check_numeric(each_value) is True:
76
+ in_distance = 1 - float(each_value)
77
+ in_distance = float("{0:.4f}".format(in_distance))
78
+ if in_distance == 0:
79
+ in_distance = '0'
80
+ current_qt_mantel_stats_value_list_distance.append(str(in_distance))
81
+ else:
82
+ current_qt_mantel_stats_value_list_distance.append('na')
83
+
84
+ distance_lol.append(current_qt_mantel_stats_value_list_distance)
85
+ current_qt_mantel_stats_value_str = '\t'.join(current_qt_mantel_stats_value_list)
86
+ output_matrix_handle.write(current_qt_mantel_stats_value_str + '\n')
87
+ output_matrix_handle.close()
88
+
89
+ # write out distance matrix
90
+ if write_out_dm is True:
91
+ output_matrix_distance_handle = open(output_matrix_distance, 'w')
92
+ output_matrix_distance_handle.write(header_line_str)
93
+ for each_list in distance_lol:
94
+ output_matrix_distance_handle.write('\t'.join(each_list) + '\n')
95
+ output_matrix_distance_handle.close()
96
+
97
+
98
+ def compare_trees_worker(arg_list):
99
+
100
+ compare_trees_R = arg_list[0]
101
+ tree_file_1 = arg_list[1]
102
+ tree_file_2 = arg_list[2]
103
+ tmp_dir = arg_list[3]
104
+ keep_tmp_file = arg_list[4]
105
+
106
+ tree1_path, tree1_basename, tree1_extension = sep_path_basename_ext(tree_file_1)
107
+ tree2_path, tree2_basename, tree2_extension = sep_path_basename_ext(tree_file_2)
108
+
109
+ op_stats = '%s/%s_vs_%s_mantel_stats.txt' % (tmp_dir, tree1_basename, tree2_basename)
110
+
111
+ t1 = Tree(tree_file_1, format=1)
112
+ t2 = Tree(tree_file_2, format=1)
113
+
114
+ tree1_leaf_list = []
115
+ for leaf1 in t1:
116
+ tree1_leaf_list.append(leaf1.name)
117
+
118
+ tree2_leaf_list = []
119
+ for leaf2 in t2:
120
+ tree2_leaf_list.append(leaf2.name)
121
+
122
+ shared_leaves = set(tree1_leaf_list).intersection(tree2_leaf_list)
123
+ if len(shared_leaves) == 0:
124
+ print('No leaves shared between %s and %s, calculation skipped!' % (tree1_basename, tree2_basename))
125
+ #exit()
126
+
127
+ elif len(tree1_leaf_list) == len(tree2_leaf_list) == len(shared_leaves):
128
+ compare_trees_cmd = 'Rscript %s -a %s -b %s > %s' % (compare_trees_R, tree_file_1, tree_file_2, op_stats)
129
+ os.system(compare_trees_cmd)
130
+
131
+ elif (len(shared_leaves) != len(tree1_leaf_list)) or (len(shared_leaves) != len(tree2_leaf_list)):
132
+ print('Performing Mantel test based on %s leaves shared by %s (%s) and %s (%s)' % (len(shared_leaves), tree1_basename, len(tree1_leaf_list), tree2_basename, len(tree2_leaf_list)))
133
+
134
+ # write out shared leaves
135
+ shared_leaves_txt = '%s/%s_vs_%s_shared_leaves.txt' % (tmp_dir, tree1_basename, tree2_basename)
136
+ shared_leaves_txt_handle = open(shared_leaves_txt, 'w')
137
+ for each_shared_leaf in shared_leaves:
138
+ shared_leaves_txt_handle.write(each_shared_leaf + '\n')
139
+ shared_leaves_txt_handle.close()
140
+
141
+ # subset_tree
142
+ t1_subset = '%s/%s_vs_%s_%s_subset%s' % (tmp_dir, tree1_basename, tree2_basename, tree1_basename, tree1_extension)
143
+ t2_subset = '%s/%s_vs_%s_%s_subset%s' % (tmp_dir, tree1_basename, tree2_basename, tree2_basename, tree2_extension)
144
+ subset_cmd_t1 = 'BioSAK subset_tree -tree %s -taxon %s -out %s -q' % (tree_file_1, shared_leaves_txt, t1_subset)
145
+ subset_cmd_t2 = 'BioSAK subset_tree -tree %s -taxon %s -out %s -q' % (tree_file_2, shared_leaves_txt, t2_subset)
146
+ os.system(subset_cmd_t1)
147
+ os.system(subset_cmd_t2)
148
+
149
+ compare_trees_cmd = 'Rscript %s -a %s -b %s > %s' % (compare_trees_R, t1_subset, t2_subset, op_stats)
150
+ os.system(compare_trees_cmd)
151
+
152
+ if keep_tmp_file is False:
153
+ os.system('rm %s' % shared_leaves_txt)
154
+ os.system('rm %s' % t1_subset)
155
+ os.system('rm %s' % t2_subset)
156
+
157
+
158
+ def compare_trees(args):
159
+
160
+ op_dir = args['o']
161
+ tree_file_1 = args['t1']
162
+ tree_file_2 = args['t2']
163
+ tree_file_ext = args['tx']
164
+ export_dm = args['dm']
165
+ num_threads = args['t']
166
+ keep_tmp = args['tmp']
167
+ force_create_op_dir = args['f']
168
+
169
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
170
+ compare_trees_R = '%s/compare_trees.R' % current_file_path
171
+ tmp_dir = '%s/tmp' % op_dir
172
+
173
+ query_tree_list = []
174
+ if os.path.isfile(tree_file_1):
175
+ query_tree_list = [tree_file_1]
176
+ elif os.path.isdir(tree_file_1):
177
+ query_tree_re = '%s/*.%s' % (tree_file_1, tree_file_ext)
178
+ query_tree_list = glob.glob(query_tree_re)
179
+
180
+ subject_tree_list = []
181
+ if os.path.isfile(tree_file_2):
182
+ subject_tree_list = [tree_file_2]
183
+ elif os.path.isdir(tree_file_2):
184
+ subject_tree_re = '%s/*.%s' % (tree_file_2, tree_file_ext)
185
+ subject_tree_list = glob.glob(subject_tree_re)
186
+
187
+ # prepare arg list for compare_trees_worker
188
+ to_be_calculated_set = set()
189
+ list_for_compare_trees_worker = []
190
+ for each_query_tree in query_tree_list:
191
+ for each_subject_tree in subject_tree_list:
192
+
193
+ tree_1_vs_2 = '%s_vs_%s' % (each_query_tree, each_subject_tree)
194
+ tree_2_vs_1 = '%s_vs_%s' % (each_subject_tree, each_query_tree)
195
+
196
+ if tree_1_vs_2 not in to_be_calculated_set:
197
+ list_for_compare_trees_worker.append([compare_trees_R, each_query_tree, each_subject_tree, tmp_dir, keep_tmp])
198
+ to_be_calculated_set.add(tree_1_vs_2)
199
+ to_be_calculated_set.add(tree_2_vs_1)
200
+
201
+ print('Total pairs of trees to compare: %s' % len(list_for_compare_trees_worker))
202
+
203
+ # create op_dir
204
+ if os.path.isdir(op_dir) is True:
205
+ if force_create_op_dir is True:
206
+ os.system('rm -r %s' % op_dir)
207
+ else:
208
+ print('Output folder detected, program exited!')
209
+ exit()
210
+ os.system('mkdir %s' % op_dir)
211
+ os.system('mkdir %s' % tmp_dir)
212
+
213
+ # compare trees with multiprocessing
214
+ pool = mp.Pool(processes=num_threads)
215
+ pool.map(compare_trees_worker, list_for_compare_trees_worker)
216
+ pool.close()
217
+ pool.join()
218
+
219
+ # get matrix
220
+ output_matrix_similarity = '%s/Matrix_similarity.txt' % op_dir
221
+ output_matrix_distance = '%s/Matrix_distance.txt' % op_dir
222
+ query_tree_list_basename = []
223
+ for each_q_tree in query_tree_list:
224
+ q_tree_path, q_tree_basename, q_tree_ext = sep_path_basename_ext(each_q_tree)
225
+ query_tree_list_basename.append(q_tree_basename)
226
+
227
+ subject_tree_list_basename = []
228
+ for each_s_tree in subject_tree_list:
229
+ s_tree_path, s_tree_basename, s_tree_ext = sep_path_basename_ext(each_s_tree)
230
+ subject_tree_list_basename.append(s_tree_basename)
231
+
232
+ get_matrix(sorted(query_tree_list_basename), sorted(subject_tree_list_basename), tmp_dir, export_dm, output_matrix_similarity, output_matrix_distance)
233
+
234
+ # final report
235
+ if export_dm is True:
236
+ print('Data matrix exported to: %s and %s' % (output_matrix_similarity, output_matrix_distance))
237
+ else:
238
+ print('Data matrix exported to: %s' % output_matrix_similarity)
239
+
240
+ print('Done!')
241
+
242
+
243
+ if __name__ == '__main__':
244
+
245
+ compare_trees_parser = argparse.ArgumentParser(usage=compare_trees_usage)
246
+ compare_trees_parser.add_argument('-o', required=True, help='output directory')
247
+ compare_trees_parser.add_argument('-t1', required=True, help='tree (folder) 1')
248
+ compare_trees_parser.add_argument('-t2', required=True, help='tree (folder) 2')
249
+ compare_trees_parser.add_argument('-tx', required=False, default='newick', help='extention of tree files, default: newick')
250
+ compare_trees_parser.add_argument('-dm', required=False, action="store_true", help='export distance-alike matrix, obtained by subtract the similarity value from 1')
251
+ compare_trees_parser.add_argument('-t', required=False, type=int, default=1, help='number of threads')
252
+ compare_trees_parser.add_argument('-tmp', required=False, action="store_true", help='keep tmp files')
253
+ compare_trees_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
254
+ args = vars(compare_trees_parser.parse_args())
255
+ compare_trees(args)