PyPI - treesak - Versions diffs - 1.51.2__py3-none-any.whl - Mend

treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show

TreeSAK/ALE.py +63 -0
TreeSAK/ALE1.py +268 -0
TreeSAK/ALE2.py +168 -0
TreeSAK/ALE2RTC.py +30 -0
TreeSAK/ALE3.py +205 -0
TreeSAK/ALE4.py +636 -0
TreeSAK/ALE5.py +210 -0
TreeSAK/ALE6.py +401 -0
TreeSAK/ALE7.py +126 -0
TreeSAK/ALE_backup.py +1081 -0
TreeSAK/AssessCVG.py +128 -0
TreeSAK/AssessMarker.py +306 -0
TreeSAK/AssessMarkerDeltaLL.py +257 -0
TreeSAK/AssessMarkerPA.py +317 -0
TreeSAK/AssessPB.py +130 -0
TreeSAK/BMGE.jar +0 -0
TreeSAK/BMGE.py +49 -0
TreeSAK/CompareMCMC.py +138 -0
TreeSAK/ConcateMSA.py +111 -0
TreeSAK/ConvertMSA.py +135 -0
TreeSAK/Dir.rb +82 -0
TreeSAK/ExtractMarkerSeq.py +263 -0
TreeSAK/FastRoot.py +1175 -0
TreeSAK/FastRoot_backup.py +1122 -0
TreeSAK/FigTree.py +34 -0
TreeSAK/GTDB_tree.py +76 -0
TreeSAK/GeneTree.py +142 -0
TreeSAK/KEGG_Luo17.py +807 -0
TreeSAK/LcaToLeaves.py +66 -0
TreeSAK/MarkerRef2Tree.py +616 -0
TreeSAK/MarkerRef2Tree_backup.py +628 -0
TreeSAK/MarkerSeq2Tree.py +290 -0
TreeSAK/MarkerSeq2Tree_backup.py +259 -0
TreeSAK/ModifyTopo.py +116 -0
TreeSAK/Newick_tree_plotter.py +79 -0
TreeSAK/OMA.py +170 -0
TreeSAK/OMA2.py +212 -0
TreeSAK/OneLineAln.py +50 -0
TreeSAK/PB.py +155 -0
TreeSAK/PMSF.py +106 -0
TreeSAK/PhyloBiAssoc.R +84 -0
TreeSAK/PhyloBiAssoc.py +167 -0
TreeSAK/PlotMCMC.py +41 -0
TreeSAK/PlotMcmcNode.py +152 -0
TreeSAK/PlotMcmcNode_old.py +252 -0
TreeSAK/RootTree.py +101 -0
TreeSAK/RootTreeGTDB214.py +288 -0
TreeSAK/RootTreeGTDB220.py +300 -0
TreeSAK/RootTreeGTDB226.py +300 -0
TreeSAK/SequentialDating.py +16 -0
TreeSAK/SingleAleHGT.py +157 -0
TreeSAK/SingleLinePhy.py +50 -0
TreeSAK/SliceMSA.py +142 -0
TreeSAK/SplitScore.py +19 -0
TreeSAK/SplitScore1.py +178 -0
TreeSAK/SplitScore1OMA.py +148 -0
TreeSAK/SplitScore2.py +597 -0
TreeSAK/TaxaCountStats.R +256 -0
TreeSAK/TaxonTree.py +47 -0
TreeSAK/TreeSAK_config.py +32 -0
TreeSAK/VERSION +158 -0
TreeSAK/VisHPD95.R +45 -0
TreeSAK/VisHPD95.py +200 -0
TreeSAK/__init__.py +0 -0
TreeSAK/ale_parser.py +74 -0
TreeSAK/ale_splitter.py +63 -0
TreeSAK/alignment_pruner.pl +1471 -0
TreeSAK/assessOG.py +45 -0
TreeSAK/catfasta2phy.py +140 -0
TreeSAK/cogTree.py +185 -0
TreeSAK/compare_trees.R +30 -0
TreeSAK/compare_trees.py +255 -0
TreeSAK/dating.py +264 -0
TreeSAK/dating_ss.py +361 -0
TreeSAK/deltall.py +82 -0
TreeSAK/do_rrtc.rb +464 -0
TreeSAK/fa2phy.py +42 -0
TreeSAK/format_leaf_name.py +70 -0
TreeSAK/gap_stats.py +38 -0
TreeSAK/get_SCG_tree.py +742 -0
TreeSAK/get_arCOG_seq.py +97 -0
TreeSAK/global_functions.py +222 -0
TreeSAK/gnm_leaves.py +43 -0
TreeSAK/iTOL.py +791 -0
TreeSAK/iTOL_gene_tree.py +80 -0
TreeSAK/itol_msa_stats.py +56 -0
TreeSAK/keep_highest_rrtc.py +37 -0
TreeSAK/koTree.py +194 -0
TreeSAK/label_tree.R +75 -0
TreeSAK/label_tree.py +121 -0
TreeSAK/mad.py +708 -0
TreeSAK/mcmc2tree.py +58 -0
TreeSAK/mcmcTC copy.py +92 -0
TreeSAK/mcmcTC.py +104 -0
TreeSAK/mcmctree_vs_reltime.R +44 -0
TreeSAK/mcmctree_vs_reltime.py +252 -0
TreeSAK/merge_pdf.py +32 -0
TreeSAK/pRTC.py +56 -0
TreeSAK/parse_mcmctree.py +198 -0
TreeSAK/parse_reltime.py +141 -0
TreeSAK/phy2fa.py +37 -0
TreeSAK/plot_distruibution_th.py +165 -0
TreeSAK/prep_mcmctree_ctl.py +92 -0
TreeSAK/print_leaves.py +32 -0
TreeSAK/pruneMSA.py +63 -0
TreeSAK/recode.py +73 -0
TreeSAK/remove_bias.R +112 -0
TreeSAK/rename_leaves.py +77 -0
TreeSAK/replace_clade.py +55 -0
TreeSAK/root_with_out_group.py +84 -0
TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
TreeSAK/subsample_drep_gnms.py +74 -0
TreeSAK/subset.py +69 -0
TreeSAK/subset_tree_stupid_old_way.py +193 -0
TreeSAK/supertree.py +330 -0
TreeSAK/tmp_1.py +19 -0
TreeSAK/tmp_2.py +19 -0
TreeSAK/tmp_3.py +120 -0
TreeSAK/weighted_rand.rb +23 -0
treesak-1.51.2.data/scripts/TreeSAK +950 -0
treesak-1.51.2.dist-info/LICENSE +674 -0
treesak-1.51.2.dist-info/METADATA +27 -0
treesak-1.51.2.dist-info/RECORD +125 -0
treesak-1.51.2.dist-info/WHEEL +5 -0
treesak-1.51.2.dist-info/top_level.txt +1 -0

TreeSAK/parse_mcmctree.py ADDED Viewed

@@ -0,0 +1,198 @@
+import os
+import glob
+import argparse
+from ete3 import Tree
+VisHPD95_usage = '''
+============================ VisHPD95 example command ============================
+TreeSAK VisHPD95 -i mcmc_out -o HPD95.pdf -n nodes.txt -label label.txt
+TreeSAK VisHPD95 -i mcmc_out -o HPD95.pdf -n nodes.txt -label label.txt -x 9 -y 6
+# Example data
+https://github.com/songweizhi/TreeSAK/tree/master/example_data/VisHPD95
+==================================================================================
+'''
+def mcmctree_out_to_tree_str(mamctree_out):
+    # get tree string from mamctree_out
+    tree_str = ''
+    tree_line = 0
+    current_line = 1
+    for each_line in open(mamctree_out):
+        if 'Species tree for FigTree.  Branch lengths = posterior mean times; 95% CIs = labels' in each_line:
+            tree_line = current_line + 1
+        if tree_line == current_line:
+            tree_str = each_line.strip()
+        current_line += 1
+    tree_str_no_space = tree_str.replace(' ', '')
+    # rename tree nodes
+    t = Tree(tree_str_no_space, format=1)
+    for each_node in t.traverse():
+        if each_node.is_leaf():
+            node_name_new = '_'.join(each_node.name.split('_')[1:])
+        else:
+            node_name_new = 't_n%s' % each_node.name
+        each_node.name = node_name_new
+    tree_str_renamed = t.write(format=8)
+    return tree_str_renamed
+def get_internal_node_to_plot(node_txt, mo_file):
+    tree_str = ''
+    if os.path.isfile(mo_file):
+        tree_str = mcmctree_out_to_tree_str(mo_file)
+    # get nodes to plot
+    node_set = set()
+    node_rename_dict = dict()
+    if os.path.isfile(node_txt) is True:
+        for each in open(node_txt):
+            each_split = each.strip().split('\t')
+            node_str = each_split[0]
+            # get internal_node_to_plot
+            internal_node_to_plot = ''
+            if ',' not in node_str:
+                internal_node_to_plot = each_split[0]
+            else:
+                leaf_list = node_str.split(',')
+                if tree_str == '':
+                    print('MCMCTree out file not found, program exited!')
+                    exit()
+                current_lca = Tree(tree_str, format=1).get_common_ancestor(leaf_list)
+                internal_node_to_plot = current_lca.name
+            # add internal_node_to_plot to  node_set
+            if internal_node_to_plot != '':
+                node_set.add(internal_node_to_plot)
+            # read in name to show in plot
+            if len(each_split) == 2:
+                if each_split[1] != '':
+                    node_rename_dict[internal_node_to_plot] = each_split[1]
+    else:
+        node_set = node_txt.split(',')
+    return node_set, node_rename_dict, tree_str
+def read_in_posterior_mean(mcmctree_out):
+    # read in Posterior mean
+    node_to_mean_hpd95_dict = dict()
+    current_line = 1
+    posterior_mean_header_line = 0
+    for each_line in open(mcmctree_out):
+        if 'Posterior mean (95% Equal-tail CI) (95% HPD CI) HPD-CI-width' in each_line:
+            posterior_mean_header_line = current_line
+        if (posterior_mean_header_line != 0) and (current_line > posterior_mean_header_line):
+            each_line_split = each_line.strip().split(' ')
+            each_line_split_no_empty = []
+            for each_element in each_line_split:
+                if each_element not in ['', '(']:
+                    each_element_value = each_element.replace('(', '').replace(')', '').replace(',', '')
+                    each_line_split_no_empty.append(each_element_value)
+            if len(each_line_split_no_empty) == 9:
+                node_id           = each_line_split_no_empty[0]
+                value_mean        = each_line_split_no_empty[1]
+                value_hpd95_small = each_line_split_no_empty[4]
+                value_hpd95_big   = each_line_split_no_empty[5]
+                node_to_mean_hpd95_dict[node_id] = [value_mean, value_hpd95_small, value_hpd95_big]
+        current_line += 1
+    return node_to_mean_hpd95_dict
+def VisHPD95(args):
+    mcmc_in     = args['i']
+    node_txt    = args['n']
+    label_txt   = args['label']
+    plot_out    = args['o']
+    plot_width  = args['x']
+    plot_height = args['y']
+    pwd_current_file  = os.path.realpath(__file__)
+    current_file_path = '/'.join(pwd_current_file.split('/')[:-1])
+    VisHPD95_R  = '%s/VisHPD95.R' % current_file_path
+    dm_out      = '%s.txt' % plot_out
+    # check MCMCTree output file/dir
+    if os.path.isfile(mcmc_in) is True:
+        mcmc_out_file_list = [mcmc_in]
+    else:
+        mcmc_out_file_re = '%s/*_out.txt' % (mcmc_in)
+        mcmc_out_file_list = glob.glob(mcmc_out_file_re)
+    if len(mcmc_out_file_list) == 0:
+        print('MCMCTree out file not found, program exited!')
+        exit()
+    # read in y-axis label file
+    label_dict = dict()
+    color_dict = dict()
+    shape_dict = dict()
+    if label_txt is not None:
+        for each_sample in open(label_txt):
+            each_sample_split = each_sample.strip().split('\t')
+            if len(each_sample_split) == 3:
+                label_dict[each_sample_split[0]] = each_sample_split[1]
+                color_dict[each_sample_split[0]] = each_sample_split[1]
+                shape_dict[each_sample_split[0]] = each_sample_split[2]
+            else:
+                print('Format error: %s' % label_txt)
+                exit()
+    dm_out_handle = open(dm_out, 'w')
+    dm_out_handle.write('Test\tShape\tVar\tMean\tLow\tHigh\n')
+    for mcmc_out_file in mcmc_out_file_list:
+        mcmc_out_file_no_path = mcmc_out_file
+        if '/' in mcmc_out_file_no_path:
+            mcmc_out_file_no_path = mcmc_out_file_no_path.split('/')[-1]
+        color_col_to_write = color_dict.get(mcmc_out_file_no_path, mcmc_out_file_no_path)
+        shape_col_to_write = shape_dict.get(mcmc_out_file_no_path, mcmc_out_file_no_path)
+        node_set, node_rename_dict, tree_str = get_internal_node_to_plot(node_txt, mcmc_out_file)
+        node_to_mean_95_hpd_dict = read_in_posterior_mean(mcmc_out_file)
+        for each_node in node_set:
+            node_name_to_write = node_rename_dict.get(each_node, each_node)
+            mean_95_hpd_list = node_to_mean_95_hpd_dict.get(each_node)
+            dm_out_handle.write('%s\t%s\t%s\t%s\n' % (color_col_to_write, shape_col_to_write, node_name_to_write, '\t'.join(mean_95_hpd_list)))
+    dm_out_handle.close()
+    plot_cmd   = 'Rscript %s -i %s -x %s -y %s -o %s' % (VisHPD95_R, dm_out, plot_width, plot_height, plot_out)
+    os.system(plot_cmd)
+    print('Plot exported to: %s' % plot_out)
+if __name__ == '__main__':
+    VisHPD95_parser = argparse.ArgumentParser()
+    VisHPD95_parser.add_argument('-i',      required=True,                      help='mcmc.txt file or folder')
+    VisHPD95_parser.add_argument('-n',      required=True,                      help='Nodes to plot')
+    VisHPD95_parser.add_argument('-label',  required=False, default=None,       help='labels on y axis')
+    VisHPD95_parser.add_argument('-x',      required=False, default=8,type=int, help='plot width, default: 8')
+    VisHPD95_parser.add_argument('-y',      required=False, default=5,type=int, help='plot height, default: 5')
+    VisHPD95_parser.add_argument('-o',      required=True,                      help='Output plot')
+    args = vars(VisHPD95_parser.parse_args())
+    VisHPD95(args)
+'''
+cd /Users/songweizhi/Desktop/777
+python3 ~/PycharmProjects/TreeSAK/TreeSAK/VisHPD95.py -i M1_mcmc_txt -o M1_HPD95.pdf -n nodes_five.txt -label y_label_out.txt
+'''

TreeSAK/parse_reltime.py ADDED Viewed

@@ -0,0 +1,141 @@
+import os
+import argparse
+parse_reltime_usage = '''
+==================== parse_reltime example commands ====================
+TreeSAK parse_reltime -i RelTime.txt -n dbscc_lca.txt -o dbscc_age.txt
+========================================================================
+'''
+def sep_path_basename_ext(file_in):
+    f_path, f_name = os.path.split(file_in)
+    if f_path == '':
+        f_path = '.'
+    f_base, f_ext = os.path.splitext(f_name)
+    return f_name, f_path, f_base, f_ext[1:]
+def get_lca(reltime_txt, leaf_1_name, leaf_2_name):
+    leaf_set = set()
+    child_to_parent_dict = dict()
+    id_to_name_dict = dict()
+    name_to_id_dict = dict()
+    for each_line in open(reltime_txt):
+        if not each_line.startswith('NodeLabel'):
+            each_line_split = each_line.strip().split('\t')
+            each_line_split = [i.strip() for i in each_line_split]
+            if len(each_line_split) > 1:
+                node_name = each_line_split[0].replace(' ', '_')
+                node_id = each_line_split[1]
+                des1 = each_line_split[2]
+                des2 = each_line_split[3]
+                id_to_name_dict[node_id] = node_name
+                name_to_id_dict[node_name] = node_id
+                child_to_parent_dict[des1] = node_id
+                child_to_parent_dict[des2] = node_id
+                if (des1 == '-') and (des2 == '-'):
+                    leaf_set.add(node_id)
+    leaf_to_lineage_dict = dict()
+    for leaf in sorted([i for i in leaf_set]):
+        original_leaf = leaf
+        lineage_list = [leaf]
+        while leaf in child_to_parent_dict:
+            leaf_p = child_to_parent_dict[leaf]
+            lineage_list.append(leaf_p)
+            leaf = leaf_p
+        leaf_to_lineage_dict[original_leaf] = lineage_list
+    leaf_1_id     = name_to_id_dict[leaf_1_name]
+    leaf_2_id     = name_to_id_dict[leaf_2_name]
+    leaf_1_linage = leaf_to_lineage_dict[leaf_1_id]
+    leaf_2_linage = leaf_to_lineage_dict[leaf_2_id]
+    lca = ''
+    for each_p in leaf_1_linage[::-1]:
+        if each_p in leaf_2_linage:
+            lca = each_p
+    return lca
+def parse_reltime(args):
+    reltime_txt          = args['i']
+    interested_nodes_txt = args['n']
+    op_txt               = args['o']
+    f_name, f_path, f_base, f_ext = sep_path_basename_ext(op_txt)
+    op_txt_all_info = '%s/%s_all_info.%s' % (f_path,f_base, f_ext)
+    lca_to_leaves_dict = dict()
+    interested_node_desc_dict = dict()
+    for interested_node in open(interested_nodes_txt):
+        interested_node_split = interested_node.strip().split('\t')
+        paired_leaves = interested_node_split[0]
+        interested_node_desc = paired_leaves
+        if len(interested_node_split) > 1:
+            interested_node_desc = interested_node_split[1]
+        interested_node_desc_dict[paired_leaves] = interested_node_desc
+        leaf_1 = paired_leaves.split(',')[0]
+        leaf_2 = paired_leaves.split(',')[1]
+        lca_id = get_lca(reltime_txt, leaf_1, leaf_2)
+        lca_to_leaves_dict[lca_id] = paired_leaves.strip()
+    op_txt_all_info_handle = open(op_txt_all_info, 'w')
+    line_num_index = 0
+    for each_line in open(reltime_txt):
+        each_line_split = each_line.strip().split('\t')
+        each_line_split = [i.strip() for i in each_line_split]
+        if line_num_index == 0:
+            op_txt_all_info_handle.write('Leaves\tDescription\t%s\n' % ('\t'.join(each_line_split)))
+        else:
+            if len(each_line_split) > 1:
+                node_id = each_line_split[1]
+                if node_id in lca_to_leaves_dict:
+                    node_id = each_line_split[1]
+                    corresponding_leaves = lca_to_leaves_dict[node_id]
+                    interested_node_desc = interested_node_desc_dict[corresponding_leaves]
+                    op_txt_all_info_handle.write('%s\t%s\t%s\n' % (corresponding_leaves, interested_node_desc, '\t'.join(each_line_split)))
+        line_num_index += 1
+    op_txt_all_info_handle.close()
+    op_txt_handle = open(op_txt, 'w')
+    op_txt_handle.write('Node\tDivTime\tCI_Lower\tCI_Upper\n')
+    line_num_index = 0
+    for each_line in open(op_txt_all_info):
+        if line_num_index > 0:
+            each_line_split = each_line.strip().split('\t')
+            desc = each_line_split[1]
+            div_time = each_line_split[9]
+            ci_lower = each_line_split[10]
+            ci_upper = each_line_split[11]
+            op_txt_handle.write('%s\t%s\t%s\t%s\n' % (desc, div_time, ci_lower, ci_upper))
+        line_num_index += 1
+    op_txt_handle.close()
+if __name__ == '__main__':
+    parse_reltime_parser = argparse.ArgumentParser()
+    parse_reltime_parser.add_argument('-i',  required=True, help='reltime output file')
+    parse_reltime_parser.add_argument('-n',  required=True, help='interested node txt')
+    parse_reltime_parser.add_argument('-o',  required=True, help='output txt file')
+    args = vars(parse_reltime_parser.parse_args())
+    parse_reltime(args)
+'''
+cd /Users/songweizhi/Desktop
+python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/parse_reltime.py -i /Users/songweizhi/Desktop/Sponge_r220/6_dating/RelTime/topo2_p30_RelTime_JTT_Gamma4/topo2_p30_RelTime_Gamma4.txt -n yang_7.txt -o dbscc_age.txt
+cd /Users/songweizhi/Desktop
+TreeSAK parse_reltime -i /Users/songweizhi/Desktop/Sponge_r220/6_dating/RelTime/topo2_p30_RelTime_JTT_Gamma4/topo2_p30_RelTime_Gamma4.txt -n yang_7.txt -o dbscc_age.txt
+'''

TreeSAK/phy2fa.py ADDED Viewed

@@ -0,0 +1,37 @@
+import argparse
+from Bio import SeqIO
+from Bio import AlignIO
+phy2fa_usage = '''
+======= phy2fa example commands =======
+TreeSAK phy2fa -i msa.phy -o msa.fa
+=======================================
+'''
+def phy2fa(args):
+    phylip_in   = args['i']
+    fasta_out   = args['o']
+    for aln in AlignIO.parse(phylip_in, "phylip"):
+        print(aln)
+    # alignments = list(AlignIO.parse(phylip_in, "phylip"))
+    # print(alignments)
+    # records = SeqIO.parse(phylip_in, "phylip")
+    # count = SeqIO.write(records, fasta_out, "fasta")
+    # print("Converted %i records" % count)
+if __name__ == '__main__':
+    # initialize the options parser
+    phy2fa_parser = argparse.ArgumentParser()
+    phy2fa_parser.add_argument('-i',      required=True,   help='input MSA in phylip format')
+    phy2fa_parser.add_argument('-o',      required=True,   help='output MSA in fasta format')
+    args = vars(phy2fa_parser.parse_args())
+    phy2fa(args)

TreeSAK/plot_distruibution_th.py ADDED Viewed

@@ -0,0 +1,165 @@
+import io
+import pandas as pd
+from tqdm import tqdm
+from ete3 import Tree
+from glob import glob
+from os.path import *
+import plotly.express as px
+import plotly.graph_objects as go
+import plotly.figure_factory as ff
+def read_mcmc(mcmc, all_col=False):
+    if type(mcmc) != str:
+        return mcmc
+    if all_col:
+        mcmc_df = pd.read_csv(mcmc, sep='\t', index_col=0)
+    else:
+        f1 = open(mcmc)
+        header = [_ for _ in next(f1).strip().split('\t')]
+        r_header = [_ for _ in header if not _.startswith('r_g')]
+        # normally it need to iterate rows and ignore the columns representing rates
+        text = '\t'.join(r_header) + '\n'
+        r_header = set(r_header)
+        for row in f1:
+            text += '\t'.join([r for r, h in zip(row.strip().split('\t'), header) if h in r_header]) + '\n'
+        mcmc_df = pd.read_csv(io.StringIO(text), sep='\t', index_col=0)
+    return mcmc_df
+def get_node_name_from_log(f):
+    # f should be the *.log file
+    rows = open(f).read().split('\n')
+    idx = [_ for _, r in enumerate(rows) if r == 'Species tree']
+    if not idx:
+        print("prior not complete")
+        return
+    idx = idx[0]
+    start_idx = idx + 3
+    end_idx = 0
+    for _ in range(idx, 100000):
+        if rows[_] == '':
+            end_idx = _
+            break
+    tree_idx1 = end_idx + 1
+    tree_idx2 = end_idx + 2
+    # find the index
+    n2father = {}
+    for i in range(start_idx, end_idx):
+        row = [_ for _ in rows[i].split(' ') if _]
+        father, n, name = row[0], row[1], row[2]
+        n2father[name if len(row) == 4 else n] = father
+    t = Tree(rows[tree_idx2], format=8)
+    for l in t.traverse('postorder'):
+        if l.up is None:
+            break
+        if not l.up.name:
+            l.up.name = n2father[l.name]
+    return t
+indir                   = '/Users/songweizhi/Desktop/DateArTree/plot_distruibution/stepwise'
+tree_dir                = '/Users/songweizhi/Desktop/DateArTree/plot_distruibution/treefile_dir'
+plot_dir                = '/Users/songweizhi/Desktop/DateArTree/plot_distruibution'
+gene_names              = ['M24', 'COG25']
+M24_gene_list           = ['MitoCOG0043', 'MitoCOG0040', 'MitoCOG0055', 'MitoCOG0052', 'MitoCOG0053', 'MitoCOG0133', 'MitoCOG0008', 'MitoCOG0009', 'MitoCOG0027', 'MitoCOG0031', 'MitoCOG0030', 'MitoCOG0001', 'MitoCOG0003', 'MitoCOG0012', 'MitoCOG0010', 'MitoCOG0004', 'MitoCOG0005', 'MitoCOG0011', 'MitoCOG0039', 'MitoCOG0060', 'MitoCOG0071', 'MitoCOG0059', 'MitoCOG0067', 'MitoCOG0066']
+COG25_gene_list         = ['223163', '223176', '223175', '223607', '223159', '223165', '223170', '223164', '223158', '223172', '223128', '223665', '223275', '223328', '223280', '223127', '223279', '273102', '223130', '223181', '223180', '223168', '223178', '223596', '223556']
+setname2genes = dict()
+setname2genes['M24']   = M24_gene_list
+setname2genes['COG25'] = COG25_gene_list
+gene2num = {}
+gene2dl = {}
+for gene_id in (M24_gene_list + COG25_gene_list):
+    pwd_tree_file  = '%s/%s.treefile' % (tree_dir, gene_id)
+    pwd_iqtree_log = '%s/%s.iqtree'   % (tree_dir, gene_id)
+    rows = open(pwd_iqtree_log).read().strip().split("\n")
+    idx = [idx for idx, v in enumerate(rows) if "deltaL  bp-RELL" in v][0]
+    r1, r2 = rows[idx + 2], rows[idx + 3]
+    r1 = [_ for _ in r1.strip().split(" ") if _]
+    r2 = [_ for _ in r2.strip().split(" ") if _]
+    if r2[2] == "0":
+        gene2dl[gene_id] = float(r1[2])
+    else:
+        gene2dl[gene_id] = float(r2[2])
+    gene2num[gene_id] = len(Tree(pwd_tree_file).get_leaf_names())
+# plot 1
+for setname, genes in setname2genes.items():
+    dl_list = [gene2dl[_] for _ in genes]
+    dl_list = sorted(dl_list, reverse=True)
+    fig = go.Figure()
+    fig.add_bar(y=dl_list)
+    fig.update_layout(title_text=setname,title_x=0.5,title_y=1,width=700,height=100,template='simple_white',
+                     margin_b=10,margin_l=10,margin_r=10,margin_t=10)
+    fig.write_image('%s/Plot_1_%s.pdf' % (plot_dir, setname))
+for gene_set in gene_names:
+    for _model in ['LG']:  # C60
+        t = []
+        for f in glob(f'{indir}/{gene_set}/r*/1pf_{_model}/mcmctree/mcmc.txt'):
+            if exists(f.replace('mcmc.txt', 'FigTree.tre')):
+                t.append((f.split('/')[-4] + ' MCMC', f))
+        t = sorted(t, key=lambda x: int(x[0].split(' ')[0][1:]))
+        dfs = []
+        targets = []
+        for cal, mcmc in tqdm(t):
+            tre = get_node_name_from_log(mcmc.replace('mcmc.txt','03_mcmctree.log'))
+            df  = read_mcmc(mcmc)
+            try:
+                df = df.sample(5000)
+            except:
+                print(mcmc)
+            for lca, name in [('GCA_001828545.1,GCA_005524015.1', 'Anammox'), ('GCA_013697045.1,GCA_002356115.1', 'Gamma-AOB'),
+                              ('GCA_001772005.1,GCA_013521015.1', 'Beta-AOB'), ('GCA_017879665.1,GCA_013140535.1', 'Comammox'),
+                              ('Acanthamoeba_castellanii,Andalucia_godoyi', 'Euk'), ('Andalucia_godoyi,Ostreococcus_tauri', 'Euk'),
+                              ('Cyanophora_paradoxa,NC_002186.1', 'Euk')]:
+                try:
+                    n = tre.get_common_ancestor(lca.split(',')).name
+                    targets.append(str(n))
+                    n = 't_n' + str(n)
+                    times = df[[n]]
+                except:
+                    continue
+                times.columns = ['time']
+                times.loc[:, 'group name'] = name
+                times.loc[:, 'cal'] = cal
+                dfs.append(times)
+        # plot 2
+        _df = pd.concat(dfs, axis=0)
+        g2color = {"Gamma-AOB": "#78fce0", "Beta-AOB": "#956bb4", "Comammox": "#edc21a", "Anammox": "#ff8000"}
+        _df = _df.loc[_df["group name"].isin(list(g2color)), :]
+        _fig = px.violin( _df, y="cal", x="time", color="group name", color_discrete_map=g2color, points=False, orientation="h")
+        _fig.update_traces(side="positive", fillcolor='rgba(0,0,0,0)', width=1.8)
+        _fig.update_traces(showlegend=False)
+        num_y = len(_df["cal"].unique())
+        _fig.layout.template = "simple_white"
+        _fig.layout.width = 700
+        _fig.layout.height = 750
+        _fig.update_xaxes(range=[40, 0])
+        _fig.update_layout(margin_t=10, title_text=f'{gene_set} {_model}', title_x=0.5)
+        _fig.write_image(f'{plot_dir}/Plot_2_{gene_set}_gradient_{_model}.pdf')
+        # plot 3
+        xs = []
+        ys = []
+        for ng, subdf in sorted(_df.groupby('cal'),key=lambda x: int(x[0].split(' ')[0].replace('r', ''))):
+            t1 = subdf.loc[subdf['group name'] == 'Gamma-AOB', 'time'].median()
+            t2 = subdf.loc[subdf['group name'] == 'Anammox', 'time'].median()
+            deltaT = t2-t1
+            ys.append(deltaT)
+            xs.append(int(ng.split(' ')[0].replace('r', '')))
+        fig = go.Figure()
+        fig.add_scatter(x=xs, y=ys, mode='markers+lines', showlegend=False)
+        fig.update_layout(width=300, height=300, margin_t=30, margin_l=10, margin_b=10, margin_r=10,
+                          template='simple_white', title_text=f'{gene_set} {_model}', title_x=0.5)
+        fig.write_image('%s/Plot_3_%s_%s.pdf' % (plot_dir, gene_set, _model))

TreeSAK/prep_mcmctree_ctl.py ADDED Viewed

@@ -0,0 +1,92 @@
+import itertools
+def prep_mcmctree_ctl(ctl_para_dict, mcmctree_ctl_file):
+    with open(mcmctree_ctl_file, 'w') as ctl_file_handle:
+        ctl_file_handle.write('      finetune = %s\n' % ctl_para_dict.get('seed',           '-1'))
+        ctl_file_handle.write('       seqfile = %s\n' % ctl_para_dict['seqfile'])
+        ctl_file_handle.write('      treefile = %s\n' % ctl_para_dict['treefile'])
+        ctl_file_handle.write('      mcmcfile = %s\n' % ctl_para_dict['mcmcfile'])
+        ctl_file_handle.write('       outfile = %s\n' % ctl_para_dict['outfile'])
+        ctl_file_handle.write('         ndata = %s\n' % ctl_para_dict.get('ndata',          1))
+        ctl_file_handle.write('       seqtype = %s\n' % ctl_para_dict['seqtype'])
+        ctl_file_handle.write('       usedata = %s\n' % ctl_para_dict['usedata'])
+        ctl_file_handle.write('         clock = %s\n' % ctl_para_dict['clock'])
+        ctl_file_handle.write('       RootAge = %s\n' % ctl_para_dict.get('RootAge',        '<1.0'))
+        ctl_file_handle.write('         model = %s\n' % ctl_para_dict.get('model',          0))
+        ctl_file_handle.write('         alpha = %s\n' % ctl_para_dict.get('alpha',          0.5))
+        ctl_file_handle.write('         ncatG = %s\n' % ctl_para_dict.get('ncatG',          4))
+        ctl_file_handle.write('     cleandata = %s\n' % ctl_para_dict.get('cleandata',      0))
+        ctl_file_handle.write('       BDparas = %s\n' % ctl_para_dict.get('BDparas',        '1 1 0.1'))
+        ctl_file_handle.write('   kappa_gamma = %s\n' % ctl_para_dict.get('kappa_gamma',    '6 2'))
+        ctl_file_handle.write('   alpha_gamma = %s\n' % ctl_para_dict.get('alpha_gamma',    '1 1'))
+        ctl_file_handle.write('   rgene_gamma = %s\n' % ctl_para_dict.get('rgene_gamma',    '1 50 1'))
+        ctl_file_handle.write('  sigma2_gamma = %s\n' % ctl_para_dict.get('sigma2_gamma',   '1 10 1'))
+        ctl_file_handle.write('      finetune = %s\n' % ctl_para_dict.get('finetune',       '1: .1 .1 .1 .1 .1 .1'))
+        ctl_file_handle.write('         print = %s\n' % ctl_para_dict.get('print',          1))
+        ctl_file_handle.write('        burnin = %s\n' % ctl_para_dict.get('burnin',         50000))
+        ctl_file_handle.write('      sampfreq = %s\n' % ctl_para_dict.get('sampfreq',       5))
+        ctl_file_handle.write('       nsample = %s\n' % ctl_para_dict.get('nsample',        50000))
+mcmctree_ctl_dict = {'seqfile' : 'concatenated.phy',
+                     'treefile': 'deltall75_pa75_rooted_with_calibrations.nwk',
+                     'mcmcfile': 'mcmc.txt',
+                     'outfile' : 'DateArTree_out.txt',
+                     'seqtype' : 2,
+                     'usedata' : 3,
+                     'clock'   : 3}
+prep_mcmctree_ctl(mcmctree_ctl_dict, '/Users/songweizhi/Desktop/aaa.txt')
+def get_parameter_combinations(para_to_test_dict):
+    para_lol_name = []
+    para_lol_value = []
+    para_lol_name_with_value = []
+    for each_para in sorted(list(para_to_test_dict.keys())):
+        para_setting_list_name = []
+        para_setting_list_value = []
+        para_setting_list_name_with_value = []
+        for each_setting in sorted(para_to_test_dict[each_para]):
+            name_str = ('%s%s' % (each_para, each_setting)).replace(' ', '_')
+            para_setting_list_name.append(each_para)
+            para_setting_list_value.append(each_setting)
+            para_setting_list_name_with_value.append(name_str)
+        para_lol_name.append(para_setting_list_name)
+        para_lol_value.append(para_setting_list_value)
+        para_lol_name_with_value.append(para_setting_list_name_with_value)
+    all_combination_list_name = [p for p in itertools.product(*para_lol_name)]
+    all_combination_list_value = [p for p in itertools.product(*para_lol_value)]
+    all_combination_list_name_with_value = [p for p in itertools.product(*para_lol_name_with_value)]
+    all_combination_list_name_with_value_str = ['_'.join(i) for i in all_combination_list_name_with_value]
+    para_dod = dict()
+    element_index = 0
+    for each_combination in all_combination_list_name_with_value_str:
+        current_name_list   = all_combination_list_name[element_index]
+        current_value_list  = all_combination_list_value[element_index]
+        current_para_dict = dict()
+        for key, value in zip(current_name_list, current_value_list):
+            current_para_dict[key] = value
+        para_dod[each_combination] = current_para_dict
+        element_index += 1
+    return para_dod
+para_to_test_dict = {'clock': [2, 3], 'nsample': [20000, 50000], 'model': [0, 4], 'kappa_gamma': ['6 2', '5 1']}
+para_dod = get_parameter_combinations(para_to_test_dict)
+print(para_dod)
+# all_combination_list_in_str = ['_'.join(i) for i in all_combination_list]
+# print(all_combination_list_in_str)
+# print(len(all_combination_list_in_str))

TreeSAK/print_leaves.py ADDED Viewed

@@ -0,0 +1,32 @@
+import argparse
+from ete3 import Tree
+print_leaves_usage = '''
+======= print_leaves example commands =======
+TreeSAK print_leaves -i in.tree
+=============================================
+'''
+def print_leaves(args):
+    tree_file_in = args['i']
+    leaf_list = []
+    for leaf in Tree(tree_file_in, format=1):
+        leaf_name = leaf.name
+        leaf_list.append(leaf_name)
+    print('\n'.join(sorted(leaf_list)))
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i',      required=True,   help='input tree file')
+    args = vars(parser.parse_args())
+    print_leaves(args)