PyPI - treesak - Versions diffs - 1.51.2__py3-none-any.whl - Mend

treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show

TreeSAK/ALE.py +63 -0
TreeSAK/ALE1.py +268 -0
TreeSAK/ALE2.py +168 -0
TreeSAK/ALE2RTC.py +30 -0
TreeSAK/ALE3.py +205 -0
TreeSAK/ALE4.py +636 -0
TreeSAK/ALE5.py +210 -0
TreeSAK/ALE6.py +401 -0
TreeSAK/ALE7.py +126 -0
TreeSAK/ALE_backup.py +1081 -0
TreeSAK/AssessCVG.py +128 -0
TreeSAK/AssessMarker.py +306 -0
TreeSAK/AssessMarkerDeltaLL.py +257 -0
TreeSAK/AssessMarkerPA.py +317 -0
TreeSAK/AssessPB.py +130 -0
TreeSAK/BMGE.jar +0 -0
TreeSAK/BMGE.py +49 -0
TreeSAK/CompareMCMC.py +138 -0
TreeSAK/ConcateMSA.py +111 -0
TreeSAK/ConvertMSA.py +135 -0
TreeSAK/Dir.rb +82 -0
TreeSAK/ExtractMarkerSeq.py +263 -0
TreeSAK/FastRoot.py +1175 -0
TreeSAK/FastRoot_backup.py +1122 -0
TreeSAK/FigTree.py +34 -0
TreeSAK/GTDB_tree.py +76 -0
TreeSAK/GeneTree.py +142 -0
TreeSAK/KEGG_Luo17.py +807 -0
TreeSAK/LcaToLeaves.py +66 -0
TreeSAK/MarkerRef2Tree.py +616 -0
TreeSAK/MarkerRef2Tree_backup.py +628 -0
TreeSAK/MarkerSeq2Tree.py +290 -0
TreeSAK/MarkerSeq2Tree_backup.py +259 -0
TreeSAK/ModifyTopo.py +116 -0
TreeSAK/Newick_tree_plotter.py +79 -0
TreeSAK/OMA.py +170 -0
TreeSAK/OMA2.py +212 -0
TreeSAK/OneLineAln.py +50 -0
TreeSAK/PB.py +155 -0
TreeSAK/PMSF.py +106 -0
TreeSAK/PhyloBiAssoc.R +84 -0
TreeSAK/PhyloBiAssoc.py +167 -0
TreeSAK/PlotMCMC.py +41 -0
TreeSAK/PlotMcmcNode.py +152 -0
TreeSAK/PlotMcmcNode_old.py +252 -0
TreeSAK/RootTree.py +101 -0
TreeSAK/RootTreeGTDB214.py +288 -0
TreeSAK/RootTreeGTDB220.py +300 -0
TreeSAK/RootTreeGTDB226.py +300 -0
TreeSAK/SequentialDating.py +16 -0
TreeSAK/SingleAleHGT.py +157 -0
TreeSAK/SingleLinePhy.py +50 -0
TreeSAK/SliceMSA.py +142 -0
TreeSAK/SplitScore.py +19 -0
TreeSAK/SplitScore1.py +178 -0
TreeSAK/SplitScore1OMA.py +148 -0
TreeSAK/SplitScore2.py +597 -0
TreeSAK/TaxaCountStats.R +256 -0
TreeSAK/TaxonTree.py +47 -0
TreeSAK/TreeSAK_config.py +32 -0
TreeSAK/VERSION +158 -0
TreeSAK/VisHPD95.R +45 -0
TreeSAK/VisHPD95.py +200 -0
TreeSAK/__init__.py +0 -0
TreeSAK/ale_parser.py +74 -0
TreeSAK/ale_splitter.py +63 -0
TreeSAK/alignment_pruner.pl +1471 -0
TreeSAK/assessOG.py +45 -0
TreeSAK/catfasta2phy.py +140 -0
TreeSAK/cogTree.py +185 -0
TreeSAK/compare_trees.R +30 -0
TreeSAK/compare_trees.py +255 -0
TreeSAK/dating.py +264 -0
TreeSAK/dating_ss.py +361 -0
TreeSAK/deltall.py +82 -0
TreeSAK/do_rrtc.rb +464 -0
TreeSAK/fa2phy.py +42 -0
TreeSAK/format_leaf_name.py +70 -0
TreeSAK/gap_stats.py +38 -0
TreeSAK/get_SCG_tree.py +742 -0
TreeSAK/get_arCOG_seq.py +97 -0
TreeSAK/global_functions.py +222 -0
TreeSAK/gnm_leaves.py +43 -0
TreeSAK/iTOL.py +791 -0
TreeSAK/iTOL_gene_tree.py +80 -0
TreeSAK/itol_msa_stats.py +56 -0
TreeSAK/keep_highest_rrtc.py +37 -0
TreeSAK/koTree.py +194 -0
TreeSAK/label_tree.R +75 -0
TreeSAK/label_tree.py +121 -0
TreeSAK/mad.py +708 -0
TreeSAK/mcmc2tree.py +58 -0
TreeSAK/mcmcTC copy.py +92 -0
TreeSAK/mcmcTC.py +104 -0
TreeSAK/mcmctree_vs_reltime.R +44 -0
TreeSAK/mcmctree_vs_reltime.py +252 -0
TreeSAK/merge_pdf.py +32 -0
TreeSAK/pRTC.py +56 -0
TreeSAK/parse_mcmctree.py +198 -0
TreeSAK/parse_reltime.py +141 -0
TreeSAK/phy2fa.py +37 -0
TreeSAK/plot_distruibution_th.py +165 -0
TreeSAK/prep_mcmctree_ctl.py +92 -0
TreeSAK/print_leaves.py +32 -0
TreeSAK/pruneMSA.py +63 -0
TreeSAK/recode.py +73 -0
TreeSAK/remove_bias.R +112 -0
TreeSAK/rename_leaves.py +77 -0
TreeSAK/replace_clade.py +55 -0
TreeSAK/root_with_out_group.py +84 -0
TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
TreeSAK/subsample_drep_gnms.py +74 -0
TreeSAK/subset.py +69 -0
TreeSAK/subset_tree_stupid_old_way.py +193 -0
TreeSAK/supertree.py +330 -0
TreeSAK/tmp_1.py +19 -0
TreeSAK/tmp_2.py +19 -0
TreeSAK/tmp_3.py +120 -0
TreeSAK/weighted_rand.rb +23 -0
treesak-1.51.2.data/scripts/TreeSAK +950 -0
treesak-1.51.2.dist-info/LICENSE +674 -0
treesak-1.51.2.dist-info/METADATA +27 -0
treesak-1.51.2.dist-info/RECORD +125 -0
treesak-1.51.2.dist-info/WHEEL +5 -0
treesak-1.51.2.dist-info/top_level.txt +1 -0

TreeSAK/AssessPB.py ADDED Viewed

@@ -0,0 +1,130 @@
+import os
+import argparse
+AssessPB_usage = '''
+====================== AssessPB example commands ======================
+# Dependency: bpcomp and tracecomp (from PhyloBayes-MPI)
+export OMPI_MCA_btl=^openib
+TreeSAK AssessPB -c1 c1dir/c1 -c2 c2dir/c2
+TreeSAK AssessPB -c1 c1dir/c1 -c2 c2dir/c2 -c3 c3dir/c3
+TreeSAK AssessPB -c1 c1dir/c1 -c2 c2dir/c2 -c3 c3dir/c3 -c4 c4dir/c4
+TreeSAK AssessPB -cdir chain_dir
+# This is a wrapper for:
+bpcomp -x 1000 10 c1 c2
+bpcomp -x 1000 10 c1 c2 c3 c4
+tracecomp -x 1000 c1 c2
+tracecomp -x 1000 c1 c2 c3 c4
+=======================================================================
+'''
+def compare2chains(chain_1, chain_2, chain_3, chain_4, burn_in, sample_interval, with_bpcomp, with_tracecomp, op_dir, cmd_txt):
+    # bpcomp:    -x <burnin> [<every> <until>]. default burnin = 10 percent of the chain
+    # tracecomp: -x <burnin> [<every> <until>]. default burnin = 20 percent of the chain
+    bpcomp_cmd    = 'bpcomp -o %s/bpcomp -x %s %s %s %s'    % (op_dir, burn_in, sample_interval, chain_1, chain_2)
+    tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s' % (op_dir, burn_in, chain_1, chain_2)
+    if (chain_3 is not None) and (chain_4 is None):
+        bpcomp_cmd    = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s'    % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3)
+        tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s' % (op_dir, burn_in, chain_1, chain_2, chain_3)
+    if (chain_3 is not None) and (chain_4 is not None):
+        bpcomp_cmd    = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s %s'    % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3, chain_4)
+        tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s %s' % (op_dir, burn_in, chain_1, chain_2, chain_3, chain_4)
+    # write out commands
+    cmd_txt_handle = open(cmd_txt, 'a')
+    cmd_txt_handle.write(bpcomp_cmd + '\n')
+    cmd_txt_handle.write(tracecomp_cmd + '\n')
+    cmd_txt_handle.close()
+    # execute commands
+    if with_bpcomp is True:
+        print()
+        print('====================== bpcomp ======================')
+        print()
+        print(bpcomp_cmd)
+        os.system(bpcomp_cmd)
+        print('Guideline')
+        print('1. maxdiﬀ < 0.1: good run.')
+        print('2. maxdiﬀ < 0.3: acceptable: gives a good qualitative picture of the posterior consensus.')
+        print('3. 0.3 < maxdiﬀ < 1: the sample is not yet suﬃciently large and have not converged, but on right track.')
+        print('4. if maxdiﬀ = 1 even after 10,000 points: at least one run stuck in a local maximum.')
+        print()
+    if with_tracecomp is True:
+        print('==================== tracecomp ====================')
+        print()
+        print(tracecomp_cmd)
+        print()
+        os.system(tracecomp_cmd)
+        print()
+        print('Guideline')
+        print('1. rel diﬀ < 0.1 and minimum eﬀective size > 300: good run.')
+        print('2. rel diﬀ < 0.3 and minimum eﬀective size > 50: acceptable run.')
+        print()
+    print('====================================================')
+def AssessPB(args):
+    chain_1         = args['c1']
+    chain_2         = args['c2']
+    chain_3         = args['c3']
+    chain_4         = args['c4']
+    chain_dir       = args['cdir']
+    burn_in         = args['bi']
+    sample_interval = args['si']
+    op_dir          = args['o']
+    force_overwrite = args['f']
+    with_bpcomp     = True
+    with_tracecomp  = True
+    cmd_txt         = '%s/cmds.txt' % op_dir
+    # create output dir
+    if os.path.isdir(op_dir) is True:
+        if force_overwrite is True:
+            os.system('rm -r %s' % op_dir)
+        else:
+            print('output folder already exist, program exited!')
+            exit()
+    os.system('mkdir %s' % op_dir)
+    if (chain_1 is not None) and (chain_2 is not None) and (chain_dir is None):
+        compare2chains(chain_1, chain_2, chain_3, chain_4, burn_in, sample_interval, with_bpcomp, with_tracecomp, op_dir, cmd_txt)
+    elif (chain_1 is None) and (chain_2 is None) and (chain_dir is not None):
+        print('Compare multiple chains')
+        print('Function to be added!')
+        print('Program exited!')
+    else:
+        print('Please compare either no more than four chains (specified by -c1, -c2, -c3 and -c4) or multiple chains provided within -cdir')
+        print('Program exited!')
+        exit()
+if __name__ == '__main__':
+    AssessPB_parser = argparse.ArgumentParser()
+    AssessPB_parser.add_argument('-c1',     required=False, default=None,           help='chain 1')
+    AssessPB_parser.add_argument('-c2',     required=False, default=None,           help='chain 2')
+    AssessPB_parser.add_argument('-c3',     required=False, default=None,           help='chain 3')
+    AssessPB_parser.add_argument('-c4',     required=False, default=None,           help='chain 4')
+    AssessPB_parser.add_argument('-cdir',   required=False, default=None,           help='chain folder')
+    AssessPB_parser.add_argument('-bi',     required=False, default=1000,           help='burn-in, default: 1000')
+    AssessPB_parser.add_argument('-si',     required=False, default=10,             help='sample interval, default: 10')
+    AssessPB_parser.add_argument('-o',      required=True, default=None,            help='output directory')
+    AssessPB_parser.add_argument('-f',      required=False, action="store_true",    help='force overwrite')
+    args = vars(AssessPB_parser.parse_args())
+    AssessPB(args)

TreeSAK/BMGE.jar ADDED Viewed

Binary file

TreeSAK/BMGE.py ADDED Viewed

@@ -0,0 +1,49 @@
+import os
+import argparse
+BMGE_usage = '''
+======================= BMGE example commands =======================
+# require: java
+TreeSAK BMGE -p demo -i input.aln -m BLOSUM30 -esc 0.55
+# Settings for calculating split score (Nina Dombrowski):
+# -t AA -m BLOSUM30 -h 0.55
+=====================================================================
+'''
+def BMGE(args):
+    op_prefix               = args['p']
+    msa_in                  = args['i']
+    trim_model              = args['m']
+    entropy_score_cutoff    = args['esc']
+    # define file name
+    msa_out_fasta  = '%s.BMGE.fasta'  % op_prefix
+    # specify path to BMGE.jar
+    current_file_path   = '/'.join(os.path.realpath(__file__).split('/')[:-1])
+    pwd_bmge_jar        = '%s/BMGE.jar' % current_file_path
+    # run BMGE
+    bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_fasta)
+    print(bmge_cmd)
+    os.system(bmge_cmd)
+    print('Done!')
+if __name__ == '__main__':
+    BMGE_parser = argparse.ArgumentParser()
+    BMGE_parser.add_argument('-p',   required=True,                         help='output prefix')
+    BMGE_parser.add_argument('-i',   required=True,                         help='input MSA')
+    BMGE_parser.add_argument('-m',   required=False, default='BLOSUM30',    help='trim model, default: BLOSUM30')
+    BMGE_parser.add_argument('-esc', required=False, default='0.55',        help='entropy score cutoff, default: 0.55')
+    args = vars(BMGE_parser.parse_args())
+    BMGE(args)

TreeSAK/CompareMCMC.py ADDED Viewed

@@ -0,0 +1,138 @@
+import os
+import argparse
+import arviz as az
+import pandas as pd
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import figure
+CompareMCMC_usage = '''
+====================================== CompareMCMC example commands ======================================
+TreeSAK CompareMCMC -mx IR_mcmc.txt -my AR_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
+cd /Users/songweizhi/Desktop
+TreeSAK CompareMCMC -mx /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run1_mcmc.txt -my /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run2_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
+==========================================================================================================
+'''
+def sep_path_basename_ext(file_in):
+    file_path, file_name = os.path.split(file_in)
+    if file_path == '':
+        file_path = '.'
+    file_basename, file_extension = os.path.splitext(file_name)
+    return file_path, file_basename, file_extension
+def CompareMCMC(args):
+    mcmc_txt_x      = args['mx']
+    mcmc_txt_y      = args['my']
+    label_x         = args['lx']
+    label_y         = args['ly']
+    pwd_figure      = args['o']
+    max_axis_value  = args['max']
+    label_fs        = args['fs']
+    x_path, x_basename, x_ext = sep_path_basename_ext(mcmc_txt_x)
+    y_path, y_basename, y_ext = sep_path_basename_ext(mcmc_txt_y)
+    if label_x is None:
+        label_x = x_basename
+    if label_y is None:
+        label_y = y_basename
+    # read in dataframe
+    df_x = pd.read_table(mcmc_txt_x, index_col=0)
+    df_y = pd.read_table(mcmc_txt_y, index_col=0)
+    # get Mean value for each column
+    df_x_col_to_mean_dict = {col_name: mean for col_name, mean in df_x.mean().iteritems()}
+    df_y_col_to_mean_dict = {col_name: mean for col_name, mean in df_y.mean().iteritems()}
+    # get CI95 for each column
+    df_x_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_x.iteritems()}
+    df_y_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_y.iteritems()}
+    num_list_x = []
+    num_list_y = []
+    err_range_x = []
+    err_range_y = []
+    for col_name, col in df_x.iteritems():
+        if col_name not in ['mu', 'sigma2', 'lnL']:
+            num_list_x.append(df_x_col_to_mean_dict[col_name])
+            num_list_y.append(df_y_col_to_mean_dict[col_name])
+            err_range_x.append(df_x_col_to_ci_dict[col_name])
+            err_range_y.append(df_y_col_to_ci_dict[col_name])
+    x_err_l = []
+    x_err_r = []
+    y_err_l = []
+    y_err_u = []
+    max_value = 0
+    min_value = 100000000000000
+    n = 0
+    while n < len(num_list_x):
+        x_value = num_list_x[n]
+        y_value = num_list_y[n]
+        x_range = err_range_x[n]
+        y_range = err_range_y[n]
+        x_l_dist = abs(x_value - x_range[0])
+        x_r_dist = abs(x_range[1] - x_value)
+        y_l_dist = abs(y_value - y_range[0])
+        y_u_dist = abs(y_range[1] - y_value)
+        x_err_l.append(x_l_dist)
+        x_err_r.append(x_r_dist)
+        y_err_l.append(y_l_dist)
+        y_err_u.append(y_u_dist)
+        current_max = max(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
+        current_min = min(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
+        if current_max > max_value:
+            max_value = current_max
+        if current_min < min_value:
+            min_value = current_min
+        n += 1
+    figure(figsize=(6, 6), dpi=300)
+    plt.plot([min_value, max_value], [min_value, max_value], color='black', linestyle='dashed', linewidth=1, alpha=0.5)
+    plt.scatter(num_list_x, num_list_y, s=0)
+    plt.errorbar(num_list_x, num_list_y, xerr=[x_err_l, x_err_r], yerr=[y_err_l, y_err_u],
+                 ls='none', ecolor='skyblue', elinewidth=1, alpha=0.5)
+    if max_axis_value is not None:
+        plt.xlim([0, max_axis_value])
+        plt.ylim([0, max_axis_value])
+    # Set the font size of xticks and yticks
+    plt.xticks(fontsize=label_fs)
+    plt.yticks(fontsize=label_fs)
+    plt.xlabel(label_x, fontsize=label_fs)
+    plt.ylabel(label_y, fontsize=label_fs)
+    # write out
+    plt.tight_layout()
+    plt.savefig(pwd_figure)
+    plt.close()
+    print('Plot exported to %s, done!' % pwd_figure)
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-mx',      required=True,                          help='mcmc.txt for x axis')
+    parser.add_argument('-my',      required=True,                          help='mcmc.txt for y axis')
+    parser.add_argument('-lx',      required=False, default=None,           help='label for x axis')
+    parser.add_argument('-ly',      required=False, default=None,           help='label for y axis')
+    parser.add_argument('-max',     required=False, default=None, type=int, help='maximum axis value')
+    parser.add_argument('-fs',      required=False, default=16, type=int,   help='label font size, default: 16')
+    parser.add_argument('-o',       required=True,                          help='output plot')
+    args = vars(parser.parse_args())
+    CompareMCMC(args)

TreeSAK/ConcateMSA.py ADDED Viewed

@@ -0,0 +1,111 @@
+import os
+import glob
+import argparse
+from Bio import SeqIO
+from Bio import AlignIO
+ConcateMSA_usage = '''
+================= ConcateMSA example commands =================
+TreeSAK ConcateMSA -i aln -x aln -p concatenated -gene2gnm
+# output file include:
+concatenated.fasta
+concatenated.phylip
+concatenated.partition.txt
+===============================================================
+'''
+def ConcateMSA(args):
+    msa_dir                 = args['i']
+    msa_ext                 = args['x']
+    op_prefix               = args['p']
+    gene2gnm                = args['gene2gnm']
+    concatenated_msa_phy    = '%s.phylip'           % op_prefix
+    concatenated_msa_fasta  = '%s.fasta'            % op_prefix
+    partition_file          = '%s.partition.txt'    % op_prefix
+    msa_file_re             = '%s/*.%s'             % (msa_dir, msa_ext)
+    msa_file_list           = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
+    msa_file_list_sorted    = sorted(msa_file_list)
+    complete_gnm_set = set()
+    for each_msa_file in msa_file_list:
+        pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
+        for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
+            seq_id = each_seq.id
+            if gene2gnm is True:
+                seq_id = '_'.join(seq_id.split('_')[:-1])
+            complete_gnm_set.add(seq_id)
+    complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
+    # initialize concatenated msa dict
+    gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
+    msa_len_dict = dict()
+    for each_msa_file in msa_file_list_sorted:
+        msa_id = each_msa_file.split('.' + msa_ext)[0]
+        # read in msa
+        current_msa_len = 0
+        current_msa_len_set = set()
+        pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
+        current_msa_seq_dict = dict()
+        for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
+            seq_id = each_seq.id
+            if gene2gnm is True:
+                seq_id = '_'.join(seq_id.split('_')[:-1])
+            complete_gnm_set.add(seq_id)
+            current_msa_seq_dict[seq_id] = str(each_seq.seq)
+            current_msa_len_set.add(len(each_seq.seq))
+            current_msa_len = len(each_seq.seq)
+        if len(current_msa_len_set) != 1:
+            print('Sequences with different length were found in %s, program exited!' % each_msa_file)
+            exit()
+        msa_len_dict[msa_id] = current_msa_len
+        # add sequence to concatenated msa dict
+        for each_gnm in complete_gnm_list_sorted:
+            msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
+            gnm_to_seq_dict[each_gnm] += msa_seq
+    # write out concatenated msa
+    concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
+    for each_gnm in complete_gnm_list_sorted:
+        concatenated_msa_handle.write('>%s\n' % each_gnm)
+        concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
+    concatenated_msa_handle.close()
+    # write out partition file
+    end_pos = 0
+    partition_file_handle = open(partition_file, 'w')
+    for each_m in msa_file_list_sorted:
+        gene_id = each_m.split('.' + msa_ext)[0]
+        current_m_len = msa_len_dict[gene_id]
+        partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
+        end_pos += current_m_len
+    partition_file_handle.close()
+    # convert msa in fasta to phy
+    AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i',           required=True,                       help='input MSA folder')
+    parser.add_argument('-x',           required=False, default='aln',       help='input file extension')
+    parser.add_argument('-p',           required=True,                       help='output prefix')
+    parser.add_argument('-gene2gnm',    required=False, action="store_true", help='gene id to gnm id, split sequence id before the last _')
+    args = vars(parser.parse_args())
+    ConcateMSA(args)

TreeSAK/ConvertMSA.py ADDED Viewed

@@ -0,0 +1,135 @@
+import os
+import glob
+import argparse
+from Bio import SeqIO
+from Bio import AlignIO
+ConvertMSA_usage = '''
+================================= ConvertMSA example commands =================================
+# phylip to fasta
+TreeSAK ConvertMSA -i concatenated.phy -fi phylip-relaxed -o concatenated.fasta -fo fasta
+TreeSAK ConvertMSA -i phy_files -fi phylip-relaxed -xi phy -o MSA_in_fasta -fo fasta -xo fa
+# examples of alignment format (https://biopython.org/wiki/AlignIO):
+fasta, phylip, phylip-relaxed, phylip-sequential, clustal
+===============================================================================================
+'''
+def sep_path_basename_ext(file_in):
+    # separate path and file name
+    file_path, file_name = os.path.split(file_in)
+    if file_path == '':
+        file_path = '.'
+    # separate file basename and extension
+    file_basename, file_extension = os.path.splitext(file_name)
+    return file_path, file_basename, file_extension
+def ConvertMSA(args):
+    aln_in            = args['i']
+    aln_in_ext        = args['xi']
+    aln_in_format     = args['fi']
+    aln_out           = args['o']
+    aln_out_ext       = args['xo']
+    aln_out_format    = args['fo']
+    one_line          = args['oneline']
+    no_gap            = args['nogap']
+    force_overwriting = args['f']
+    if ((one_line is True) and (aln_out_format != 'fasta')) or ((no_gap is True) and (aln_out_format != 'fasta')):
+        print('Please provide "-oneline" and/or "-nogap" only if "-fo" is fasta')
+        exit()
+    if os.path.isfile(aln_in) is True:
+        if (one_line is False) and (no_gap is False):
+            AlignIO.convert(aln_in, aln_in_format, aln_out, aln_out_format)
+        else:
+            aln_out_tmp = aln_out + '.tmp'
+            AlignIO.convert(aln_in, aln_in_format, aln_out_tmp, aln_out_format)
+            pwd_aln_out_handle = open(aln_out, 'w')
+            for each_seq in SeqIO.parse(aln_out_tmp, 'fasta'):
+                seq_id = each_seq.id
+                seq_sequence = str(each_seq.seq)
+                if no_gap is False:
+                    pwd_aln_out_handle.write('>%s\n' % seq_id)
+                    pwd_aln_out_handle.write('%s\n' % seq_sequence)
+                else:
+                    pwd_aln_out_handle.write('>%s\n' % seq_id)
+                    pwd_aln_out_handle.write('%s\n' % seq_sequence.replace('-', ''))
+            pwd_aln_out_handle.close()
+            os.system('rm %s' % aln_out_tmp)
+        print('Done!')
+    elif os.path.isdir(aln_in) is True:
+        aln_in_re = '%s/*.%s' % (aln_in, aln_in_ext)
+        aln_in_list = [os.path.basename(file_name) for file_name in glob.glob(aln_in_re)]
+        # check input
+        if len(aln_in_list) == 0:
+            print('Input file not detected, program exited!')
+            exit()
+        # check output folder
+        if os.path.isdir(aln_out) is True:
+            if force_overwriting is True:
+                os.system('rm -r %s' % aln_out)
+            else:
+                print('Output folder already exist, program exited!')
+                exit()
+        os.system('mkdir %s' % aln_out)
+        # convert
+        for each_aln_in in aln_in_list:
+            aln_in_path, aln_in_basename, aln_in_ext = sep_path_basename_ext(each_aln_in)
+            pwd_aln_in      = '%s/%s'        % (aln_in, each_aln_in)
+            pwd_aln_out     = '%s/%s.%s'     % (aln_out, aln_in_basename, aln_out_ext)
+            pwd_aln_out_tmp = '%s/%s_tmp.%s' % (aln_out, aln_in_basename, aln_out_ext)
+            if (one_line is False) and (no_gap is False):
+                AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out, aln_out_format)
+            else:
+                AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out_tmp, aln_out_format)
+                pwd_aln_out_handle = open(pwd_aln_out, 'w')
+                for each_seq in SeqIO.parse(pwd_aln_out_tmp, 'fasta'):
+                    seq_id = each_seq.id
+                    seq_sequence = str(each_seq.seq)
+                    if no_gap is False:
+                        pwd_aln_out_handle.write('>%s\n' % seq_id)
+                        pwd_aln_out_handle.write('%s\n' % seq_sequence)
+                    else:
+                        sequence_no_gap = seq_sequence.replace('-', '')
+                        if len(sequence_no_gap) > 0:
+                            pwd_aln_out_handle.write('>%s\n' % seq_id)
+                            pwd_aln_out_handle.write('%s\n' % sequence_no_gap)
+                pwd_aln_out_handle.close()
+                os.system('rm %s' % pwd_aln_out_tmp)
+        print('Done!')
+    else:
+        print('Input file not found, program exited!')
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i',       required=True,                       help='input alignment')
+    parser.add_argument('-xi',      required=False, default='aln',       help='input alignment extension')
+    parser.add_argument('-fi',      required=True,                       help='input alignment format, e.g., fasta, phylip')
+    parser.add_argument('-o',       required=True,                       help='output alignment')
+    parser.add_argument('-xo',      required=False, default='aln',       help='output alignment extension')
+    parser.add_argument('-fo',      required=True,                       help='output alignment format, e.g., fasta, phylip')
+    parser.add_argument('-oneline', required=False, action="store_true", help='put sequence in single line, available if -fo is fasta')
+    parser.add_argument('-nogap',   required=False, action="store_true", help='remove gaps from alignment, available if -fo is fasta')
+    parser.add_argument('-f',       required=False, action="store_true", help='force overwrite existing output folder')
+    args = vars(parser.parse_args())
+    ConvertMSA(args)

TreeSAK/Dir.rb ADDED Viewed

@@ -0,0 +1,82 @@
+require 'find'
+################################################################################
+class Dir
+    def self.mkdirs(path)
+        if(!File.directory?(path))
+            if(!mkdirs(File.dirname(path)))
+                return false;
+            end
+            mkdir(path)
+        end
+        return true
+    end
+end
+################################################################################
+def mkdir_with_force(outdir, is_force=false, is_tolerate=false)
+  if outdir.class != String
+    raise "outdir wrong? Exiting ......"
+  end
+  if ! Dir.exists?(outdir)
+    `mkdir -p #{outdir}`
+  else
+    if is_tolerate
+      ;
+    elsif is_force
+      `rm -rf #{outdir}`
+      `mkdir -p #{outdir}`
+    else
+      raise "The outdir #{outdir} has already existed!"
+    end
+  end
+end
+def read_infiles(indir, suffix='', is_all_subfolder=false)
+  infiles = Array.new
+  if ! is_all_subfolder
+    Dir.foreach(indir) do |b|
+      next if b =~ /^\./
+      if suffix.is_a?(String)
+        if suffix != ''
+          next if b !~ /#{suffix}$/
+        end
+      elsif suffix.is_a?(Array)
+        next unless suffix.any?{|i| b =~ /#{i}$/ }
+      end
+      infiles << File.join(indir, b)
+    end
+  else
+    Find.find(indir) do |path|
+      next if File.directory?(path)
+      next if File.basename(path) =~ /^\./
+      infiles << path if suffix.is_a?(String) ? path =~ /\.#{suffix}$/ : suffix.any?{|i| path =~ /#{i}$/ }
+    end
+  end
+  return(infiles)
+end
+def getFilesBySuffices(indir, suffices)
+  files = Array.new
+  infiles = read_infiles(indir)
+  infiles.each do |infile|
+    if suffices.include?(File.extname(infile))
+      files << infile
+    end
+  end
+  return(files)
+end
+def get_file_path(file)
+  path = File.symlink?(file) ? File.readlink(file) : file
+  return(path)
+end
+################################################################################