PyPI - treesak - Versions diffs - 1.53.3__py3-none-any.whl - Mend

treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

TreeSAK/ALE.py +63 -0
TreeSAK/ALE1.py +268 -0
TreeSAK/ALE2.py +168 -0
TreeSAK/ALE2RTC.py +30 -0
TreeSAK/ALE3.py +205 -0
TreeSAK/ALE4.py +636 -0
TreeSAK/ALE5.py +210 -0
TreeSAK/ALE6.py +401 -0
TreeSAK/ALE7.py +126 -0
TreeSAK/ALE_backup.py +1081 -0
TreeSAK/AssessCVG.py +128 -0
TreeSAK/AssessMarker.py +306 -0
TreeSAK/AssessMarkerDeltaLL.py +257 -0
TreeSAK/AssessMarkerPA.py +317 -0
TreeSAK/AssessPB.py +113 -0
TreeSAK/BMGE.jar +0 -0
TreeSAK/BMGE.py +49 -0
TreeSAK/C60SR4.nex +127 -0
TreeSAK/CompareMCMC.py +138 -0
TreeSAK/ConcateMSA.py +111 -0
TreeSAK/ConvertMSA.py +135 -0
TreeSAK/Dir.rb +82 -0
TreeSAK/ExtractMarkerSeq.py +263 -0
TreeSAK/FastRoot.py +1175 -0
TreeSAK/FastRoot_backup.py +1122 -0
TreeSAK/FigTree.py +34 -0
TreeSAK/GTDB_tree.py +76 -0
TreeSAK/GeneTree.py +142 -0
TreeSAK/KEGG_Luo17.py +807 -0
TreeSAK/LcaToLeaves.py +66 -0
TreeSAK/MarkerRef2Tree.py +616 -0
TreeSAK/MarkerRef2Tree_backup.py +628 -0
TreeSAK/MarkerSeq2Tree.py +299 -0
TreeSAK/MarkerSeq2Tree_backup.py +259 -0
TreeSAK/ModifyTopo.py +116 -0
TreeSAK/Newick_tree_plotter.py +79 -0
TreeSAK/OMA.py +170 -0
TreeSAK/OMA2.py +212 -0
TreeSAK/OneLineAln.py +50 -0
TreeSAK/PB.py +155 -0
TreeSAK/PMSF.py +115 -0
TreeSAK/PhyloBiAssoc.R +84 -0
TreeSAK/PhyloBiAssoc.py +167 -0
TreeSAK/PlotMCMC.py +41 -0
TreeSAK/PlotMcmcNode.py +152 -0
TreeSAK/PlotMcmcNode_old.py +252 -0
TreeSAK/RootTree.py +101 -0
TreeSAK/RootTreeGTDB.py +371 -0
TreeSAK/RootTreeGTDB214.py +288 -0
TreeSAK/RootTreeGTDB220.py +300 -0
TreeSAK/SequentialDating.py +16 -0
TreeSAK/SingleAleHGT.py +157 -0
TreeSAK/SingleLinePhy.py +50 -0
TreeSAK/SliceMSA.py +142 -0
TreeSAK/SplitScore.py +21 -0
TreeSAK/SplitScore1.py +177 -0
TreeSAK/SplitScore1OMA.py +148 -0
TreeSAK/SplitScore2.py +608 -0
TreeSAK/TaxaCountStats.R +256 -0
TreeSAK/TaxonTree.py +47 -0
TreeSAK/TreeSAK_config.py +32 -0
TreeSAK/VERSION +164 -0
TreeSAK/VisHPD95.R +45 -0
TreeSAK/VisHPD95.py +200 -0
TreeSAK/__init__.py +0 -0
TreeSAK/ale_parser.py +74 -0
TreeSAK/ale_splitter.py +63 -0
TreeSAK/alignment_pruner.pl +1471 -0
TreeSAK/assessOG.py +45 -0
TreeSAK/batch_itol.py +171 -0
TreeSAK/catfasta2phy.py +140 -0
TreeSAK/cogTree.py +185 -0
TreeSAK/compare_trees.R +30 -0
TreeSAK/compare_trees.py +255 -0
TreeSAK/dating.py +264 -0
TreeSAK/dating_ss.py +361 -0
TreeSAK/deltall.py +82 -0
TreeSAK/do_rrtc.rb +464 -0
TreeSAK/fa2phy.py +42 -0
TreeSAK/filter_rename_ar53.py +118 -0
TreeSAK/format_leaf_name.py +70 -0
TreeSAK/gap_stats.py +38 -0
TreeSAK/get_SCG_tree.py +742 -0
TreeSAK/get_arCOG_seq.py +97 -0
TreeSAK/global_functions.py +222 -0
TreeSAK/gnm_leaves.py +43 -0
TreeSAK/iTOL.py +791 -0
TreeSAK/iTOL_gene_tree.py +80 -0
TreeSAK/itol_msa_stats.py +56 -0
TreeSAK/keep_highest_rrtc.py +37 -0
TreeSAK/koTree.py +194 -0
TreeSAK/label_gene_tree_by_gnm.py +34 -0
TreeSAK/label_tree.R +75 -0
TreeSAK/label_tree.py +121 -0
TreeSAK/mad.py +708 -0
TreeSAK/mcmc2tree.py +58 -0
TreeSAK/mcmcTC copy.py +92 -0
TreeSAK/mcmcTC.py +104 -0
TreeSAK/mcmctree_vs_reltime.R +44 -0
TreeSAK/mcmctree_vs_reltime.py +252 -0
TreeSAK/merge_pdf.py +32 -0
TreeSAK/pRTC.py +56 -0
TreeSAK/parse_mcmctree.py +198 -0
TreeSAK/parse_reltime.py +141 -0
TreeSAK/phy2fa.py +37 -0
TreeSAK/plot_distruibution_th.py +165 -0
TreeSAK/prep_mcmctree_ctl.py +92 -0
TreeSAK/print_leaves.py +32 -0
TreeSAK/pruneMSA.py +63 -0
TreeSAK/recode.py +73 -0
TreeSAK/remove_bias.R +112 -0
TreeSAK/rename_leaves.py +78 -0
TreeSAK/replace_clade.py +55 -0
TreeSAK/root_with_out_group.py +84 -0
TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
TreeSAK/subsample_drep_gnms.py +74 -0
TreeSAK/subset.py +69 -0
TreeSAK/subset_tree_stupid_old_way.py +193 -0
TreeSAK/supertree.py +330 -0
TreeSAK/tmp_1.py +19 -0
TreeSAK/tmp_2.py +19 -0
TreeSAK/tmp_3.py +120 -0
TreeSAK/tmp_4.py +43 -0
TreeSAK/tmp_5.py +12 -0
TreeSAK/weighted_rand.rb +23 -0
treesak-1.53.3.data/scripts/TreeSAK +955 -0
treesak-1.53.3.dist-info/LICENSE +674 -0
treesak-1.53.3.dist-info/METADATA +27 -0
treesak-1.53.3.dist-info/RECORD +131 -0
treesak-1.53.3.dist-info/WHEEL +5 -0
treesak-1.53.3.dist-info/top_level.txt +1 -0

TreeSAK/get_arCOG_seq.py ADDED Viewed

@@ -0,0 +1,97 @@
+import os
+import argparse
+from Bio import SeqIO
+get_arCOG_seq_usage = '''
+=========================== get_arCOG_seq example commands ===========================
+TreeSAK get_arCOG_seq -id cog_id.txt -db_dir /Users/songweizhi/DB/arCOG18 -o op_dir
+# required db files
+ar18.ar14.02.csv, arCOG_names_220807.txt and ar18.fa
+======================================================================================
+'''
+def get_arCOG_seq(args):
+    cog_id_txt          = args['i']
+    db_dir              = args['db_dir']
+    op_dir              = args['o']
+    force_create_dir    = args['f']
+    ar18_ar14_02_csv = '%s/ar18.ar14.02.csv'        % db_dir
+    cog_des_txt      = '%s/arCOG_names_220807.txt'  % db_dir
+    ar18_fa          = '%s/ar18.fa'                 % db_dir
+    cog_metadata_txt = '%s/metadata.txt'            % op_dir
+    if os.path.isdir(op_dir) is True:
+        if force_create_dir is True:
+            os.system('rm -r %s' % op_dir)
+        else:
+            print('Output folder already exist, program exited!')
+            exit()
+    os.system('mkdir %s' % op_dir)
+    cog_des_dict = dict()
+    for each_cog in open(cog_des_txt, encoding="ISO-8859-1"):
+        each_cog_split = each_cog.strip().split('\t')
+        cog_des_dict[each_cog_split[0]] = each_cog_split[1:]
+    cog_id_set = set()
+    for each_cog in open(cog_id_txt):
+        cog_id_set.add(each_cog.strip().replace('ArCOG', 'arCOG'))
+    seq_id_set = set()
+    seq_to_arcog_dict = dict()
+    arcog_to_seq_dict = dict()
+    for each_line in open(ar18_ar14_02_csv):
+        each_line_split = each_line.strip().split(',')
+        arcog_id = each_line_split[6]
+        seq_id = each_line_split[2]
+        if arcog_id in cog_id_set:
+            seq_id_set.add(seq_id)
+            if arcog_id not in arcog_to_seq_dict:
+                arcog_to_seq_dict[arcog_id] = {seq_id}
+            else:
+                arcog_to_seq_dict[arcog_id].add(seq_id)
+            if seq_id not in seq_to_arcog_dict:
+                seq_to_arcog_dict[seq_id] = {arcog_id}
+            else:
+                seq_to_arcog_dict[seq_id].add(arcog_id)
+    # write out sequence by arCOG
+    for each_seq in SeqIO.parse(ar18_fa, 'fasta'):
+        seq_id = each_seq.id
+        if seq_id in seq_id_set:
+            seq_cog_set = seq_to_arcog_dict.get(seq_id, [])
+            seq_cog_list = [i for i in seq_cog_set]
+            if len(seq_cog_list) == 1:
+                pwd_fa = '%s/%s.fa' % (op_dir, seq_cog_list[0])
+                with open(pwd_fa, 'a') as pwd_fa_handle:
+                    pwd_fa_handle.write('>%s\n' % seq_id)
+                    pwd_fa_handle.write('%s\n' % str(each_seq.seq))
+    # write out metadata
+    cog_metadata_txt_handle = open(cog_metadata_txt, 'w')
+    for each_c in sorted([i for i in cog_id_set]):
+        each_c_desc = '\t'.join(cog_des_dict[each_c])
+        cog_metadata_txt_handle.write('%s\t%s\n' % (each_c, each_c_desc))
+    cog_metadata_txt_handle.close()
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i',      required=True,                       help='arCOD id file, one id per line')
+    parser.add_argument('-db_dir', required=True,                       help='database folder')
+    parser.add_argument('-o',      required=True,                       help='output folder')
+    parser.add_argument('-f',      required=False, action="store_true", help='force overwrite existing output folder')
+    args = vars(parser.parse_args())
+    get_arCOG_seq(args)

TreeSAK/global_functions.py ADDED Viewed

@@ -0,0 +1,222 @@
+import os
+import glob
+import shutil
+from Bio import SeqIO
+from Bio import AlignIO
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+time_format = '[%Y-%m-%d %H:%M:%S] '
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def force_create_folder(folder_to_create):
+    if os.path.isdir(folder_to_create):
+        shutil.rmtree(folder_to_create, ignore_errors=True)
+        if os.path.isdir(folder_to_create):
+            shutil.rmtree(folder_to_create, ignore_errors=True)
+            if os.path.isdir(folder_to_create):
+                shutil.rmtree(folder_to_create, ignore_errors=True)
+                if os.path.isdir(folder_to_create):
+                    shutil.rmtree(folder_to_create, ignore_errors=True)
+    os.mkdir(folder_to_create)
+def sep_path_basename_ext(file_in):
+    # separate path and file name
+    file_path, file_name = os.path.split(file_in)
+    if file_path == '':
+        file_path = '.'
+    # separate file basename and extension
+    file_basename, file_ext = os.path.splitext(file_name)
+    return file_path, file_basename, file_ext
+def get_no_hidden_folder_list(wd):
+    folder_list = []
+    for each_folder in os.listdir(wd):
+        if not each_folder.startswith('.'):
+            folder_list.append(each_folder)
+    return folder_list
+def unique_list_elements(list_input):
+    list_output = []
+    for each_element in list_input:
+        if each_element not in list_output:
+            list_output.append(each_element)
+    return list_output
+def ctg_depth_and_gbk_to_gene_depth(ctg_depth_file, gbk_file, skip_depth_file_header, gene_depth_file_folder):
+    gbk_file_path, gbk_file_basename, gbk_file_extension = sep_path_basename_ext(gbk_file)
+    pwd_depth_file = '%s/%s.depth' % (gene_depth_file_folder, gbk_file_basename)
+    # read in depth
+    ctg_depth_dict = {}
+    line = 0
+    for ctg in open(ctg_depth_file):
+        ctg_split = ctg.strip().split('\t')
+        if skip_depth_file_header is True:
+            if line > 0:
+                ctg_depth_dict[ctg_split[0]] = float(ctg_split[1])
+        else:
+            ctg_depth_dict[ctg_split[0]] = float(ctg_split[1])
+        line += 1
+    # get gene depth
+    gene_depth_file_handle = open(pwd_depth_file, 'w')
+    gene_depth_file_handle.write('Gene\tDepth\n')
+    for seq_record in SeqIO.parse(gbk_file, 'genbank'):
+        seq_id = seq_record.id
+        seq_depth = ctg_depth_dict[seq_id]
+        for feature in seq_record.features:
+            if feature.type == 'CDS':
+                gene_id = feature.qualifiers['locus_tag'][0]
+                for_out = '%s\t%s\n' % (gene_id, seq_depth)
+                gene_depth_file_handle.write(for_out)
+    gene_depth_file_handle.close()
+def barh_plotter(num_list, label_list, query_seq_num, query_ko_NA, fig_width, fig_height, plot_file):
+    fig, ax = plt.subplots()
+    fig.set_size_inches(fig_width, fig_height)
+    y_pos = range(len(num_list))
+    ax.barh(y_pos, num_list, height=0.8, align='center', alpha=0.2, linewidth=0)
+    ax.set_yticks([])  # not show yticks
+    ax.invert_xaxis()  # line up bar on right
+    ax.invert_yaxis()  # put first number on top
+    ax.axis('tight')   # remove extra spaces at the top and bottom, equal to: ax.margins(0, 0)
+    # ax.margins(0, 0.01) # customize space percentage
+    ax.set_xlabel('Number of gene')
+    ax.set_title('Query genes number: %s, genes without KO: %s' % (query_seq_num, query_ko_NA))
+    ax2 = ax.twinx()
+    ax2.set_ylim(ax.get_ylim())
+    ax2.set_yticks(y_pos)
+    ax2.set_yticklabels(label_list)
+    plt.tight_layout()
+    plt.savefig(plot_file, dpi=300)
+    plt.close()
+    plt.clf()
+def AnnotateNorm(file_in, skip_header, value_column, Divisor_value, file_out, file_out_header):
+    file_out_handle = open(file_out, 'w')
+    file_out_handle.write(file_out_header)
+    line_num = 0
+    for each_line in open(file_in):
+        each_line_split = each_line.strip().split('\t')
+        value_str = each_line_split[value_column - 1]
+        if (skip_header is True and line_num > 0) or (skip_header is False):
+            value_pct = float(value_str) * 100 / Divisor_value
+            each_line_split[value_column - 1] = str(float("{0:.2f}".format(value_pct)))
+            file_out_handle.write('%s\n' % '\t'.join(each_line_split))
+        line_num += 1
+    file_out_handle.close()
+def get_gene_list_TotalDepth(gene_list, gene_to_depth_dict):
+    total_depth = 0
+    for gene in gene_list:
+        gene_depth = gene_to_depth_dict[gene]
+        total_depth += gene_depth
+    return total_depth
+def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
+    concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
+    msa_file_re            = '%s/*.%s'  % (msa_dir, msa_ext)
+    msa_file_list          = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
+    msa_file_list_sorted   = sorted(msa_file_list)
+    complete_gnm_set = set()
+    for each_msa_file in msa_file_list:
+        pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
+        for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
+            complete_gnm_set.add(each_seq.id)
+    complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
+    # initialize concatenated msa dict
+    gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
+    msa_len_dict = dict()
+    for each_msa_file in msa_file_list_sorted:
+        gene_id = each_msa_file.split('.' + msa_ext)[0]
+        # read in msa
+        current_msa_len = 0
+        current_msa_len_set = set()
+        pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
+        current_msa_seq_dict = dict()
+        for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
+            complete_gnm_set.add(each_seq.id)
+            current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
+            current_msa_len_set.add(len(each_seq.seq))
+            current_msa_len = len(each_seq.seq)
+        if len(current_msa_len_set) != 1:
+            print('Sequences with different length were found in %s, program exited!' % each_msa_file)
+            exit()
+        msa_len_dict[gene_id] = current_msa_len
+        # add sequence to concatenated msa dict
+        for each_gnm in complete_gnm_list_sorted:
+            msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
+            gnm_to_seq_dict[each_gnm] += msa_seq
+    # write out concatenated msa
+    concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
+    for each_gnm in complete_gnm_list_sorted:
+        concatenated_msa_handle.write('>%s\n' % each_gnm)
+        concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
+    concatenated_msa_handle.close()
+    # write out partition file
+    end_pos = 0
+    partition_file_handle = open(partition_file, 'w')
+    for each_m in msa_file_list_sorted:
+        gene_id = each_m.split('.' + msa_ext)[0]
+        current_m_len = msa_len_dict[gene_id]
+        partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
+        end_pos += current_m_len
+    partition_file_handle.close()
+    # convert msa in fasta to phy
+    AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')

TreeSAK/gnm_leaves.py ADDED Viewed

@@ -0,0 +1,43 @@
+import os
+import argparse
+from ete3 import Tree
+gnm_leaves_usage = '''
+========== gnm_leaves example commands ==========
+TreeSAK gnm_leaves -i input.tree -o output.tree
+=================================================
+'''
+def gnm_leaves(args):
+    tree_file_in  = args['i']
+    tree_file_out = args['o']
+    tree_format   = args['fmt']
+    if os.path.isfile(tree_file_in) is False:
+        print('Tree file not found, program exited!')
+        exit()
+    t = Tree(tree_file_in, format=tree_format)
+    for leaf in t:
+        leaf_name = leaf.name
+        leaf_name_new = '_'.join(leaf_name.split('_')[:-1])
+        leaf.name = leaf_name_new
+    t.write(format=tree_format, outfile=tree_file_out)
+    print('Done!')
+if __name__ == '__main__':
+    gnm_leaves_parser = argparse.ArgumentParser()
+    gnm_leaves_parser.add_argument('-i',    required=True,                       help='input tree')
+    gnm_leaves_parser.add_argument('-o',    required=True,                       help='output tree')
+    gnm_leaves_parser.add_argument('-fmt',  required=False, default=1, type=int, help='tree format, default: 1')
+    args = vars(gnm_leaves_parser.parse_args())
+    gnm_leaves(args)