PyPI - treesak - Versions diffs - 1.53.3__py3-none-any.whl - Mend

treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

TreeSAK/ALE.py +63 -0
TreeSAK/ALE1.py +268 -0
TreeSAK/ALE2.py +168 -0
TreeSAK/ALE2RTC.py +30 -0
TreeSAK/ALE3.py +205 -0
TreeSAK/ALE4.py +636 -0
TreeSAK/ALE5.py +210 -0
TreeSAK/ALE6.py +401 -0
TreeSAK/ALE7.py +126 -0
TreeSAK/ALE_backup.py +1081 -0
TreeSAK/AssessCVG.py +128 -0
TreeSAK/AssessMarker.py +306 -0
TreeSAK/AssessMarkerDeltaLL.py +257 -0
TreeSAK/AssessMarkerPA.py +317 -0
TreeSAK/AssessPB.py +113 -0
TreeSAK/BMGE.jar +0 -0
TreeSAK/BMGE.py +49 -0
TreeSAK/C60SR4.nex +127 -0
TreeSAK/CompareMCMC.py +138 -0
TreeSAK/ConcateMSA.py +111 -0
TreeSAK/ConvertMSA.py +135 -0
TreeSAK/Dir.rb +82 -0
TreeSAK/ExtractMarkerSeq.py +263 -0
TreeSAK/FastRoot.py +1175 -0
TreeSAK/FastRoot_backup.py +1122 -0
TreeSAK/FigTree.py +34 -0
TreeSAK/GTDB_tree.py +76 -0
TreeSAK/GeneTree.py +142 -0
TreeSAK/KEGG_Luo17.py +807 -0
TreeSAK/LcaToLeaves.py +66 -0
TreeSAK/MarkerRef2Tree.py +616 -0
TreeSAK/MarkerRef2Tree_backup.py +628 -0
TreeSAK/MarkerSeq2Tree.py +299 -0
TreeSAK/MarkerSeq2Tree_backup.py +259 -0
TreeSAK/ModifyTopo.py +116 -0
TreeSAK/Newick_tree_plotter.py +79 -0
TreeSAK/OMA.py +170 -0
TreeSAK/OMA2.py +212 -0
TreeSAK/OneLineAln.py +50 -0
TreeSAK/PB.py +155 -0
TreeSAK/PMSF.py +115 -0
TreeSAK/PhyloBiAssoc.R +84 -0
TreeSAK/PhyloBiAssoc.py +167 -0
TreeSAK/PlotMCMC.py +41 -0
TreeSAK/PlotMcmcNode.py +152 -0
TreeSAK/PlotMcmcNode_old.py +252 -0
TreeSAK/RootTree.py +101 -0
TreeSAK/RootTreeGTDB.py +371 -0
TreeSAK/RootTreeGTDB214.py +288 -0
TreeSAK/RootTreeGTDB220.py +300 -0
TreeSAK/SequentialDating.py +16 -0
TreeSAK/SingleAleHGT.py +157 -0
TreeSAK/SingleLinePhy.py +50 -0
TreeSAK/SliceMSA.py +142 -0
TreeSAK/SplitScore.py +21 -0
TreeSAK/SplitScore1.py +177 -0
TreeSAK/SplitScore1OMA.py +148 -0
TreeSAK/SplitScore2.py +608 -0
TreeSAK/TaxaCountStats.R +256 -0
TreeSAK/TaxonTree.py +47 -0
TreeSAK/TreeSAK_config.py +32 -0
TreeSAK/VERSION +164 -0
TreeSAK/VisHPD95.R +45 -0
TreeSAK/VisHPD95.py +200 -0
TreeSAK/__init__.py +0 -0
TreeSAK/ale_parser.py +74 -0
TreeSAK/ale_splitter.py +63 -0
TreeSAK/alignment_pruner.pl +1471 -0
TreeSAK/assessOG.py +45 -0
TreeSAK/batch_itol.py +171 -0
TreeSAK/catfasta2phy.py +140 -0
TreeSAK/cogTree.py +185 -0
TreeSAK/compare_trees.R +30 -0
TreeSAK/compare_trees.py +255 -0
TreeSAK/dating.py +264 -0
TreeSAK/dating_ss.py +361 -0
TreeSAK/deltall.py +82 -0
TreeSAK/do_rrtc.rb +464 -0
TreeSAK/fa2phy.py +42 -0
TreeSAK/filter_rename_ar53.py +118 -0
TreeSAK/format_leaf_name.py +70 -0
TreeSAK/gap_stats.py +38 -0
TreeSAK/get_SCG_tree.py +742 -0
TreeSAK/get_arCOG_seq.py +97 -0
TreeSAK/global_functions.py +222 -0
TreeSAK/gnm_leaves.py +43 -0
TreeSAK/iTOL.py +791 -0
TreeSAK/iTOL_gene_tree.py +80 -0
TreeSAK/itol_msa_stats.py +56 -0
TreeSAK/keep_highest_rrtc.py +37 -0
TreeSAK/koTree.py +194 -0
TreeSAK/label_gene_tree_by_gnm.py +34 -0
TreeSAK/label_tree.R +75 -0
TreeSAK/label_tree.py +121 -0
TreeSAK/mad.py +708 -0
TreeSAK/mcmc2tree.py +58 -0
TreeSAK/mcmcTC copy.py +92 -0
TreeSAK/mcmcTC.py +104 -0
TreeSAK/mcmctree_vs_reltime.R +44 -0
TreeSAK/mcmctree_vs_reltime.py +252 -0
TreeSAK/merge_pdf.py +32 -0
TreeSAK/pRTC.py +56 -0
TreeSAK/parse_mcmctree.py +198 -0
TreeSAK/parse_reltime.py +141 -0
TreeSAK/phy2fa.py +37 -0
TreeSAK/plot_distruibution_th.py +165 -0
TreeSAK/prep_mcmctree_ctl.py +92 -0
TreeSAK/print_leaves.py +32 -0
TreeSAK/pruneMSA.py +63 -0
TreeSAK/recode.py +73 -0
TreeSAK/remove_bias.R +112 -0
TreeSAK/rename_leaves.py +78 -0
TreeSAK/replace_clade.py +55 -0
TreeSAK/root_with_out_group.py +84 -0
TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
TreeSAK/subsample_drep_gnms.py +74 -0
TreeSAK/subset.py +69 -0
TreeSAK/subset_tree_stupid_old_way.py +193 -0
TreeSAK/supertree.py +330 -0
TreeSAK/tmp_1.py +19 -0
TreeSAK/tmp_2.py +19 -0
TreeSAK/tmp_3.py +120 -0
TreeSAK/tmp_4.py +43 -0
TreeSAK/tmp_5.py +12 -0
TreeSAK/weighted_rand.rb +23 -0
treesak-1.53.3.data/scripts/TreeSAK +955 -0
treesak-1.53.3.dist-info/LICENSE +674 -0
treesak-1.53.3.dist-info/METADATA +27 -0
treesak-1.53.3.dist-info/RECORD +131 -0
treesak-1.53.3.dist-info/WHEEL +5 -0
treesak-1.53.3.dist-info/top_level.txt +1 -0

TreeSAK/AssessPB.py ADDED Viewed

@@ -0,0 +1,113 @@
+import os
+import argparse
+AssessPB_usage = '''
+====================== AssessPB example commands ======================
+# Dependency: bpcomp and tracecomp (from PhyloBayes-MPI)
+export OMPI_MCA_btl=^openib
+TreeSAK AssessPB -c all_chains.txt
+# This is a wrapper for (take 4 chains as an example):
+bpcomp -x 1000 10 c1 c2 c3 c4
+tracecomp -x 1000 c1 c2 c3 c4
+# format of the file provided to -c: directory_path/output_prefix
+GTDB_SCG_best50p0_pb_chain1/GTDB_SCG_best50p0_pb_chain1
+GTDB_SCG_best50p0_pb_chain2/GTDB_SCG_best50p0_pb_chain2
+GTDB_SCG_best50p0_pb_chain3/GTDB_SCG_best50p0_pb_chain3
+GTDB_SCG_best50p0_pb_chain4/GTDB_SCG_best50p0_pb_chain4
+=======================================================================
+'''
+def compare2chains(chain_1, chain_2, chain_3, chain_4, burn_in, sample_interval, op_dir, cmd_txt):
+    # bpcomp:    -x <burnin> [<every> <until>]. default burnin = 10 percent of the chain
+    # tracecomp: -x <burnin> [<every> <until>]. default burnin = 20 percent of the chain
+    bpcomp_cmd    = 'bpcomp -o %s/bpcomp -x %s %s %s %s'                % (op_dir, burn_in, sample_interval, chain_1, chain_2)
+    tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s'             % (op_dir, burn_in, chain_1, chain_2)
+    if (chain_3 is not None) and (chain_4 is None):
+        bpcomp_cmd    = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s'         % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3)
+        tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s'      % (op_dir, burn_in, chain_1, chain_2, chain_3)
+    if (chain_3 is not None) and (chain_4 is not None):
+        bpcomp_cmd    = 'bpcomp -o %s/bpcomp -x %s %s %s %s %s %s'      % (op_dir, burn_in, sample_interval, chain_1, chain_2, chain_3, chain_4)
+        tracecomp_cmd = 'tracecomp -o %s/tracecomp -x %s %s %s %s %s'   % (op_dir, burn_in, chain_1, chain_2, chain_3, chain_4)
+    # write out commands
+    cmd_txt_handle = open(cmd_txt, 'a')
+    cmd_txt_handle.write(bpcomp_cmd + '\n')
+    cmd_txt_handle.write(tracecomp_cmd + '\n')
+    cmd_txt_handle.close()
+    # execute commands
+    print('\n================================ bpcomp ================================')
+    os.system(bpcomp_cmd)
+    print('\nGuideline')
+    print('maxdiﬀ < 0.1:     good')
+    print('maxdiﬀ < 0.3:     acceptable, gives a good qualitative picture of the posterior consensus.')
+    print('0.3 < maxdiﬀ < 1: the sample is not yet suﬃciently large and have not converged, but on right track.')
+    print('If maxdiﬀ = 1 even after 10,000 points: at least one run stuck in a local maximum.')
+    print('\n============================== tracecomp ==============================\n')
+    os.system(tracecomp_cmd)
+    print('\nGuideline')
+    print('good:       rel diﬀ < 0.1 and minimum eﬀective size > 300')
+    print('acceptable: rel diﬀ < 0.3 and minimum eﬀective size > 50')
+    print('\n========================================================================\n')
+def AssessPB(args):
+    chain_file      = args['c']
+    burn_in         = args['bi']
+    sample_interval = args['si']
+    op_dir          = args['o']
+    force_overwrite = args['f']
+    cmd_txt         = '%s/cmds.txt' % op_dir
+    # check is chain_file exist
+    if os.path.isfile(chain_file) is False:
+        print('%s not found, program exited!' % chain_file)
+        exit()
+    # check if chains were provided in the file
+    chain_list = []
+    for each_chain in open(chain_file):
+        chain_list.append(each_chain.strip())
+    if len(chain_list) < 2:
+        print('Provided %s chains, need at least two chains, program exited!' % len(chain_list))
+        exit()
+    # create output dir
+    if os.path.isdir(op_dir) is True:
+        if force_overwrite is True:
+            os.system('rm -r %s' % op_dir)
+        else:
+            print('output folder already exist, program exited!')
+            exit()
+    os.system('mkdir %s' % op_dir)
+    if len(chain_list) == 2:
+        compare2chains(chain_list[0], chain_list[1], burn_in, sample_interval, op_dir, cmd_txt)
+    elif len(chain_list) == 3:
+        compare2chains(chain_list[0], chain_list[1], chain_list[2], burn_in, sample_interval, op_dir, cmd_txt)
+    elif len(chain_list) == 4:
+        compare2chains(chain_list[0], chain_list[1], chain_list[2], chain_list[3], burn_in, sample_interval, op_dir, cmd_txt)
+if __name__ == '__main__':
+    AssessPB_parser = argparse.ArgumentParser()
+    AssessPB_parser.add_argument('-c',      required=False, default=None,           help='a txt file contain all the chains')
+    AssessPB_parser.add_argument('-bi',     required=False, default=1000,           help='burn-in, default: 1000')
+    AssessPB_parser.add_argument('-si',     required=False, default=10,             help='sample interval, default: 10')
+    AssessPB_parser.add_argument('-o',      required=True,  default=None,           help='output directory')
+    AssessPB_parser.add_argument('-f',      required=False, action="store_true",    help='force overwrite')
+    args = vars(AssessPB_parser.parse_args())
+    AssessPB(args)

TreeSAK/BMGE.jar ADDED Viewed

Binary file

TreeSAK/BMGE.py ADDED Viewed

@@ -0,0 +1,49 @@
+import os
+import argparse
+BMGE_usage = '''
+======================= BMGE example commands =======================
+# require: java
+TreeSAK BMGE -p demo -i input.aln -m BLOSUM30 -esc 0.55
+# Settings for calculating split score (Nina Dombrowski):
+# -t AA -m BLOSUM30 -h 0.55
+=====================================================================
+'''
+def BMGE(args):
+    op_prefix               = args['p']
+    msa_in                  = args['i']
+    trim_model              = args['m']
+    entropy_score_cutoff    = args['esc']
+    # define file name
+    msa_out_fasta  = '%s.BMGE.fasta'  % op_prefix
+    # specify path to BMGE.jar
+    current_file_path   = '/'.join(os.path.realpath(__file__).split('/')[:-1])
+    pwd_bmge_jar        = '%s/BMGE.jar' % current_file_path
+    # run BMGE
+    bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_fasta)
+    print(bmge_cmd)
+    os.system(bmge_cmd)
+    print('Done!')
+if __name__ == '__main__':
+    BMGE_parser = argparse.ArgumentParser()
+    BMGE_parser.add_argument('-p',   required=True,                         help='output prefix')
+    BMGE_parser.add_argument('-i',   required=True,                         help='input MSA')
+    BMGE_parser.add_argument('-m',   required=False, default='BLOSUM30',    help='trim model, default: BLOSUM30')
+    BMGE_parser.add_argument('-esc', required=False, default='0.55',        help='entropy score cutoff, default: 0.55')
+    args = vars(BMGE_parser.parse_args())
+    BMGE(args)

TreeSAK/C60SR4.nex ADDED Viewed

@@ -0,0 +1,127 @@
+#nexus
+begin models;
+frequency C60NT1= 0.6671684132 0.0296031604 0.263873245 0.0393551815;
+frequency C60NT2= 0.1276338907 0.0209812196 0.7927062299 0.0586786597;
+frequency C60NT3= 0.0163962994 0.0051413956 0.0030074531 0.975454852;
+frequency C60NT4= 0.43894983 0.1822394649 0.0174009227 0.3614097825;
+frequency C60NT5= 0.0979385646 0.2096596198 0.0222763602 0.6701254554;
+frequency C60NT6= 0.245811863 0.1332129821 0.5371646791 0.083810476;
+frequency C60NT7= 0.0226930145 0.3959457214 0.0091927563 0.5721685078;
+frequency C60NT8= 0.5015838424 0.4308125086 0.0079650295 0.0596386196;
+frequency C60NT9= 0.2221963854 0.3881879363 0.1020538807 0.2875617976;
+frequency C60NT10= 0.2637737779 0.1509966633 0.2145338888 0.3706956701;
+frequency C60NT11= 0.192147964 0.0110024977 0.7823009601 0.0145485781;
+frequency C60NT12= 0.1484234234 0.0338349152 0.4455828719 0.3721587895;
+frequency C60NT13= 0.7701156248 0.0361294714 0.0511613978 0.142593506;
+frequency C60NT14= 0.4210820539 0.0363523971 0.0088100421 0.5337555069;
+frequency C60NT15= 0.4443921233 0.0693293801 0.4521540079 0.0341244887;
+frequency C60NT16= 0.3775042704 0.0269821801 0.580257087 0.0152564625;
+frequency C60NT17= 0.4767453479 0.034379142 0.1231147917 0.3657607185;
+frequency C60NT18= 0.0268679326 0.0169276748 0.0098917288 0.9463126639;
+frequency C60NT19= 0.3018006878 0.0294470475 0.6420354145 0.0267168502;
+frequency C60NT20= 0.3419571734 0.0262071154 0.6184824034 0.0133533078;
+frequency C60NT21= 0.0096247416 0.9088039403 0.0103815544 0.0711897638;
+frequency C60NT22= 0.1999965426 0.0297517307 0.2954127651 0.4748389616;
+frequency C60NT23= 0.1127907248 0.5495098576 0.0691492727 0.268550145;
+frequency C60NT24= 0.0776920852 0.0329607437 0.0190469923 0.8703001788;
+frequency C60NT25= 0.8938602529 0.0175688123 0.0516233175 0.0369476173;
+frequency C60NT26= 0.27898143 0.2724136798 0.2567784573 0.1918264331;
+frequency C60NT27= 0.498929145 0.0363065742 0.3503605267 0.1144037542;
+frequency C60NT28= 0.2681948281 0.06920056 0.5207582638 0.1418463481;
+frequency C60NT29= 0.2045159736 0.0339698809 0.0436529982 0.7178611476;
+frequency C60NT30= 0.2691226215 0.0356845278 0.162767839 0.5324250117;
+frequency C60NT31= 0.7694198604 0.0996406504 0.090926784 0.040012705;
+frequency C60NT32= 0.015577726 0.0724312301 0.0067063212 0.9052847226;
+frequency C60NT33= 0.0832030401 0.1132575475 0.2862435644 0.5172958479;
+frequency C60NT34= 0.0509990348 0.0082496135 0.0061309345 0.9346204172;
+frequency C60NT35= 0.7537265064 0.0213859704 0.1711138164 0.0537737068;
+frequency C60NT36= 0.909000965 0.0289286367 0.0266394188 0.0354309794;
+frequency C60NT37= 0.9546241163 0.008441069 0.0316827792 0.0052520355;
+frequency C60NT38= 0.5192884454 0.0572862542 0.3797587173 0.0436665831;
+frequency C60NT39= 0.1341646585 0.5925517098 0.1874442682 0.0858393636;
+frequency C60NT40= 0.0448326475 0.0307205308 0.0250541698 0.8993926521;
+frequency C60NT41= 0.5664706531 0.0393032078 0.3812930448 0.0129330943;
+frequency C60NT42= 0.8370819783 0.0190256158 0.1290970633 0.0147953427;
+frequency C60NT43= 0.1120464953 0.0574737723 0.799428803 0.0310509292;
+frequency C60NT44= 0.5567531248 0.030047458 0.113077382 0.3001220351;
+frequency C60NT45= 0.7530681463 0.0296715581 0.1919452504 0.0253150453;
+frequency C60NT46= 0.0813111668 0.4663342365 0.3315560742 0.1207985224;
+frequency C60NT47= 0.3493942031 0.0181602391 0.5911205418 0.0413250161;
+frequency C60NT48= 0.7273938304 0.0224960247 0.2440348688 0.0060752763;
+frequency C60NT49= 0.144800238 0.0781260939 0.6995327663 0.0775409016;
+frequency C60NT50= 0.5558261942 0.0217135959 0.355400139 0.0670600707;
+frequency C60NT51= 0.0142277933 0.577157862 0.0046886186 0.4039257261;
+frequency C60NT52= 0.566952106 0.1662596481 0.0829557817 0.1838324641;
+frequency C60NT53= 0.3710307018 0.2568582793 0.3304267825 0.0416842365;
+frequency C60NT54= 0.4673591892 0.0443703034 0.0644933219 0.4237771855;
+frequency C60NT55= 0.2935718655 0.0371033744 0.5025606284 0.1667641317;
+frequency C60NT56= 0.1445054403 0.0175105032 0.8019947085 0.035989348;
+frequency C60NT57= 0.5953413269 0.0543418469 0.3379976485 0.0123191777;
+frequency C60NT58= 0.5011346064 0.0186312309 0.4456054968 0.034628666;
+frequency C60NT59= 0.8862685333 0.0262544484 0.0131639188 0.0743130995;
+frequency C60NT60= 0.0386456469 0.0058035261 0.0121187396 0.9434320874;
+model C60SR4=GTR+G+FMIX{C60NT1,C60NT2,C60NT3,C60NT4,C60NT5,C60NT6,C60NT7,C60NT8,C60NT9,C60NT10,C60NT11,C60NT12,C60NT13,C60NT14,C60NT15,C60NT16,C60NT17,C60NT18,C60NT19,C60NT20,C60NT21,C60NT22,C60NT23,C60NT24,C60NT25,C60NT26,C60NT27,C60NT28,C60NT29,C60NT30,C60NT31,C60NT32,C60NT33,C60NT34,C60NT35,C60NT36,C60NT37,C60NT38,C60NT39,C60NT40,C60NT41,C60NT42,C60NT43,C60NT44,C60NT45,C60NT46,C60NT47,C60NT48,C60NT49,C60NT50,C60NT51,C60NT52,C60NT53,C60NT54,C60NT55,C60NT56,C60NT57,C60NT58,C60NT59,C60NT60}+F;
+end;

TreeSAK/CompareMCMC.py ADDED Viewed

@@ -0,0 +1,138 @@
+import os
+import argparse
+import arviz as az
+import pandas as pd
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import figure
+CompareMCMC_usage = '''
+====================================== CompareMCMC example commands ======================================
+TreeSAK CompareMCMC -mx IR_mcmc.txt -my AR_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
+cd /Users/songweizhi/Desktop
+TreeSAK CompareMCMC -mx /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run1_mcmc.txt -my /Users/songweizhi/Desktop/Sponge_r220/6_dating/MCMCTree/dating_outputs/topo2p10_clock3_nsample250000_run2_mcmc.txt -lx IR -ly AR -o convergence_plot.png -max 40 -fs 12
+==========================================================================================================
+'''
+def sep_path_basename_ext(file_in):
+    file_path, file_name = os.path.split(file_in)
+    if file_path == '':
+        file_path = '.'
+    file_basename, file_extension = os.path.splitext(file_name)
+    return file_path, file_basename, file_extension
+def CompareMCMC(args):
+    mcmc_txt_x      = args['mx']
+    mcmc_txt_y      = args['my']
+    label_x         = args['lx']
+    label_y         = args['ly']
+    pwd_figure      = args['o']
+    max_axis_value  = args['max']
+    label_fs        = args['fs']
+    x_path, x_basename, x_ext = sep_path_basename_ext(mcmc_txt_x)
+    y_path, y_basename, y_ext = sep_path_basename_ext(mcmc_txt_y)
+    if label_x is None:
+        label_x = x_basename
+    if label_y is None:
+        label_y = y_basename
+    # read in dataframe
+    df_x = pd.read_table(mcmc_txt_x, index_col=0)
+    df_y = pd.read_table(mcmc_txt_y, index_col=0)
+    # get Mean value for each column
+    df_x_col_to_mean_dict = {col_name: mean for col_name, mean in df_x.mean().iteritems()}
+    df_y_col_to_mean_dict = {col_name: mean for col_name, mean in df_y.mean().iteritems()}
+    # get CI95 for each column
+    df_x_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_x.iteritems()}
+    df_y_col_to_ci_dict = {col_name: az.hdi(col.values, hdi_prob=0.95) for col_name, col in df_y.iteritems()}
+    num_list_x = []
+    num_list_y = []
+    err_range_x = []
+    err_range_y = []
+    for col_name, col in df_x.iteritems():
+        if col_name not in ['mu', 'sigma2', 'lnL']:
+            num_list_x.append(df_x_col_to_mean_dict[col_name])
+            num_list_y.append(df_y_col_to_mean_dict[col_name])
+            err_range_x.append(df_x_col_to_ci_dict[col_name])
+            err_range_y.append(df_y_col_to_ci_dict[col_name])
+    x_err_l = []
+    x_err_r = []
+    y_err_l = []
+    y_err_u = []
+    max_value = 0
+    min_value = 100000000000000
+    n = 0
+    while n < len(num_list_x):
+        x_value = num_list_x[n]
+        y_value = num_list_y[n]
+        x_range = err_range_x[n]
+        y_range = err_range_y[n]
+        x_l_dist = abs(x_value - x_range[0])
+        x_r_dist = abs(x_range[1] - x_value)
+        y_l_dist = abs(y_value - y_range[0])
+        y_u_dist = abs(y_range[1] - y_value)
+        x_err_l.append(x_l_dist)
+        x_err_r.append(x_r_dist)
+        y_err_l.append(y_l_dist)
+        y_err_u.append(y_u_dist)
+        current_max = max(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
+        current_min = min(x_value, y_value, x_range[0], x_range[1], y_range[0], y_range[1])
+        if current_max > max_value:
+            max_value = current_max
+        if current_min < min_value:
+            min_value = current_min
+        n += 1
+    figure(figsize=(6, 6), dpi=300)
+    plt.plot([min_value, max_value], [min_value, max_value], color='black', linestyle='dashed', linewidth=1, alpha=0.5)
+    plt.scatter(num_list_x, num_list_y, s=0)
+    plt.errorbar(num_list_x, num_list_y, xerr=[x_err_l, x_err_r], yerr=[y_err_l, y_err_u],
+                 ls='none', ecolor='skyblue', elinewidth=1, alpha=0.5)
+    if max_axis_value is not None:
+        plt.xlim([0, max_axis_value])
+        plt.ylim([0, max_axis_value])
+    # Set the font size of xticks and yticks
+    plt.xticks(fontsize=label_fs)
+    plt.yticks(fontsize=label_fs)
+    plt.xlabel(label_x, fontsize=label_fs)
+    plt.ylabel(label_y, fontsize=label_fs)
+    # write out
+    plt.tight_layout()
+    plt.savefig(pwd_figure)
+    plt.close()
+    print('Plot exported to %s, done!' % pwd_figure)
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-mx',      required=True,                          help='mcmc.txt for x axis')
+    parser.add_argument('-my',      required=True,                          help='mcmc.txt for y axis')
+    parser.add_argument('-lx',      required=False, default=None,           help='label for x axis')
+    parser.add_argument('-ly',      required=False, default=None,           help='label for y axis')
+    parser.add_argument('-max',     required=False, default=None, type=int, help='maximum axis value')
+    parser.add_argument('-fs',      required=False, default=16, type=int,   help='label font size, default: 16')
+    parser.add_argument('-o',       required=True,                          help='output plot')
+    args = vars(parser.parse_args())
+    CompareMCMC(args)

TreeSAK/ConcateMSA.py ADDED Viewed

@@ -0,0 +1,111 @@
+import os
+import glob
+import argparse
+from Bio import SeqIO
+from Bio import AlignIO
+ConcateMSA_usage = '''
+================= ConcateMSA example commands =================
+TreeSAK ConcateMSA -i aln -x aln -p concatenated -gene2gnm
+# output file include:
+concatenated.fasta
+concatenated.phylip
+concatenated.partition.txt
+===============================================================
+'''
+def ConcateMSA(args):
+    msa_dir                 = args['i']
+    msa_ext                 = args['x']
+    op_prefix               = args['p']
+    gene2gnm                = args['gene2gnm']
+    concatenated_msa_phy    = '%s.phylip'           % op_prefix
+    concatenated_msa_fasta  = '%s.fasta'            % op_prefix
+    partition_file          = '%s.partition.txt'    % op_prefix
+    msa_file_re             = '%s/*.%s'             % (msa_dir, msa_ext)
+    msa_file_list           = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
+    msa_file_list_sorted    = sorted(msa_file_list)
+    complete_gnm_set = set()
+    for each_msa_file in msa_file_list:
+        pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
+        for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
+            seq_id = each_seq.id
+            if gene2gnm is True:
+                seq_id = '_'.join(seq_id.split('_')[:-1])
+            complete_gnm_set.add(seq_id)
+    complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
+    # initialize concatenated msa dict
+    gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
+    msa_len_dict = dict()
+    for each_msa_file in msa_file_list_sorted:
+        msa_id = each_msa_file.split('.' + msa_ext)[0]
+        # read in msa
+        current_msa_len = 0
+        current_msa_len_set = set()
+        pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
+        current_msa_seq_dict = dict()
+        for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
+            seq_id = each_seq.id
+            if gene2gnm is True:
+                seq_id = '_'.join(seq_id.split('_')[:-1])
+            complete_gnm_set.add(seq_id)
+            current_msa_seq_dict[seq_id] = str(each_seq.seq)
+            current_msa_len_set.add(len(each_seq.seq))
+            current_msa_len = len(each_seq.seq)
+        if len(current_msa_len_set) != 1:
+            print('Sequences with different length were found in %s, program exited!' % each_msa_file)
+            exit()
+        msa_len_dict[msa_id] = current_msa_len
+        # add sequence to concatenated msa dict
+        for each_gnm in complete_gnm_list_sorted:
+            msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
+            gnm_to_seq_dict[each_gnm] += msa_seq
+    # write out concatenated msa
+    concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
+    for each_gnm in complete_gnm_list_sorted:
+        concatenated_msa_handle.write('>%s\n' % each_gnm)
+        concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
+    concatenated_msa_handle.close()
+    # write out partition file
+    end_pos = 0
+    partition_file_handle = open(partition_file, 'w')
+    for each_m in msa_file_list_sorted:
+        gene_id = each_m.split('.' + msa_ext)[0]
+        current_m_len = msa_len_dict[gene_id]
+        partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
+        end_pos += current_m_len
+    partition_file_handle.close()
+    # convert msa in fasta to phy
+    AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i',           required=True,                       help='input MSA folder')
+    parser.add_argument('-x',           required=False, default='aln',       help='input file extension')
+    parser.add_argument('-p',           required=True,                       help='output prefix')
+    parser.add_argument('-gene2gnm',    required=False, action="store_true", help='gene id to gnm id, split sequence id before the last _')
+    args = vars(parser.parse_args())
+    ConcateMSA(args)

TreeSAK/ConvertMSA.py ADDED Viewed

@@ -0,0 +1,135 @@
+import os
+import glob
+import argparse
+from Bio import SeqIO
+from Bio import AlignIO
+ConvertMSA_usage = '''
+================================= ConvertMSA example commands =================================
+# phylip to fasta
+TreeSAK ConvertMSA -i concatenated.phy -fi phylip-relaxed -o concatenated.fasta -fo fasta
+TreeSAK ConvertMSA -i phy_files -fi phylip-relaxed -xi phy -o MSA_in_fasta -fo fasta -xo fa
+# examples of alignment format (https://biopython.org/wiki/AlignIO):
+fasta, phylip, phylip-relaxed, phylip-sequential, clustal
+===============================================================================================
+'''
+def sep_path_basename_ext(file_in):
+    # separate path and file name
+    file_path, file_name = os.path.split(file_in)
+    if file_path == '':
+        file_path = '.'
+    # separate file basename and extension
+    file_basename, file_extension = os.path.splitext(file_name)
+    return file_path, file_basename, file_extension
+def ConvertMSA(args):
+    aln_in            = args['i']
+    aln_in_ext        = args['xi']
+    aln_in_format     = args['fi']
+    aln_out           = args['o']
+    aln_out_ext       = args['xo']
+    aln_out_format    = args['fo']
+    one_line          = args['oneline']
+    no_gap            = args['nogap']
+    force_overwriting = args['f']
+    if ((one_line is True) and (aln_out_format != 'fasta')) or ((no_gap is True) and (aln_out_format != 'fasta')):
+        print('Please provide "-oneline" and/or "-nogap" only if "-fo" is fasta')
+        exit()
+    if os.path.isfile(aln_in) is True:
+        if (one_line is False) and (no_gap is False):
+            AlignIO.convert(aln_in, aln_in_format, aln_out, aln_out_format)
+        else:
+            aln_out_tmp = aln_out + '.tmp'
+            AlignIO.convert(aln_in, aln_in_format, aln_out_tmp, aln_out_format)
+            pwd_aln_out_handle = open(aln_out, 'w')
+            for each_seq in SeqIO.parse(aln_out_tmp, 'fasta'):
+                seq_id = each_seq.id
+                seq_sequence = str(each_seq.seq)
+                if no_gap is False:
+                    pwd_aln_out_handle.write('>%s\n' % seq_id)
+                    pwd_aln_out_handle.write('%s\n' % seq_sequence)
+                else:
+                    pwd_aln_out_handle.write('>%s\n' % seq_id)
+                    pwd_aln_out_handle.write('%s\n' % seq_sequence.replace('-', ''))
+            pwd_aln_out_handle.close()
+            os.system('rm %s' % aln_out_tmp)
+        print('Done!')
+    elif os.path.isdir(aln_in) is True:
+        aln_in_re = '%s/*.%s' % (aln_in, aln_in_ext)
+        aln_in_list = [os.path.basename(file_name) for file_name in glob.glob(aln_in_re)]
+        # check input
+        if len(aln_in_list) == 0:
+            print('Input file not detected, program exited!')
+            exit()
+        # check output folder
+        if os.path.isdir(aln_out) is True:
+            if force_overwriting is True:
+                os.system('rm -r %s' % aln_out)
+            else:
+                print('Output folder already exist, program exited!')
+                exit()
+        os.system('mkdir %s' % aln_out)
+        # convert
+        for each_aln_in in aln_in_list:
+            aln_in_path, aln_in_basename, aln_in_ext = sep_path_basename_ext(each_aln_in)
+            pwd_aln_in      = '%s/%s'        % (aln_in, each_aln_in)
+            pwd_aln_out     = '%s/%s.%s'     % (aln_out, aln_in_basename, aln_out_ext)
+            pwd_aln_out_tmp = '%s/%s_tmp.%s' % (aln_out, aln_in_basename, aln_out_ext)
+            if (one_line is False) and (no_gap is False):
+                AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out, aln_out_format)
+            else:
+                AlignIO.convert(pwd_aln_in, aln_in_format, pwd_aln_out_tmp, aln_out_format)
+                pwd_aln_out_handle = open(pwd_aln_out, 'w')
+                for each_seq in SeqIO.parse(pwd_aln_out_tmp, 'fasta'):
+                    seq_id = each_seq.id
+                    seq_sequence = str(each_seq.seq)
+                    if no_gap is False:
+                        pwd_aln_out_handle.write('>%s\n' % seq_id)
+                        pwd_aln_out_handle.write('%s\n' % seq_sequence)
+                    else:
+                        sequence_no_gap = seq_sequence.replace('-', '')
+                        if len(sequence_no_gap) > 0:
+                            pwd_aln_out_handle.write('>%s\n' % seq_id)
+                            pwd_aln_out_handle.write('%s\n' % sequence_no_gap)
+                pwd_aln_out_handle.close()
+                os.system('rm %s' % pwd_aln_out_tmp)
+        print('Done!')
+    else:
+        print('Input file not found, program exited!')
+if __name__ == '__main__':
+    # initialize the options parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i',       required=True,                       help='input alignment')
+    parser.add_argument('-xi',      required=False, default='aln',       help='input alignment extension')
+    parser.add_argument('-fi',      required=True,                       help='input alignment format, e.g., fasta, phylip')
+    parser.add_argument('-o',       required=True,                       help='output alignment')
+    parser.add_argument('-xo',      required=False, default='aln',       help='output alignment extension')
+    parser.add_argument('-fo',      required=True,                       help='output alignment format, e.g., fasta, phylip')
+    parser.add_argument('-oneline', required=False, action="store_true", help='put sequence in single line, available if -fo is fasta')
+    parser.add_argument('-nogap',   required=False, action="store_true", help='remove gaps from alignment, available if -fo is fasta')
+    parser.add_argument('-f',       required=False, action="store_true", help='force overwrite existing output folder')
+    args = vars(parser.parse_args())
+    ConvertMSA(args)