PyPI - partis-bcr - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

partis-bcr 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

bin/FastTree +0 -0
bin/add-chimeras.py +59 -0
bin/add-seqs-to-outputs.py +81 -0
bin/bcr-phylo-run.py +799 -0
bin/build.sh +24 -0
bin/cf-alleles.py +97 -0
bin/cf-germlines.py +57 -0
bin/cf-linearham.py +199 -0
bin/chimera-plot.py +76 -0
bin/choose-partially-paired.py +143 -0
bin/circle-plots.py +30 -0
bin/compare-plotdirs.py +298 -0
bin/diff-parameters.py +133 -0
bin/docker-hub-push.sh +6 -0
bin/extract-pairing-info.py +55 -0
bin/gcdyn-simu-run.py +223 -0
bin/gctree-run.py +244 -0
bin/get-naive-probabilities.py +126 -0
bin/iqtree-1.6.12 +0 -0
bin/lonr.r +1020 -0
bin/makeHtml +52 -0
bin/mds-run.py +46 -0
bin/parse-output.py +277 -0
bin/partis +1869 -0
bin/partis-pip +116 -0
bin/partis.py +1869 -0
bin/plot-gl-set-trees.py +519 -0
bin/plot-hmms.py +151 -0
bin/plot-lb-tree.py +427 -0
bin/raxml-ng +0 -0
bin/read-bcr-phylo-trees.py +38 -0
bin/read-gctree-output.py +166 -0
bin/run-chimeras.sh +64 -0
bin/run-dtr-scan.sh +25 -0
bin/run-paired-loci.sh +100 -0
bin/run-tree-metrics.sh +88 -0
bin/smetric-run.py +62 -0
bin/split-loci.py +317 -0
bin/swarm-2.1.13-linux-x86_64 +0 -0
bin/test-germline-inference.py +425 -0
bin/tree-perf-run.py +194 -0
bin/vsearch-2.4.3-linux-x86_64 +0 -0
bin/vsearch-2.4.3-macos-x86_64 +0 -0
bin/xvfb-run +194 -0
partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
partis_bcr-1.0.1.data/scripts/test.py +1005 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
{partis → python}/glutils.py +1 -1
python/main.py +30 -0
{partis → python}/plotting.py +10 -1
{partis → python}/treeutils.py +18 -16
{partis → python}/utils.py +14 -7
partis/main.py +0 -59
partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
{partis → python}/__init__.py +0 -0
{partis → python}/alleleclusterer.py +0 -0
{partis → python}/allelefinder.py +0 -0
{partis → python}/alleleremover.py +0 -0
{partis → python}/annotationclustering.py +0 -0
{partis → python}/baseutils.py +0 -0
{partis → python}/cache/__init__.py +0 -0
{partis → python}/cache/cached_uncertainties.py +0 -0
{partis → python}/clusterpath.py +0 -0
{partis → python}/coar.py +0 -0
{partis → python}/corrcounter.py +0 -0
{partis → python}/datautils.py +0 -0
{partis → python}/event.py +0 -0
{partis → python}/fraction_uncertainty.py +0 -0
{partis → python}/gex.py +0 -0
{partis → python}/glomerator.py +0 -0
{partis → python}/hist.py +0 -0
{partis → python}/hmmwriter.py +0 -0
{partis → python}/hutils.py +0 -0
{partis → python}/indelutils.py +0 -0
{partis → python}/lbplotting.py +0 -0
{partis → python}/mds.py +0 -0
{partis → python}/mutefreqer.py +0 -0
{partis → python}/paircluster.py +0 -0
{partis → python}/parametercounter.py +0 -0
{partis → python}/paramutils.py +0 -0
{partis → python}/partitiondriver.py +0 -0
{partis → python}/partitionplotter.py +0 -0
{partis → python}/performanceplotter.py +0 -0
{partis → python}/plotconfig.py +0 -0
{partis → python}/processargs.py +0 -0
{partis → python}/prutils.py +0 -0
{partis → python}/recombinator.py +0 -0
{partis → python}/scanplot.py +0 -0
{partis → python}/seqfileopener.py +0 -0
{partis → python}/treegenerator.py +0 -0
{partis → python}/viterbicluster.py +0 -0
{partis → python}/vrc01.py +0 -0
{partis → python}/waterer.py +0 -0

bin/test-germline-inference.py ADDED Viewed

@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import numpy
+import copy
+import random
+import argparse
+import time
+import sys
+import os
+import glob
+import colored_traceback.always
+from subprocess import check_call
+sys.path.insert(1, '.') #'./python')
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+import python.utils as utils
+import python.glutils as glutils
+import python.processargs as processargs
+# ----------------------------------------------------------------------------------------
+def cov_cmd():
+    return 'coverage3 run --append'
+# ----------------------------------------------------------------------------------------
+def get_outfname(args, method, annotation_performance_plots=False, return_parent_gl_dir=False):
+    outdir = args.outdir + '/' + method
+    if not annotation_performance_plots:  # default: output is igh/ighv.fasta
+        if method == 'partis' or method == 'full':  # parameter directory, not regular file (although, could change it to the gls .fa in sw/)
+            outdir += '/sw/germline-sets'
+        if not return_parent_gl_dir:
+            return glutils.get_fname(outdir, args.locus, 'v')
+        else:
+            return outdir
+    else:  # product of running partis annotation with --plot-annotation-performance
+        return outdir + '/annotation-performance-plots'
+# ----------------------------------------------------------------------------------------
+def simulate(args):
+    if utils.output_exists(args, args.simfname):
+        return
+    cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --force-dont-generate-germline-set --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
+    cmd_str += ' --allow-nonfunctional-scratch-seqs'
+    if args.n_leaf_distribution is None:
+        cmd_str += ' --constant-number-of-leaves'
+    else:
+        cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
+    if args.mut_mult is not None:
+        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
+    if args.root_mrca_weibull_parameter is not None:
+        cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)
+    if args.n_procs is not None:
+        cmd_str += ' --n-procs ' + str(args.n_procs)
+    if args.slurm:
+        cmd_str += ' --batch-system slurm'
+    args.allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'
+    # figure what genes we're using
+    if args.gls_gen:
+        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
+        glutils.remove_v_genes_with_bad_cysteines(sglfo)
+        glutils.generate_germline_set(sglfo, args, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
+        cmd_str += ' --allele-prevalence-fname ' + args.allele_prevalence_fname
+    else:
+        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
+        added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes))  # NOTE template gene removal is the default for glutils.generate_germline_set
+        if args.allele_prevalence_freqs is not None:
+            if not utils.is_normed(args.allele_prevalence_freqs):
+                raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
+            if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):  # already checked when parsing args, but, you know...
+                raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
+            gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
+            prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
+            glutils.write_allele_prevalence_freqs(prevalence_freqs, args.allele_prevalence_fname)
+            cmd_str += ' --allele-prevalence-fname ' + args.allele_prevalence_fname
+    glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
+    cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
+    # glutils.print_glfo(sglfo)
+    # run simulation
+    if args.seed is not None:
+        cmd_str += ' --random-seed ' + str(args.seed)
+    utils.simplerun(cmd_str, dryrun=args.dryrun)
+# ----------------------------------------------------------------------------------------
+def run_other_method(args, method):
+    if method not in ['tigger-default', 'tigger-tuned', 'igdiscover']:  # really just to make it easier to search for this fcn
+        assert False
+    assert args.n_max_queries is None
+    if utils.output_exists(args, get_outfname(args, method)):
+        return
+    simfasta = utils.getprefix(args.simfname) + '.fa'
+    utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True)
+    cmd = './test/%s-run.py' % method.split('-')[0]
+    if method == 'tigger-tuned':
+        cmd += ' --tuned-tigger-params'
+    cmd += ' --infname ' + simfasta
+    cmd += ' --outfname ' + get_outfname(args, method)
+    if args.species != 'human':
+        cmd += ' --species %s' % args.species
+    if args.overwrite:
+        cmd += ' --overwrite'
+    if args.gls_gen:
+        cmd += ' --gls-gen'
+        cmd += ' --glfo-dir ' + partis_dir + '/' + args.default_germline_dir  # the partis mehods have this as the default internally, but we want/have to set it explicitly here
+    else:
+        cmd += ' --glfo-dir ' + args.inf_glfo_dir
+    cmd += ' --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
+    if method != 'igdiscover':  # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir
+        cmd += ' --workdir ' + args.workdir + '/' + method
+    if args.n_procs is not None:
+        cmd += ' --n-procs ' + str(args.n_procs)
+    if args.slurm:
+        cmd += ' --slurm'
+    utils.simplerun(cmd, dryrun=args.dryrun)
+# ----------------------------------------------------------------------------------------
+def run_performance_plot(args, method):
+    perf_outdir = get_outfname(args, method, annotation_performance_plots=True)
+    if utils.output_exists(args, perf_outdir):
+        return
+    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --plot-annotation-performance'
+    cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'
+    cmd_str += ' --initial-germline-dir ' + get_outfname(args, method, return_parent_gl_dir=True)  # i.e. use the inferred glfo from <method>
+    cmd_str += ' --parameter-dir ' + perf_outdir + '/dummy-parameter-dir'
+    cmd_str += ' --plotdir ' + perf_outdir
+    cmd_str += ' --only-smith-waterman --leave-default-germline --dont-write-parameters'  # i.e. we really want to annotate, not cache parameters, but then it'd look for a parameter dir
+    if args.n_procs is not None:
+        cmd_str += ' --n-procs ' + str(args.n_procs)
+    if args.n_max_queries is not None:
+        cmd_str += ' --n-max-queries ' + str(args.n_max_queries)  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
+    if args.slurm:
+        cmd_str += ' --batch-system slurm'
+    if args.seed is not None:
+        cmd_str += ' --random-seed ' + str(args.seed)
+    utils.simplerun(cmd_str, dryrun=args.dryrun)
+# ----------------------------------------------------------------------------------------
+def run_partis_parameter_cache(args, method):
+    if utils.output_exists(args, get_outfname(args, method)):
+        return
+    paramdir = args.outdir + '/' + method
+    plotdir = args.outdir + '/' + method + '/plots'
+    # remove any old sw cache files
+    sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv')
+    if len(sw_cachefiles) > 0:
+        for cachefname in sw_cachefiles:
+            check_call(['rm', '-v', cachefname])
+            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
+            if os.path.exists(sw_cache_gldir):  # if stuff fails halfway through, you can get one but not the other
+                glutils.remove_glfo_files(sw_cache_gldir, args.locus)
+                # os.rmdir(sw_cache_gldir)
+    # generate germline set and cache parameters
+    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman'
+    cmd_str += ' --initial-germline-dir %s' % (args.default_germline_dir if args.gls_gen else args.inf_glfo_dir)
+    if method == 'partis':
+        cmd_str += ' --debug-allele-finding' # --always-find-new-alleles'
+        cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
+        if args.allele_cluster:
+            cmd_str += ' --allele-cluster'
+            if args.kmeans_allele_cluster:
+                cmd_str += ' --kmeans-allele-cluster'
+    elif method == 'full':
+        cmd_str += ' --leave-default-germline'
+    else:
+        assert False
+    if args.species != 'human':
+        cmd_str += ' --species %s' % args.species
+    if args.n_procs is not None:
+        cmd_str += ' --n-procs ' + str(args.n_procs)
+    if args.n_max_queries is not None:
+        cmd_str += ' --n-max-queries ' + str(args.n_max_queries)  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
+    if args.slurm:
+        cmd_str += ' --batch-system slurm'
+    cmd_str += ' --parameter-dir ' + paramdir
+    cmd_str += ' --plotdir ' + plotdir
+    if args.seed is not None:
+        cmd_str += ' --random-seed ' + str(args.seed)
+    if args.plot_and_fit_absolutely_everything is not None:
+        cmd_str += ' --plot-and-fit-absolutely-everything ' + str(args.plot_and_fit_absolutely_everything)
+    utils.simplerun(cmd_str, dryrun=args.dryrun)
+# ----------------------------------------------------------------------------------------
+def write_inf_glfo(args):  # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it
+    # NOTE this dir should *not* be modified by any of the methods
+    inf_glfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=args.inf_v_genes + args.dj_genes)
+    print('  writing initial inference glfo with %d v: %s' % (len(inf_glfo['seqs']['v']), ' '.join([utils.color_gene(g) for g in inf_glfo['seqs']['v']])))
+    glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
+# ----------------------------------------------------------------------------------------
+def run_tests(args):
+    print('seed %d' % args.seed)
+    # all fcns return immediately if output already exists
+    if 'simu' in args.methods:
+        simulate(args)
+        args.methods.remove('simu')
+    if not args.gls_gen:
+        write_inf_glfo(args)
+    for method in args.methods:
+        if args.plot_annotation_performance:
+            run_performance_plot(args, method)
+        elif method == 'partis' or method == 'full':
+            run_partis_parameter_cache(args, method)
+        else:
+            run_other_method(args, method)
+# ----------------------------------------------------------------------------------------
+def multiple_tests(args):
+    def getlogdir(iproc):
+        logdir = args.outdir + '/' + str(iproc) + '/logs'
+        if args.plot_annotation_performance:
+            logdir += '/annotation-performance-plots'
+        return logdir + '/' + '-'.join(args.methods)
+    def cmd_str(iproc):
+        clist = copy.deepcopy(sys.argv)
+        utils.remove_from_arglist(clist, '--n-tests', has_arg=True)
+        utils.remove_from_arglist(clist, '--iteststart', has_arg=True)
+        utils.replace_in_arglist(clist, '--outdir', args.outdir + '/' + str(iproc))
+        utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc))
+        # clist.append('--slurm')
+        return ' '.join(clist)
+    for iproc in range(args.iteststart, args.n_tests):  # don't overwrite old log files... need to eventually fix this so it isn't necessary
+        def lfn(iproc, ilog):
+            logfname =  args.outdir + '/' + str(iproc) + '/log'
+            if ilog > 0:
+                logfname += '.' + str(ilog)
+            return logfname
+    cmdfos = [{'cmd_str' : cmd_str(iproc),
+               'workdir' : args.workdir + '/' + str(iproc),
+               'logdir' : getlogdir(iproc),
+               'outfname' : args.outdir + '/' + str(iproc)}
+              for iproc in range(args.iteststart, args.n_tests)]
+    if args.dryrun:
+        for iproc in range(args.iteststart, args.n_tests):
+            utils.simplerun(cmdfos[iproc - args.iteststart]['cmd_str'], dryrun=True)
+        return
+    for iproc in range(args.iteststart, args.n_tests):
+        logd = getlogdir(iproc)
+        if os.path.exists(logd + '/log'):
+            ilog = 0
+            while os.path.exists(logd + '/log.' + str(ilog)):
+                ilog += 1
+            check_call(['mv', '-v', logd + '/log', logd + '/log.' + str(ilog)])
+    print('  look for logs in %s' % args.outdir)
+    utils.run_cmds(cmdfos, debug='write')
+# ----------------------------------------------------------------------------------------
+# # ----------------------------------------------------------------------------------------
+# from hist import Hist
+# import plotting
+# fig, ax = plotting.mpl_init()
+# ntrees = 1000
+# distrs = [
+#     # (1.5, 'geo'),
+#     # (3, 'geo'),
+#     (10, 'geo'),
+#     # (25, 'geo'),
+#     # (2.3, 'zipf'),
+#     # (1.8, 'zipf'),
+#     # (1.3, 'zipf'),
+# ]
+# # ----------------------------------------------------------------------------------------
+# def getsubsample(vals):
+#     print vals
+#     iclust = 0
+#     seqs = []
+#     for v in vals:
+#         seqs += [iclust for _ in range(v)]
+#         iclust += 1
+#     print seqs
+#     subseqs = numpy.random.choice(seqs, size=ntrees, replace=False)
+#     # subseqs = seqs[:ntrees]
+#     print subseqs
+#     import itertools
+#     subvals = []
+#     for _, group in itertools.groupby(sorted(subseqs)):
+#         for what in set(group):
+#             subg = [s for s in subseqs if s == what]
+#             print what, len(subg)
+#             subvals.append(len(subg))
+#     print subvals
+#     return subvals
+# ih = 0
+# for n_leaves, fcn in distrs:
+#     if fcn == 'zipf':
+#         vals = numpy.random.zipf(n_leaves, size=ntrees)  # NOTE <n_leaves> is not the mean here
+#     elif fcn == 'geo':
+#         vals = numpy.random.geometric(1. / n_leaves, size=ntrees)
+#     else:
+#         assert False
+#     nbins = 100
+#     htmp = Hist(nbins, -0.5, nbins - 0.5)
+#     for v in vals:
+#         htmp.fill(v)
+#     htmp.mpl_plot(ax, color=plotting.default_colors[ih], errors=False, label='%s %.1f' % (fcn, numpy.mean(vals)))
+# # ----------------------------------------------------------------------------------------
+#     hsub = Hist(nbins, -0.5, nbins - 0.5)
+#     subvals = getsubsample(vals)
+#     for v in subvals:
+#         hsub.fill(v)
+#     hsub.mpl_plot(ax, color=plotting.default_colors[ih], errors=False, label='%s %.1f' % (fcn, numpy.mean(subvals)), linestyle='--')
+# # ----------------------------------------------------------------------------------------
+#     ih += 1
+# plotting.mpl_finish(ax, utils.fsdir() + '/partis/tmp/tmp', 'baz', xbounds=(0.9, nbins), log='y')
+# sys.exit()
+# # ----------------------------------------------------------------------------------------
+example_str = '\n    '.join(['example usage:',
+                             'one new allele separated by 3 snps from existing allele:',
+                             '    ./bin/test-germline-inference.py --n-sim-events 2000 --n-procs 10 --sim-v-genes=IGHV1-18*01 --inf-v-genes=IGHV1-18*01 --snp-positions 27,55,88',
+                             'one new allele [i.e. that the inference doesn\'t know about, but that in this case is in IMGT] separated by 1 snp from existing allele:',
+                             '    ./bin/test-germline-inference.py --n-sim-events 2000 --n-procs 10 --sim-v-genes=IGHV4-39*01:IGHV4-39*02 --inf-v-genes=IGHV4-39*01',
+                             'generate a full germline set for simulation, and then try to infer it:',
+                             '    ./bin/test-germline-inference.py --n-sim-events 2000 --n-procs 10 --gls-gen'])
+class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+formatter_class = MultiplyInheritedFormatter
+parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, epilog=example_str)
+parser.add_argument('--n-sim-events', type=int, default=20, help='number of simulated rearrangement events')
+parser.add_argument('--n-max-queries', type=int, help='number of queries to use for inference from the simulation sample')
+parser.add_argument('--n-leaves', type=float, default=1., help='see bin/partis --help')
+parser.add_argument('--n-leaf-distribution', help='see bin/partis --help')
+parser.add_argument('--root-mrca-weibull-parameter', type=float, help='see bin/partis --help')
+parser.add_argument('--n-procs', type=int)
+parser.add_argument('--seed', type=int, default=int(time.time()), help='random seed')
+parser.add_argument('--gls-gen', action='store_true', help='generate a random germline set from scratch (parameters specified above), and infer a germline set from scratch, instead of using --sim-v-genes, --dj-genes, --inf-v-genes.')
+parser.add_argument('--sim-v-genes', default='IGHV4-39*01:IGHV4-39*08', help='V genes to use for simulation')
+parser.add_argument('--inf-v-genes', default='IGHV4-39*01', help='V genes to use for inference')
+parser.add_argument('--dj-genes', default='IGHD6-19*01:IGHJ4*02', help='D and J genes to use for both simulation and inference')
+parser.add_argument('--snp-positions', help='colon-separated list (length must equal length of <--sim-v-genes>) of comma-separated snp positions for each gene, e.g. for two genes you might have \'3,71:45\'')
+parser.add_argument('--nsnp-list', help='colon-separated list (length must equal length of <--sim-v-genes> unless --gls-gen) of the number of snps to generate for each gene (each snp at a random position). If --gls-gen, then this still gives the number of snpd genes, but it isn\'t assumed to be the same length as anything [i.e. we don\'t yet know how many v genes there\'ll be]')
+parser.add_argument('--indel-positions', help='see --snp-positions (a.t.m. the indel length distributions are hardcoded)')
+parser.add_argument('--nindel-list', help='see --nsnp-list')
+parser.add_argument('--n-genes-per-region', default='::', help='see bin/partis --help')
+parser.add_argument('--n-sim-alleles-per-gene', default='::', help='see bin/partis --help')
+parser.add_argument('--min-sim-allele-prevalence-freq', default=glutils.default_min_allele_prevalence_freq, type=float, help='see bin/partis --help')
+parser.add_argument('--allele-prevalence-freqs', help='colon-separated list of allele prevalence frequencies, including newly-generated snpd genes (ordered alphabetically)')
+parser.add_argument('--dont-remove-template-genes', action='store_true', help='when generating snps, *don\'t* remove the original gene before simulation')  # NOTE template gene removal is the default for glutils.generate_germline_set
+parser.add_argument('--mut-mult', type=float, help='DO NOT USE use --mutation-multiplier (see below)')
+parser.add_argument('--mutation-multiplier', type=float, help='see bin/partis --help')  # see note below
+parser.add_argument('--slurm', action='store_true')
+parser.add_argument('--overwrite', action='store_true')
+parser.add_argument('--dryrun', action='store_true')
+parser.add_argument('--allele-cluster', action='store_true', help='see bin/partis --help')
+parser.add_argument('--kmeans-allele-cluster', action='store_true', help='see bin/partis --help')
+parser.add_argument('--plot-annotation-performance', action='store_true', help='see bin/partis --help')
+parser.add_argument('--methods', default='simu:partis', help='colon-separated list of methods to run. By default runs simulation, and then partis inference (igdiscover and tigger, if installed, are the other options)')
+parser.add_argument('--outdir', default=utils.fsdir() + '/partis/allele-finder')
+parser.add_argument('--inf-glfo-dir', help='default set below')
+parser.add_argument('--simfname', help='default set below')
+parser.add_argument('--workdir', default=utils.fsdir() + '/_tmp/hmms/' + str(random.randint(0, 999999)))
+parser.add_argument('--n-tests', type=int, help='instead of just running once, run <N> independent tests simultaneously')
+parser.add_argument('--iteststart', type=int, default=0, help='for use with --n-tests, if you want to add more tests on')
+parser.add_argument('--plot-and-fit-absolutely-everything', type=int, help='fit every single position for this <istart> and write every single corresponding plot (slow as hell, and only for debugging/making plots for paper)')
+parser.add_argument('--partis-path', default='./bin/partis')
+parser.add_argument('--prepend-coverage-command', action='store_true', help='see bin/partis --help')
+parser.add_argument('--species', default='human', choices=('human', 'macaque'))
+parser.add_argument('--locus', default='igh')
+parser.add_argument('--allele-prevalence-fname', help='for internal use only (set above)')
+args = parser.parse_args()
+assert args.locus == 'igh'  # would just need to update some things, e.g. propagate through to the various methods
+args.methods = utils.get_arg_list(args.methods)
+available_methods = set(['simu', 'partis', 'full', 'tigger-default', 'tigger-tuned', 'igdiscover'])
+if len(set(args.methods) - available_methods) > 0:
+    raise Exception('unexpected --methods: %s' % ' '.join(set(args.methods) - available_methods))
+# args.default_germline_dir = 'old-glfo/%s' % args.species  # 'data/germlines/%s' % args.species  # NOTE gad damnit, I just deleted old-glfo, had no idea what it was for
+print('  %s hopefully old-glfo/ isn\'t needed to recreate old results (see comment)' % utils.color('yellow', 'note:'))
+args.default_germline_dir = 'data/germlines/%s' % args.species  # 'data/germlines/%s' % args.species
+args.generate_germline_set = args.gls_gen  # for compatibility with bin/partis (i.e. so they can both use the fcn in processargs, but I don't have to rewrite either)
+args.mut_mult = args.mutation_multiplier  # for compatibility with bin/partis (i.e. so they can both use the fcn in processargs, but I don't have to rewrite either)
+if args.generate_germline_set:  # if we're generating/inferring a whole germline set these are either set automatically or not used
+    delattr(args, 'sim_v_genes')
+    delattr(args, 'inf_v_genes')
+    delattr(args, 'dj_genes')
+    args.allele_prevalence_freqs = None
+    args.inf_glfo_dir = None
+else:
+    args.dj_genes = utils.get_arg_list(args.dj_genes)
+    args.sim_v_genes = utils.get_arg_list(args.sim_v_genes)
+    args.inf_v_genes = utils.get_arg_list(args.inf_v_genes)
+    args.allele_prevalence_freqs = utils.get_arg_list(args.allele_prevalence_freqs, floatify=True)
+processargs.process_gls_gen_args(args)  # well, also does stuff with non-gls-gen new allele args
+if args.inf_glfo_dir is None:
+    args.inf_glfo_dir = args.outdir + '/germlines/inference'
+if args.simfname is None:
+    args.simfname = args.outdir + '/simu.yaml'
+if args.prepend_coverage_command:
+    args.partis_path = '%s %s' % (cov_cmd(), args.partis_path)
+if args.seed is not None:
+    random.seed(args.seed)
+    numpy.random.seed(args.seed)
+if args.n_tests is not None:
+    multiple_tests(args)
+else:
+    run_tests(args)

bin/tree-perf-run.py ADDED Viewed

@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import sys
+import csv
+from io import open
+csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
+import os
+import argparse
+import colored_traceback.always
+import json
+import dendropy
+# if you move this script, you'll need to change this method of getting the imports
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+import python.treeutils as treeutils
+import python.lbplotting as lbplotting
+import python.coar as coar
+# ----------------------------------------------------------------------------------------
+helpstr = """
+"""
+class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+formatter_class = MultiplyInheritedFormatter
+parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
+parser.add_argument('--true-tree-file', required=True, help='partis yaml file with true annotations from which to extract true trees')
+parser.add_argument('--inferred-tree-file', required=True, help='partis yaml file with inferred annotations and inferred trees')
+parser.add_argument('--outdir')
+parser.add_argument('--metrics', default='coar:rf:mrca')
+parser.add_argument('--n-procs', type=int, help='NOTE not used, just putting here for consistency with other scripts')
+parser.add_argument('--overwrite', action='store_true', help='NOTE just for compatibility, not used atm')
+parser.add_argument('--itree', type=int, help='only run on tree/annotation with this index')
+parser.add_argument('--debug', type=int, default=0)
+args = parser.parse_args()
+args.metrics = utils.get_arg_list(args.metrics, choices=['coar', 'rf', 'mrca'])
+_, tru_atn_list, _ = utils.read_output(args.true_tree_file)
+_, inf_atn_list, _ = utils.read_output(args.inferred_tree_file)
+naive_name = 'naive'
+# ----------------------------------------------------------------------------------------
+def add_seqs_to_nodes(ttr, seqdict, tfn):
+    for node in ttr.preorder_node_iter():
+        node.seq = seqdict[naive_name if node is ttr.seed_node else node.taxon.label]
+# ----------------------------------------------------------------------------------------
+def fix_seqs(atn_t, atn_i, tr_t, tr_i, seq_key='input_seqs', debug=False):  # inferred annotation has padded seqs, which means when the h and l seqs get smashed together (in paircluster.sumv() called from paircluster.make_fake_hl_pair_antns()) sometimes there's extra Ns that aren't in the true annotation
+    # ----------------------------------------------------------------------------------------
+    def combine_chain_seqs(uid, seq_i):  # basic idea is that we need to remove any N padding from waterer.py, then repad but just for translation
+        new_seq_i = []
+        for tch in 'hl':
+            cseq_i = utils.per_seq_val(atn_i, '%s_seqs'%tch, uid)
+            if cseq_i is None:  # inferred ancestral seqs won't have h/l seqs set (maybe also naive?)
+                if tch == 'h':
+                    cseq_i = seq_i[ : cs_lens[tch]]
+                else:
+                    cseq_i = seq_i[cs_lens['h'] : ]
+                cseq_i = cseq_i.strip('N')
+            else:
+                cseq_i = cseq_i.strip('N')
+                # n_lstrip, n_rstrip = len(cseq_i) - len(cseq_i.lstrip('N')), len(cseq_i) - len(cseq_i.rstrip('N'))  # if this starts causing problems again, it might be worth doing something like this to keep track of n bases removed from each side, and making sure it's the same for all seqs
+                if tch not in cs_lens:
+                    cs_lens[tch] = len(cseq_i)  # keep track of the h/l seq lengths, so for inferred nodes where we don't know it, we can remove the same bases
+                assert cs_lens[tch] == len(cseq_i)  # they should all be the same
+            cseq_i = utils.pad_seq_for_translation(atn_i, cseq_i)
+            new_seq_i.append(cseq_i)
+        return ''.join(new_seq_i).strip('N')
+    # ----------------------------------------------------------------------------------------
+    def check_seqs(uid, seq_i, seq_t, fix_counts, force=False, dont_fix=False):  # check/fix that any nodes that are in both trees have the same sequence
+        fix_counts['total'] += 1
+        if seq_t == seq_i:
+            return False  # return whether we fixed it or not
+        # utils.color_mutants(seq_t, seq_i, align_if_necessary=True, print_result=True)
+        seq_i = combine_chain_seqs(uid, seq_i)
+        if seq_t is None:
+            assert force  # for seqs that are in inferred but not true, we already know we need to fix them (and how)
+        else:
+            if seq_t != seq_i and not dont_fix:
+                print('%s tried to fix %s but seqs still different:' % (utils.wrnstr(), uid))
+                utils.color_mutants(seq_t, seq_i, print_result=True, align_if_necessary=True, ref_label='true ', seq_label='inf ')
+                assert False  # NOTE if you stop crashing here, you probably need to increment something in fix_counts
+            seqs_t[uid] = seq_t
+        seqs_i[uid] = seq_i
+        fix_counts['fixed'].append(uid)
+        return True
+    # ----------------------------------------------------------------------------------------
+    def check_all_lengths(seqs_t, seqs_i):  # check/fix that all seqs in both trees have the same length
+        lens_t, lens_i = [list(set(len(s) for s in slist.values())) for slist in [seqs_t, seqs_i]]
+        true_len = utils.get_single_entry(lens_t)
+        if len(lens_i) == 1 and lens_i[0] == true_len:
+            return
+        tseq = list(seqs_t.values())[0]
+        for uid in [u for u, s in seqs_i.items() if len(s) != true_len]:
+            utils.color_mutants(tseq, seqs_i[uid], align_if_necessary=True, print_result=True, extra_str='        ', ref_label='arb. true ', seq_label=uid+' ')
+            _, new_seq = utils.align_seqs(tseq, seqs_i[uid])  # i added this to fix a case that i ended up fixing a different (much better) way, but it might be useful in future, so leaving here
+            seqs_i[uid] = new_seq.replace('-', utils.ambig_base)  # UGH
+            utils.color_mutants(tseq, seqs_i[uid], align_if_necessary=True, print_result=True, extra_str='        ', ref_label='arb. true ', seq_label=uid+' ')
+        raise Exception('different sequence lengths (probably from inferred internal nodes), see previous lines')
+    # ----------------------------------------------------------------------------------------
+    leaf_ids_t = [l.taxon.label for l in tr_t.leaf_node_iter() if l.taxon.label in atn_t['unique_ids']]
+    leaf_ids_i = [u for u in leaf_ids_t if u in atn_i['unique_ids']]  # inferred tree may swap internal/leaf nodes
+    if set(leaf_ids_i) != set(leaf_ids_t):
+        only_true, only_inf = set(leaf_ids_t) - set(leaf_ids_i), set(leaf_ids_i) - set(leaf_ids_t)
+        print('    %s inferred leaf ids not the same as true leaf ids when trying to fix seqs (this is probably ok, since the coar calculation will probably skip them).\n      %d extra true: %s\n      %d extra inf: %s' % (utils.wrnstr(), len(only_true), ' '.join(only_true), len(only_inf), ' '.join(only_inf)))
+    common_leaf_ids = set(leaf_ids_t) & set(leaf_ids_i)  # maybe missing ones would be ok? but don't want to mess with it, and for now we assume below that they're the same
+    seqs_t, seqs_i = [{u : utils.per_seq_val(atn, seq_key, u).strip('N') for u in atn['unique_ids']} for atn in (atn_t, atn_i)]
+    seqs_t[naive_name], seqs_i[naive_name] = [a['naive_seq'].strip('N') for a in (atn_t, atn_i)]
+    fixed, cs_lens, fix_counts = None, {}, {'fixed' : [], 'total' : 0}
+    for uid in common_leaf_ids:
+        tfx = check_seqs(uid, seqs_i[uid], seqs_t[uid], fix_counts)
+        if fixed is None:
+            fixed = tfx
+        assert tfx == fixed  # if we fix one, we should fix all of them
+    if fixed:
+        for uid in [u for u in atn_i['unique_ids'] if u not in leaf_ids_i] + [naive_name]:  # need to also fix any internal/inferred nodes
+            check_seqs(uid, seqs_i[uid], seqs_t.get(uid), fix_counts, force=True, dont_fix=uid==naive_name)
+    print('    no nodes needed fixing (all seqs already the same for common true/inferred nodes)' if len(fix_counts['fixed'])==0 else '    fixed %d / %d nodes' % (len(fix_counts['fixed']), fix_counts['total']))
+    check_all_lengths(seqs_t, seqs_i)
+    if debug and len(fix_counts['fixed']) > 0:
+        print('      fixed seqs: %s' % ' '.join(sorted(fix_counts['fixed'])))
+    return seqs_t, seqs_i
+# ----------------------------------------------------------------------------------------
+def get_n_parsimony_trees(n_clusters):
+# other way to get this number:
+#              with open('gctree_base.inference.parsimony_forest.p', 'rb') as fh:
+#                  forest = pickle.load(fh)
+#              n_parsimony_trees = forest._forest.count_histories()
+    n_ptree_list = []
+    for iclust in range(n_clusters):
+        logfn = '%s/%s/iclust-%d/log' % (os.path.dirname(args.inferred_tree_file), os.path.basename(args.inferred_tree_file).replace('-annotations.yaml', ''), iclust)
+        out, err = utils.simplerun('grep "number of trees with integer branch lengths:" %s ' % logfn, shell=True, return_out_err=True, debug=False)
+        n_ptree_list.append(int(out.split()[-1]))
+    return n_ptree_list
+# ----------------------------------------------------------------------------------------
+# don't need this now that i'm using --simultaneous-true-clonal-seqs (yes, ick)
+def trnfn(u): return u + '_contig_igh+igk'
+utils.translate_uids(tru_atn_list, trfcn=trnfn, expect_missing=True)
+# ----------------------------------------------------------------------------------------
+jvals = {'coar' : [], 'rf' : [], 'mrca' : []}
+for itree, atn_t in enumerate(tru_atn_list):
+    if args.itree is not None and itree != args.itree:
+        continue
+    print('  %d: starting true annotation with size %d' % (itree, len(atn_t['unique_ids'])))
+    atn_i = None
+    for tatn in inf_atn_list:
+        common_ids = set(atn_t['unique_ids']) & set(tatn['unique_ids'])
+        if len(common_ids) > 0:
+            estr = '' if not args.debug else ' (missing %d: %s)' % (len(atn_t['unique_ids']) - len(common_ids), ' '.join(sorted(set(atn_t['unique_ids']) - common_ids)))
+            print('    found inferred annotation with %d / %d uids in common%s' % (len(common_ids), len(atn_t['unique_ids']), estr))
+            atn_i = tatn
+            break
+    if atn_i is None:
+        raise Exception('couldn\'t find inferred annotation for true annotation (looked in %d inferred annotations, maybe try uncommenting translation above): %s' % (len(inf_atn_list), ' '.join(atn_t['unique_ids'])))
+    dtree_t, dtree_i = [treeutils.get_dendro_tree(treestr=lbplotting.get_tree_in_line(l, is_true)) for is_true, l in [[True, atn_t], [False, atn_i]]]
+    if args.debug:
+        for tstr, ttr in zip(['true', 'inf'], [dtree_t, dtree_i]):
+            print('    %4s:' % tstr)
+            print(utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ttr, width=250)))  # , label_fcn=lambda l: l.replace('_contig_igh+igk', '')
+    seqs_t, seqs_i = fix_seqs(atn_t, atn_i, dtree_t, dtree_i, debug=args.debug)
+    for ttr, seqdict, tfn in zip([dtree_t, dtree_i], [seqs_t, seqs_i], [args.true_tree_file, args.inferred_tree_file]):
+        add_seqs_to_nodes(ttr, seqdict, tfn)
+    if 'coar' in args.metrics:
+        jvals['coar'].append(coar.COAR(dtree_t, dtree_i, known_root=False, debug=args.debug))
+    if 'mrca' in args.metrics:
+        jvals['mrca'].append(treeutils.mrca_dist(dtree_t, dtree_i, debug=args.debug))
+    if 'rf' in args.metrics:
+        dts_t, dts_i = treeutils.sync_taxon_namespaces(dtree_t, dtree_i, only_leaves=True) #, debug=True)
+        # this is weighted (i.e. depends on edge length), could also use unweighted (fcn symmetric_difference()) [from /loc/dralph/.local/lib/python3.6/site-packages/dendropy/calculate/treecompare.py]
+        jvals['rf'].append(dendropy.calculate.treecompare.weighted_robinson_foulds_distance(dts_t, dts_i))
+        # print(treeutils.get_ete_rf(dtree_t, dtree_i)
+# if os.path.basename(args.inferred_tree_file).split('-')[0] == 'gctree':
+#     jvals['n-pars-trees'] = get_n_parsimony_trees(len(tru_atn_list))
+if args.outdir is None:
+    print('  %s no --outdir specified, so not writing anything' % utils.wrnstr())
+    sys.exit(0)
+ofn = '%s/tree-perf-vals.yaml' % args.outdir
+print('  writing tree perf values to %s' % ofn)
+if not os.path.exists(args.outdir):
+    os.makedirs(args.outdir)
+utils.jsdump(ofn, jvals)

bin/vsearch-2.4.3-linux-x86_64 ADDED Viewed

Binary file

bin/vsearch-2.4.3-macos-x86_64 ADDED Viewed

Binary file

partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

partis-bcr 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl