PyPI - partis-bcr - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

partis-bcr 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

bin/FastTree +0 -0
bin/add-chimeras.py +59 -0
bin/add-seqs-to-outputs.py +81 -0
bin/bcr-phylo-run.py +799 -0
bin/build.sh +24 -0
bin/cf-alleles.py +97 -0
bin/cf-germlines.py +57 -0
bin/cf-linearham.py +199 -0
bin/chimera-plot.py +76 -0
bin/choose-partially-paired.py +143 -0
bin/circle-plots.py +30 -0
bin/compare-plotdirs.py +298 -0
bin/diff-parameters.py +133 -0
bin/docker-hub-push.sh +6 -0
bin/extract-pairing-info.py +55 -0
bin/gcdyn-simu-run.py +223 -0
bin/gctree-run.py +244 -0
bin/get-naive-probabilities.py +126 -0
bin/iqtree-1.6.12 +0 -0
bin/lonr.r +1020 -0
bin/makeHtml +52 -0
bin/mds-run.py +46 -0
bin/parse-output.py +277 -0
bin/partis +1869 -0
bin/partis-pip +116 -0
bin/partis.py +1869 -0
bin/plot-gl-set-trees.py +519 -0
bin/plot-hmms.py +151 -0
bin/plot-lb-tree.py +427 -0
bin/raxml-ng +0 -0
bin/read-bcr-phylo-trees.py +38 -0
bin/read-gctree-output.py +166 -0
bin/run-chimeras.sh +64 -0
bin/run-dtr-scan.sh +25 -0
bin/run-paired-loci.sh +100 -0
bin/run-tree-metrics.sh +88 -0
bin/smetric-run.py +62 -0
bin/split-loci.py +317 -0
bin/swarm-2.1.13-linux-x86_64 +0 -0
bin/test-germline-inference.py +425 -0
bin/tree-perf-run.py +194 -0
bin/vsearch-2.4.3-linux-x86_64 +0 -0
bin/vsearch-2.4.3-macos-x86_64 +0 -0
bin/xvfb-run +194 -0
partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
partis_bcr-1.0.1.data/scripts/test.py +1005 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
{partis → python}/glutils.py +1 -1
python/main.py +30 -0
{partis → python}/plotting.py +10 -1
{partis → python}/treeutils.py +18 -16
{partis → python}/utils.py +14 -7
partis/main.py +0 -59
partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
{partis → python}/__init__.py +0 -0
{partis → python}/alleleclusterer.py +0 -0
{partis → python}/allelefinder.py +0 -0
{partis → python}/alleleremover.py +0 -0
{partis → python}/annotationclustering.py +0 -0
{partis → python}/baseutils.py +0 -0
{partis → python}/cache/__init__.py +0 -0
{partis → python}/cache/cached_uncertainties.py +0 -0
{partis → python}/clusterpath.py +0 -0
{partis → python}/coar.py +0 -0
{partis → python}/corrcounter.py +0 -0
{partis → python}/datautils.py +0 -0
{partis → python}/event.py +0 -0
{partis → python}/fraction_uncertainty.py +0 -0
{partis → python}/gex.py +0 -0
{partis → python}/glomerator.py +0 -0
{partis → python}/hist.py +0 -0
{partis → python}/hmmwriter.py +0 -0
{partis → python}/hutils.py +0 -0
{partis → python}/indelutils.py +0 -0
{partis → python}/lbplotting.py +0 -0
{partis → python}/mds.py +0 -0
{partis → python}/mutefreqer.py +0 -0
{partis → python}/paircluster.py +0 -0
{partis → python}/parametercounter.py +0 -0
{partis → python}/paramutils.py +0 -0
{partis → python}/partitiondriver.py +0 -0
{partis → python}/partitionplotter.py +0 -0
{partis → python}/performanceplotter.py +0 -0
{partis → python}/plotconfig.py +0 -0
{partis → python}/processargs.py +0 -0
{partis → python}/prutils.py +0 -0
{partis → python}/recombinator.py +0 -0
{partis → python}/scanplot.py +0 -0
{partis → python}/seqfileopener.py +0 -0
{partis → python}/treegenerator.py +0 -0
{partis → python}/viterbicluster.py +0 -0
{partis → python}/vrc01.py +0 -0
{partis → python}/waterer.py +0 -0

bin/xvfb-run ADDED Viewed

@@ -0,0 +1,194 @@
+#!/bin/sh
+# ----------------------------------------------------------------------------------------
+# NOTE copied from ubuntu system location in order to remove the '2>&1' at the bottom, see https://bugs.launchpad.net/ubuntu/+source/xorg-server/+bug/1059947
+# ----------------------------------------------------------------------------------------
+# This script starts an instance of Xvfb, the "fake" X server, runs a command
+# with that server available, and kills the X server when done.  The return
+# value of the command becomes the return value of this script, except in cases
+# where this script encounters an error.
+#
+# If anyone is using this to build a Debian package, make sure the package
+# Build-Depends on xvfb and xauth.
+set -e
+PROGNAME=xvfb-run
+SERVERNUM=99
+AUTHFILE=
+ERRORFILE=/dev/null
+XVFBARGS="-screen 0 640x480x8"
+LISTENTCP="-nolisten tcp"
+XAUTHPROTO=.
+# Query the terminal to establish a default number of columns to use for
+# displaying messages to the user.  This is used only as a fallback in the event
+# the COLUMNS variable is not set.  ($COLUMNS can react to SIGWINCH while the
+# script is running, and this cannot, only being calculated once.)
+DEFCOLUMNS=$(stty size 2>/dev/null | awk '{print $2}') || true
+if ! expr "$DEFCOLUMNS" : "[[:digit:]]\+$" >/dev/null 2>&1; then
+    DEFCOLUMNS=80
+fi
+# Display a message, wrapping lines at the terminal width.
+message () {
+    echo "$PROGNAME: $*" | fmt -t -w ${COLUMNS:-$DEFCOLUMNS}
+}
+# Display an error message.
+error () {
+    message "error: $*" >&2
+}
+# Display a usage message.
+usage () {
+    if [ -n "$*" ]; then
+        message "usage error: $*"
+    fi
+    cat <<EOF
+Usage: $PROGNAME [OPTION ...] COMMAND
+Run COMMAND (usually an X client) in a virtual X server environment.
+Options:
+-a        --auto-servernum          try to get a free server number, starting at
+                                    --server-num
+-e FILE   --error-file=FILE         file used to store xauth errors and Xvfb
+                                    output (default: $ERRORFILE)
+-f FILE   --auth-file=FILE          file used to store auth cookie
+                                    (default: ./.Xauthority)
+-h        --help                    display this usage message and exit
+-n NUM    --server-num=NUM          server number to use (default: $SERVERNUM)
+-l        --listen-tcp              enable TCP port listening in the X server
+-p PROTO  --xauth-protocol=PROTO    X authority protocol name to use
+                                    (default: xauth command's default)
+-s ARGS   --server-args=ARGS        arguments (other than server number and
+                                    "-nolisten tcp") to pass to the Xvfb server
+                                    (default: "$XVFBARGS")
+EOF
+}
+# Find a free server number by looking at .X*-lock files in /tmp.
+find_free_servernum() {
+    # Sadly, the "local" keyword is not POSIX.  Leave the next line commented in
+    # the hope Debian Policy eventually changes to allow it in /bin/sh scripts
+    # anyway.
+    #local i
+    i=$SERVERNUM
+    while [ -f /tmp/.X$i-lock ]; do
+        i=$(($i + 1))
+    done
+    echo $i
+}
+# Clean up files
+clean_up() {
+    if [ -e "$AUTHFILE" ]; then
+        XAUTHORITY=$AUTHFILE xauth remove ":$SERVERNUM" >>"$ERRORFILE" 2>&1
+    fi
+    if [ -n "$XVFB_RUN_TMPDIR" ]; then
+        if ! rm -r "$XVFB_RUN_TMPDIR"; then
+            error "problem while cleaning up temporary directory"
+            exit 5
+        fi
+    fi
+    if [ -n "$XVFBPID" ]; then
+        kill "$XVFBPID" >>"$ERRORFILE" 2>&1
+    fi
+}
+# Parse the command line.
+ARGS=$(getopt --options +ae:f:hn:lp:s:w: \
+       --long auto-servernum,error-file:,auth-file:,help,server-num:,listen-tcp,xauth-protocol:,server-args:,wait: \
+       --name "$PROGNAME" -- "$@")
+GETOPT_STATUS=$?
+if [ $GETOPT_STATUS -ne 0 ]; then
+    error "internal error; getopt exited with status $GETOPT_STATUS"
+    exit 6
+fi
+eval set -- "$ARGS"
+while :; do
+    case "$1" in
+        -a|--auto-servernum) SERVERNUM=$(find_free_servernum); AUTONUM="yes" ;;
+        -e|--error-file) ERRORFILE="$2"; shift ;;
+        -f|--auth-file) AUTHFILE="$2"; shift ;;
+        -h|--help) SHOWHELP="yes" ;;
+        -n|--server-num) SERVERNUM="$2"; shift ;;
+        -l|--listen-tcp) LISTENTCP="" ;;
+        -p|--xauth-protocol) XAUTHPROTO="$2"; shift ;;
+        -s|--server-args) XVFBARGS="$2"; shift ;;
+        -w|--wait) shift ;;
+        --) shift; break ;;
+        *) error "internal error; getopt permitted \"$1\" unexpectedly"
+           exit 6
+           ;;
+    esac
+    shift
+done
+if [ "$SHOWHELP" ]; then
+    usage
+    exit 0
+fi
+if [ -z "$*" ]; then
+    usage "need a command to run" >&2
+    exit 2
+fi
+if ! which xauth >/dev/null; then
+    error "xauth command not found"
+    exit 3
+fi
+# tidy up after ourselves
+trap clean_up EXIT
+# If the user did not specify an X authorization file to use, set up a temporary
+# directory to house one.
+if [ -z "$AUTHFILE" ]; then
+    XVFB_RUN_TMPDIR="$(mktemp -d -t $PROGNAME.XXXXXX)"
+    # Create empty file to avoid xauth warning
+    AUTHFILE=$(tempfile -n "$XVFB_RUN_TMPDIR/Xauthority")
+fi
+# Start Xvfb.
+MCOOKIE=$(mcookie)
+tries=10
+while [ $tries -gt 0 ]; do
+    tries=$(( $tries - 1 ))
+    XAUTHORITY=$AUTHFILE xauth source - << EOF >>"$ERRORFILE" 2>&1
+add :$SERVERNUM $XAUTHPROTO $MCOOKIE
+EOF
+    # handle SIGUSR1 so Xvfb knows to send a signal when it's ready to accept
+    # connections
+    trap : USR1
+    (trap '' USR1; exec Xvfb ":$SERVERNUM" $XVFBARGS $LISTENTCP -auth $AUTHFILE >>"$ERRORFILE" 2>&1) &
+    XVFBPID=$!
+    wait || :
+    if kill -0 $XVFBPID 2>/dev/null; then
+        break
+    elif [ -n "$AUTONUM" ]; then
+        # The display is in use so try another one (if '-a' was specified).
+        SERVERNUM=$((SERVERNUM + 1))
+        SERVERNUM=$(find_free_servernum)
+        continue
+    fi
+    error "Xvfb failed to start" >&2
+    XVFBPID=
+    exit 1
+done
+# Start the command and save its exit status.
+set +e
+DISPLAY=:$SERVERNUM XAUTHORITY=$AUTHFILE "$@"
+RETVAL=$?
+set -e
+# Return the executed command's exit status.
+exit $RETVAL
+# vim:set ai et sts=4 sw=4 tw=80:

partis_bcr-1.0.1.data/scripts/cf-alleles.py ADDED Viewed

@@ -0,0 +1,97 @@
+#!python
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import argparse
+import os
+import sys
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+if not os.path.exists(partis_dir):
+    print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+parser = argparse.ArgumentParser()
+parser.add_argument('--bases', required=True, help='colon-separated list of the bits before the stars, e.g. 1-18:2-2 (set to \'all\' to print entire germline set)')
+parser.add_argument('--allele-numbers')
+parser.add_argument('--ref-allele', help='print this one first')
+parser.add_argument('--other-genes')
+parser.add_argument('--region', default='v')
+parser.add_argument('--locus', default='igh', choices=utils.loci)
+parser.add_argument('--species', default='human')
+parser.add_argument('--glfo-dir', help='default set below')
+args = parser.parse_args()
+if args.glfo_dir is None:
+    args.glfo_dir = 'data/germlines/' + args.species
+glfo = glutils.read_glfo(args.glfo_dir, args.locus)
+# ----------------------------------------------------------------------------------------
+def get_base(gene):
+    basestr = utils.primary_version(gene)
+    if utils.sub_version(gene) is not None:
+        basestr += '-' + utils.sub_version(gene)
+    return basestr
+# ----------------------------------------------------------------------------------------
+def get_genes(base, alleles=None):
+    if alleles is None:  # take all of 'em
+        alleles = [utils.allele(g) for g in glfo['seqs'][args.region] if base == get_base(g)]
+    return [args.locus.upper() + args.region.upper() + base + '*' + al for al in alleles]
+if args.bases == 'all':
+    input_groupfcn = None  # lambda g: str(utils.primary_version(g) in ['4', '5'])  # this example puts all the 4 and 5 primary versions in one group, and everybody else in another
+    glutils.print_glfo(glfo, only_region=(args.region if args.region != 'v' else None), input_groupfcn=input_groupfcn)  # not much point in doing only v, since it's the one that takes most of the time
+    sys.exit(0)
+args.bases = utils.get_arg_list(args.bases)
+args.allele_numbers = utils.get_arg_list(args.allele_numbers)
+genes = [g for base in args.bases for g in get_genes(base, args.allele_numbers)]
+if len(genes) == 0:
+    raise Exception('couldn\'t find any genes for the specified --bases %s\n  choices:\n    %s' % (' '.join(args.bases), ' '.join(sorted(set([get_base(g) for g in glfo['seqs'][args.region]])))))
+args.other_genes = utils.get_arg_list(args.other_genes)
+if args.other_genes is not None:
+    genes += args.other_genes
+seqstrs = ['' for _ in range(len(genes))]
+snpstrs = ['' for _ in range(len(genes))]
+gene_str_width = max([utils.len_excluding_colors(utils.color_gene(g)) for g in genes])
+codon_positions = glfo[utils.conserved_codons[args.locus][args.region] + '-positions'] if args.region != 'd' else None
+max_seq_len = max([len(glfo['seqs'][args.region][g]) for g in genes])
+ref_gene = genes[0] if args.ref_allele is None else utils.rejoin_gene(args.locus, args.region, utils.primary_version(genes[0]), utils.sub_version(genes[0]), args.ref_allele)
+if ref_gene != genes[0]:
+    genes.remove(ref_gene)
+    genes.insert(0, ref_gene)
+ref_seq = glfo['seqs'][args.region][ref_gene]
+ref_pos = codon_positions[ref_gene]
+for igene in range(0, len(genes)):
+    gene = genes[igene]
+    seq = glfo['seqs'][args.region][gene]
+    pos = codon_positions[gene]
+    if pos < ref_pos:  # align the codon position in the case that this seq is shorter up to the codon
+        seq = (ref_pos - pos) * '-' + seq
+        pos += (ref_pos - pos)
+    right_pad_str = ''  # i think i don't need this any more since i have the align option in color_mutants
+    # if len(seq) < max_seq_len:
+    #     right_pad_str = (max_seq_len - len(seq)) * ' '
+    emph_positions = None if args.region == 'd' else [pos + i for i in range(3)]
+    colored_seq, isnps = utils.color_mutants(ref_seq, seq, return_isnps=True, emphasis_positions=emph_positions, align=True)
+    seqstrs[igene] += '%s%s' % (colored_seq, right_pad_str)
+    if len(isnps) > 0:
+        snpstrs[igene] = '%2d (%s)' % (len(isnps), ' '.join([str(i) for i in isnps]))
+# ----------------------------------------------------------------------------------------
+def print_str(gene, seqstr, snpstr):
+    return '%s  %s  %s  %s' % (utils.color_gene(gene, width=gene_str_width), seqstr, utils.color_gene(gene, width=gene_str_width), snpstr)
+for igene in range(len(genes)):
+    print(print_str(genes[igene], seqstrs[igene], snpstrs[igene]))

partis_bcr-1.0.1.data/scripts/cf-germlines.py ADDED Viewed

@@ -0,0 +1,57 @@
+#!python
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import argparse
+import sys
+import os
+import copy
+import collections
+import colored_traceback.always
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+if not os.path.exists(partis_dir):
+    print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+parser = argparse.ArgumentParser()
+parser.add_argument('gldir1')
+parser.add_argument('gldir2')
+parser.add_argument('--names', default='+gl-1:+gl-2', help='colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output')
+parser.add_argument('--locus', default='igh')
+args = parser.parse_args()
+args.names = utils.get_arg_list(args.names)
+# ----------------------------------------------------------------------------------------
+def clrname(name):
+    return utils.color('blue', name)
+# ----------------------------------------------------------------------------------------
+glfos = []
+for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
+    print('%s:' % clrname(name))
+    glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))
+for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
+    aseqs, bseqs = [{s : n for n, s in g['seqs'][region].items()} for g in glfos]  # dict of names keyed by seqs
+    a_only_seqs, b_only_seqs = set(aseqs) - set(bseqs), set(bseqs) - set(aseqs)
+    print('%s' % utils.color('green', region))
+    common_seqs = set(aseqs) & set(bseqs)
+    common_name_seqs = [aseqs[s] for s in common_seqs if aseqs[s]==bseqs[s]]
+    print('    %3d seqs in common with same name: %s' % (len(common_name_seqs), utils.color_genes(sorted(common_name_seqs))))
+    dnamed_seqs = [(aseqs[s], bseqs[s]) for s in common_seqs if aseqs[s] != bseqs[s]]
+    if len(dnamed_seqs) > 0:
+        print('      %s %d common seq%s with different names: %s' % (utils.wrnstr(), len(dnamed_seqs), utils.plural(len(dnamed_seqs)), ',  '.join(utils.color_genes([an,bn]) for an, bn in dnamed_seqs)))
+    print('    only in:\n      %12s: %3d  %s\n      %12s: %3d  %s' % (clrname(args.names[0]), len(a_only_seqs), utils.color_genes(sorted(aseqs[s] for s in a_only_seqs)),
+                                                                      clrname(args.names[1]), len(b_only_seqs), utils.color_genes(sorted(bseqs[s] for s in b_only_seqs))))
+    tmpfo = glutils.get_empty_glfo(args.locus)  # make a new glfo that will only have non-shared genes
+    for gname, oname, only_seqs, allseqs, ogfo in zip(args.names, reversed(args.names), [a_only_seqs, b_only_seqs], [aseqs, bseqs], reversed(glfos)):  # <gset> is the genes that're only in <gname>
+        print('  finding nearest seq in %s for %d seqs only in %s' % (clrname(oname), len(only_seqs), clrname(gname)))
+        for oseq in only_seqs:
+            glutils.find_nearest_gene_in_glfo(ogfo, oseq, new_name=allseqs[oseq], region=region, debug=True)

partis_bcr-1.0.1.data/scripts/extract-pairing-info.py ADDED Viewed

@@ -0,0 +1,55 @@
+#!python
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import csv
+import os
+import sys
+from io import open
+csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
+import argparse
+import colored_traceback.always
+import yaml
+import json
+import operator
+import random
+import numpy
+from pathlib import Path
+# if you move this script, you'll need to change this method of getting the imports
+partis_dir = str(Path(__file__).parent.parent)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+dstr = """
+Extract heavy/light chain pairing info from fasta file <infname> and write it to yaml/json file <outfname>.
+Should have the same effect as setting --guess-pairing-info when running bin/split-loci.py.
+"""
+parser = argparse.ArgumentParser(description=dstr,
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)  # why tf isn't this printing the defaults?
+parser.add_argument('infname')
+parser.add_argument('outfname')
+parser.add_argument('--droplet-id-separators', help=utils.did_help['seps'])
+parser.add_argument('--droplet-id-indices', help=utils.did_help['indices'])
+parser.add_argument('--overwrite', action='store_true')
+parser.add_argument('--for-testing-n-max-queries', type=int, default=-1, help='only for testing, applied when reading initial fasta file, just in case it\'s huge and you want to run quickly without having to read the whole file')
+parser.add_argument('--n-max-queries', type=int, default=-1, help='see partis help (although here it applies to droplets, not individual seqs)')
+parser.add_argument('--n-random-queries', type=int, help='see partis help (although here it applies to droplets, not individual seqs)')
+parser.add_argument('--input-metafname', help='json/yaml file with additional (beyond pairing info) input meta info (see partis help)')
+parser.add_argument('--random-seed', type=int, default=1)
+args = parser.parse_args()
+random.seed(args.random_seed)
+numpy.random.seed(args.random_seed)
+args.droplet_id_indices = utils.get_arg_list(args.droplet_id_indices, intify=True)
+if utils.output_exists(args, args.outfname, offset=4, debug=False):
+    print('  extract-pairing-info.py output exists and --overwrite was not set, so not doing anything: %s' % args.outfname)
+    sys.exit(0)
+seqfos = utils.read_fastx(args.infname, n_max_queries=args.for_testing_n_max_queries)
+if args.n_max_queries != -1 or args.n_random_queries is not None:
+    seqfos = utils.subset_paired_queries(seqfos, args.droplet_id_separators, args.droplet_id_indices, n_max_queries=args.n_max_queries, n_random_queries=args.n_random_queries)
+metafos = utils.extract_pairing_info(seqfos, droplet_id_separators=args.droplet_id_separators, droplet_id_indices=args.droplet_id_indices, input_metafname=args.input_metafname)
+utils.mkdir(args.outfname, isfile=True)
+utils.jsdump(args.outfname, metafos)

partis_bcr-1.0.1.data/scripts/gctree-run.py ADDED Viewed

@@ -0,0 +1,244 @@
+#!python
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import numpy
+import csv
+import yaml
+import time
+import colored_traceback.always
+import argparse
+import subprocess
+import sys
+import os
+import dendropy
+import json
+from io import open
+import random
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+sys.path.insert(1, partis_dir) #'./python')
+import python.utils as utils
+import python.glutils as glutils
+import python.treeutils as treeutils
+# ----------------------------------------------------------------------------------------
+def get_inf_int_name(gname):  # <gname> is just an integer, which won't be unique and will break things
+    return '%s-%s' % (args.inf_int_label, gname)
+# ----------------------------------------------------------------------------------------
+def gctofn(ft):
+    ftstrs = {
+        'tree' : 'gctree.out.inference.1.nk',
+        'seqs' : 'gctree.out.inference.1.fasta',
+        'dnapars' : 'outfile',
+    }
+    return '%s/%s' % (args.outdir, ftstrs[ft])
+# ----------------------------------------------------------------------------------------
+def fofn(ft):
+    assert ft in ['tree', 'seqs']
+    return '%s/%s%s' % (args.outdir, ft if ft=='tree' else 'inferred-%s'%ft, '.nwk' if ft=='tree' else '.fa')
+# ----------------------------------------------------------------------------------------
+def idfn():
+    return 'idmap.txt'
+# ----------------------------------------------------------------------------------------
+def install():
+    cmds = ['#!/bin/bash']
+    cmds += utils.mamba_cmds(args.env_label, only_prep=True)
+    cmds += ['micromamba create -y -n %s -c conda-forge python=3.9' % args.env_label]  # 3.10 currently has problems with ete
+    cmds += ['micromamba activate %s' % args.env_label]
+    cmds += ['micromamba install -y -c bioconda -c conda-forge phylip']
+    cmds += ['micromamba install -y -c conda-forge%s click' % ('' if args.no_dag else ' gctree')]
+    if args.no_dag:
+        cmds += ['pip install gctree==3.3.0']  # I think having --user makes it install in ~/.local (outside mamba env)
+    # micromamba remove -n gctree --all  # to nuke it and start over
+    utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
+# ----------------------------------------------------------------------------------------
+def update():
+    cmds = ['#!/bin/bash']
+    cmds += utils.mamba_cmds(args.env_label)
+    cmds += ['micromamba update phylip gctree click']
+    utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
+# ----------------------------------------------------------------------------------------
+def add_mfo(tcmd, mfn):
+    kdict = {'frame' : 'frame', 'h_frame' : 'frame', 'l_frame' : 'frame2', 'l_offset' : 'chain_split'}  # translates from metafo dict to gctree command line args
+    with open(args.metafname) as mfile:
+        metafo = json.load(mfile)
+    for tk, tc in kdict.items():
+        if tk in metafo:
+            tcmd += ' --%s %d' % (tc, metafo[tk])
+    return tcmd
+# ----------------------------------------------------------------------------------------
+def run_gctree():
+    # ----------------------------------------------------------------------------------------
+    def get_gctree_cmd():
+        tcmd = '%s/bin/xvfb-run -a gctree infer outfile abundances.csv --root %s --verbose --idlabel' % (utils.get_partis_dir(), args.root_label)  # --idlabel writes the output fasta file
+        if not args.base_model and not args.no_dag:
+            tcmd += ' --mutability %s/HS5F_Mutability.csv --substitution %s/HS5F_Substitution.csv' % (args.data_dir, args.data_dir)
+        if args.ranking_coeffs is not None:
+            tcmd += ' --ranking_coeffs %s' % (' '.join(c for c in args.ranking_coeffs))
+        if args.branching_process_ranking_coeff is not None:
+            tcmd += ' --branching_process_ranking_coeff %d' % args.branching_process_ranking_coeff
+        if os.path.exists(args.metafname):
+            tcmd = add_mfo(tcmd, args.metafname)
+        return tcmd
+    # ----------------------------------------------------------------------------------------
+    def get_cmds():
+        cmds = ['#!/bin/bash']
+        cmds += utils.mamba_cmds(args.env_label)
+        if args.run_help:
+            cmds += ['gctree infer -h']
+            return cmds
+        if not os.path.exists(args.infname):
+            raise Exception('--infname %s doesn\'t exist' % args.infname)
+        cmds += ['cd %s' % args.outdir]
+        if args.input_forest_dir is None:
+            ofn = '%s/outfile' % args.outdir  # dnapars output file (this is what takes the longest to make
+            if os.path.exists(ofn) and os.stat(ofn).st_size > 0:
+                print('    dnapars output already exists, not rerunning: %s' % ofn)
+            else:
+                if os.path.exists(ofn) and os.stat(ofn).st_size == 0:
+                    print('    removing zero length dnapars output %s' % ofn)
+                    utils.prep_dir(args.outdir, wildlings=['outfile', 'outtree'], allow_other_files=True)  # phylip barfs like a mfer if its outputs exist (probably you'll get a KeyError 'naive')
+                cmds += ['deduplicate %s --root %s --abundance_file abundances.csv --idmapfile %s > deduplicated.phylip' % (args.infname, args.root_label, idfn())]
+                cmds += ['mkconfig deduplicated.phylip dnapars > dnapars.cfg']
+                cmds += ['dnapars < dnapars.cfg > dnapars.log']  # NOTE if things fail, look in dnaparse.log (but it's super verbose so we can't print it to std out by default)
+        else:
+            print('    --input-forest-dir: copying abundance, idmap, and forest files from %s' % args.input_forest_dir)
+            cmds += ['cp %s/{abundances.csv,%s,outfile} %s/' % (args.input_forest_dir, idfn(), args.outdir)]
+        if not args.only_write_forest:
+            cmds.append(get_gctree_cmd())
+        return cmds
+    # ----------------------------------------------------------------------------------------
+    if not args.run_help and utils.output_exists(args, gctofn('dnapars' if args.only_write_forest else 'tree')):
+        return
+    cmds = get_cmds()  # also preps dir + other stuff
+    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.outdir + '/run.sh', print_time='gctree', debug=True, dryrun=args.dry_run)
+    if args.run_help:
+        sys.exit()
+# ----------------------------------------------------------------------------------------
+def parse_output():
+    if utils.output_exists(args, fofn('seqs')):
+        return
+    # read translations (this only includes input sequences, not inferred intermediates)
+    idm_trns = {}
+    with open('%s/idmap.txt' % args.outdir) as idfile:
+        reader = csv.DictReader(idfile, fieldnames=('name', 'orig_names'))
+        for line in reader:
+            if line['orig_names'] == '':
+                continue
+            idm_trns[line['name']] = line['orig_names'].split(':')
+    # read fasta (mostly for inferred intermediate seqs)
+    seqfos = utils.read_fastx(gctofn('seqs'), look_for_tuples=True)
+    print('    read %d seqs from gctree output fasta' % len(seqfos))
+    if any(s['name']=='' for s in seqfos):
+        n_removed = len([s for s in seqfos if s['name']==''])
+        seqfos = [s for s in seqfos if s['name']!='']
+        print('  %s removed %d seqs with zero-length names \'\' (I\'m *not* sure this is the right thing to do, but it just kicked this error when I was doing the python 3 conversion)' % (utils.wrnstr(), n_removed))
+    nfos = [s for s in seqfos if s['name']==args.root_label]
+    if len(nfos) != 1:
+        print('  %s expected 1 naive seq with label \'%s\' but found %d: %s  (in %s)' % (utils.wrnstr(), args.root_label, len(nfos), ' '.join(n['name'] for n in nfos), gctofn('seqs')))
+    seqfos = [s for s in seqfos if s['name'] != args.root_label]  # don't want naive seq in final fasta
+    seq_len = numpy.mean([len(s['seq']) for s in seqfos])
+    if not args.expand_all_nodes:  # also remove input seqs (well, gctree's new names for input seqs), unless we're expanding all nodes, in which case we need the gctree-named-nodes as fake new internal nodes
+        seqfos = [s for s in seqfos if s['name'] not in idm_trns]
+    if len(seqfos) == 0:
+        print('  %s no inferred sequences (all seqs read from gctree output were input seqs' % utils.wrnstr())
+    inf_int_trns = []
+    for sfo in seqfos:
+        inf_int_trns.append((sfo['name'], get_inf_int_name(sfo['name'])))
+        sfo['name'] = get_inf_int_name(sfo['name'])
+    # read tree
+    dtree = treeutils.get_dendro_tree(treefname=gctofn('tree'), debug=args.debug)
+    dtree.scale_edges(1. / seq_len)
+    dtree.seed_node.taxon.label = args.root_label
+    ndict = {n.taxon.label : n for n in dtree.preorder_node_iter()}
+    for gname, onames in idm_trns.items():
+        node = ndict[gname]
+        if node is None:
+            raise Exception('couldn\'t find node with name \'%s\' in tree from gctree in %s' % (gname, gctofn('tree')))
+        if args.debug and len(onames) > 1:
+            print('    abundance > 1 for %s: %d (%s)' % (gname, len(onames), ' '.join(onames)))
+        for onm in onames:
+            if node.taxon.label == gname and not args.expand_all_nodes:
+                node.taxon.label = onm
+                if args.debug and len(onames) > 1:
+                    print('        setting node to %s' % onm)
+                continue
+            treeutils.add_zero_length_child(node, dtree, child_name=onm)  # add duplicates as children with zero-length edges
+            if args.debug and len(onames) > 1:
+                print('        adding child node %s' % onm)
+    treeutils.translate_labels(dtree, inf_int_trns, expect_missing=True, debug=args.debug)
+    if args.fix_multifurcations:
+        input_seqfos = utils.read_fastx(args.infname)
+        dtree, new_seqfos = treeutils.get_binary_tree(dtree, nfos + input_seqfos + seqfos, debug=args.debug)
+        seqfos += new_seqfos
+    if args.debug:
+        print('    final tree:')
+        print(treeutils.get_ascii_tree(dendro_tree=dtree, extra_str='      ', width=350))
+    with open(fofn('tree'), 'w') as ofile:
+        ofile.write('%s\n' % treeutils.as_str(dtree))
+    utils.write_fasta(fofn('seqs'), nfos + seqfos)
+# ----------------------------------------------------------------------------------------
+ustr = """
+Run gctree tree inference on sequences from fasta input file <--infname>.
+Output trees and sequences are written to <--outdir> as inferred-seqs.fa and tree.nwk (gctree output files are also there, but they don't have any postprocessing e.g. fixing names and/or multifurcations.
+  gctree-run.py --infname <fasta> --outdir <outdir>
+"""
+parser = argparse.ArgumentParser(usage=ustr)
+parser.add_argument('--actions', default='run:parse')
+parser.add_argument('--infname')
+parser.add_argument('--metafname', help='if you need --frame (v region doesn\'t start at first position) or --chain_split and --frame2 (heavy/light chain smooshed together), pass the info in json format with this arg (see code above for format).')
+parser.add_argument('--outdir')
+parser.add_argument('--only-write-forest', action='store_true', help='only run preparatory steps for gctree, i.e. up through dnapars, to write parsimony forest')
+parser.add_argument('--input-forest-dir', help='If set, skips preparatory steps (see --only-write-forest), and looks for \'abundance.csv\' and parsimony forest file (\'outfile\') in the specified dir')
+parser.add_argument('--overwrite', action='store_true')
+parser.add_argument('--base-model', action='store_true', help='By default, we pass gctree info for the s5f mutation model; if this is set, we don\'t, and it instead use the base model.')
+parser.add_argument('--no-dag', action='store_true', help='If set, use old v1 non-DAG gctree version (v3.3.0). Note that this uses a different env (see --env-label)')
+parser.add_argument('--ranking-coeffs', nargs='+', help='see gctree help')
+parser.add_argument('--branching-process-ranking-coeff', type=int, help='see gctree help')
+parser.add_argument('--env-label', default='gctree')
+parser.add_argument('--root-label', default='naive')
+parser.add_argument('--data-dir', default='%s/data/s5f'%utils.get_partis_dir())
+parser.add_argument('--inf-int-label', default='inf', help='base name for inferred intermediate seqs (numerical name is appended with -')
+parser.add_argument('--expand-all-nodes', action='store_true', help='Gctree collapses duplicate observed seqs into nodes with new names and abundance N > 1. By default, we expand these such that the node is named for one of the observed seqs, and add N-1 (zero-length) children. If this arg is set, however, we leave the node and add N (zero-length) children.')
+parser.add_argument('--run-help', action='store_true', help='run gctree help')
+parser.add_argument('--debug', action='store_true')
+parser.add_argument('--dry-run', action='store_true')
+parser.add_argument('--random-seed', type=int, default=0)
+parser.add_argument('--fix-multifurcations', action='store_true', help='resolves multifurcations (by adding zero length intermediates) and move input seqs that have been extend unifurcations onto zero length branches')
+args = parser.parse_args()
+random.seed(args.random_seed)
+numpy.random.seed(args.random_seed)
+if args.only_write_forest and args.input_forest_dir:
+    raise Exception('doesn\'t make sense to specify both')
+args.actions = utils.get_arg_list(args.actions, choices=['install', 'update', 'run', 'parse'])
+args.infname = utils.fpath(args.infname)
+args.outdir = utils.fpath(args.outdir)
+if args.no_dag:
+    assert not args.base_model and args.branching_process_ranking_coeff is None and args.ranking_coeffs is None
+    args.env_label = 'gctree-no-dag'
+if 'install' in args.actions:
+    install()
+if 'update' in args.actions:
+    update()
+if 'run' in args.actions:
+    run_gctree()
+if 'parse' in args.actions:
+    parse_output()

partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

partis-bcr 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl