PyPI - partis-bcr - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

partis-bcr 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

bin/FastTree +0 -0
bin/add-chimeras.py +59 -0
bin/add-seqs-to-outputs.py +81 -0
bin/bcr-phylo-run.py +799 -0
bin/build.sh +24 -0
bin/cf-alleles.py +97 -0
bin/cf-germlines.py +57 -0
bin/cf-linearham.py +199 -0
bin/chimera-plot.py +76 -0
bin/choose-partially-paired.py +143 -0
bin/circle-plots.py +30 -0
bin/compare-plotdirs.py +298 -0
bin/diff-parameters.py +133 -0
bin/docker-hub-push.sh +6 -0
bin/extract-pairing-info.py +55 -0
bin/gcdyn-simu-run.py +223 -0
bin/gctree-run.py +244 -0
bin/get-naive-probabilities.py +126 -0
bin/iqtree-1.6.12 +0 -0
bin/lonr.r +1020 -0
bin/makeHtml +52 -0
bin/mds-run.py +46 -0
bin/parse-output.py +277 -0
bin/partis +1869 -0
bin/partis-pip +116 -0
bin/partis.py +1869 -0
bin/plot-gl-set-trees.py +519 -0
bin/plot-hmms.py +151 -0
bin/plot-lb-tree.py +427 -0
bin/raxml-ng +0 -0
bin/read-bcr-phylo-trees.py +38 -0
bin/read-gctree-output.py +166 -0
bin/run-chimeras.sh +64 -0
bin/run-dtr-scan.sh +25 -0
bin/run-paired-loci.sh +100 -0
bin/run-tree-metrics.sh +88 -0
bin/smetric-run.py +62 -0
bin/split-loci.py +317 -0
bin/swarm-2.1.13-linux-x86_64 +0 -0
bin/test-germline-inference.py +425 -0
bin/tree-perf-run.py +194 -0
bin/vsearch-2.4.3-linux-x86_64 +0 -0
bin/vsearch-2.4.3-macos-x86_64 +0 -0
bin/xvfb-run +194 -0
partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
partis_bcr-1.0.2.data/scripts/test.py +1005 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
{partis → python}/glutils.py +1 -1
python/main.py +30 -0
{partis → python}/plotting.py +10 -1
{partis → python}/treeutils.py +18 -16
{partis → python}/utils.py +14 -7
packages/ham/bcrham +0 -0
partis/main.py +0 -59
partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
{partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
{partis → python}/__init__.py +0 -0
{partis → python}/alleleclusterer.py +0 -0
{partis → python}/allelefinder.py +0 -0
{partis → python}/alleleremover.py +0 -0
{partis → python}/annotationclustering.py +0 -0
{partis → python}/baseutils.py +0 -0
{partis → python}/cache/__init__.py +0 -0
{partis → python}/cache/cached_uncertainties.py +0 -0
{partis → python}/clusterpath.py +0 -0
{partis → python}/coar.py +0 -0
{partis → python}/corrcounter.py +0 -0
{partis → python}/datautils.py +0 -0
{partis → python}/event.py +0 -0
{partis → python}/fraction_uncertainty.py +0 -0
{partis → python}/gex.py +0 -0
{partis → python}/glomerator.py +0 -0
{partis → python}/hist.py +0 -0
{partis → python}/hmmwriter.py +0 -0
{partis → python}/hutils.py +0 -0
{partis → python}/indelutils.py +0 -0
{partis → python}/lbplotting.py +0 -0
{partis → python}/mds.py +0 -0
{partis → python}/mutefreqer.py +0 -0
{partis → python}/paircluster.py +0 -0
{partis → python}/parametercounter.py +0 -0
{partis → python}/paramutils.py +0 -0
{partis → python}/partitiondriver.py +0 -0
{partis → python}/partitionplotter.py +0 -0
{partis → python}/performanceplotter.py +0 -0
{partis → python}/plotconfig.py +0 -0
{partis → python}/processargs.py +0 -0
{partis → python}/prutils.py +0 -0
{partis → python}/recombinator.py +0 -0
{partis → python}/scanplot.py +0 -0
{partis → python}/seqfileopener.py +0 -0
{partis → python}/treegenerator.py +0 -0
{partis → python}/viterbicluster.py +0 -0
{partis → python}/vrc01.py +0 -0
{partis → python}/waterer.py +0 -0

bin/build.sh ADDED Viewed

@@ -0,0 +1,24 @@
+#!/bin/bash
+echo -e "\n--> running $0"
+set -eu
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+basedir=`dirname $SCRIPT_DIR`  # go up one level
+echo -e "\n--> building ig-sw"
+cd $basedir/packages/ig-sw/src/ig_align/ && scons
+cd $basedir/
+echo -e "\n--> building ham"
+cd $basedir/packages/ham/ && scons bcrham
+cd $basedir/
+if [ "$*" == "with-simulation" ]; then
+    echo -e "\n--> building bpp-newlik (only used for simulation)"
+    cd $basedir/packages/bpp-newlik/ && ./install.sh  # the bpp-phyl step is really really incredibly slow
+    cd $basedir/
+fi
+echo -e "\n--> test"
+./test/test.py --quick

bin/cf-alleles.py ADDED Viewed

@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import argparse
+import os
+import sys
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+if not os.path.exists(partis_dir):
+    print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+parser = argparse.ArgumentParser()
+parser.add_argument('--bases', required=True, help='colon-separated list of the bits before the stars, e.g. 1-18:2-2 (set to \'all\' to print entire germline set)')
+parser.add_argument('--allele-numbers')
+parser.add_argument('--ref-allele', help='print this one first')
+parser.add_argument('--other-genes')
+parser.add_argument('--region', default='v')
+parser.add_argument('--locus', default='igh', choices=utils.loci)
+parser.add_argument('--species', default='human')
+parser.add_argument('--glfo-dir', help='default set below')
+args = parser.parse_args()
+if args.glfo_dir is None:
+    args.glfo_dir = 'data/germlines/' + args.species
+glfo = glutils.read_glfo(args.glfo_dir, args.locus)
+# ----------------------------------------------------------------------------------------
+def get_base(gene):
+    basestr = utils.primary_version(gene)
+    if utils.sub_version(gene) is not None:
+        basestr += '-' + utils.sub_version(gene)
+    return basestr
+# ----------------------------------------------------------------------------------------
+def get_genes(base, alleles=None):
+    if alleles is None:  # take all of 'em
+        alleles = [utils.allele(g) for g in glfo['seqs'][args.region] if base == get_base(g)]
+    return [args.locus.upper() + args.region.upper() + base + '*' + al for al in alleles]
+if args.bases == 'all':
+    input_groupfcn = None  # lambda g: str(utils.primary_version(g) in ['4', '5'])  # this example puts all the 4 and 5 primary versions in one group, and everybody else in another
+    glutils.print_glfo(glfo, only_region=(args.region if args.region != 'v' else None), input_groupfcn=input_groupfcn)  # not much point in doing only v, since it's the one that takes most of the time
+    sys.exit(0)
+args.bases = utils.get_arg_list(args.bases)
+args.allele_numbers = utils.get_arg_list(args.allele_numbers)
+genes = [g for base in args.bases for g in get_genes(base, args.allele_numbers)]
+if len(genes) == 0:
+    raise Exception('couldn\'t find any genes for the specified --bases %s\n  choices:\n    %s' % (' '.join(args.bases), ' '.join(sorted(set([get_base(g) for g in glfo['seqs'][args.region]])))))
+args.other_genes = utils.get_arg_list(args.other_genes)
+if args.other_genes is not None:
+    genes += args.other_genes
+seqstrs = ['' for _ in range(len(genes))]
+snpstrs = ['' for _ in range(len(genes))]
+gene_str_width = max([utils.len_excluding_colors(utils.color_gene(g)) for g in genes])
+codon_positions = glfo[utils.conserved_codons[args.locus][args.region] + '-positions'] if args.region != 'd' else None
+max_seq_len = max([len(glfo['seqs'][args.region][g]) for g in genes])
+ref_gene = genes[0] if args.ref_allele is None else utils.rejoin_gene(args.locus, args.region, utils.primary_version(genes[0]), utils.sub_version(genes[0]), args.ref_allele)
+if ref_gene != genes[0]:
+    genes.remove(ref_gene)
+    genes.insert(0, ref_gene)
+ref_seq = glfo['seqs'][args.region][ref_gene]
+ref_pos = codon_positions[ref_gene]
+for igene in range(0, len(genes)):
+    gene = genes[igene]
+    seq = glfo['seqs'][args.region][gene]
+    pos = codon_positions[gene]
+    if pos < ref_pos:  # align the codon position in the case that this seq is shorter up to the codon
+        seq = (ref_pos - pos) * '-' + seq
+        pos += (ref_pos - pos)
+    right_pad_str = ''  # i think i don't need this any more since i have the align option in color_mutants
+    # if len(seq) < max_seq_len:
+    #     right_pad_str = (max_seq_len - len(seq)) * ' '
+    emph_positions = None if args.region == 'd' else [pos + i for i in range(3)]
+    colored_seq, isnps = utils.color_mutants(ref_seq, seq, return_isnps=True, emphasis_positions=emph_positions, align=True)
+    seqstrs[igene] += '%s%s' % (colored_seq, right_pad_str)
+    if len(isnps) > 0:
+        snpstrs[igene] = '%2d (%s)' % (len(isnps), ' '.join([str(i) for i in isnps]))
+# ----------------------------------------------------------------------------------------
+def print_str(gene, seqstr, snpstr):
+    return '%s  %s  %s  %s' % (utils.color_gene(gene, width=gene_str_width), seqstr, utils.color_gene(gene, width=gene_str_width), snpstr)
+for igene in range(len(genes)):
+    print(print_str(genes[igene], seqstrs[igene], snpstrs[igene]))

bin/cf-germlines.py ADDED Viewed

@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import argparse
+import sys
+import os
+import copy
+import collections
+import colored_traceback.always
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+if not os.path.exists(partis_dir):
+    print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+parser = argparse.ArgumentParser()
+parser.add_argument('gldir1')
+parser.add_argument('gldir2')
+parser.add_argument('--names', default='+gl-1:+gl-2', help='colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output')
+parser.add_argument('--locus', default='igh')
+args = parser.parse_args()
+args.names = utils.get_arg_list(args.names)
+# ----------------------------------------------------------------------------------------
+def clrname(name):
+    return utils.color('blue', name)
+# ----------------------------------------------------------------------------------------
+glfos = []
+for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
+    print('%s:' % clrname(name))
+    glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))
+for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
+    aseqs, bseqs = [{s : n for n, s in g['seqs'][region].items()} for g in glfos]  # dict of names keyed by seqs
+    a_only_seqs, b_only_seqs = set(aseqs) - set(bseqs), set(bseqs) - set(aseqs)
+    print('%s' % utils.color('green', region))
+    common_seqs = set(aseqs) & set(bseqs)
+    common_name_seqs = [aseqs[s] for s in common_seqs if aseqs[s]==bseqs[s]]
+    print('    %3d seqs in common with same name: %s' % (len(common_name_seqs), utils.color_genes(sorted(common_name_seqs))))
+    dnamed_seqs = [(aseqs[s], bseqs[s]) for s in common_seqs if aseqs[s] != bseqs[s]]
+    if len(dnamed_seqs) > 0:
+        print('      %s %d common seq%s with different names: %s' % (utils.wrnstr(), len(dnamed_seqs), utils.plural(len(dnamed_seqs)), ',  '.join(utils.color_genes([an,bn]) for an, bn in dnamed_seqs)))
+    print('    only in:\n      %12s: %3d  %s\n      %12s: %3d  %s' % (clrname(args.names[0]), len(a_only_seqs), utils.color_genes(sorted(aseqs[s] for s in a_only_seqs)),
+                                                                      clrname(args.names[1]), len(b_only_seqs), utils.color_genes(sorted(bseqs[s] for s in b_only_seqs))))
+    tmpfo = glutils.get_empty_glfo(args.locus)  # make a new glfo that will only have non-shared genes
+    for gname, oname, only_seqs, allseqs, ogfo in zip(args.names, reversed(args.names), [a_only_seqs, b_only_seqs], [aseqs, bseqs], reversed(glfos)):  # <gset> is the genes that're only in <gname>
+        print('  finding nearest seq in %s for %d seqs only in %s' % (clrname(oname), len(only_seqs), clrname(gname)))
+        for oseq in only_seqs:
+            glutils.find_nearest_gene_in_glfo(ogfo, oseq, new_name=allseqs[oseq], region=region, debug=True)

bin/cf-linearham.py ADDED Viewed

@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import csv
+import os
+import sys
+import argparse
+import colored_traceback.always
+import glob
+import subprocess
+import operator
+from io import open
+# if you move this script, you'll need to change this method of getting the imports
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+# partis_dir = os.getcwd()
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+from python.clusterpath import ClusterPath
+parser = argparse.ArgumentParser()
+# NOTE to compare multiple output runs see  datascripts/meta/qa013-synth/read-lh-cf.py
+parser.add_argument('--partis-file', required=True, help='partis yaml partition output file that includes alternative annotation information (i.e. --calculate-alternative-annotations was set while partitioning)')
+parser.add_argument('--linearham-dir', required=True, help='linearham output dir (the main/parent dir))')
+parser.add_argument('--prob-to-ignore', default=0.15, help='don\'t print sequences with probabilities smaller than this')
+parser.add_argument('--outdir', help='if set, write csv info for the printed naive sequences to here')
+args = parser.parse_args()
+# ----------------------------------------------------------------------------------------
+def read_linearham_output():
+    lh_info = {}
+    glbfns = [f for gstr in ['cluster', 'iclust-'] for f in glob.glob('%s/%s*' % (args.linearham_dir, gstr))]
+    clusterdirs = [d for d in (glbfns) if os.path.isdir(d)]  # works for old style 'clusterN' and new-style 'cluster-N'
+    if len(clusterdirs) == 0:
+        raise Exception('no linearham cluster subdirs (of form clusterN/ or cluster-N/ found in %s' % args.linearham_dir)
+    print('  reading linearham info for %d cluster%s: %s' % (len(clusterdirs), utils.plural(len(clusterdirs)),' '.join(os.path.basename(cd) for cd in clusterdirs)))
+    for cdir in clusterdirs:
+        sfnames = [f for gstr in ['', 'lineage*/'] for f in glob.glob('%s/%scluster_seqs.fasta' % (cdir, gstr))]
+        if len(sfnames) == 0:
+            raise Exception('no sequence files \'*_seqs.fasta\' found in %s' % cdir)
+        sfn = utils.get_single_entry(sfnames)
+        input_seqfos = utils.read_fastx(sfn)
+        input_uids = [sfo['name'] for sfo in input_seqfos if sfo['name'] != 'naive']
+        # aa_naive_seqs.fasta:   prob of each aa naive seq
+        # aa_naive_seqs.dnamap:  prob of each nuc naive seq contributing to each of those aa naive seqs
+        aasfn = '%s/aa_naive_seqs.fasta'%os.path.dirname(sfn)
+        aa_seq_infos = utils.read_fastx(aasfn)
+        for iseq, sfo in enumerate(aa_seq_infos):
+            tlist = sfo['name'].split('_')
+            assert len(tlist) == 3
+            assert int(tlist[1]) == iseq
+            sfo['prob'] = float(tlist[2])
+        with open(aasfn.replace('.fasta', '.dnamap')) as outfile:  # this is some weird bastardization of a fasta file
+            iseq = -1
+            for line in outfile:
+                if line[0] == '>':
+                    iseq += 1
+                    assert line.strip().lstrip('>') == aa_seq_infos[iseq]['name']
+                    aa_seq_infos[iseq]['nuc_seqs_probs'] = []
+                    continue
+                prob, naive_nuc_seq = line.strip().split(',')
+                aa_seq_infos[iseq]['nuc_seqs_probs'].append((naive_nuc_seq, float(prob)))
+        lh_info[':'.join(input_uids)] = aa_seq_infos
+    return lh_info
+# ----------------------------------------------------------------------------------------
+def print_naive_seq_lines(nseq_info, namestr, namecolor, ref_seq=None, amino_acid=False, writefo=None):
+    def i_aa_color(i_aa):
+        tmpcolors = ['purple', 'yellow', 'red', 'blue', 'green']
+        return tmpcolors[i_aa % len(tmpcolors)]
+    total_prob = 0.
+    breaking = False
+    for naive_seq, prob, i_aa_seq in sorted(nseq_info, key=operator.itemgetter(1), reverse=True):
+        if ref_seq is None:
+            ref_seq = naive_seq
+        breakstr = ''
+        if 1. - total_prob < args.prob_to_ignore:
+            breaking = True
+            breakstr = 'total: %5.2f (breaking after %.2f)' % (prob+total_prob, 1. - args.prob_to_ignore)
+        print('     %s %s    %5.2f    %s   %s' % (utils.color_mutants(ref_seq, naive_seq, amino_acid=amino_acid, align_if_necessary=True),
+                                                  utils.color(i_aa_color(i_aa_seq), str(i_aa_seq), width=2), prob, utils.color(namecolor, namestr, width=9, padside='right'), breakstr))
+        if writefo is not None:
+            writefo.append({'method' : namestr, 'prob' : prob, 'seq' : naive_seq})
+        if breaking:
+            break
+        total_prob += prob
+    print('')
+    return ref_seq
+# ----------------------------------------------------------------------------------------
+def print_all_lines(lh_aa_seq_infos, pline, amino_acid=False):
+    seq_len = len(lh_aa_seq_infos[0]['seq'] if amino_acid else pline['naive_seq'])
+    anstr = '%s %s naive seqs' % (headstr('1.' if amino_acid else '3.'), 'amino acid' if amino_acid else 'nucleotide')
+    print('  %s:%s aa seq' % (anstr, (seq_len - utils.len_excluding_colors(anstr)) * ' '))
+    if amino_acid:
+        codon_str = utils.color('reverse_video', 'X')
+        vpos, jpos = [pline['codon_positions'][r] // 3 for r in ['v', 'j']]
+        cdstr = '%s%s%s%s%s' % (' '*vpos, codon_str, '-'*(jpos - vpos - 1), codon_str, ' '*(seq_len - jpos - 1))
+    else:
+        cdstr = seq_len*' '
+    print('    %s index  prob' % cdstr)
+    writefo=[]
+    ref_seq = print_naive_seq_lines(get_lh_nsinfo(lh_aa_seq_infos, amino_acid=amino_acid), 'linearham', 'green', amino_acid=amino_acid, writefo=writefo)
+    _ = print_naive_seq_lines(get_partis_nsinfo(pline, amino_acid=amino_acid), 'partis', 'blue', ref_seq=ref_seq, amino_acid=amino_acid, writefo=writefo)  # use the linearham naive seq as ref_seq also for partis
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    if args.outdir is not None:
+        print('  writing %s seqs to %s' % ('aa' if amino_acid else 'nuc', args.outdir))
+        with open('%s/%s-seqs.csv'%(args.outdir, 'aa' if amino_acid else 'nuc'), 'w') as jfile:
+            writer = csv.DictWriter(jfile, writefo[0].keys())
+            writer.writeheader()
+            for wfo in writefo:
+                writer.writerow(wfo)
+# ----------------------------------------------------------------------------------------
+def print_gene_calls(pline):
+# TODO would be nice to read the linearham annotation so that we can highlight the v/d/j genes that linearham assumed were correct
+    print('  %s partis gene calls (linearham only considers one gene combo):' % headstr('4.'))
+    print('        prob   gene')
+    for region in utils.regions:
+        print('      %s' % utils.color('blue', region))
+        for gene, prob in pline['alternative-annotations']['gene-calls'][region]:
+            print('        %4.2f  %s' % (prob, utils.color_gene(gene, width=15)))
+# ----------------------------------------------------------------------------------------
+def get_lh_nsinfo(lh_aa_seq_infos, amino_acid=False):
+    if amino_acid:
+        return [(s['seq'], s['prob'], i) for i, s in enumerate(lh_aa_seq_infos)]
+    else:
+        return [(ns, np, i) for i, s in enumerate(lh_aa_seq_infos) for ns, np in s['nuc_seqs_probs']]
+# ----------------------------------------------------------------------------------------
+def get_partis_nsinfo(pline, amino_acid=False):
+    nuc_naive_seqs = pline['alternative-annotations']['naive-seqs'] if 'alternative-annotations' in pline else [(pline['naive_seq'], 1.), ]
+    pdict, sdict = {}, {}
+    for aseq, nseq, prob in [(utils.ltranslate(nseq, trim=True), nseq, prob) for nseq, prob in nuc_naive_seqs]:  # add up the probs for any nuc seqs that code for the same aa seq
+        if aseq not in pdict:
+            pdict[aseq] = 0.
+            sdict[aseq] = []
+        pdict[aseq] += prob
+        sdict[aseq].append(nseq)
+    aa_naive_seqs = sorted(list(pdict.items()), key=operator.itemgetter(1), reverse=True)
+    if amino_acid:
+        return [(s, p, i) for i, (s, p) in enumerate(aa_naive_seqs)]
+    else:
+        nuc_seq_aa_indices = {}
+        for iseq, (aseq, _) in enumerate(aa_naive_seqs):
+            for nseq in sdict[aseq]:
+                nuc_seq_aa_indices[nseq] = iseq
+        return [(s, p, nuc_seq_aa_indices[s]) for s, p in nuc_naive_seqs]
+# ----------------------------------------------------------------------------------------
+def headstr(hstr):
+    return utils.color('green', hstr)
+# ----------------------------------------------------------------------------------------
+glfo, annotation_list, cpath = utils.read_output(args.partis_file)
+lh_info = read_linearham_output()
+annotations = {':'.join(adict['unique_ids']) : adict for adict in annotation_list}  # collect the annotations in a dictionary so they're easier to access
+most_likely_partition = cpath.partitions[cpath.i_best]  # a partition is represented as a list of lists of strings, with each string a sequence id
+print('  %d (of %d) clusters from partis file share uids with %d linearham cluster%s' % (len([c for c in most_likely_partition if any(len(set(c) & set(lc.split(':'))) > 0 for lc in lh_info)]), len(most_likely_partition), len(lh_info), utils.plural(len(lh_info))))
+sorted_clusters = sorted(most_likely_partition, key=len, reverse=True)
+for cluster in sorted_clusters:
+    pline = annotations[':'.join(cluster)]
+    utils.trim_fwk_insertions(glfo, pline, modify_alternative_annotations=True)  # linearham probably won't have them, so we need to remove them so things line up
+    if 'alternative-annotations' not in pline:
+        print('  note: no alternative annotations in %s, so can\'t print partis alternative naive sequences' % args.partis_file)
+    p_uids = set(pline['unique_ids'])
+    lh_clusters = [(uidstr, cfo) for uidstr, cfo in lh_info.items() if set(uidstr.split(':')) & p_uids]  # lh clusters with any uids in common iwth this partis <cluster> (there should only be 1)
+    lh_aa_seq_infos = []
+    if len(lh_clusters) == 0:
+        # print '  no linearham clusters with any of these uids' % utils.color('red', 'error')
+        continue
+    elif len(lh_clusters) != 1:
+        raise Exception('should have only one linearham cluster with uids in common with this cluster, but found %d' % len(lh_clusters))
+    else:
+        lh_uidstr, lh_aa_seq_infos = lh_clusters[0]
+        lh_uids = set(lh_uidstr.split(':'))
+        print('%s with sizes: partis %d   linearham %d (%d in common)' % (utils.color('blue', 'starting clusters'), len(pline['unique_ids']), len(lh_uids), len(lh_uids & p_uids)))
+        if len(lh_uids - p_uids) > 0:
+            print('  %s %d extra uids in linearham cluster' % (utils.color('yellow', 'warning'), len(lh_uids - p_uids)))
+        if len(p_uids - lh_uids) > 0:
+            print('  %s %d extra uids in partis cluster' % (utils.color('yellow', 'warning'), len(p_uids - lh_uids)))
+    if len(lh_aa_seq_infos) == 0:
+        print('  %s no prob/naive_seq pairs in linearham for this cluster' % utils.color('red', 'error'))
+    print_all_lines(lh_aa_seq_infos, pline, amino_acid=True)
+    utils.print_reco_event(utils.synthesize_single_seq_line(pline, iseq=0), extra_str='    ', label='%s annotation for a single (arbitrary) sequence from the cluster:'%headstr('2.'))
+    print_all_lines(lh_aa_seq_infos, pline, amino_acid=False)
+    if 'alternative-annotations' in pline:
+        print_gene_calls(pline)
+    print('')

bin/chimera-plot.py ADDED Viewed

@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import collections
+import argparse
+import sys
+import os
+import csv
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+if not os.path.exists(partis_dir):
+    print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+from python.hist import Hist
+import python.plotting as plotting
+import python.glutils as glutils
+parser = argparse.ArgumentParser()
+parser.add_argument('infile')
+parser.add_argument('plotdir')
+parser.add_argument('--glfo-dir', default='data/germlines/human', help='I\'m hacking this in afterwards because this was written before switching to yaml output files, so I think it was using this default germline dir anyway (except it used old glfo with different genes, so you probably actually have to pass in the real corresponding glfo anyway)')
+parser.add_argument('--chunk-len', default=75, type=int)
+parser.add_argument('--cutoff', default=0.3, help='point in max-abs-diff above which we assume most sequences are chimeric')
+parser.add_argument('--title')
+parser.add_argument('--locus', default='igh')
+args = parser.parse_args()
+if args.title == 'good':
+    args.title = 'none'
+elif args.title == 'chimeras':
+    args.title = 'all chimeras'
+def gk(uids):
+    return ':'.join(uids)
+glfo = None
+if utils.getsuffix(args.infile) == '.csv':
+    glfo = glutils.read_glfo(args.glfo_dir, args.locus)
+glfo, annotation_list, _ = utils.read_output(args.infile, glfo=glfo)
+annotations = collections.OrderedDict((line['unique_ids'][0], line) for line in annotation_list)
+chfo = {uid : {k : v for k, v in zip(('imax', 'max_abs_diff'), utils.get_chimera_max_abs_diff(annotations[uid], iseq=0, chunk_len=args.chunk_len))} for uid in annotations}
+biggest_adiffs = sorted(chfo, key=lambda q: chfo[q]['max_abs_diff'], reverse=True)
+for uid in biggest_adiffs[:5]:
+    print('%-3d  %6.3f' % (chfo[uid]['imax'], chfo[uid]['max_abs_diff']))
+    utils.print_reco_event(annotations[uid])
+n_above_cutoff = len([_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff])
+chimeric_fraction = n_above_cutoff / float(len(chfo))
+print('  %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo), chimeric_fraction))
+hmaxval = Hist(45, 0., 0.65)
+for uid in annotations:
+    hmaxval.fill(chfo[uid]['max_abs_diff'])
+himax = Hist(75, 0., 400)
+for uid in annotations:
+    himax.fill(chfo[uid]['imax'])
+utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv'])
+import matplotlib
+from matplotlib import pyplot as plt
+fig, ax = plotting.mpl_init()
+xvals, yvals = list(zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()]))
+plt.scatter(xvals, yvals, alpha=0.4)
+print('writing to %s' % args.plotdir)
+plotting.mpl_finish(ax, args.plotdir, 'hexbin', title=args.title, xlabel='break point', ylabel='abs mfreq diff')
+plotting.draw_no_root(hmaxval, plotdir=args.plotdir, plotname='mfreq-diff', shift_overflows=True, xtitle='abs mfreq diff', ytitle='seqs')
+hmaxval.write('%s/%s.csv' % (args.plotdir, 'mfreq-diff'))
+plotting.draw_no_root(himax, plotdir=args.plotdir, plotname='imax', shift_overflows=True, xtitle='break point', ytitle='seqs')
+himax.write('%s/%s.csv' % (args.plotdir, 'imax'))

bin/choose-partially-paired.py ADDED Viewed

@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+from __future__ import print_function
+import sys
+import csv
+from io import open
+csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
+import os
+import argparse
+import colored_traceback.always
+import operator
+# if you move this script, you'll need to change this method of getting the imports
+from pathlib import Path
+partis_dir = str(Path(__file__).parent.parent)
+sys.path.insert(1, partis_dir) # + '/python')
+import python.utils as utils
+import python.glutils as glutils
+from python.clusterpath import ClusterPath
+import python.seqfileopener as seqfileopener
+import python.indelutils as indelutils
+import python.treeutils as treeutils
+# ----------------------------------------------------------------------------------------
+def addseq(ltmp, tline, uid, iclust):
+    if any(uid==s['name'] for s in chosen_seqs[ltmp]):  # don't add it twice
+        return
+    if indelutils.has_indels_line(tline, tline['unique_ids'].index(uid)):
+        indel_warning_strs.append('  %s shm indels in chosen seq %s, which means you need to decide by hand whether you want to choose the input or indel-reversed seq (indel-reversed is written to output file' % (utils.color('yellow', 'warning'), uid))
+    chosen_seqs[ltmp].append({
+        'name' : uid,
+        'seq' : utils.per_seq_val(tline, 'seqs', uid),
+        'locus' : ltmp,
+        'igh_iclust' : iclust,
+        'aa-cdist' : treeutils.smvals(tline, 'cons-dist-aa', uid=uid)
+    })
+# ----------------------------------------------------------------------------------------
+def translate_paired_ids(ltmp, pids):
+    return ['%s-%s-%s' % (args.sample_prefix, ltmp, u) for u in pids]
+# ----------------------------------------------------------------------------------------
+helpstr = """
+I think this was an old script to kind of do a hackey version of approximate bulk pairing (but not really sure, would need to read through it more carefully, atm i'm adding this late and i forget).
+see usage: datascripts/meta/qa013-synth/run.sh
+"""
+class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+formatter_class = MultiplyInheritedFormatter
+parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
+parser.add_argument('igh_fname')
+parser.add_argument('igk_fname')
+parser.add_argument('igl_fname')
+parser.add_argument('--input-metafnames')
+parser.add_argument('--sample-prefix', default='QA013-10x-pre', help='str that needs to be prepended to paired uid to match the uid in \'paired-uids\' (necessary because when we merge samples in datascripts/preprocess.py we don\'t know which sample each paired id is from)')
+parser.add_argument('--outfname')
+parser.add_argument('--n-largest-clusters', type=int, default=3)
+parser.add_argument('--n-to-choose', type=int, default=2)
+parser.add_argument('--choose-paired', action='store_true')
+# parser.add_argument('--paired-sample-prefix')
+# parser.add_argument('--n-max-queries', type=int, default=-1)  # just for testing DAMMIT can't do this, you have to read all the other chains to find the right cluster
+args = parser.parse_args()
+args.input_metafnames = utils.get_arg_list(args.input_metafnames)
+cpaths, antn_lists = {}, {}
+for ltmp, fn in zip(['igh', 'igk', 'igl'], [args.igh_fname, args.igk_fname, args.igl_fname]):
+    _, antn_lists[ltmp], cpaths[ltmp] = utils.read_output(fn) #, n_max_queries=args.n_max_queries)
+    for tline in antn_lists[ltmp]:
+        tline['paired-uids'] = [[] for _ in tline['unique_ids']]
+    if args.input_metafnames is not None:
+        seqfileopener.read_input_metafo(args.input_metafnames, antn_lists[ltmp])
+chosen_seqs = {l : [] for l in utils.sub_loci('ig')}
+indel_warning_strs = []
+lp_antn_pairs = []  # somewhat similar to paircluster.find_cluster_pairs()
+antn_dicts = {l : utils.get_annotation_dict(alist) for l, alist in antn_lists.items()}
+sorted_hclusters = sorted(cpaths['igh'].best(), key=len, reverse=True)[:args.n_largest_clusters]
+print('  choosing seqs from %d largest igh clusters with sizes %s' % (args.n_largest_clusters, ' '.join(str(len(c)) for c in sorted_hclusters)))
+print('             igh    N igh   light  l clust     N chosen')
+print('    iclust   size  paired   locus  size    aa-cdist   paired')
+for iclust, hclust in enumerate(sorted_hclusters):
+    print('    %3d    %5d' % (iclust, len(hclust)), end=' ')
+    hline = antn_dicts['igh'][':'.join(hclust)]
+    tid_lists = [(u, pids[0]) for u, pids in zip(hline['unique_ids'], hline['paired-uids']) if len(pids)==1]  # NOTE this doesn't check that the pairing info is reciprocal (i.e. that the paired-uids in the light chain correpsond to the h seqs)
+    if len(tid_lists) == 0:
+        non_zero_pids = [pids for pids in hline['paired-uids'] if len(pids) > 0]
+        print('         no uniquely-paired seqs (paired-uids lengths: %s, +%d unpaired)' % (' '.join(str(n) for n in sorted((len(pids) for pids in non_zero_pids), reverse=True)), len(hline['unique_ids']) - len(non_zero_pids)))
+        continue
+    h_paired_ids, l_paired_ids = list(zip(*tid_lists))
+    print(' %3d' % len(h_paired_ids), end=' ')
+    lclusts = []
+    for ltmp in ['igk', 'igl']:
+        l_tmp_ids = translate_paired_ids(ltmp, l_paired_ids)
+        for lc in cpaths[ltmp].best():
+            if len(set(lc) & set(l_tmp_ids)) > 0:
+                lclusts.append((ltmp, lc))
+    if len(lclusts) != 1:
+        print('         couldn\'t find unique light cluster (found %d) for %d paired ids (from %d heavy ids)' % (len(lclusts), len(l_paired_ids), len(h_paired_ids)))
+        continue
+    l_locus, lclust = lclusts[0]
+    lline = antn_dicts[l_locus][':'.join(lclust)]
+    lp_antn_pairs.append((hline, lline))
+    print('      %3s  %4d' % (l_locus, len(lclust)), end=' ')
+    # add aa-cdist (it's probably usually already there, but it's easy to add, and should always end up the same)
+    import python.treeutils as treeutils
+    tmpids = {}
+    for ltmp, tline in zip(('igh', l_locus), (hline, lline)):
+        tline['tree-info'] = {'lb' : {}}
+        treeutils.add_cdists_to_lbfo(tline, tline['tree-info']['lb'], 'cons-dist-aa')
+        tmpids[ltmp], _ = list(zip(*sorted(list(tline['tree-info']['lb']['cons-dist-aa'].items()), key=operator.itemgetter(1), reverse=True)))
+        tmpids[ltmp] = tmpids[ltmp][:args.n_to_choose]
+        for uid in tmpids[ltmp]:
+            addseq(ltmp, tline, uid, iclust)
+    print('     %2d %2d' % (len(tmpids['igh']), len(tmpids[l_locus])), end=' ')
+    if args.choose_paired:
+        for ltmp, tline, cids in zip(('igh', l_locus), (hline, lline), (h_paired_ids, translate_paired_ids(l_locus, l_paired_ids))):
+            for uid in cids:
+                addseq(ltmp, tline, uid, iclust)
+        print('     %2d %2d' % (len(h_paired_ids), len(l_paired_ids)), end=' ')
+    print('')
+if len(indel_warning_strs) > 0:
+    print('\n'.join(indel_warning_strs))
+all_chosen_ids = [s['name'] for sfos in chosen_seqs.values() for s in sfos]
+for hline, lline in lp_antn_pairs:
+    print('%s   sizes %d %d  chose %d %d' % (utils.color('green', '-->'), len(hline['unique_ids']), len(lline['unique_ids']), len([u for u in all_chosen_ids if u in hline['unique_ids']]), len([u for u in all_chosen_ids if u in hline['unique_ids']])))
+    for tline in [hline, lline]:
+        utils.print_reco_event(tline, extra_print_keys=['cons-dist-aa', 'paired-uids'], queries_to_emphasize=all_chosen_ids, extra_str='        ')
+if args.outfname is not None:
+    print('  writing %d chosen seqs to %s' % (len(chosen_seqs), args.outfname))
+    utils.mkdir(args.outfname, isfile=True)
+    with open(args.outfname, utils.csv_wmode()) as ofile:
+        writer = csv.DictWriter(ofile, sorted(chosen_seqs['igh'][0].keys()))  # NOTE dammit this is way too similar to treeutils.combine_selection_metrics(), i need to maybe split the csv writing code out of there?
+        writer.writeheader()
+        for ltmp, seqfos in chosen_seqs.items():
+            for sfo in seqfos:
+                writer.writerow(sfo)

bin/circle-plots.py ADDED Viewed

@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+from __future__ import absolute_import, division, unicode_literals
+import sys
+import colored_traceback.always
+import os
+import circlify
+import json
+import argparse
+import csv
+from io import open
+parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument('infname')
+parser.add_argument('outfname')
+args = parser.parse_args()
+radii = []
+with open(args.infname) as ifile:
+    reader = csv.DictReader(ifile)
+    for line in reader:
+        radii.append({'id' : line['id'], 'radius' : float(line['radius'])})
+circlefos = circlify.circlify(radii, datum_field='radius', id_field='id')  # NOTE this doesn't return them in the same order
+with open(args.outfname, 'wb' if sys.version_info.major < 3 else 'w') as ofile:
+    def gfn(k, c): return getattr(c, k) if hasattr(c, k) else getattr(c, 'ex')[k]
+    headers = ('id', 'x', 'y', 'r')
+    writer = csv.DictWriter(ofile, headers)
+    writer.writeheader()
+    for cfo in circlefos:
+        writer.writerow({k : gfn(k, cfo) for k in headers})

partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

partis-bcr 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl