partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.2.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
  54. partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. packages/ham/bcrham +0 -0
  61. partis/main.py +0 -59
  62. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
  65. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
  66. {partis → python}/__init__.py +0 -0
  67. {partis → python}/alleleclusterer.py +0 -0
  68. {partis → python}/allelefinder.py +0 -0
  69. {partis → python}/alleleremover.py +0 -0
  70. {partis → python}/annotationclustering.py +0 -0
  71. {partis → python}/baseutils.py +0 -0
  72. {partis → python}/cache/__init__.py +0 -0
  73. {partis → python}/cache/cached_uncertainties.py +0 -0
  74. {partis → python}/clusterpath.py +0 -0
  75. {partis → python}/coar.py +0 -0
  76. {partis → python}/corrcounter.py +0 -0
  77. {partis → python}/datautils.py +0 -0
  78. {partis → python}/event.py +0 -0
  79. {partis → python}/fraction_uncertainty.py +0 -0
  80. {partis → python}/gex.py +0 -0
  81. {partis → python}/glomerator.py +0 -0
  82. {partis → python}/hist.py +0 -0
  83. {partis → python}/hmmwriter.py +0 -0
  84. {partis → python}/hutils.py +0 -0
  85. {partis → python}/indelutils.py +0 -0
  86. {partis → python}/lbplotting.py +0 -0
  87. {partis → python}/mds.py +0 -0
  88. {partis → python}/mutefreqer.py +0 -0
  89. {partis → python}/paircluster.py +0 -0
  90. {partis → python}/parametercounter.py +0 -0
  91. {partis → python}/paramutils.py +0 -0
  92. {partis → python}/partitiondriver.py +0 -0
  93. {partis → python}/partitionplotter.py +0 -0
  94. {partis → python}/performanceplotter.py +0 -0
  95. {partis → python}/plotconfig.py +0 -0
  96. {partis → python}/processargs.py +0 -0
  97. {partis → python}/prutils.py +0 -0
  98. {partis → python}/recombinator.py +0 -0
  99. {partis → python}/scanplot.py +0 -0
  100. {partis → python}/seqfileopener.py +0 -0
  101. {partis → python}/treegenerator.py +0 -0
  102. {partis → python}/viterbicluster.py +0 -0
  103. {partis → python}/vrc01.py +0 -0
  104. {partis → python}/waterer.py +0 -0
bin/build.sh ADDED
@@ -0,0 +1,24 @@
1
+ #!/bin/bash
2
+
3
+ echo -e "\n--> running $0"
4
+ set -eu
5
+
6
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
7
+ basedir=`dirname $SCRIPT_DIR` # go up one level
8
+
9
+ echo -e "\n--> building ig-sw"
10
+ cd $basedir/packages/ig-sw/src/ig_align/ && scons
11
+ cd $basedir/
12
+
13
+ echo -e "\n--> building ham"
14
+ cd $basedir/packages/ham/ && scons bcrham
15
+ cd $basedir/
16
+
17
+ if [ "$*" == "with-simulation" ]; then
18
+ echo -e "\n--> building bpp-newlik (only used for simulation)"
19
+ cd $basedir/packages/bpp-newlik/ && ./install.sh # the bpp-phyl step is really really incredibly slow
20
+ cd $basedir/
21
+ fi
22
+
23
+ echo -e "\n--> test"
24
+ ./test/test.py --quick
bin/cf-alleles.py ADDED
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import os
6
+ import sys
7
+
8
+ from pathlib import Path
9
+ partis_dir = str(Path(__file__).parent.parent)
10
+ if not os.path.exists(partis_dir):
11
+ print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
12
+ sys.path.insert(1, partis_dir) # + '/python')
13
+
14
+ import python.utils as utils
15
+ import python.glutils as glutils
16
+
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument('--bases', required=True, help='colon-separated list of the bits before the stars, e.g. 1-18:2-2 (set to \'all\' to print entire germline set)')
19
+ parser.add_argument('--allele-numbers')
20
+ parser.add_argument('--ref-allele', help='print this one first')
21
+ parser.add_argument('--other-genes')
22
+ parser.add_argument('--region', default='v')
23
+ parser.add_argument('--locus', default='igh', choices=utils.loci)
24
+ parser.add_argument('--species', default='human')
25
+ parser.add_argument('--glfo-dir', help='default set below')
26
+ args = parser.parse_args()
27
+
28
+ if args.glfo_dir is None:
29
+ args.glfo_dir = 'data/germlines/' + args.species
30
+
31
+ glfo = glutils.read_glfo(args.glfo_dir, args.locus)
32
+
33
+ # ----------------------------------------------------------------------------------------
34
+ def get_base(gene):
35
+ basestr = utils.primary_version(gene)
36
+ if utils.sub_version(gene) is not None:
37
+ basestr += '-' + utils.sub_version(gene)
38
+ return basestr
39
+
40
+ # ----------------------------------------------------------------------------------------
41
+ def get_genes(base, alleles=None):
42
+ if alleles is None: # take all of 'em
43
+ alleles = [utils.allele(g) for g in glfo['seqs'][args.region] if base == get_base(g)]
44
+ return [args.locus.upper() + args.region.upper() + base + '*' + al for al in alleles]
45
+
46
+ if args.bases == 'all':
47
+ input_groupfcn = None # lambda g: str(utils.primary_version(g) in ['4', '5']) # this example puts all the 4 and 5 primary versions in one group, and everybody else in another
48
+ glutils.print_glfo(glfo, only_region=(args.region if args.region != 'v' else None), input_groupfcn=input_groupfcn) # not much point in doing only v, since it's the one that takes most of the time
49
+ sys.exit(0)
50
+
51
+ args.bases = utils.get_arg_list(args.bases)
52
+ args.allele_numbers = utils.get_arg_list(args.allele_numbers)
53
+ genes = [g for base in args.bases for g in get_genes(base, args.allele_numbers)]
54
+ if len(genes) == 0:
55
+ raise Exception('couldn\'t find any genes for the specified --bases %s\n choices:\n %s' % (' '.join(args.bases), ' '.join(sorted(set([get_base(g) for g in glfo['seqs'][args.region]])))))
56
+ args.other_genes = utils.get_arg_list(args.other_genes)
57
+ if args.other_genes is not None:
58
+ genes += args.other_genes
59
+
60
+ seqstrs = ['' for _ in range(len(genes))]
61
+ snpstrs = ['' for _ in range(len(genes))]
62
+
63
+ gene_str_width = max([utils.len_excluding_colors(utils.color_gene(g)) for g in genes])
64
+ codon_positions = glfo[utils.conserved_codons[args.locus][args.region] + '-positions'] if args.region != 'd' else None
65
+ max_seq_len = max([len(glfo['seqs'][args.region][g]) for g in genes])
66
+
67
+ ref_gene = genes[0] if args.ref_allele is None else utils.rejoin_gene(args.locus, args.region, utils.primary_version(genes[0]), utils.sub_version(genes[0]), args.ref_allele)
68
+ if ref_gene != genes[0]:
69
+ genes.remove(ref_gene)
70
+ genes.insert(0, ref_gene)
71
+ ref_seq = glfo['seqs'][args.region][ref_gene]
72
+ ref_pos = codon_positions[ref_gene]
73
+
74
+ for igene in range(0, len(genes)):
75
+ gene = genes[igene]
76
+ seq = glfo['seqs'][args.region][gene]
77
+ pos = codon_positions[gene]
78
+ if pos < ref_pos: # align the codon position in the case that this seq is shorter up to the codon
79
+ seq = (ref_pos - pos) * '-' + seq
80
+ pos += (ref_pos - pos)
81
+
82
+ right_pad_str = '' # i think i don't need this any more since i have the align option in color_mutants
83
+ # if len(seq) < max_seq_len:
84
+ # right_pad_str = (max_seq_len - len(seq)) * ' '
85
+
86
+ emph_positions = None if args.region == 'd' else [pos + i for i in range(3)]
87
+ colored_seq, isnps = utils.color_mutants(ref_seq, seq, return_isnps=True, emphasis_positions=emph_positions, align=True)
88
+ seqstrs[igene] += '%s%s' % (colored_seq, right_pad_str)
89
+ if len(isnps) > 0:
90
+ snpstrs[igene] = '%2d (%s)' % (len(isnps), ' '.join([str(i) for i in isnps]))
91
+
92
+ # ----------------------------------------------------------------------------------------
93
+ def print_str(gene, seqstr, snpstr):
94
+ return '%s %s %s %s' % (utils.color_gene(gene, width=gene_str_width), seqstr, utils.color_gene(gene, width=gene_str_width), snpstr)
95
+
96
+ for igene in range(len(genes)):
97
+ print(print_str(genes[igene], seqstrs[igene], snpstrs[igene]))
bin/cf-germlines.py ADDED
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import sys
6
+ import os
7
+ import copy
8
+ import collections
9
+ import colored_traceback.always
10
+
11
+ from pathlib import Path
12
+ partis_dir = str(Path(__file__).parent.parent)
13
+ if not os.path.exists(partis_dir):
14
+ print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
15
+ sys.path.insert(1, partis_dir) # + '/python')
16
+
17
+ import python.utils as utils
18
+ import python.glutils as glutils
19
+
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument('gldir1')
22
+ parser.add_argument('gldir2')
23
+ parser.add_argument('--names', default='+gl-1:+gl-2', help='colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output')
24
+ parser.add_argument('--locus', default='igh')
25
+ args = parser.parse_args()
26
+ args.names = utils.get_arg_list(args.names)
27
+
28
+ # ----------------------------------------------------------------------------------------
29
+ def clrname(name):
30
+ return utils.color('blue', name)
31
+
32
+ # ----------------------------------------------------------------------------------------
33
+ glfos = []
34
+ for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
35
+ print('%s:' % clrname(name))
36
+ glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))
37
+
38
+ for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
39
+ aseqs, bseqs = [{s : n for n, s in g['seqs'][region].items()} for g in glfos] # dict of names keyed by seqs
40
+ a_only_seqs, b_only_seqs = set(aseqs) - set(bseqs), set(bseqs) - set(aseqs)
41
+
42
+ print('%s' % utils.color('green', region))
43
+
44
+ common_seqs = set(aseqs) & set(bseqs)
45
+ common_name_seqs = [aseqs[s] for s in common_seqs if aseqs[s]==bseqs[s]]
46
+ print(' %3d seqs in common with same name: %s' % (len(common_name_seqs), utils.color_genes(sorted(common_name_seqs))))
47
+ dnamed_seqs = [(aseqs[s], bseqs[s]) for s in common_seqs if aseqs[s] != bseqs[s]]
48
+ if len(dnamed_seqs) > 0:
49
+ print(' %s %d common seq%s with different names: %s' % (utils.wrnstr(), len(dnamed_seqs), utils.plural(len(dnamed_seqs)), ', '.join(utils.color_genes([an,bn]) for an, bn in dnamed_seqs)))
50
+ print(' only in:\n %12s: %3d %s\n %12s: %3d %s' % (clrname(args.names[0]), len(a_only_seqs), utils.color_genes(sorted(aseqs[s] for s in a_only_seqs)),
51
+ clrname(args.names[1]), len(b_only_seqs), utils.color_genes(sorted(bseqs[s] for s in b_only_seqs))))
52
+
53
+ tmpfo = glutils.get_empty_glfo(args.locus) # make a new glfo that will only have non-shared genes
54
+ for gname, oname, only_seqs, allseqs, ogfo in zip(args.names, reversed(args.names), [a_only_seqs, b_only_seqs], [aseqs, bseqs], reversed(glfos)): # <gset> is the genes that're only in <gname>
55
+ print(' finding nearest seq in %s for %d seqs only in %s' % (clrname(oname), len(only_seqs), clrname(gname)))
56
+ for oseq in only_seqs:
57
+ glutils.find_nearest_gene_in_glfo(ogfo, oseq, new_name=allseqs[oseq], region=region, debug=True)
bin/cf-linearham.py ADDED
@@ -0,0 +1,199 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import csv
5
+ import os
6
+ import sys
7
+ import argparse
8
+ import colored_traceback.always
9
+ import glob
10
+ import subprocess
11
+ import operator
12
+ from io import open
13
+
14
+ # if you move this script, you'll need to change this method of getting the imports
15
+ from pathlib import Path
16
+ partis_dir = str(Path(__file__).parent.parent)
17
+ # partis_dir = os.getcwd()
18
+ sys.path.insert(1, partis_dir) # + '/python')
19
+
20
+ import python.utils as utils
21
+ import python.glutils as glutils
22
+ from python.clusterpath import ClusterPath
23
+
24
+ parser = argparse.ArgumentParser()
25
+ # NOTE to compare multiple output runs see datascripts/meta/qa013-synth/read-lh-cf.py
26
+ parser.add_argument('--partis-file', required=True, help='partis yaml partition output file that includes alternative annotation information (i.e. --calculate-alternative-annotations was set while partitioning)')
27
+ parser.add_argument('--linearham-dir', required=True, help='linearham output dir (the main/parent dir))')
28
+ parser.add_argument('--prob-to-ignore', default=0.15, help='don\'t print sequences with probabilities smaller than this')
29
+ parser.add_argument('--outdir', help='if set, write csv info for the printed naive sequences to here')
30
+ args = parser.parse_args()
31
+
32
+ # ----------------------------------------------------------------------------------------
33
+ def read_linearham_output():
34
+ lh_info = {}
35
+ glbfns = [f for gstr in ['cluster', 'iclust-'] for f in glob.glob('%s/%s*' % (args.linearham_dir, gstr))]
36
+ clusterdirs = [d for d in (glbfns) if os.path.isdir(d)] # works for old style 'clusterN' and new-style 'cluster-N'
37
+ if len(clusterdirs) == 0:
38
+ raise Exception('no linearham cluster subdirs (of form clusterN/ or cluster-N/ found in %s' % args.linearham_dir)
39
+ print(' reading linearham info for %d cluster%s: %s' % (len(clusterdirs), utils.plural(len(clusterdirs)),' '.join(os.path.basename(cd) for cd in clusterdirs)))
40
+ for cdir in clusterdirs:
41
+ sfnames = [f for gstr in ['', 'lineage*/'] for f in glob.glob('%s/%scluster_seqs.fasta' % (cdir, gstr))]
42
+ if len(sfnames) == 0:
43
+ raise Exception('no sequence files \'*_seqs.fasta\' found in %s' % cdir)
44
+ sfn = utils.get_single_entry(sfnames)
45
+ input_seqfos = utils.read_fastx(sfn)
46
+ input_uids = [sfo['name'] for sfo in input_seqfos if sfo['name'] != 'naive']
47
+ # aa_naive_seqs.fasta: prob of each aa naive seq
48
+ # aa_naive_seqs.dnamap: prob of each nuc naive seq contributing to each of those aa naive seqs
49
+ aasfn = '%s/aa_naive_seqs.fasta'%os.path.dirname(sfn)
50
+ aa_seq_infos = utils.read_fastx(aasfn)
51
+ for iseq, sfo in enumerate(aa_seq_infos):
52
+ tlist = sfo['name'].split('_')
53
+ assert len(tlist) == 3
54
+ assert int(tlist[1]) == iseq
55
+ sfo['prob'] = float(tlist[2])
56
+ with open(aasfn.replace('.fasta', '.dnamap')) as outfile: # this is some weird bastardization of a fasta file
57
+ iseq = -1
58
+ for line in outfile:
59
+ if line[0] == '>':
60
+ iseq += 1
61
+ assert line.strip().lstrip('>') == aa_seq_infos[iseq]['name']
62
+ aa_seq_infos[iseq]['nuc_seqs_probs'] = []
63
+ continue
64
+ prob, naive_nuc_seq = line.strip().split(',')
65
+ aa_seq_infos[iseq]['nuc_seqs_probs'].append((naive_nuc_seq, float(prob)))
66
+ lh_info[':'.join(input_uids)] = aa_seq_infos
67
+ return lh_info
68
+
69
+ # ----------------------------------------------------------------------------------------
70
+ def print_naive_seq_lines(nseq_info, namestr, namecolor, ref_seq=None, amino_acid=False, writefo=None):
71
+ def i_aa_color(i_aa):
72
+ tmpcolors = ['purple', 'yellow', 'red', 'blue', 'green']
73
+ return tmpcolors[i_aa % len(tmpcolors)]
74
+ total_prob = 0.
75
+ breaking = False
76
+ for naive_seq, prob, i_aa_seq in sorted(nseq_info, key=operator.itemgetter(1), reverse=True):
77
+ if ref_seq is None:
78
+ ref_seq = naive_seq
79
+ breakstr = ''
80
+ if 1. - total_prob < args.prob_to_ignore:
81
+ breaking = True
82
+ breakstr = 'total: %5.2f (breaking after %.2f)' % (prob+total_prob, 1. - args.prob_to_ignore)
83
+ print(' %s %s %5.2f %s %s' % (utils.color_mutants(ref_seq, naive_seq, amino_acid=amino_acid, align_if_necessary=True),
84
+ utils.color(i_aa_color(i_aa_seq), str(i_aa_seq), width=2), prob, utils.color(namecolor, namestr, width=9, padside='right'), breakstr))
85
+ if writefo is not None:
86
+ writefo.append({'method' : namestr, 'prob' : prob, 'seq' : naive_seq})
87
+ if breaking:
88
+ break
89
+ total_prob += prob
90
+ print('')
91
+ return ref_seq
92
+
93
+ # ----------------------------------------------------------------------------------------
94
+ def print_all_lines(lh_aa_seq_infos, pline, amino_acid=False):
95
+ seq_len = len(lh_aa_seq_infos[0]['seq'] if amino_acid else pline['naive_seq'])
96
+ anstr = '%s %s naive seqs' % (headstr('1.' if amino_acid else '3.'), 'amino acid' if amino_acid else 'nucleotide')
97
+ print(' %s:%s aa seq' % (anstr, (seq_len - utils.len_excluding_colors(anstr)) * ' '))
98
+ if amino_acid:
99
+ codon_str = utils.color('reverse_video', 'X')
100
+ vpos, jpos = [pline['codon_positions'][r] // 3 for r in ['v', 'j']]
101
+ cdstr = '%s%s%s%s%s' % (' '*vpos, codon_str, '-'*(jpos - vpos - 1), codon_str, ' '*(seq_len - jpos - 1))
102
+ else:
103
+ cdstr = seq_len*' '
104
+ print(' %s index prob' % cdstr)
105
+ writefo=[]
106
+ ref_seq = print_naive_seq_lines(get_lh_nsinfo(lh_aa_seq_infos, amino_acid=amino_acid), 'linearham', 'green', amino_acid=amino_acid, writefo=writefo)
107
+ _ = print_naive_seq_lines(get_partis_nsinfo(pline, amino_acid=amino_acid), 'partis', 'blue', ref_seq=ref_seq, amino_acid=amino_acid, writefo=writefo) # use the linearham naive seq as ref_seq also for partis
108
+ if not os.path.exists(args.outdir):
109
+ os.makedirs(args.outdir)
110
+ if args.outdir is not None:
111
+ print(' writing %s seqs to %s' % ('aa' if amino_acid else 'nuc', args.outdir))
112
+ with open('%s/%s-seqs.csv'%(args.outdir, 'aa' if amino_acid else 'nuc'), 'w') as jfile:
113
+ writer = csv.DictWriter(jfile, writefo[0].keys())
114
+ writer.writeheader()
115
+ for wfo in writefo:
116
+ writer.writerow(wfo)
117
+
118
+ # ----------------------------------------------------------------------------------------
119
+ def print_gene_calls(pline):
120
+ # TODO would be nice to read the linearham annotation so that we can highlight the v/d/j genes that linearham assumed were correct
121
+ print(' %s partis gene calls (linearham only considers one gene combo):' % headstr('4.'))
122
+ print(' prob gene')
123
+ for region in utils.regions:
124
+ print(' %s' % utils.color('blue', region))
125
+ for gene, prob in pline['alternative-annotations']['gene-calls'][region]:
126
+ print(' %4.2f %s' % (prob, utils.color_gene(gene, width=15)))
127
+
128
+ # ----------------------------------------------------------------------------------------
129
+ def get_lh_nsinfo(lh_aa_seq_infos, amino_acid=False):
130
+ if amino_acid:
131
+ return [(s['seq'], s['prob'], i) for i, s in enumerate(lh_aa_seq_infos)]
132
+ else:
133
+ return [(ns, np, i) for i, s in enumerate(lh_aa_seq_infos) for ns, np in s['nuc_seqs_probs']]
134
+
135
+ # ----------------------------------------------------------------------------------------
136
+ def get_partis_nsinfo(pline, amino_acid=False):
137
+ nuc_naive_seqs = pline['alternative-annotations']['naive-seqs'] if 'alternative-annotations' in pline else [(pline['naive_seq'], 1.), ]
138
+ pdict, sdict = {}, {}
139
+ for aseq, nseq, prob in [(utils.ltranslate(nseq, trim=True), nseq, prob) for nseq, prob in nuc_naive_seqs]: # add up the probs for any nuc seqs that code for the same aa seq
140
+ if aseq not in pdict:
141
+ pdict[aseq] = 0.
142
+ sdict[aseq] = []
143
+ pdict[aseq] += prob
144
+ sdict[aseq].append(nseq)
145
+ aa_naive_seqs = sorted(list(pdict.items()), key=operator.itemgetter(1), reverse=True)
146
+ if amino_acid:
147
+ return [(s, p, i) for i, (s, p) in enumerate(aa_naive_seqs)]
148
+ else:
149
+ nuc_seq_aa_indices = {}
150
+ for iseq, (aseq, _) in enumerate(aa_naive_seqs):
151
+ for nseq in sdict[aseq]:
152
+ nuc_seq_aa_indices[nseq] = iseq
153
+ return [(s, p, nuc_seq_aa_indices[s]) for s, p in nuc_naive_seqs]
154
+
155
+ # ----------------------------------------------------------------------------------------
156
+ def headstr(hstr):
157
+ return utils.color('green', hstr)
158
+
159
+ # ----------------------------------------------------------------------------------------
160
+ glfo, annotation_list, cpath = utils.read_output(args.partis_file)
161
+ lh_info = read_linearham_output()
162
+
163
+ annotations = {':'.join(adict['unique_ids']) : adict for adict in annotation_list} # collect the annotations in a dictionary so they're easier to access
164
+ most_likely_partition = cpath.partitions[cpath.i_best] # a partition is represented as a list of lists of strings, with each string a sequence id
165
+ print(' %d (of %d) clusters from partis file share uids with %d linearham cluster%s' % (len([c for c in most_likely_partition if any(len(set(c) & set(lc.split(':'))) > 0 for lc in lh_info)]), len(most_likely_partition), len(lh_info), utils.plural(len(lh_info))))
166
+ sorted_clusters = sorted(most_likely_partition, key=len, reverse=True)
167
+ for cluster in sorted_clusters:
168
+ pline = annotations[':'.join(cluster)]
169
+ utils.trim_fwk_insertions(glfo, pline, modify_alternative_annotations=True) # linearham probably won't have them, so we need to remove them so things line up
170
+ if 'alternative-annotations' not in pline:
171
+ print(' note: no alternative annotations in %s, so can\'t print partis alternative naive sequences' % args.partis_file)
172
+
173
+ p_uids = set(pline['unique_ids'])
174
+ lh_clusters = [(uidstr, cfo) for uidstr, cfo in lh_info.items() if set(uidstr.split(':')) & p_uids] # lh clusters with any uids in common iwth this partis <cluster> (there should only be 1)
175
+ lh_aa_seq_infos = []
176
+ if len(lh_clusters) == 0:
177
+ # print ' no linearham clusters with any of these uids' % utils.color('red', 'error')
178
+ continue
179
+ elif len(lh_clusters) != 1:
180
+ raise Exception('should have only one linearham cluster with uids in common with this cluster, but found %d' % len(lh_clusters))
181
+ else:
182
+ lh_uidstr, lh_aa_seq_infos = lh_clusters[0]
183
+ lh_uids = set(lh_uidstr.split(':'))
184
+ print('%s with sizes: partis %d linearham %d (%d in common)' % (utils.color('blue', 'starting clusters'), len(pline['unique_ids']), len(lh_uids), len(lh_uids & p_uids)))
185
+ if len(lh_uids - p_uids) > 0:
186
+ print(' %s %d extra uids in linearham cluster' % (utils.color('yellow', 'warning'), len(lh_uids - p_uids)))
187
+ if len(p_uids - lh_uids) > 0:
188
+ print(' %s %d extra uids in partis cluster' % (utils.color('yellow', 'warning'), len(p_uids - lh_uids)))
189
+ if len(lh_aa_seq_infos) == 0:
190
+ print(' %s no prob/naive_seq pairs in linearham for this cluster' % utils.color('red', 'error'))
191
+
192
+ print_all_lines(lh_aa_seq_infos, pline, amino_acid=True)
193
+ utils.print_reco_event(utils.synthesize_single_seq_line(pline, iseq=0), extra_str=' ', label='%s annotation for a single (arbitrary) sequence from the cluster:'%headstr('2.'))
194
+ print_all_lines(lh_aa_seq_infos, pline, amino_acid=False)
195
+
196
+ if 'alternative-annotations' in pline:
197
+ print_gene_calls(pline)
198
+
199
+ print('')
bin/chimera-plot.py ADDED
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import collections
5
+ import argparse
6
+ import sys
7
+ import os
8
+ import csv
9
+
10
+ from pathlib import Path
11
+ partis_dir = str(Path(__file__).parent.parent)
12
+ if not os.path.exists(partis_dir):
13
+ print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
14
+ sys.path.insert(1, partis_dir) # + '/python')
15
+
16
+ import python.utils as utils
17
+ from python.hist import Hist
18
+ import python.plotting as plotting
19
+ import python.glutils as glutils
20
+
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument('infile')
23
+ parser.add_argument('plotdir')
24
+ parser.add_argument('--glfo-dir', default='data/germlines/human', help='I\'m hacking this in afterwards because this was written before switching to yaml output files, so I think it was using this default germline dir anyway (except it used old glfo with different genes, so you probably actually have to pass in the real corresponding glfo anyway)')
25
+ parser.add_argument('--chunk-len', default=75, type=int)
26
+ parser.add_argument('--cutoff', default=0.3, help='point in max-abs-diff above which we assume most sequences are chimeric')
27
+ parser.add_argument('--title')
28
+ parser.add_argument('--locus', default='igh')
29
+ args = parser.parse_args()
30
+ if args.title == 'good':
31
+ args.title = 'none'
32
+ elif args.title == 'chimeras':
33
+ args.title = 'all chimeras'
34
+
35
+ def gk(uids):
36
+ return ':'.join(uids)
37
+
38
+ glfo = None
39
+ if utils.getsuffix(args.infile) == '.csv':
40
+ glfo = glutils.read_glfo(args.glfo_dir, args.locus)
41
+ glfo, annotation_list, _ = utils.read_output(args.infile, glfo=glfo)
42
+ annotations = collections.OrderedDict((line['unique_ids'][0], line) for line in annotation_list)
43
+
44
+ chfo = {uid : {k : v for k, v in zip(('imax', 'max_abs_diff'), utils.get_chimera_max_abs_diff(annotations[uid], iseq=0, chunk_len=args.chunk_len))} for uid in annotations}
45
+ biggest_adiffs = sorted(chfo, key=lambda q: chfo[q]['max_abs_diff'], reverse=True)
46
+ for uid in biggest_adiffs[:5]:
47
+ print('%-3d %6.3f' % (chfo[uid]['imax'], chfo[uid]['max_abs_diff']))
48
+ utils.print_reco_event(annotations[uid])
49
+
50
+ n_above_cutoff = len([_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff])
51
+ chimeric_fraction = n_above_cutoff / float(len(chfo))
52
+ print(' %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo), chimeric_fraction))
53
+
54
+ hmaxval = Hist(45, 0., 0.65)
55
+ for uid in annotations:
56
+ hmaxval.fill(chfo[uid]['max_abs_diff'])
57
+ himax = Hist(75, 0., 400)
58
+ for uid in annotations:
59
+ himax.fill(chfo[uid]['imax'])
60
+
61
+ utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv'])
62
+
63
+ import matplotlib
64
+ from matplotlib import pyplot as plt
65
+ fig, ax = plotting.mpl_init()
66
+ xvals, yvals = list(zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()]))
67
+ plt.scatter(xvals, yvals, alpha=0.4)
68
+
69
+ print('writing to %s' % args.plotdir)
70
+ plotting.mpl_finish(ax, args.plotdir, 'hexbin', title=args.title, xlabel='break point', ylabel='abs mfreq diff')
71
+
72
+ plotting.draw_no_root(hmaxval, plotdir=args.plotdir, plotname='mfreq-diff', shift_overflows=True, xtitle='abs mfreq diff', ytitle='seqs')
73
+ hmaxval.write('%s/%s.csv' % (args.plotdir, 'mfreq-diff'))
74
+
75
+ plotting.draw_no_root(himax, plotdir=args.plotdir, plotname='imax', shift_overflows=True, xtitle='break point', ytitle='seqs')
76
+ himax.write('%s/%s.csv' % (args.plotdir, 'imax'))
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import sys
5
+ import csv
6
+ from io import open
7
+ csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
8
+ import os
9
+ import argparse
10
+ import colored_traceback.always
11
+ import operator
12
+
13
+ # if you move this script, you'll need to change this method of getting the imports
14
+ from pathlib import Path
15
+ partis_dir = str(Path(__file__).parent.parent)
16
+ sys.path.insert(1, partis_dir) # + '/python')
17
+
18
+ import python.utils as utils
19
+ import python.glutils as glutils
20
+ from python.clusterpath import ClusterPath
21
+ import python.seqfileopener as seqfileopener
22
+ import python.indelutils as indelutils
23
+ import python.treeutils as treeutils
24
+
25
+ # ----------------------------------------------------------------------------------------
26
+ def addseq(ltmp, tline, uid, iclust):
27
+ if any(uid==s['name'] for s in chosen_seqs[ltmp]): # don't add it twice
28
+ return
29
+ if indelutils.has_indels_line(tline, tline['unique_ids'].index(uid)):
30
+ indel_warning_strs.append(' %s shm indels in chosen seq %s, which means you need to decide by hand whether you want to choose the input or indel-reversed seq (indel-reversed is written to output file' % (utils.color('yellow', 'warning'), uid))
31
+ chosen_seqs[ltmp].append({
32
+ 'name' : uid,
33
+ 'seq' : utils.per_seq_val(tline, 'seqs', uid),
34
+ 'locus' : ltmp,
35
+ 'igh_iclust' : iclust,
36
+ 'aa-cdist' : treeutils.smvals(tline, 'cons-dist-aa', uid=uid)
37
+ })
38
+
39
+ # ----------------------------------------------------------------------------------------
40
+ def translate_paired_ids(ltmp, pids):
41
+ return ['%s-%s-%s' % (args.sample_prefix, ltmp, u) for u in pids]
42
+
43
+ # ----------------------------------------------------------------------------------------
44
+ helpstr = """
45
+ I think this was an old script to kind of do a hackey version of approximate bulk pairing (but not really sure, would need to read through it more carefully, atm i'm adding this late and i forget).
46
+ see usage: datascripts/meta/qa013-synth/run.sh
47
+ """
48
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
49
+ pass
50
+ formatter_class = MultiplyInheritedFormatter
51
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
52
+ parser.add_argument('igh_fname')
53
+ parser.add_argument('igk_fname')
54
+ parser.add_argument('igl_fname')
55
+ parser.add_argument('--input-metafnames')
56
+ parser.add_argument('--sample-prefix', default='QA013-10x-pre', help='str that needs to be prepended to paired uid to match the uid in \'paired-uids\' (necessary because when we merge samples in datascripts/preprocess.py we don\'t know which sample each paired id is from)')
57
+ parser.add_argument('--outfname')
58
+ parser.add_argument('--n-largest-clusters', type=int, default=3)
59
+ parser.add_argument('--n-to-choose', type=int, default=2)
60
+ parser.add_argument('--choose-paired', action='store_true')
61
+ # parser.add_argument('--paired-sample-prefix')
62
+ # parser.add_argument('--n-max-queries', type=int, default=-1) # just for testing DAMMIT can't do this, you have to read all the other chains to find the right cluster
63
+ args = parser.parse_args()
64
+ args.input_metafnames = utils.get_arg_list(args.input_metafnames)
65
+
66
+ cpaths, antn_lists = {}, {}
67
+ for ltmp, fn in zip(['igh', 'igk', 'igl'], [args.igh_fname, args.igk_fname, args.igl_fname]):
68
+ _, antn_lists[ltmp], cpaths[ltmp] = utils.read_output(fn) #, n_max_queries=args.n_max_queries)
69
+ for tline in antn_lists[ltmp]:
70
+ tline['paired-uids'] = [[] for _ in tline['unique_ids']]
71
+ if args.input_metafnames is not None:
72
+ seqfileopener.read_input_metafo(args.input_metafnames, antn_lists[ltmp])
73
+
74
+ chosen_seqs = {l : [] for l in utils.sub_loci('ig')}
75
+ indel_warning_strs = []
76
+ lp_antn_pairs = [] # somewhat similar to paircluster.find_cluster_pairs()
77
+ antn_dicts = {l : utils.get_annotation_dict(alist) for l, alist in antn_lists.items()}
78
+ sorted_hclusters = sorted(cpaths['igh'].best(), key=len, reverse=True)[:args.n_largest_clusters]
79
+ print(' choosing seqs from %d largest igh clusters with sizes %s' % (args.n_largest_clusters, ' '.join(str(len(c)) for c in sorted_hclusters)))
80
+ print(' igh N igh light l clust N chosen')
81
+ print(' iclust size paired locus size aa-cdist paired')
82
+ for iclust, hclust in enumerate(sorted_hclusters):
83
+ print(' %3d %5d' % (iclust, len(hclust)), end=' ')
84
+ hline = antn_dicts['igh'][':'.join(hclust)]
85
+ tid_lists = [(u, pids[0]) for u, pids in zip(hline['unique_ids'], hline['paired-uids']) if len(pids)==1] # NOTE this doesn't check that the pairing info is reciprocal (i.e. that the paired-uids in the light chain correpsond to the h seqs)
86
+ if len(tid_lists) == 0:
87
+ non_zero_pids = [pids for pids in hline['paired-uids'] if len(pids) > 0]
88
+ print(' no uniquely-paired seqs (paired-uids lengths: %s, +%d unpaired)' % (' '.join(str(n) for n in sorted((len(pids) for pids in non_zero_pids), reverse=True)), len(hline['unique_ids']) - len(non_zero_pids)))
89
+ continue
90
+ h_paired_ids, l_paired_ids = list(zip(*tid_lists))
91
+ print(' %3d' % len(h_paired_ids), end=' ')
92
+
93
+ lclusts = []
94
+ for ltmp in ['igk', 'igl']:
95
+ l_tmp_ids = translate_paired_ids(ltmp, l_paired_ids)
96
+ for lc in cpaths[ltmp].best():
97
+ if len(set(lc) & set(l_tmp_ids)) > 0:
98
+ lclusts.append((ltmp, lc))
99
+ if len(lclusts) != 1:
100
+ print(' couldn\'t find unique light cluster (found %d) for %d paired ids (from %d heavy ids)' % (len(lclusts), len(l_paired_ids), len(h_paired_ids)))
101
+ continue
102
+ l_locus, lclust = lclusts[0]
103
+ lline = antn_dicts[l_locus][':'.join(lclust)]
104
+ lp_antn_pairs.append((hline, lline))
105
+ print(' %3s %4d' % (l_locus, len(lclust)), end=' ')
106
+
107
+ # add aa-cdist (it's probably usually already there, but it's easy to add, and should always end up the same)
108
+ import python.treeutils as treeutils
109
+ tmpids = {}
110
+ for ltmp, tline in zip(('igh', l_locus), (hline, lline)):
111
+ tline['tree-info'] = {'lb' : {}}
112
+ treeutils.add_cdists_to_lbfo(tline, tline['tree-info']['lb'], 'cons-dist-aa')
113
+ tmpids[ltmp], _ = list(zip(*sorted(list(tline['tree-info']['lb']['cons-dist-aa'].items()), key=operator.itemgetter(1), reverse=True)))
114
+ tmpids[ltmp] = tmpids[ltmp][:args.n_to_choose]
115
+ for uid in tmpids[ltmp]:
116
+ addseq(ltmp, tline, uid, iclust)
117
+ print(' %2d %2d' % (len(tmpids['igh']), len(tmpids[l_locus])), end=' ')
118
+
119
+ if args.choose_paired:
120
+ for ltmp, tline, cids in zip(('igh', l_locus), (hline, lline), (h_paired_ids, translate_paired_ids(l_locus, l_paired_ids))):
121
+ for uid in cids:
122
+ addseq(ltmp, tline, uid, iclust)
123
+ print(' %2d %2d' % (len(h_paired_ids), len(l_paired_ids)), end=' ')
124
+ print('')
125
+
126
+ if len(indel_warning_strs) > 0:
127
+ print('\n'.join(indel_warning_strs))
128
+
129
+ all_chosen_ids = [s['name'] for sfos in chosen_seqs.values() for s in sfos]
130
+ for hline, lline in lp_antn_pairs:
131
+ print('%s sizes %d %d chose %d %d' % (utils.color('green', '-->'), len(hline['unique_ids']), len(lline['unique_ids']), len([u for u in all_chosen_ids if u in hline['unique_ids']]), len([u for u in all_chosen_ids if u in hline['unique_ids']])))
132
+ for tline in [hline, lline]:
133
+ utils.print_reco_event(tline, extra_print_keys=['cons-dist-aa', 'paired-uids'], queries_to_emphasize=all_chosen_ids, extra_str=' ')
134
+
135
+ if args.outfname is not None:
136
+ print(' writing %d chosen seqs to %s' % (len(chosen_seqs), args.outfname))
137
+ utils.mkdir(args.outfname, isfile=True)
138
+ with open(args.outfname, utils.csv_wmode()) as ofile:
139
+ writer = csv.DictWriter(ofile, sorted(chosen_seqs['igh'][0].keys())) # NOTE dammit this is way too similar to treeutils.combine_selection_metrics(), i need to maybe split the csv writing code out of there?
140
+ writer.writeheader()
141
+ for ltmp, seqfos in chosen_seqs.items():
142
+ for sfo in seqfos:
143
+ writer.writerow(sfo)
bin/circle-plots.py ADDED
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ import sys
4
+ import colored_traceback.always
5
+ import os
6
+ import circlify
7
+ import json
8
+ import argparse
9
+ import csv
10
+ from io import open
11
+
12
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
13
+ parser.add_argument('infname')
14
+ parser.add_argument('outfname')
15
+ args = parser.parse_args()
16
+
17
+ radii = []
18
+ with open(args.infname) as ifile:
19
+ reader = csv.DictReader(ifile)
20
+ for line in reader:
21
+ radii.append({'id' : line['id'], 'radius' : float(line['radius'])})
22
+
23
+ circlefos = circlify.circlify(radii, datum_field='radius', id_field='id') # NOTE this doesn't return them in the same order
24
+ with open(args.outfname, 'wb' if sys.version_info.major < 3 else 'w') as ofile:
25
+ def gfn(k, c): return getattr(c, k) if hasattr(c, k) else getattr(c, 'ex')[k]
26
+ headers = ('id', 'x', 'y', 'r')
27
+ writer = csv.DictWriter(ofile, headers)
28
+ writer.writeheader()
29
+ for cfo in circlefos:
30
+ writer.writerow({k : gfn(k, cfo) for k in headers})