partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.1.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
  54. partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. partis/main.py +0 -59
  61. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  62. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
  65. {partis → python}/__init__.py +0 -0
  66. {partis → python}/alleleclusterer.py +0 -0
  67. {partis → python}/allelefinder.py +0 -0
  68. {partis → python}/alleleremover.py +0 -0
  69. {partis → python}/annotationclustering.py +0 -0
  70. {partis → python}/baseutils.py +0 -0
  71. {partis → python}/cache/__init__.py +0 -0
  72. {partis → python}/cache/cached_uncertainties.py +0 -0
  73. {partis → python}/clusterpath.py +0 -0
  74. {partis → python}/coar.py +0 -0
  75. {partis → python}/corrcounter.py +0 -0
  76. {partis → python}/datautils.py +0 -0
  77. {partis → python}/event.py +0 -0
  78. {partis → python}/fraction_uncertainty.py +0 -0
  79. {partis → python}/gex.py +0 -0
  80. {partis → python}/glomerator.py +0 -0
  81. {partis → python}/hist.py +0 -0
  82. {partis → python}/hmmwriter.py +0 -0
  83. {partis → python}/hutils.py +0 -0
  84. {partis → python}/indelutils.py +0 -0
  85. {partis → python}/lbplotting.py +0 -0
  86. {partis → python}/mds.py +0 -0
  87. {partis → python}/mutefreqer.py +0 -0
  88. {partis → python}/paircluster.py +0 -0
  89. {partis → python}/parametercounter.py +0 -0
  90. {partis → python}/paramutils.py +0 -0
  91. {partis → python}/partitiondriver.py +0 -0
  92. {partis → python}/partitionplotter.py +0 -0
  93. {partis → python}/performanceplotter.py +0 -0
  94. {partis → python}/plotconfig.py +0 -0
  95. {partis → python}/processargs.py +0 -0
  96. {partis → python}/prutils.py +0 -0
  97. {partis → python}/recombinator.py +0 -0
  98. {partis → python}/scanplot.py +0 -0
  99. {partis → python}/seqfileopener.py +0 -0
  100. {partis → python}/treegenerator.py +0 -0
  101. {partis → python}/viterbicluster.py +0 -0
  102. {partis → python}/vrc01.py +0 -0
  103. {partis → python}/waterer.py +0 -0
bin/xvfb-run ADDED
@@ -0,0 +1,194 @@
1
+ #!/bin/sh
2
+ # ----------------------------------------------------------------------------------------
3
+ # NOTE copied from ubuntu system location in order to remove the '2>&1' at the bottom, see https://bugs.launchpad.net/ubuntu/+source/xorg-server/+bug/1059947
4
+ # ----------------------------------------------------------------------------------------
5
+
6
+ # This script starts an instance of Xvfb, the "fake" X server, runs a command
7
+ # with that server available, and kills the X server when done. The return
8
+ # value of the command becomes the return value of this script, except in cases
9
+ # where this script encounters an error.
10
+ #
11
+ # If anyone is using this to build a Debian package, make sure the package
12
+ # Build-Depends on xvfb and xauth.
13
+
14
+ set -e
15
+
16
+ PROGNAME=xvfb-run
17
+ SERVERNUM=99
18
+ AUTHFILE=
19
+ ERRORFILE=/dev/null
20
+ XVFBARGS="-screen 0 640x480x8"
21
+ LISTENTCP="-nolisten tcp"
22
+ XAUTHPROTO=.
23
+
24
+ # Query the terminal to establish a default number of columns to use for
25
+ # displaying messages to the user. This is used only as a fallback in the event
26
+ # the COLUMNS variable is not set. ($COLUMNS can react to SIGWINCH while the
27
+ # script is running, and this cannot, only being calculated once.)
28
+ DEFCOLUMNS=$(stty size 2>/dev/null | awk '{print $2}') || true
29
+ if ! expr "$DEFCOLUMNS" : "[[:digit:]]\+$" >/dev/null 2>&1; then
30
+ DEFCOLUMNS=80
31
+ fi
32
+
33
+ # Display a message, wrapping lines at the terminal width.
34
+ message () {
35
+ echo "$PROGNAME: $*" | fmt -t -w ${COLUMNS:-$DEFCOLUMNS}
36
+ }
37
+
38
+ # Display an error message.
39
+ error () {
40
+ message "error: $*" >&2
41
+ }
42
+
43
+ # Display a usage message.
44
+ usage () {
45
+ if [ -n "$*" ]; then
46
+ message "usage error: $*"
47
+ fi
48
+ cat <<EOF
49
+ Usage: $PROGNAME [OPTION ...] COMMAND
50
+ Run COMMAND (usually an X client) in a virtual X server environment.
51
+ Options:
52
+ -a --auto-servernum try to get a free server number, starting at
53
+ --server-num
54
+ -e FILE --error-file=FILE file used to store xauth errors and Xvfb
55
+ output (default: $ERRORFILE)
56
+ -f FILE --auth-file=FILE file used to store auth cookie
57
+ (default: ./.Xauthority)
58
+ -h --help display this usage message and exit
59
+ -n NUM --server-num=NUM server number to use (default: $SERVERNUM)
60
+ -l --listen-tcp enable TCP port listening in the X server
61
+ -p PROTO --xauth-protocol=PROTO X authority protocol name to use
62
+ (default: xauth command's default)
63
+ -s ARGS --server-args=ARGS arguments (other than server number and
64
+ "-nolisten tcp") to pass to the Xvfb server
65
+ (default: "$XVFBARGS")
66
+ EOF
67
+ }
68
+
69
+ # Find a free server number by looking at .X*-lock files in /tmp.
70
+ find_free_servernum() {
71
+ # Sadly, the "local" keyword is not POSIX. Leave the next line commented in
72
+ # the hope Debian Policy eventually changes to allow it in /bin/sh scripts
73
+ # anyway.
74
+ #local i
75
+
76
+ i=$SERVERNUM
77
+ while [ -f /tmp/.X$i-lock ]; do
78
+ i=$(($i + 1))
79
+ done
80
+ echo $i
81
+ }
82
+
83
+ # Clean up files
84
+ clean_up() {
85
+ if [ -e "$AUTHFILE" ]; then
86
+ XAUTHORITY=$AUTHFILE xauth remove ":$SERVERNUM" >>"$ERRORFILE" 2>&1
87
+ fi
88
+ if [ -n "$XVFB_RUN_TMPDIR" ]; then
89
+ if ! rm -r "$XVFB_RUN_TMPDIR"; then
90
+ error "problem while cleaning up temporary directory"
91
+ exit 5
92
+ fi
93
+ fi
94
+ if [ -n "$XVFBPID" ]; then
95
+ kill "$XVFBPID" >>"$ERRORFILE" 2>&1
96
+ fi
97
+ }
98
+
99
+ # Parse the command line.
100
+ ARGS=$(getopt --options +ae:f:hn:lp:s:w: \
101
+ --long auto-servernum,error-file:,auth-file:,help,server-num:,listen-tcp,xauth-protocol:,server-args:,wait: \
102
+ --name "$PROGNAME" -- "$@")
103
+ GETOPT_STATUS=$?
104
+
105
+ if [ $GETOPT_STATUS -ne 0 ]; then
106
+ error "internal error; getopt exited with status $GETOPT_STATUS"
107
+ exit 6
108
+ fi
109
+
110
+ eval set -- "$ARGS"
111
+
112
+ while :; do
113
+ case "$1" in
114
+ -a|--auto-servernum) SERVERNUM=$(find_free_servernum); AUTONUM="yes" ;;
115
+ -e|--error-file) ERRORFILE="$2"; shift ;;
116
+ -f|--auth-file) AUTHFILE="$2"; shift ;;
117
+ -h|--help) SHOWHELP="yes" ;;
118
+ -n|--server-num) SERVERNUM="$2"; shift ;;
119
+ -l|--listen-tcp) LISTENTCP="" ;;
120
+ -p|--xauth-protocol) XAUTHPROTO="$2"; shift ;;
121
+ -s|--server-args) XVFBARGS="$2"; shift ;;
122
+ -w|--wait) shift ;;
123
+ --) shift; break ;;
124
+ *) error "internal error; getopt permitted \"$1\" unexpectedly"
125
+ exit 6
126
+ ;;
127
+ esac
128
+ shift
129
+ done
130
+
131
+ if [ "$SHOWHELP" ]; then
132
+ usage
133
+ exit 0
134
+ fi
135
+
136
+ if [ -z "$*" ]; then
137
+ usage "need a command to run" >&2
138
+ exit 2
139
+ fi
140
+
141
+ if ! which xauth >/dev/null; then
142
+ error "xauth command not found"
143
+ exit 3
144
+ fi
145
+
146
+ # tidy up after ourselves
147
+ trap clean_up EXIT
148
+
149
+ # If the user did not specify an X authorization file to use, set up a temporary
150
+ # directory to house one.
151
+ if [ -z "$AUTHFILE" ]; then
152
+ XVFB_RUN_TMPDIR="$(mktemp -d -t $PROGNAME.XXXXXX)"
153
+ # Create empty file to avoid xauth warning
154
+ AUTHFILE=$(tempfile -n "$XVFB_RUN_TMPDIR/Xauthority")
155
+ fi
156
+
157
+ # Start Xvfb.
158
+ MCOOKIE=$(mcookie)
159
+ tries=10
160
+ while [ $tries -gt 0 ]; do
161
+ tries=$(( $tries - 1 ))
162
+ XAUTHORITY=$AUTHFILE xauth source - << EOF >>"$ERRORFILE" 2>&1
163
+ add :$SERVERNUM $XAUTHPROTO $MCOOKIE
164
+ EOF
165
+ # handle SIGUSR1 so Xvfb knows to send a signal when it's ready to accept
166
+ # connections
167
+ trap : USR1
168
+ (trap '' USR1; exec Xvfb ":$SERVERNUM" $XVFBARGS $LISTENTCP -auth $AUTHFILE >>"$ERRORFILE" 2>&1) &
169
+ XVFBPID=$!
170
+
171
+ wait || :
172
+ if kill -0 $XVFBPID 2>/dev/null; then
173
+ break
174
+ elif [ -n "$AUTONUM" ]; then
175
+ # The display is in use so try another one (if '-a' was specified).
176
+ SERVERNUM=$((SERVERNUM + 1))
177
+ SERVERNUM=$(find_free_servernum)
178
+ continue
179
+ fi
180
+ error "Xvfb failed to start" >&2
181
+ XVFBPID=
182
+ exit 1
183
+ done
184
+
185
+ # Start the command and save its exit status.
186
+ set +e
187
+ DISPLAY=:$SERVERNUM XAUTHORITY=$AUTHFILE "$@"
188
+ RETVAL=$?
189
+ set -e
190
+
191
+ # Return the executed command's exit status.
192
+ exit $RETVAL
193
+
194
+ # vim:set ai et sts=4 sw=4 tw=80:
@@ -0,0 +1,97 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import os
6
+ import sys
7
+
8
+ from pathlib import Path
9
+ partis_dir = str(Path(__file__).parent.parent)
10
+ if not os.path.exists(partis_dir):
11
+ print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
12
+ sys.path.insert(1, partis_dir) # + '/python')
13
+
14
+ import python.utils as utils
15
+ import python.glutils as glutils
16
+
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument('--bases', required=True, help='colon-separated list of the bits before the stars, e.g. 1-18:2-2 (set to \'all\' to print entire germline set)')
19
+ parser.add_argument('--allele-numbers')
20
+ parser.add_argument('--ref-allele', help='print this one first')
21
+ parser.add_argument('--other-genes')
22
+ parser.add_argument('--region', default='v')
23
+ parser.add_argument('--locus', default='igh', choices=utils.loci)
24
+ parser.add_argument('--species', default='human')
25
+ parser.add_argument('--glfo-dir', help='default set below')
26
+ args = parser.parse_args()
27
+
28
+ if args.glfo_dir is None:
29
+ args.glfo_dir = 'data/germlines/' + args.species
30
+
31
+ glfo = glutils.read_glfo(args.glfo_dir, args.locus)
32
+
33
+ # ----------------------------------------------------------------------------------------
34
+ def get_base(gene):
35
+ basestr = utils.primary_version(gene)
36
+ if utils.sub_version(gene) is not None:
37
+ basestr += '-' + utils.sub_version(gene)
38
+ return basestr
39
+
40
+ # ----------------------------------------------------------------------------------------
41
+ def get_genes(base, alleles=None):
42
+ if alleles is None: # take all of 'em
43
+ alleles = [utils.allele(g) for g in glfo['seqs'][args.region] if base == get_base(g)]
44
+ return [args.locus.upper() + args.region.upper() + base + '*' + al for al in alleles]
45
+
46
+ if args.bases == 'all':
47
+ input_groupfcn = None # lambda g: str(utils.primary_version(g) in ['4', '5']) # this example puts all the 4 and 5 primary versions in one group, and everybody else in another
48
+ glutils.print_glfo(glfo, only_region=(args.region if args.region != 'v' else None), input_groupfcn=input_groupfcn) # not much point in doing only v, since it's the one that takes most of the time
49
+ sys.exit(0)
50
+
51
+ args.bases = utils.get_arg_list(args.bases)
52
+ args.allele_numbers = utils.get_arg_list(args.allele_numbers)
53
+ genes = [g for base in args.bases for g in get_genes(base, args.allele_numbers)]
54
+ if len(genes) == 0:
55
+ raise Exception('couldn\'t find any genes for the specified --bases %s\n choices:\n %s' % (' '.join(args.bases), ' '.join(sorted(set([get_base(g) for g in glfo['seqs'][args.region]])))))
56
+ args.other_genes = utils.get_arg_list(args.other_genes)
57
+ if args.other_genes is not None:
58
+ genes += args.other_genes
59
+
60
+ seqstrs = ['' for _ in range(len(genes))]
61
+ snpstrs = ['' for _ in range(len(genes))]
62
+
63
+ gene_str_width = max([utils.len_excluding_colors(utils.color_gene(g)) for g in genes])
64
+ codon_positions = glfo[utils.conserved_codons[args.locus][args.region] + '-positions'] if args.region != 'd' else None
65
+ max_seq_len = max([len(glfo['seqs'][args.region][g]) for g in genes])
66
+
67
+ ref_gene = genes[0] if args.ref_allele is None else utils.rejoin_gene(args.locus, args.region, utils.primary_version(genes[0]), utils.sub_version(genes[0]), args.ref_allele)
68
+ if ref_gene != genes[0]:
69
+ genes.remove(ref_gene)
70
+ genes.insert(0, ref_gene)
71
+ ref_seq = glfo['seqs'][args.region][ref_gene]
72
+ ref_pos = codon_positions[ref_gene]
73
+
74
+ for igene in range(0, len(genes)):
75
+ gene = genes[igene]
76
+ seq = glfo['seqs'][args.region][gene]
77
+ pos = codon_positions[gene]
78
+ if pos < ref_pos: # align the codon position in the case that this seq is shorter up to the codon
79
+ seq = (ref_pos - pos) * '-' + seq
80
+ pos += (ref_pos - pos)
81
+
82
+ right_pad_str = '' # i think i don't need this any more since i have the align option in color_mutants
83
+ # if len(seq) < max_seq_len:
84
+ # right_pad_str = (max_seq_len - len(seq)) * ' '
85
+
86
+ emph_positions = None if args.region == 'd' else [pos + i for i in range(3)]
87
+ colored_seq, isnps = utils.color_mutants(ref_seq, seq, return_isnps=True, emphasis_positions=emph_positions, align=True)
88
+ seqstrs[igene] += '%s%s' % (colored_seq, right_pad_str)
89
+ if len(isnps) > 0:
90
+ snpstrs[igene] = '%2d (%s)' % (len(isnps), ' '.join([str(i) for i in isnps]))
91
+
92
+ # ----------------------------------------------------------------------------------------
93
+ def print_str(gene, seqstr, snpstr):
94
+ return '%s %s %s %s' % (utils.color_gene(gene, width=gene_str_width), seqstr, utils.color_gene(gene, width=gene_str_width), snpstr)
95
+
96
+ for igene in range(len(genes)):
97
+ print(print_str(genes[igene], seqstrs[igene], snpstrs[igene]))
@@ -0,0 +1,57 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import sys
6
+ import os
7
+ import copy
8
+ import collections
9
+ import colored_traceback.always
10
+
11
+ from pathlib import Path
12
+ partis_dir = str(Path(__file__).parent.parent)
13
+ if not os.path.exists(partis_dir):
14
+ print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
15
+ sys.path.insert(1, partis_dir) # + '/python')
16
+
17
+ import python.utils as utils
18
+ import python.glutils as glutils
19
+
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument('gldir1')
22
+ parser.add_argument('gldir2')
23
+ parser.add_argument('--names', default='+gl-1:+gl-2', help='colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output')
24
+ parser.add_argument('--locus', default='igh')
25
+ args = parser.parse_args()
26
+ args.names = utils.get_arg_list(args.names)
27
+
28
+ # ----------------------------------------------------------------------------------------
29
+ def clrname(name):
30
+ return utils.color('blue', name)
31
+
32
+ # ----------------------------------------------------------------------------------------
33
+ glfos = []
34
+ for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
35
+ print('%s:' % clrname(name))
36
+ glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))
37
+
38
+ for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
39
+ aseqs, bseqs = [{s : n for n, s in g['seqs'][region].items()} for g in glfos] # dict of names keyed by seqs
40
+ a_only_seqs, b_only_seqs = set(aseqs) - set(bseqs), set(bseqs) - set(aseqs)
41
+
42
+ print('%s' % utils.color('green', region))
43
+
44
+ common_seqs = set(aseqs) & set(bseqs)
45
+ common_name_seqs = [aseqs[s] for s in common_seqs if aseqs[s]==bseqs[s]]
46
+ print(' %3d seqs in common with same name: %s' % (len(common_name_seqs), utils.color_genes(sorted(common_name_seqs))))
47
+ dnamed_seqs = [(aseqs[s], bseqs[s]) for s in common_seqs if aseqs[s] != bseqs[s]]
48
+ if len(dnamed_seqs) > 0:
49
+ print(' %s %d common seq%s with different names: %s' % (utils.wrnstr(), len(dnamed_seqs), utils.plural(len(dnamed_seqs)), ', '.join(utils.color_genes([an,bn]) for an, bn in dnamed_seqs)))
50
+ print(' only in:\n %12s: %3d %s\n %12s: %3d %s' % (clrname(args.names[0]), len(a_only_seqs), utils.color_genes(sorted(aseqs[s] for s in a_only_seqs)),
51
+ clrname(args.names[1]), len(b_only_seqs), utils.color_genes(sorted(bseqs[s] for s in b_only_seqs))))
52
+
53
+ tmpfo = glutils.get_empty_glfo(args.locus) # make a new glfo that will only have non-shared genes
54
+ for gname, oname, only_seqs, allseqs, ogfo in zip(args.names, reversed(args.names), [a_only_seqs, b_only_seqs], [aseqs, bseqs], reversed(glfos)): # <gset> is the genes that're only in <gname>
55
+ print(' finding nearest seq in %s for %d seqs only in %s' % (clrname(oname), len(only_seqs), clrname(gname)))
56
+ for oseq in only_seqs:
57
+ glutils.find_nearest_gene_in_glfo(ogfo, oseq, new_name=allseqs[oseq], region=region, debug=True)
@@ -0,0 +1,55 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import csv
5
+ import os
6
+ import sys
7
+ from io import open
8
+ csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
9
+ import argparse
10
+ import colored_traceback.always
11
+ import yaml
12
+ import json
13
+ import operator
14
+ import random
15
+ import numpy
16
+ from pathlib import Path
17
+
18
+ # if you move this script, you'll need to change this method of getting the imports
19
+ partis_dir = str(Path(__file__).parent.parent)
20
+ sys.path.insert(1, partis_dir) # + '/python')
21
+
22
+ import python.utils as utils
23
+
24
+ dstr = """
25
+ Extract heavy/light chain pairing info from fasta file <infname> and write it to yaml/json file <outfname>.
26
+ Should have the same effect as setting --guess-pairing-info when running bin/split-loci.py.
27
+ """
28
+ parser = argparse.ArgumentParser(description=dstr,
29
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter) # why tf isn't this printing the defaults?
30
+ parser.add_argument('infname')
31
+ parser.add_argument('outfname')
32
+ parser.add_argument('--droplet-id-separators', help=utils.did_help['seps'])
33
+ parser.add_argument('--droplet-id-indices', help=utils.did_help['indices'])
34
+ parser.add_argument('--overwrite', action='store_true')
35
+ parser.add_argument('--for-testing-n-max-queries', type=int, default=-1, help='only for testing, applied when reading initial fasta file, just in case it\'s huge and you want to run quickly without having to read the whole file')
36
+ parser.add_argument('--n-max-queries', type=int, default=-1, help='see partis help (although here it applies to droplets, not individual seqs)')
37
+ parser.add_argument('--n-random-queries', type=int, help='see partis help (although here it applies to droplets, not individual seqs)')
38
+ parser.add_argument('--input-metafname', help='json/yaml file with additional (beyond pairing info) input meta info (see partis help)')
39
+ parser.add_argument('--random-seed', type=int, default=1)
40
+ args = parser.parse_args()
41
+ random.seed(args.random_seed)
42
+ numpy.random.seed(args.random_seed)
43
+ args.droplet_id_indices = utils.get_arg_list(args.droplet_id_indices, intify=True)
44
+
45
+ if utils.output_exists(args, args.outfname, offset=4, debug=False):
46
+ print(' extract-pairing-info.py output exists and --overwrite was not set, so not doing anything: %s' % args.outfname)
47
+ sys.exit(0)
48
+
49
+ seqfos = utils.read_fastx(args.infname, n_max_queries=args.for_testing_n_max_queries)
50
+ if args.n_max_queries != -1 or args.n_random_queries is not None:
51
+ seqfos = utils.subset_paired_queries(seqfos, args.droplet_id_separators, args.droplet_id_indices, n_max_queries=args.n_max_queries, n_random_queries=args.n_random_queries)
52
+ metafos = utils.extract_pairing_info(seqfos, droplet_id_separators=args.droplet_id_separators, droplet_id_indices=args.droplet_id_indices, input_metafname=args.input_metafname)
53
+
54
+ utils.mkdir(args.outfname, isfile=True)
55
+ utils.jsdump(args.outfname, metafos)
@@ -0,0 +1,244 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import numpy
5
+ import csv
6
+ import yaml
7
+ import time
8
+ import colored_traceback.always
9
+ import argparse
10
+ import subprocess
11
+ import sys
12
+ import os
13
+ import dendropy
14
+ import json
15
+ from io import open
16
+ import random
17
+ from pathlib import Path
18
+
19
+ partis_dir = str(Path(__file__).parent.parent)
20
+ sys.path.insert(1, partis_dir) #'./python')
21
+ import python.utils as utils
22
+ import python.glutils as glutils
23
+ import python.treeutils as treeutils
24
+
25
+ # ----------------------------------------------------------------------------------------
26
+ def get_inf_int_name(gname): # <gname> is just an integer, which won't be unique and will break things
27
+ return '%s-%s' % (args.inf_int_label, gname)
28
+
29
+ # ----------------------------------------------------------------------------------------
30
+ def gctofn(ft):
31
+ ftstrs = {
32
+ 'tree' : 'gctree.out.inference.1.nk',
33
+ 'seqs' : 'gctree.out.inference.1.fasta',
34
+ 'dnapars' : 'outfile',
35
+ }
36
+ return '%s/%s' % (args.outdir, ftstrs[ft])
37
+
38
+ # ----------------------------------------------------------------------------------------
39
+ def fofn(ft):
40
+ assert ft in ['tree', 'seqs']
41
+ return '%s/%s%s' % (args.outdir, ft if ft=='tree' else 'inferred-%s'%ft, '.nwk' if ft=='tree' else '.fa')
42
+
43
+ # ----------------------------------------------------------------------------------------
44
+ def idfn():
45
+ return 'idmap.txt'
46
+
47
+ # ----------------------------------------------------------------------------------------
48
+ def install():
49
+ cmds = ['#!/bin/bash']
50
+ cmds += utils.mamba_cmds(args.env_label, only_prep=True)
51
+ cmds += ['micromamba create -y -n %s -c conda-forge python=3.9' % args.env_label] # 3.10 currently has problems with ete
52
+ cmds += ['micromamba activate %s' % args.env_label]
53
+ cmds += ['micromamba install -y -c bioconda -c conda-forge phylip']
54
+ cmds += ['micromamba install -y -c conda-forge%s click' % ('' if args.no_dag else ' gctree')]
55
+ if args.no_dag:
56
+ cmds += ['pip install gctree==3.3.0'] # I think having --user makes it install in ~/.local (outside mamba env)
57
+ # micromamba remove -n gctree --all # to nuke it and start over
58
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
59
+
60
+ # ----------------------------------------------------------------------------------------
61
+ def update():
62
+ cmds = ['#!/bin/bash']
63
+ cmds += utils.mamba_cmds(args.env_label)
64
+ cmds += ['micromamba update phylip gctree click']
65
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname='/tmp/tmprun.sh', debug=True)
66
+
67
+ # ----------------------------------------------------------------------------------------
68
+ def add_mfo(tcmd, mfn):
69
+ kdict = {'frame' : 'frame', 'h_frame' : 'frame', 'l_frame' : 'frame2', 'l_offset' : 'chain_split'} # translates from metafo dict to gctree command line args
70
+ with open(args.metafname) as mfile:
71
+ metafo = json.load(mfile)
72
+ for tk, tc in kdict.items():
73
+ if tk in metafo:
74
+ tcmd += ' --%s %d' % (tc, metafo[tk])
75
+ return tcmd
76
+
77
+ # ----------------------------------------------------------------------------------------
78
+ def run_gctree():
79
+ # ----------------------------------------------------------------------------------------
80
+ def get_gctree_cmd():
81
+ tcmd = '%s/bin/xvfb-run -a gctree infer outfile abundances.csv --root %s --verbose --idlabel' % (utils.get_partis_dir(), args.root_label) # --idlabel writes the output fasta file
82
+ if not args.base_model and not args.no_dag:
83
+ tcmd += ' --mutability %s/HS5F_Mutability.csv --substitution %s/HS5F_Substitution.csv' % (args.data_dir, args.data_dir)
84
+ if args.ranking_coeffs is not None:
85
+ tcmd += ' --ranking_coeffs %s' % (' '.join(c for c in args.ranking_coeffs))
86
+ if args.branching_process_ranking_coeff is not None:
87
+ tcmd += ' --branching_process_ranking_coeff %d' % args.branching_process_ranking_coeff
88
+ if os.path.exists(args.metafname):
89
+ tcmd = add_mfo(tcmd, args.metafname)
90
+ return tcmd
91
+ # ----------------------------------------------------------------------------------------
92
+ def get_cmds():
93
+ cmds = ['#!/bin/bash']
94
+ cmds += utils.mamba_cmds(args.env_label)
95
+ if args.run_help:
96
+ cmds += ['gctree infer -h']
97
+ return cmds
98
+ if not os.path.exists(args.infname):
99
+ raise Exception('--infname %s doesn\'t exist' % args.infname)
100
+ cmds += ['cd %s' % args.outdir]
101
+ if args.input_forest_dir is None:
102
+ ofn = '%s/outfile' % args.outdir # dnapars output file (this is what takes the longest to make
103
+ if os.path.exists(ofn) and os.stat(ofn).st_size > 0:
104
+ print(' dnapars output already exists, not rerunning: %s' % ofn)
105
+ else:
106
+ if os.path.exists(ofn) and os.stat(ofn).st_size == 0:
107
+ print(' removing zero length dnapars output %s' % ofn)
108
+ utils.prep_dir(args.outdir, wildlings=['outfile', 'outtree'], allow_other_files=True) # phylip barfs like a mfer if its outputs exist (probably you'll get a KeyError 'naive')
109
+ cmds += ['deduplicate %s --root %s --abundance_file abundances.csv --idmapfile %s > deduplicated.phylip' % (args.infname, args.root_label, idfn())]
110
+ cmds += ['mkconfig deduplicated.phylip dnapars > dnapars.cfg']
111
+ cmds += ['dnapars < dnapars.cfg > dnapars.log'] # NOTE if things fail, look in dnaparse.log (but it's super verbose so we can't print it to std out by default)
112
+ else:
113
+ print(' --input-forest-dir: copying abundance, idmap, and forest files from %s' % args.input_forest_dir)
114
+ cmds += ['cp %s/{abundances.csv,%s,outfile} %s/' % (args.input_forest_dir, idfn(), args.outdir)]
115
+ if not args.only_write_forest:
116
+ cmds.append(get_gctree_cmd())
117
+ return cmds
118
+ # ----------------------------------------------------------------------------------------
119
+ if not args.run_help and utils.output_exists(args, gctofn('dnapars' if args.only_write_forest else 'tree')):
120
+ return
121
+
122
+ cmds = get_cmds() # also preps dir + other stuff
123
+
124
+ utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.outdir + '/run.sh', print_time='gctree', debug=True, dryrun=args.dry_run)
125
+ if args.run_help:
126
+ sys.exit()
127
+
128
+ # ----------------------------------------------------------------------------------------
129
+ def parse_output():
130
+ if utils.output_exists(args, fofn('seqs')):
131
+ return
132
+
133
+ # read translations (this only includes input sequences, not inferred intermediates)
134
+ idm_trns = {}
135
+ with open('%s/idmap.txt' % args.outdir) as idfile:
136
+ reader = csv.DictReader(idfile, fieldnames=('name', 'orig_names'))
137
+ for line in reader:
138
+ if line['orig_names'] == '':
139
+ continue
140
+ idm_trns[line['name']] = line['orig_names'].split(':')
141
+
142
+ # read fasta (mostly for inferred intermediate seqs)
143
+ seqfos = utils.read_fastx(gctofn('seqs'), look_for_tuples=True)
144
+ print(' read %d seqs from gctree output fasta' % len(seqfos))
145
+ if any(s['name']=='' for s in seqfos):
146
+ n_removed = len([s for s in seqfos if s['name']==''])
147
+ seqfos = [s for s in seqfos if s['name']!='']
148
+ print(' %s removed %d seqs with zero-length names \'\' (I\'m *not* sure this is the right thing to do, but it just kicked this error when I was doing the python 3 conversion)' % (utils.wrnstr(), n_removed))
149
+ nfos = [s for s in seqfos if s['name']==args.root_label]
150
+ if len(nfos) != 1:
151
+ print(' %s expected 1 naive seq with label \'%s\' but found %d: %s (in %s)' % (utils.wrnstr(), args.root_label, len(nfos), ' '.join(n['name'] for n in nfos), gctofn('seqs')))
152
+ seqfos = [s for s in seqfos if s['name'] != args.root_label] # don't want naive seq in final fasta
153
+ seq_len = numpy.mean([len(s['seq']) for s in seqfos])
154
+ if not args.expand_all_nodes: # also remove input seqs (well, gctree's new names for input seqs), unless we're expanding all nodes, in which case we need the gctree-named-nodes as fake new internal nodes
155
+ seqfos = [s for s in seqfos if s['name'] not in idm_trns]
156
+ if len(seqfos) == 0:
157
+ print(' %s no inferred sequences (all seqs read from gctree output were input seqs' % utils.wrnstr())
158
+ inf_int_trns = []
159
+ for sfo in seqfos:
160
+ inf_int_trns.append((sfo['name'], get_inf_int_name(sfo['name'])))
161
+ sfo['name'] = get_inf_int_name(sfo['name'])
162
+
163
+ # read tree
164
+ dtree = treeutils.get_dendro_tree(treefname=gctofn('tree'), debug=args.debug)
165
+ dtree.scale_edges(1. / seq_len)
166
+ dtree.seed_node.taxon.label = args.root_label
167
+ ndict = {n.taxon.label : n for n in dtree.preorder_node_iter()}
168
+ for gname, onames in idm_trns.items():
169
+ node = ndict[gname]
170
+ if node is None:
171
+ raise Exception('couldn\'t find node with name \'%s\' in tree from gctree in %s' % (gname, gctofn('tree')))
172
+ if args.debug and len(onames) > 1:
173
+ print(' abundance > 1 for %s: %d (%s)' % (gname, len(onames), ' '.join(onames)))
174
+ for onm in onames:
175
+ if node.taxon.label == gname and not args.expand_all_nodes:
176
+ node.taxon.label = onm
177
+ if args.debug and len(onames) > 1:
178
+ print(' setting node to %s' % onm)
179
+ continue
180
+ treeutils.add_zero_length_child(node, dtree, child_name=onm) # add duplicates as children with zero-length edges
181
+ if args.debug and len(onames) > 1:
182
+ print(' adding child node %s' % onm)
183
+ treeutils.translate_labels(dtree, inf_int_trns, expect_missing=True, debug=args.debug)
184
+
185
+ if args.fix_multifurcations:
186
+ input_seqfos = utils.read_fastx(args.infname)
187
+ dtree, new_seqfos = treeutils.get_binary_tree(dtree, nfos + input_seqfos + seqfos, debug=args.debug)
188
+ seqfos += new_seqfos
189
+ if args.debug:
190
+ print(' final tree:')
191
+ print(treeutils.get_ascii_tree(dendro_tree=dtree, extra_str=' ', width=350))
192
+ with open(fofn('tree'), 'w') as ofile:
193
+ ofile.write('%s\n' % treeutils.as_str(dtree))
194
+ utils.write_fasta(fofn('seqs'), nfos + seqfos)
195
+
196
+ # ----------------------------------------------------------------------------------------
197
+ ustr = """
198
+ Run gctree tree inference on sequences from fasta input file <--infname>.
199
+ Output trees and sequences are written to <--outdir> as inferred-seqs.fa and tree.nwk (gctree output files are also there, but they don't have any postprocessing e.g. fixing names and/or multifurcations.
200
+ gctree-run.py --infname <fasta> --outdir <outdir>
201
+ """
202
+ parser = argparse.ArgumentParser(usage=ustr)
203
+ parser.add_argument('--actions', default='run:parse')
204
+ parser.add_argument('--infname')
205
+ parser.add_argument('--metafname', help='if you need --frame (v region doesn\'t start at first position) or --chain_split and --frame2 (heavy/light chain smooshed together), pass the info in json format with this arg (see code above for format).')
206
+ parser.add_argument('--outdir')
207
+ parser.add_argument('--only-write-forest', action='store_true', help='only run preparatory steps for gctree, i.e. up through dnapars, to write parsimony forest')
208
+ parser.add_argument('--input-forest-dir', help='If set, skips preparatory steps (see --only-write-forest), and looks for \'abundance.csv\' and parsimony forest file (\'outfile\') in the specified dir')
209
+ parser.add_argument('--overwrite', action='store_true')
210
+ parser.add_argument('--base-model', action='store_true', help='By default, we pass gctree info for the s5f mutation model; if this is set, we don\'t, and it instead use the base model.')
211
+ parser.add_argument('--no-dag', action='store_true', help='If set, use old v1 non-DAG gctree version (v3.3.0). Note that this uses a different env (see --env-label)')
212
+ parser.add_argument('--ranking-coeffs', nargs='+', help='see gctree help')
213
+ parser.add_argument('--branching-process-ranking-coeff', type=int, help='see gctree help')
214
+ parser.add_argument('--env-label', default='gctree')
215
+ parser.add_argument('--root-label', default='naive')
216
+ parser.add_argument('--data-dir', default='%s/data/s5f'%utils.get_partis_dir())
217
+ parser.add_argument('--inf-int-label', default='inf', help='base name for inferred intermediate seqs (numerical name is appended with -')
218
+ parser.add_argument('--expand-all-nodes', action='store_true', help='Gctree collapses duplicate observed seqs into nodes with new names and abundance N > 1. By default, we expand these such that the node is named for one of the observed seqs, and add N-1 (zero-length) children. If this arg is set, however, we leave the node and add N (zero-length) children.')
219
+ parser.add_argument('--run-help', action='store_true', help='run gctree help')
220
+ parser.add_argument('--debug', action='store_true')
221
+ parser.add_argument('--dry-run', action='store_true')
222
+ parser.add_argument('--random-seed', type=int, default=0)
223
+ parser.add_argument('--fix-multifurcations', action='store_true', help='resolves multifurcations (by adding zero length intermediates) and move input seqs that have been extend unifurcations onto zero length branches')
224
+
225
+ args = parser.parse_args()
226
+ random.seed(args.random_seed)
227
+ numpy.random.seed(args.random_seed)
228
+ if args.only_write_forest and args.input_forest_dir:
229
+ raise Exception('doesn\'t make sense to specify both')
230
+ args.actions = utils.get_arg_list(args.actions, choices=['install', 'update', 'run', 'parse'])
231
+ args.infname = utils.fpath(args.infname)
232
+ args.outdir = utils.fpath(args.outdir)
233
+ if args.no_dag:
234
+ assert not args.base_model and args.branching_process_ranking_coeff is None and args.ranking_coeffs is None
235
+ args.env_label = 'gctree-no-dag'
236
+
237
+ if 'install' in args.actions:
238
+ install()
239
+ if 'update' in args.actions:
240
+ update()
241
+ if 'run' in args.actions:
242
+ run_gctree()
243
+ if 'parse' in args.actions:
244
+ parse_output()