partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/FastTree +0 -0
- bin/add-chimeras.py +59 -0
- bin/add-seqs-to-outputs.py +81 -0
- bin/bcr-phylo-run.py +799 -0
- bin/build.sh +24 -0
- bin/cf-alleles.py +97 -0
- bin/cf-germlines.py +57 -0
- bin/cf-linearham.py +199 -0
- bin/chimera-plot.py +76 -0
- bin/choose-partially-paired.py +143 -0
- bin/circle-plots.py +30 -0
- bin/compare-plotdirs.py +298 -0
- bin/diff-parameters.py +133 -0
- bin/docker-hub-push.sh +6 -0
- bin/extract-pairing-info.py +55 -0
- bin/gcdyn-simu-run.py +223 -0
- bin/gctree-run.py +244 -0
- bin/get-naive-probabilities.py +126 -0
- bin/iqtree-1.6.12 +0 -0
- bin/lonr.r +1020 -0
- bin/makeHtml +52 -0
- bin/mds-run.py +46 -0
- bin/parse-output.py +277 -0
- bin/partis +1869 -0
- bin/partis-pip +116 -0
- bin/partis.py +1869 -0
- bin/plot-gl-set-trees.py +519 -0
- bin/plot-hmms.py +151 -0
- bin/plot-lb-tree.py +427 -0
- bin/raxml-ng +0 -0
- bin/read-bcr-phylo-trees.py +38 -0
- bin/read-gctree-output.py +166 -0
- bin/run-chimeras.sh +64 -0
- bin/run-dtr-scan.sh +25 -0
- bin/run-paired-loci.sh +100 -0
- bin/run-tree-metrics.sh +88 -0
- bin/smetric-run.py +62 -0
- bin/split-loci.py +317 -0
- bin/swarm-2.1.13-linux-x86_64 +0 -0
- bin/test-germline-inference.py +425 -0
- bin/tree-perf-run.py +194 -0
- bin/vsearch-2.4.3-linux-x86_64 +0 -0
- bin/vsearch-2.4.3-macos-x86_64 +0 -0
- bin/xvfb-run +194 -0
- partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
- partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
- partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
- partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
- partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
- partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
- partis_bcr-1.0.2.data/scripts/test.py +1005 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
- partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
- {partis → python}/glutils.py +1 -1
- python/main.py +30 -0
- {partis → python}/plotting.py +10 -1
- {partis → python}/treeutils.py +18 -16
- {partis → python}/utils.py +14 -7
- packages/ham/bcrham +0 -0
- partis/main.py +0 -59
- partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
- {partis → python}/__init__.py +0 -0
- {partis → python}/alleleclusterer.py +0 -0
- {partis → python}/allelefinder.py +0 -0
- {partis → python}/alleleremover.py +0 -0
- {partis → python}/annotationclustering.py +0 -0
- {partis → python}/baseutils.py +0 -0
- {partis → python}/cache/__init__.py +0 -0
- {partis → python}/cache/cached_uncertainties.py +0 -0
- {partis → python}/clusterpath.py +0 -0
- {partis → python}/coar.py +0 -0
- {partis → python}/corrcounter.py +0 -0
- {partis → python}/datautils.py +0 -0
- {partis → python}/event.py +0 -0
- {partis → python}/fraction_uncertainty.py +0 -0
- {partis → python}/gex.py +0 -0
- {partis → python}/glomerator.py +0 -0
- {partis → python}/hist.py +0 -0
- {partis → python}/hmmwriter.py +0 -0
- {partis → python}/hutils.py +0 -0
- {partis → python}/indelutils.py +0 -0
- {partis → python}/lbplotting.py +0 -0
- {partis → python}/mds.py +0 -0
- {partis → python}/mutefreqer.py +0 -0
- {partis → python}/paircluster.py +0 -0
- {partis → python}/parametercounter.py +0 -0
- {partis → python}/paramutils.py +0 -0
- {partis → python}/partitiondriver.py +0 -0
- {partis → python}/partitionplotter.py +0 -0
- {partis → python}/performanceplotter.py +0 -0
- {partis → python}/plotconfig.py +0 -0
- {partis → python}/processargs.py +0 -0
- {partis → python}/prutils.py +0 -0
- {partis → python}/recombinator.py +0 -0
- {partis → python}/scanplot.py +0 -0
- {partis → python}/seqfileopener.py +0 -0
- {partis → python}/treegenerator.py +0 -0
- {partis → python}/viterbicluster.py +0 -0
- {partis → python}/vrc01.py +0 -0
- {partis → python}/waterer.py +0 -0
bin/bcr-phylo-run.py
ADDED
@@ -0,0 +1,799 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import argparse
|
5
|
+
import csv
|
6
|
+
import colored_traceback.always
|
7
|
+
import collections
|
8
|
+
import copy
|
9
|
+
import os
|
10
|
+
import sys
|
11
|
+
import numpy
|
12
|
+
import math
|
13
|
+
import time
|
14
|
+
import traceback
|
15
|
+
import itertools
|
16
|
+
import yaml
|
17
|
+
from io import open
|
18
|
+
|
19
|
+
from pathlib import Path
|
20
|
+
partis_dir = str(Path(__file__).parent.parent)
|
21
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
22
|
+
import python.utils as utils
|
23
|
+
import python.indelutils as indelutils
|
24
|
+
import python.treeutils as treeutils
|
25
|
+
from python.event import RecombinationEvent
|
26
|
+
import python.paircluster as paircluster
|
27
|
+
|
28
|
+
bcr_phylo_path = os.getenv('PWD') + '/packages/bcr-phylo-benchmark'
|
29
|
+
ig_or_tr = 'ig'
|
30
|
+
|
31
|
+
# ----------------------------------------------------------------------------------------
|
32
|
+
def simdir():
|
33
|
+
return '%s/selection/simu' % args.base_outdir
|
34
|
+
def infdir():
|
35
|
+
return '%s/selection/partis' % args.base_outdir
|
36
|
+
def evtdir(i, igcr=None):
|
37
|
+
return '%s/event-%d%s' % (simdir(), i, '' if igcr is None else '/round-%d'%igcr)
|
38
|
+
def spath(tstr, unsampled=False): # use spath() for building command line args, whereas use get_simfn() to get [an] inidivudal file e.g. for utils.output_exists(), as well as for passing to fcns in paircluster.py
|
39
|
+
if args.mutated_outpath and tstr == 'mutated':
|
40
|
+
opth = args.base_outdir
|
41
|
+
else:
|
42
|
+
opth = '%s/%s-simu' % (simdir(), tstr)
|
43
|
+
return '%s%s%s' % (opth, '-unsampled' if (args.tpsample and unsampled) else '', '' if args.paired_loci else '.yaml')
|
44
|
+
def sfname(tstr, ltmp, lpair=None, unsampled=False):
|
45
|
+
if ltmp is None: assert not args.paired_loci
|
46
|
+
return paircluster.paired_fn(spath(tstr, unsampled=unsampled), ltmp, lpair=lpair, suffix='.yaml') if args.paired_loci else spath(tstr, unsampled=unsampled)
|
47
|
+
def naive_fname(ltmp, lpair=None): # args are only used for paired loci (but we pass the whole fcn to another fcn, so we need the signature like this)
|
48
|
+
return sfname('naive', ltmp, lpair=lpair) #paircluster.paired_fn(spath('naive'), ltmp, lpair=lpair, suffix='.yaml') if args.paired_loci else spath('naive')
|
49
|
+
def bcr_phylo_fasta_fname(outdir):
|
50
|
+
return '%s/%s.fasta' % (outdir, args.extrastr)
|
51
|
+
def get_simfn(ltmp, lpair=None, joint=False): # NOTE joint has no effect, but is needed for passing to paircluster.write_concatd_output_files()
|
52
|
+
return sfname('mutated', ltmp, lpair=lpair)
|
53
|
+
def get_unsampled_simfn(ltmp, lpair=None, joint=False): # ugly to have this, but signature has to be this so we can pass it to fcns in paircluster
|
54
|
+
return sfname('mutated', ltmp, lpair=lpair, unsampled=True)
|
55
|
+
# ----------------------------------------------------------------------------------------
|
56
|
+
def ipath(stype): # path/file name for building command line args
|
57
|
+
rpath = infdir()
|
58
|
+
if args.paired_loci:
|
59
|
+
return rpath
|
60
|
+
assert stype in ['params', 'partition', 'plots']
|
61
|
+
rpath = '%s/%s' % (rpath, stype)
|
62
|
+
if stype == 'partition':
|
63
|
+
rpath += '.yaml'
|
64
|
+
return rpath
|
65
|
+
# ----------------------------------------------------------------------------------------
|
66
|
+
def ifname(stype, ltmp='igh'): # path/files for utils.output_exists()
|
67
|
+
rpath = ipath(stype)
|
68
|
+
if args.paired_loci:
|
69
|
+
if stype == 'partition':
|
70
|
+
rpath = paircluster.paired_fn(rpath, ltmp, suffix='.yaml', actstr=stype)
|
71
|
+
else:
|
72
|
+
rpath += '/parameters/%s' % ltmp
|
73
|
+
if stype == 'params':
|
74
|
+
rpath += '/hmm/hmms'
|
75
|
+
return rpath
|
76
|
+
# ----------------------------------------------------------------------------------------
|
77
|
+
def lpairs():
|
78
|
+
return utils.locus_pairs[ig_or_tr]
|
79
|
+
# ----------------------------------------------------------------------------------------
|
80
|
+
def rearrange():
|
81
|
+
if utils.output_exists(args, naive_fname('igh'), outlabel='naive simu', offset=4): # just look for the merged igh file, since it's about the last to be written (and both paired subdirs may not be there)
|
82
|
+
return
|
83
|
+
cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
|
84
|
+
cmd += ' --debug %d --random-seed %d --n-sim-events %d' % (int(args.debug), args.seed, args.n_sim_events if not args.restrict_to_single_naive_seq else 1)
|
85
|
+
if args.paired_loci:
|
86
|
+
cmd += ' --paired-loci --paired-outdir %s' % spath('naive')
|
87
|
+
else:
|
88
|
+
cmd += ' --outfname %s' % spath('naive')
|
89
|
+
if args.restrict_available_genes:
|
90
|
+
assert not args.paired_loci
|
91
|
+
cmd += ' --only-genes IGHV1-18*01:IGHJ1*01'
|
92
|
+
if args.rearr_extra_args is not None:
|
93
|
+
cmd += ' %s' % args.rearr_extra_args
|
94
|
+
if args.single_light_locus is not None:
|
95
|
+
cmd += ' --single-light-locus %s' % args.single_light_locus
|
96
|
+
if args.n_procs > 1:
|
97
|
+
cmd += ' --n-procs %d' % args.n_procs
|
98
|
+
if args.slurm:
|
99
|
+
cmd += ' --batch-system slurm'
|
100
|
+
utils.simplerun(cmd, dryrun=args.dry_run, debug=True)
|
101
|
+
|
102
|
+
# ----------------------------------------------------------------------------------------
|
103
|
+
def get_vpar_val(parg, pval, debug=False): # get value of parameter/command line arg that is allowed to (but may not at the moment) be drawn from a variable distribution (note we have to pass in <pval> for args that are lists)
|
104
|
+
if args.parameter_variances is None or parg not in args.parameter_variances: # default: just use the single, fixed value from the command line
|
105
|
+
return pval
|
106
|
+
if args.n_gc_rounds is not None and parg in ['obs-times', 'n-sim-seqs-per-generation']:
|
107
|
+
raise Exception('shouldn\'t get here (see exception elsewhere)')
|
108
|
+
def sfcn(x): # just for dbg/exceptions
|
109
|
+
return str(int(x)) if parg != 'selection-strength' else ('%.2f' % x)
|
110
|
+
pvar = args.parameter_variances[parg]
|
111
|
+
if '..' in pvar: # list of allowed values NOTE pval is *not* used if we're choosing from several choices (ick, but not sure what else to do)
|
112
|
+
dbgstr = '[%s]' % pvar.replace('..', ', ')
|
113
|
+
return_val = numpy.random.choice([float(pv) for pv in pvar.split('..')])
|
114
|
+
else: # actual parameter variance (i know, this is ugly)
|
115
|
+
parg_bounds = {'min' : {'n-sim-seqs-per-generation' : 1}, 'max' : {}}
|
116
|
+
pmean = pval
|
117
|
+
pvar = float(pvar)
|
118
|
+
pmin, pmax = pmean - 0.5 * pvar, pmean + 0.5 * pvar
|
119
|
+
if pmin < 0:
|
120
|
+
raise Exception('min parameter value for %s less than 0 (from mean %s and half width %s)' % (parg, sfcn(pmean), sfcn(pvar)))
|
121
|
+
if parg == 'selection-strength' and pmax > 1:
|
122
|
+
raise Exception('max parameter value for %s greater than 1 (from mean %s and half width %s)' % (parg, sfcn(pmean), sfcn(pvar)))
|
123
|
+
if parg in parg_bounds['min'] and pmin < parg_bounds['min'][parg]:
|
124
|
+
raise Exception('min value too small for %s: %f < %f' % (parg, pmin, parg_bounds['min'][parg]))
|
125
|
+
if parg in parg_bounds['max'] and pmax > parg_bounds['max'][parg]:
|
126
|
+
raise Exception('max value too large for %s: %f > %f' % (parg, pmax, parg_bounds['max'][parg]))
|
127
|
+
dbgstr = '[%6s, %6s]' % (sfcn(pmin), sfcn(pmax))
|
128
|
+
return_val = numpy.random.uniform(pmin, pmax)
|
129
|
+
if parg != 'selection-strength':
|
130
|
+
return_val = int(return_val)
|
131
|
+
if debug:
|
132
|
+
print(' %30s --> %-7s %s' % (dbgstr, sfcn(return_val), parg))
|
133
|
+
return return_val
|
134
|
+
|
135
|
+
# ----------------------------------------------------------------------------------------
|
136
|
+
def run_bcr_phylo(naive_seq, outdir, ievent, uid_str_len=None, igcr=None):
|
137
|
+
if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4):
|
138
|
+
return None
|
139
|
+
|
140
|
+
cmd = '%s/bin/simulator.py' % bcr_phylo_path
|
141
|
+
if args.run_help:
|
142
|
+
cmd += ' --help'
|
143
|
+
else:
|
144
|
+
cmd += ' --lambda0 %f' % args.base_mutation_rate
|
145
|
+
if args.no_selection:
|
146
|
+
cmd += ' --no_selection'
|
147
|
+
else:
|
148
|
+
cmd += ' --selection_strength %f' % get_vpar_val('selection-strength', args.selection_strength)
|
149
|
+
for astr in ['obs-times', 'n-sim-seqs-per-generation']: # for search: obs_times n_sim_seqs_per_generation
|
150
|
+
aval = getattr(args, astr.replace('-', '_'))
|
151
|
+
tstr = ' '.join('%d' % get_vpar_val(astr, t) for t in (aval if args.n_gc_rounds is None else aval[igcr]))
|
152
|
+
cmd += ' --%s %s' % (astr.replace('n-sim-seqs-per-generation', 'n-to-sample').replace('-', '_'), tstr) # ick
|
153
|
+
cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
|
154
|
+
if args.multifurcating_tree:
|
155
|
+
cmd += ' --multifurcating_tree'
|
156
|
+
if args.aa_paratope_positions is not None:
|
157
|
+
cmd += ' --aa_paratope_positions %s' % args.aa_paratope_positions
|
158
|
+
if args.aa_struct_positions is not None:
|
159
|
+
cmd += ' --aa_struct_positions %s' % args.aa_struct_positions
|
160
|
+
if args.dont_mutate_struct_positions:
|
161
|
+
cmd += ' --dont_mutate_struct_positions'
|
162
|
+
if args.skip_stops:
|
163
|
+
cmd += ' --skip_stops_when_mutating'
|
164
|
+
if args.allow_stops:
|
165
|
+
cmd += ' --allow_stops_in_functional_seqs'
|
166
|
+
cmd += ' --target_dist %d' % args.target_distance
|
167
|
+
cmd += ' --target_count %d' % args.target_count
|
168
|
+
cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap)
|
169
|
+
if not args.dont_observe_common_ancestors:
|
170
|
+
cmd += ' --observe_common_ancestors'
|
171
|
+
if args.leaf_sampling_scheme is not None:
|
172
|
+
cmd += ' --leaf_sampling_scheme %s' % args.leaf_sampling_scheme
|
173
|
+
if args.n_target_clusters is not None:
|
174
|
+
cmd += ' --n_target_clusters %d' % args.n_target_clusters
|
175
|
+
# cmd += ' --target_cluster_distance 1'
|
176
|
+
if args.tdist_scale is not None:
|
177
|
+
cmd += ' --tdist_scale %d' % args.tdist_scale
|
178
|
+
if args.tdist_weights is not None:
|
179
|
+
cmd += ' --tdist_weights %s' % args.tdist_weights
|
180
|
+
if args.min_target_distance is not None:
|
181
|
+
cmd += ' --min_target_distance %d' % args.min_target_distance
|
182
|
+
if args.min_effective_kd is not None:
|
183
|
+
cmd += ' --min_effective_kd %d' % args.min_effective_kd
|
184
|
+
if args.n_naive_seq_copies is not None:
|
185
|
+
cmd += ' --n_naive_seq_copies %d' % args.n_naive_seq_copies
|
186
|
+
if args.n_gc_rounds is not None and igcr > 0:
|
187
|
+
init_fn = '%s/init-seqs.fa' % outdir
|
188
|
+
if not args.dry_run:
|
189
|
+
isfos = utils.read_fastx(bcr_phylo_fasta_fname(evtdir(ievent, igcr=igcr - 1)))
|
190
|
+
if args.n_reentry_seqs is not None:
|
191
|
+
if args.n_reentry_seqs > len(isfos):
|
192
|
+
print(' %s --n-reentry-seqs %d greater than number of observed seqs %d in %s' % (utils.wrnstr(), args.n_reentry_seqs, len(isfos), bcr_phylo_fasta_fname(evtdir(ievent, igcr=igcr - 1))))
|
193
|
+
isfos = numpy.random.choice(isfos, size=args.n_reentry_seqs, replace=False)
|
194
|
+
utils.write_fasta(init_fn, isfos)
|
195
|
+
cmd += ' --initial_seq_file %s' % init_fn
|
196
|
+
|
197
|
+
cmd += ' --debug %d' % args.debug
|
198
|
+
cmd += ' --n_tries 1000'
|
199
|
+
if args.context_depend == 0:
|
200
|
+
cmd += ' --no_context'
|
201
|
+
cmd += ' --no_plot'
|
202
|
+
if args.only_csv_plots:
|
203
|
+
cmd += ' --dont_write_hists'
|
204
|
+
cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
|
205
|
+
cmd += ' --random_seed %d' % (args.seed + ievent) # NOTE if args.n_gc_rounds is set, it's *really* important that this is the same for each round since it ensures we have the same target sequence
|
206
|
+
if uid_str_len is not None:
|
207
|
+
cmd += ' --uid_str_len %d' % uid_str_len
|
208
|
+
cmd += ' --naive_seq %s' % naive_seq
|
209
|
+
|
210
|
+
if not os.path.exists(outdir):
|
211
|
+
os.makedirs(outdir)
|
212
|
+
|
213
|
+
cfo = None
|
214
|
+
if args.n_procs == 1:
|
215
|
+
utils.run_ete_script(cmd, dryrun=args.dry_run)
|
216
|
+
else:
|
217
|
+
cmd = utils.run_ete_script(cmd, return_for_cmdfos=True, dryrun=args.dry_run)
|
218
|
+
cfo = {'cmd_str' : cmd, 'workdir' : outdir, 'outfname' : bcr_phylo_fasta_fname(outdir)}
|
219
|
+
sys.stdout.flush()
|
220
|
+
return cfo
|
221
|
+
|
222
|
+
# ----------------------------------------------------------------------------------------
|
223
|
+
def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent, uid_info):
|
224
|
+
# ----------------------------------------------------------------------------------------
|
225
|
+
def split_seqfos(seqfos):
|
226
|
+
hline, lline = naive_events[ievent]
|
227
|
+
hseqfos, lseqfos = [], []
|
228
|
+
for sfo in seqfos:
|
229
|
+
padseq = utils.pad_nuc_seq(hline['naive_seq'])
|
230
|
+
assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq'])
|
231
|
+
hseqfos.append({'name' : sfo['name'], 'seq' : sfo['seq'][ : len(hline['naive_seq'])]})
|
232
|
+
lseqfos.append({'name' : sfo['name'], 'seq' : sfo['seq'][len(padseq) : ]})
|
233
|
+
return hseqfos, lseqfos
|
234
|
+
# ----------------------------------------------------------------------------------------
|
235
|
+
def read_kdvals(kdfname):
|
236
|
+
nodefo = {}
|
237
|
+
with open(kdfname) as kdfile:
|
238
|
+
reader = csv.DictReader(kdfile)
|
239
|
+
for line in reader:
|
240
|
+
nodefo[line['uid']] = {
|
241
|
+
'kd' : float(line['kd']),
|
242
|
+
'relative_kd' : float(line['relative_kd']),
|
243
|
+
'lambda' : float(line['lambda']) if line['lambda'] != '' else None, # bcr-phylo used to not run the lambda update fcn after last iteratio, which resulted in empty lambda values, but it shouldn't happen any more (but leaving here for backwards compatibility)
|
244
|
+
'target_index' : int(line['target_index']),
|
245
|
+
'target_distance' : float(line['target_distance']),
|
246
|
+
'time' : int(line['time']),
|
247
|
+
}
|
248
|
+
return nodefo
|
249
|
+
# ----------------------------------------------------------------------------------------
|
250
|
+
def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, dup_translations=None, locus=None):
|
251
|
+
assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above
|
252
|
+
assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below
|
253
|
+
if args.debug:
|
254
|
+
utils.print_reco_event(naive_line)
|
255
|
+
reco_info = collections.OrderedDict()
|
256
|
+
for sfo in sfos:
|
257
|
+
mline = utils.get_non_implicit_copy(naive_line)
|
258
|
+
del mline['tree']
|
259
|
+
mline['unique_ids'] = [sfo['name']]
|
260
|
+
mline['seqs'] = [sfo['seq']]
|
261
|
+
mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line)
|
262
|
+
mline['duplicates'] = [[]]
|
263
|
+
reco_info[sfo['name']] = mline
|
264
|
+
try:
|
265
|
+
utils.add_implicit_info(glfo, mline)
|
266
|
+
except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
|
267
|
+
print('implicit info adding failed for ievent %d in %s' % (ievent, outdir))
|
268
|
+
lines = traceback.format_exception(*sys.exc_info())
|
269
|
+
print(utils.pad_lines(''.join(lines))) # NOTE this will still crash on the next line if implicit info adding failed
|
270
|
+
final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in sfos], reco_info)
|
271
|
+
|
272
|
+
ftree = copy.deepcopy(dtree)
|
273
|
+
if locus is not None:
|
274
|
+
def ltr(u): return u + '-' + locus
|
275
|
+
new_nodefo = {}
|
276
|
+
for u_old in nodefo:
|
277
|
+
new_nodefo[ltr(u_old)] = nodefo[u_old]
|
278
|
+
nodefo = new_nodefo
|
279
|
+
treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']])
|
280
|
+
final_line['unique_ids'] = [ltr(u) for u in final_line['unique_ids']]
|
281
|
+
assert len(sfos) == len(final_line['unique_ids'])
|
282
|
+
for iseq, sfo in enumerate(sfos):
|
283
|
+
naive_id = naive_line['unique_ids'][0]
|
284
|
+
assert naive_id.count('-') == 1
|
285
|
+
bstr = naive_id.replace('-'+locus, '')
|
286
|
+
pids = final_line['paired-uids'][iseq]
|
287
|
+
assert len(pids) == 1 and pids[0].find(bstr) == 0 and pids[0].count('-') == 1 and pids[0].split('-')[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
|
288
|
+
final_line['paired-uids'][iseq] = [p.replace(bstr, sfo['name']) for p in pids]
|
289
|
+
|
290
|
+
tmp_trns = {}
|
291
|
+
for iu, old_id in enumerate(final_line['unique_ids']): # NOTE this only translates the uids, for paired h/l we still need to go back through and translate paired uids
|
292
|
+
if old_id in uid_info['all_uids']:
|
293
|
+
new_id, uid_info['n_duplicate_uids'] = utils.choose_non_dup_id(old_id, uid_info['n_duplicate_uids'], uid_info['all_uids'])
|
294
|
+
tmp_trns[old_id] = new_id
|
295
|
+
final_line['unique_ids'][iu] = new_id
|
296
|
+
uid_info['all_uids'].add(final_line['unique_ids'][iu])
|
297
|
+
if len(tmp_trns) > 0:
|
298
|
+
for old_id, new_id in tmp_trns.items():
|
299
|
+
nodefo[new_id] = nodefo[old_id]
|
300
|
+
del nodefo[old_id]
|
301
|
+
treeutils.translate_labels(ftree, [(o, n) for o, n in tmp_trns.items()], expect_missing=True)
|
302
|
+
if dup_translations is not None:
|
303
|
+
dup_translations.update(tmp_trns)
|
304
|
+
|
305
|
+
if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
|
306
|
+
print(' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])))
|
307
|
+
if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
|
308
|
+
print(' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)))
|
309
|
+
final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
|
310
|
+
if args.affinity_measurement_error is not None:
|
311
|
+
# before = final_line['affinities']
|
312
|
+
final_line['affinities'] = [numpy.random.normal(a, args.affinity_measurement_error * a) for a in final_line['affinities']]
|
313
|
+
# print ' '.join('%.4f'%v for v in before)
|
314
|
+
# print ' '.join('%.4f'%v for v in final_line['affinities'])
|
315
|
+
final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
|
316
|
+
final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
|
317
|
+
final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
|
318
|
+
final_line['min_target_distances'] = [nodefo[u]['target_distance'] for u in final_line['unique_ids']]
|
319
|
+
final_line['generation-times'] = [nodefo[u]['time'] for u in final_line['unique_ids']]
|
320
|
+
ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) # note that if --paired-loci is set then most edges will still be the wrong length (compared to the mutations in the single-locus sequences), i.e. best not to use this much until treeutils.combine_selection_metrics(), where we rescale to the full h+l length
|
321
|
+
# treeutils.compare_tree_distance_to_shm(ftree, final_line, debug=True)
|
322
|
+
if args.debug:
|
323
|
+
print(utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12))
|
324
|
+
final_line['tree'] = ftree.as_string(schema='newick')
|
325
|
+
if args.debug:
|
326
|
+
utils.print_reco_event(final_line) #, extra_print_keys=['lambdas'])
|
327
|
+
|
328
|
+
tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now
|
329
|
+
tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything
|
330
|
+
final_line['target_seqs'] = [[tfo['seq'] for tfo in target_sfos] for _ in final_line['unique_ids']] # NOTE it would be nice to put this in a per-family key, but it ends up we want it to behave like an input meta info key, and input meta keys need to be per-seq since they're inherently a property of each sequence. So instead we duplicate this info across all seqs to which it applies
|
331
|
+
return final_line
|
332
|
+
# ----------------------------------------------------------------------------------------
|
333
|
+
def translate_duplicate_pids(mpair, dup_translations):
|
334
|
+
if len(dup_translations) == 0:
|
335
|
+
return
|
336
|
+
assert len(set(len(l['unique_ids']) for l in mpair)) == 1 # make sure h and l annotations have the same length
|
337
|
+
for atn1, atn2 in itertools.permutations(mpair, 2):
|
338
|
+
# print ':'.join([utils.color('red' if atn1['paired-uids'][i]!=[u] else None, u) for i, u in enumerate(atn2['unique_ids'])])
|
339
|
+
for pids, uid in zip(atn1['paired-uids'], atn2['unique_ids']): # this is just to double check things, so could be removed
|
340
|
+
assert len(pids) == 1
|
341
|
+
if pids[0] != uid:
|
342
|
+
assert pids[0] in dup_translations and dup_translations[pids[0]] == uid
|
343
|
+
del dup_translations[pids[0]]
|
344
|
+
atn1['paired-uids'] = [[u] for u in atn2['unique_ids']] # seems a bit hackey to reset all of them, not just the translated one, but whatever
|
345
|
+
|
346
|
+
# ----------------------------------------------------------------------------------------
|
347
|
+
# extract kd values from pickle file (used to need a separate script since ete3 needed python 3, but now probably doesn't)
|
348
|
+
kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
|
349
|
+
if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
|
350
|
+
cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
|
351
|
+
utils.run_ete_script(cmd, debug=args.n_procs==1)
|
352
|
+
nodefo = read_kdvals(kdfname)
|
353
|
+
dtree = treeutils.get_dendro_tree(treefname=nwkfname)
|
354
|
+
seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo
|
355
|
+
target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
|
356
|
+
if args.paired_loci:
|
357
|
+
mpair = []
|
358
|
+
dup_translations = {}
|
359
|
+
for tline, sfos, tsfos in zip(naive_events[ievent], split_seqfos(seqfos), split_seqfos(target_seqfos)):
|
360
|
+
mpair.append(get_mature_line(sfos, tline, glfos[tline['loci'][0]], nodefo, dtree, target_seqfos, dup_translations=dup_translations, locus=tline['loci'][0]))
|
361
|
+
translate_duplicate_pids(mpair, dup_translations)
|
362
|
+
return mpair
|
363
|
+
else:
|
364
|
+
return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo, dtree, target_seqfos)
|
365
|
+
|
366
|
+
# ----------------------------------------------------------------------------------------
|
367
|
+
def read_rearrangements():
|
368
|
+
if args.paired_loci:
|
369
|
+
lp_infos = paircluster.read_lpair_output_files(lpairs(), naive_fname, dbgstr='naive simulation')
|
370
|
+
naive_events = paircluster.get_all_antn_pairs(lp_infos)
|
371
|
+
glfos, _, _ = paircluster.concat_heavy_chain(lpairs(), lp_infos) # per-locus glfos with concat'd heavy chain
|
372
|
+
else:
|
373
|
+
glfo, naive_events, _ = utils.read_output(naive_fname(None))
|
374
|
+
glfos = [glfo]
|
375
|
+
return glfos, naive_events
|
376
|
+
|
377
|
+
# ----------------------------------------------------------------------------------------
|
378
|
+
def write_simulation(glfos, mutated_events, unsampled=False):
|
379
|
+
opath = spath('mutated', unsampled=unsampled)
|
380
|
+
print(' writing%s annotations to %s' % ('' if not args.tpsample else (' unsampled' if unsampled else ' timepoint sampled'), opath))
|
381
|
+
mheads = []
|
382
|
+
if args.n_gc_rounds is not None:
|
383
|
+
mheads += ['gc-rounds', 'generation-times']
|
384
|
+
if args.tpsample:
|
385
|
+
mheads += ['timepoints']
|
386
|
+
headers = utils.simulation_headers + mheads
|
387
|
+
if args.paired_loci:
|
388
|
+
lp_infos = {}
|
389
|
+
for lpair in lpairs():
|
390
|
+
lpfos = {k : {} for k in ['glfos', 'antn_lists', 'cpaths']} # cpaths i think don't get used
|
391
|
+
mevents = [(hl, ll) for hl, ll in mutated_events if [hl['loci'][0], ll['loci'][0]] == lpair] # grab the events for this h/l pair
|
392
|
+
for ltmp, levents in zip(lpair, zip(*mevents)):
|
393
|
+
lpfos['antn_lists'][ltmp] = levents
|
394
|
+
lpfos['glfos'][ltmp] = glfos[ltmp]
|
395
|
+
lp_infos[tuple(lpair)] = lpfos
|
396
|
+
sfcn = get_unsampled_simfn if (args.tpsample and unsampled) else get_simfn
|
397
|
+
paircluster.write_lpair_output_files(lpairs(), lp_infos, sfcn, headers)
|
398
|
+
glfos, antn_lists, _ = paircluster.concat_heavy_chain(lpairs(), lp_infos) # per-locus glfos with concat'd heavy chain
|
399
|
+
paircluster.write_concatd_output_files(glfos, antn_lists, sfcn, headers)
|
400
|
+
outfos, metafos = paircluster.get_combined_outmetafos(antn_lists, extra_meta_headers=mheads)
|
401
|
+
paircluster.write_combined_fasta_and_meta(opath+'/all-seqs.fa', opath+'/meta.yaml', outfos, metafos)
|
402
|
+
else:
|
403
|
+
utils.write_annotations(opath, glfos[0], mutated_events, headers)
|
404
|
+
|
405
|
+
# ----------------------------------------------------------------------------------------
|
406
|
+
def combine_gc_rounds(glfos, mevt_lists):
|
407
|
+
# ----------------------------------------------------------------------------------------
|
408
|
+
def fix_evt(igcr, sum_time, evt, all_gtimes, ltmp=None):
|
409
|
+
evt['generation-times'] = [t + sum_time for t in evt['generation-times']]
|
410
|
+
evt['gc-rounds'] = [igcr for _ in evt['unique_ids']]
|
411
|
+
if ltmp is None:
|
412
|
+
def tfcn(u, t): return '%s-%s'%(u, t)
|
413
|
+
else:
|
414
|
+
def tfcn(u, t): lstr = u.split('-')[-1]; assert lstr in utils.loci; return u.replace('-'+lstr, '-%s-%s'%(t, lstr))
|
415
|
+
trns = {u : tfcn(u, t) for u, t in zip(evt['unique_ids'], evt['generation-times'])}
|
416
|
+
if ltmp is not None:
|
417
|
+
trns.update({p : tfcn(p, t) for pids, t in zip(evt['paired-uids'], evt['generation-times']) for p in pids})
|
418
|
+
utils.translate_uids([evt], trns=trns, translate_pids=args.paired_loci) # kind of annoying to add the timepoint to the uid, but otherwise we get duplicate uids in different rounds (and we can't change the random seed atm, or else the target seqs will be out of whack)
|
419
|
+
all_gtimes |= set(evt['generation-times'])
|
420
|
+
# ----------------------------------------------------------------------------------------
|
421
|
+
if utils.output_exists(args, get_simfn('igh'), outlabel='mutated simu', offset=4):
|
422
|
+
return None
|
423
|
+
assert len(mevt_lists) == args.n_gc_rounds
|
424
|
+
assert len(set(len(l) for l in mevt_lists)) == 1 # all rounds should have the same number of events
|
425
|
+
sum_time = 0
|
426
|
+
assert len(args.obs_times) == args.n_gc_rounds # also checked elsewhere
|
427
|
+
all_gtimes, stlist = set(), []
|
428
|
+
for igcr in range(args.n_gc_rounds):
|
429
|
+
if args.paired_loci:
|
430
|
+
for epair in mevt_lists[igcr]:
|
431
|
+
for evt in epair:
|
432
|
+
fix_evt(igcr, sum_time, evt, all_gtimes, ltmp=evt['loci'][0])
|
433
|
+
else:
|
434
|
+
for evt in mevt_lists[igcr]:
|
435
|
+
fix_evt(igcr, sum_time, evt, all_gtimes)
|
436
|
+
sum_time += args.obs_times[igcr][-1]
|
437
|
+
stlist.append(sum_time)
|
438
|
+
print(' merging %d events over %d gc rounds with final generation times%s: %s' % (len(mevt_lists[0]), args.n_gc_rounds, '' if args.dont_observe_common_ancestors else ' (including observed common ancestors)', ' '.join(utils.color('blue' if t in stlist else None, str(t)) for t in sorted(all_gtimes))))
|
439
|
+
merged_events = []
|
440
|
+
for ievt in range(len(mevt_lists[0])):
|
441
|
+
if args.paired_loci:
|
442
|
+
mpair = []
|
443
|
+
lpair = [l['loci'][0] for l in mevt_lists[0][0]]
|
444
|
+
for ilocus, ltmp in enumerate(lpair):
|
445
|
+
mgevt = utils.combine_events(glfos[ltmp], [evts[ievt][ilocus] for evts in mevt_lists], meta_keys=['gc-rounds', 'generation-times'])
|
446
|
+
mpair.append(mgevt)
|
447
|
+
merged_events.append(mpair)
|
448
|
+
else:
|
449
|
+
mgevt = utils.combine_events(glfos[0], [evts[ievt] for evts in mevt_lists], meta_keys=['gc-rounds', 'generation-times'])
|
450
|
+
merged_events.append(mgevt)
|
451
|
+
|
452
|
+
write_simulation(glfos, merged_events, unsampled=args.tpsample)
|
453
|
+
|
454
|
+
return merged_events
|
455
|
+
|
456
|
+
# ----------------------------------------------------------------------------------------
|
457
|
+
def sample_tp_seqs(glfos, evt_list, l_evts=None, ltmp=None):
|
458
|
+
id_list = [u for l in evt_list for u in l['unique_ids']]
|
459
|
+
if len(set(id_list)) != len(id_list):
|
460
|
+
raise Exception('duplicate ids in final events') # shouldn't be able to get to here, but if it does it'll break stuff below
|
461
|
+
for fevt in evt_list:
|
462
|
+
if 'timepoints' in fevt: # shouldn't be in there
|
463
|
+
print(' %s \'timepoints\' already in event (overwriting)' % utils.wrnstr())
|
464
|
+
fevt['timepoints'] = [None for _ in fevt['unique_ids']]
|
465
|
+
all_gtimes = set(t for l in evt_list for t in l['generation-times'])
|
466
|
+
gt_ids = {t : [] for t in all_gtimes} # map from each generation time to list of all remaining uids with that time
|
467
|
+
for tline in evt_list:
|
468
|
+
for tid, gtime in zip(tline['unique_ids'], tline['generation-times']):
|
469
|
+
gt_ids[gtime].append(tid)
|
470
|
+
print(' N generation N N')
|
471
|
+
print(' timepoint total time chosen remaining')
|
472
|
+
for tpfo in args.sequence_sample_times:
|
473
|
+
if any(t not in all_gtimes for t in tpfo['times']):
|
474
|
+
raise Exception('generation time %s not among actual final times: %s' % (' '.join(str(t) for t in tpfo['times'] if t not in all_gtimes), ' '.join(str(t) for t in sorted(all_gtimes))))
|
475
|
+
allowed_uids = [u for gt in tpfo['times'] for u in gt_ids[gt]]
|
476
|
+
if tpfo['n'] > len(allowed_uids):
|
477
|
+
print(' %s not enough allowed seqs remain (%d > %d, probably didn\'t sample enough sequences at allowed times %s)' % (utils.wrnstr(), tpfo['n'], len(allowed_uids), ' '.join(str(t) for t in tpfo['times'])))
|
478
|
+
chosen_ids = numpy.random.choice(allowed_uids, size=tpfo['n'], replace=False)
|
479
|
+
if len(chosen_ids) != tpfo['n']:
|
480
|
+
print(' %s couldn\'t choose enough seqs (only got %d)' % (utils.wrnstr(), len(chosen_ids)))
|
481
|
+
n_chosen = {}
|
482
|
+
for gtime in tpfo['times']:
|
483
|
+
n_before = len(gt_ids[gtime])
|
484
|
+
gt_ids[gtime] = [u for u in gt_ids[gtime] if u not in chosen_ids]
|
485
|
+
n_chosen[gtime] = n_before - len(gt_ids[gtime])
|
486
|
+
for igt, gtime in enumerate(tpfo['times']):
|
487
|
+
print(' %12s %3s %3d %4d %4d' % (tpfo['name'] if igt==0 else '', '%d'%tpfo['n'] if igt==0 else '', gtime, n_chosen[gtime], len(gt_ids[gtime])))
|
488
|
+
for fevt in evt_list:
|
489
|
+
fevt['timepoints'] = [tpfo['name'] if u in chosen_ids else t for u, t in zip(fevt['unique_ids'], fevt['timepoints'])]
|
490
|
+
for ievt, fevt in enumerate(evt_list):
|
491
|
+
iseqs_to_keep = [i for i, t in enumerate(fevt['timepoints']) if t is not None]
|
492
|
+
if l_evts is not None:
|
493
|
+
hevt, levt = fevt, l_evts[ievt]
|
494
|
+
hloc, lloc = [e['loci'][0] for e in [hevt, levt]]
|
495
|
+
def htrans(u, hloc, lloc): return u.replace('-'+hloc, '-'+lloc)
|
496
|
+
assert [htrans(u, hloc, lloc) for u in hevt['unique_ids']] == levt['unique_ids']
|
497
|
+
levt['timepoints'] = [t for t in hevt['timepoints']] # NOTE *has* to happen before restrict_to_iseqs() (duh)
|
498
|
+
utils.restrict_to_iseqs(levt, iseqs_to_keep, glfos[levt['loci'][0]], remove_tree=True)
|
499
|
+
utils.restrict_to_iseqs(fevt, iseqs_to_keep, glfos[0 if ltmp is None else ltmp], remove_tree=True)
|
500
|
+
# utils.print_reco_event(levt, extra_print_keys=['timepoints', 'gc-rounds', 'generation-times'])
|
501
|
+
|
502
|
+
# ----------------------------------------------------------------------------------------
|
503
|
+
def write_timepoint_sampled_sequences(glfos, final_events):
|
504
|
+
if utils.output_exists(args, get_simfn('igh'), outlabel='mutated simu', offset=4):
|
505
|
+
return None
|
506
|
+
if args.paired_loci:
|
507
|
+
h_evts, l_evts = list(zip(*final_events))
|
508
|
+
sample_tp_seqs(glfos, h_evts, l_evts=l_evts, ltmp=h_evts[0]['loci'][0])
|
509
|
+
else:
|
510
|
+
sample_tp_seqs(glfos, final_events)
|
511
|
+
|
512
|
+
write_simulation(glfos, final_events)
|
513
|
+
|
514
|
+
# ----------------------------------------------------------------------------------------
|
515
|
+
def simulate(igcr=None):
|
516
|
+
|
517
|
+
if igcr in [None, 0]:
|
518
|
+
rearrange()
|
519
|
+
|
520
|
+
glfos, naive_events = read_rearrangements()
|
521
|
+
if args.dry_run:
|
522
|
+
for ievent in range(args.n_sim_events):
|
523
|
+
_ = run_bcr_phylo('<NAIVE_SEQ>', evtdir(ievent, igcr=igcr), ievent, igcr=igcr)
|
524
|
+
return None, None
|
525
|
+
if args.restrict_to_single_naive_seq:
|
526
|
+
print(' --restrict-to-single-naive-seq: using same naive event for all mutation simulations')
|
527
|
+
assert len(naive_events) == 1
|
528
|
+
naive_events = [naive_events[0] for _ in range(args.n_sim_events)]
|
529
|
+
else:
|
530
|
+
assert len(naive_events) == args.n_sim_events
|
531
|
+
|
532
|
+
outdirs = [evtdir(i, igcr=igcr) for i in range(len(naive_events))]
|
533
|
+
|
534
|
+
start = time.time()
|
535
|
+
cmdfos = []
|
536
|
+
if args.n_procs > 1:
|
537
|
+
print(' starting %d events%s' % (len(naive_events), '' if args.n_procs==1 else ' with N max simultaneous procs %d'%args.n_procs))
|
538
|
+
uid_str_len = args.min_ustr_len # UPDATE don't need to increase this any more since I'm handling duplicates when above + int(math.log(len(naive_events), 7)) # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates (note that this starts getting pretty slow if it's bigger than 7 or so)
|
539
|
+
for ievent, outdir in enumerate(outdirs):
|
540
|
+
if args.n_sim_events > 1 and args.n_procs == 1:
|
541
|
+
print(' %s %d' % (utils.color('blue', 'ievent'), ievent))
|
542
|
+
if args.paired_loci:
|
543
|
+
hnseq, lnseq = [l['naive_seq'] for l in naive_events[ievent]]
|
544
|
+
naive_seq = utils.pad_nuc_seq(hnseq) + lnseq
|
545
|
+
else:
|
546
|
+
naive_seq = naive_events[ievent]['naive_seq']
|
547
|
+
cfo = run_bcr_phylo(naive_seq, outdir, ievent, uid_str_len=uid_str_len, igcr=igcr) # if n_procs > 1, doesn't run, just returns cfo
|
548
|
+
if cfo is not None:
|
549
|
+
print(' %s %s' % (utils.color('red', 'run'), cfo['cmd_str']))
|
550
|
+
cmdfos.append(cfo)
|
551
|
+
if args.n_procs > 1 and len(cmdfos) > 0:
|
552
|
+
utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print')
|
553
|
+
print(' bcr-phylo run time: %.1fs' % (time.time() - start))
|
554
|
+
|
555
|
+
if utils.output_exists(args, get_simfn('igh'), outlabel='mutated simu', offset=4): # i guess if it crashes during the plotting just below, this'll get confused
|
556
|
+
return None, None
|
557
|
+
|
558
|
+
start = time.time()
|
559
|
+
uid_info = {'all_uids' : set(), 'n_duplicate_uids' : 0} # stuff just for dealing with duplicate uids
|
560
|
+
mutated_events = []
|
561
|
+
for ievent, outdir in enumerate(outdirs):
|
562
|
+
mutated_events.append(parse_bcr_phylo_output(glfos, naive_events, outdir, ievent, uid_info))
|
563
|
+
if uid_info['n_duplicate_uids'] > 0:
|
564
|
+
print(' %s renamed %d duplicate uids from %d bcr-phylo events' % (utils.color('yellow', 'warning'), uid_info['n_duplicate_uids'], len(mutated_events)))
|
565
|
+
print(' parsing time: %.1fs' % (time.time() - start))
|
566
|
+
|
567
|
+
if igcr is None:
|
568
|
+
write_simulation(glfos, mutated_events, unsampled=args.tpsample)
|
569
|
+
|
570
|
+
if not args.only_csv_plots:
|
571
|
+
import python.lbplotting as lbplotting
|
572
|
+
for ievent, outdir in enumerate(outdirs):
|
573
|
+
if args.paired_loci:
|
574
|
+
lpair = [l['loci'][0] for l in mutated_events[ievent]]
|
575
|
+
evtlist = mutated_events[ievent]
|
576
|
+
else:
|
577
|
+
lpair = None
|
578
|
+
evtlist = [mutated_events[ievent]]
|
579
|
+
lbplotting.plot_bcr_phylo_simulation(outdir + '/plots', outdir, evtlist, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance], lpair=lpair)
|
580
|
+
# utils.simplerun('cp -v %s/simu_collapsed_runstat_color_tree.svg %s/plots/' % (outdir, outdir))
|
581
|
+
|
582
|
+
return glfos, mutated_events
|
583
|
+
|
584
|
+
# ----------------------------------------------------------------------------------------
|
585
|
+
def simulseq_args():
|
586
|
+
cstr = ''
|
587
|
+
if args.restrict_to_single_naive_seq:
|
588
|
+
print(' note: using --all-seqs-simultaneous because --restrict-to-single-naive-seq was set')
|
589
|
+
cstr += ' --all-seqs-simultaneous'
|
590
|
+
if args.n_gc_rounds is None and not args.tpsample:
|
591
|
+
cstr += ' --is-simu'
|
592
|
+
if '--all-seqs-simultaneous' not in cstr:
|
593
|
+
cstr += ' --simultaneous-true-clonal-seqs'
|
594
|
+
elif args.n_sim_events == 1:
|
595
|
+
print(' %s not using --is-simu since --n-gc-rounds or --sequence-sample-time-fname are set, so e.g. plots won\'t use true info, and true tree won\'t be set' % utils.wrnstr())
|
596
|
+
if '--all-seqs-simultaneous' not in cstr:
|
597
|
+
cstr += ' --all-seqs-simultaneous'
|
598
|
+
else:
|
599
|
+
print(' %s not using any of --is-simu or --simultaneous-true-clonal-seqs since either --n-gc-rounds or --sequence-sample-time-fname are set with more than one event, so e.g. plots won\'t use true info, and true tree won\'t be set' % utils.wrnstr())
|
600
|
+
return cstr
|
601
|
+
|
602
|
+
# ----------------------------------------------------------------------------------------
|
603
|
+
def cache_parameters():
|
604
|
+
if utils.output_exists(args, ifname('params'), outlabel='parameters', offset=4):
|
605
|
+
return
|
606
|
+
cmd = './bin/partis cache-parameters --random-seed %d --no-indels' % args.seed # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option
|
607
|
+
cmd += simulseq_args()
|
608
|
+
fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ' --infname %s --parameter-dir %s'
|
609
|
+
cmd += fstr % (spath('mutated'), ipath('params'))
|
610
|
+
if args.all_inference_plots:
|
611
|
+
cmd += ' --plotdir %s' % ('paired-outdir' if args.paired_loci else ipath('plots'))
|
612
|
+
if args.meta_info_key_to_color is not None:
|
613
|
+
cmd += ' --meta-info-key-to-color %s' % args.meta_info_key_to_color
|
614
|
+
if args.inf_extra_args is not None:
|
615
|
+
cmd += ' %s' % args.inf_extra_args
|
616
|
+
if args.n_procs > 1:
|
617
|
+
cmd += ' --n-procs %d' % args.n_procs
|
618
|
+
if args.slurm:
|
619
|
+
cmd += ' --batch-system slurm'
|
620
|
+
if args.n_max_queries is not None:
|
621
|
+
cmd += ' --n-max-queries %d' % args.n_max_queries
|
622
|
+
utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
|
623
|
+
sys.stdout.flush()
|
624
|
+
|
625
|
+
# ----------------------------------------------------------------------------------------
|
626
|
+
def partition():
|
627
|
+
if utils.output_exists(args, ifname('partition'), outlabel='partition', offset=4):
|
628
|
+
return
|
629
|
+
cmd = './bin/partis partition --random-seed %d' % args.seed
|
630
|
+
cmd += simulseq_args()
|
631
|
+
fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else (' --infname %%s --parameter-dir %s --outfname %%s' % ipath('params'))
|
632
|
+
cmd += fstr % (spath('mutated'), ipath('partition'))
|
633
|
+
# --write-additional-cluster-annotations 0:5 # I don't think there was really a good reason for having this
|
634
|
+
if not args.dont_get_tree_metrics:
|
635
|
+
cmd += ' --get-selection-metrics'
|
636
|
+
if args.tree_inference_method is not None:
|
637
|
+
cmd += ' --tree-inference-method %s' % args.tree_inference_method
|
638
|
+
if not args.dont_get_tree_metrics or args.all_inference_plots:
|
639
|
+
cmd += ' --plotdir %s' % ('paired-outdir' if args.paired_loci else ipath('plots'))
|
640
|
+
if not args.all_inference_plots:
|
641
|
+
cmd += ' --no-partition-plots'
|
642
|
+
if args.meta_info_key_to_color is not None:
|
643
|
+
cmd += ' --meta-info-key-to-color %s' % args.meta_info_key_to_color
|
644
|
+
if args.inf_extra_args is not None:
|
645
|
+
cmd += ' %s' % args.inf_extra_args
|
646
|
+
if args.lb_tau is not None:
|
647
|
+
cmd += ' --lb-tau %f' % args.lb_tau
|
648
|
+
if args.n_procs > 1:
|
649
|
+
cmd += ' --n-procs %d' % args.n_procs
|
650
|
+
if args.slurm:
|
651
|
+
cmd += ' --batch-system slurm'
|
652
|
+
if args.n_max_queries is not None:
|
653
|
+
cmd += ' --n-max-queries %d' % args.n_max_queries
|
654
|
+
if args.extra_smetric_plots is not None:
|
655
|
+
cmd += ' --selection-metric-plot-cfg %s' % ':'.join(treeutils.default_plot_cfg + args.extra_smetric_plots + ['distr'])
|
656
|
+
utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
|
657
|
+
# cmd = './bin/partis get-selection-metrics --outfname %s/partition.yaml' % infdir()
|
658
|
+
# utils.simplerun(cmd, debug=True) #, dryrun=True)
|
659
|
+
sys.stdout.flush()
|
660
|
+
|
661
|
+
# ----------------------------------------------------------------------------------------
|
662
|
+
all_actions = ('simu', 'cache-parameters', 'partition')
|
663
|
+
class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
664
|
+
pass
|
665
|
+
formatter_class = MultiplyInheritedFormatter
|
666
|
+
parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter)
|
667
|
+
parser.add_argument('--actions', default=':'.join(all_actions), help='which actions to run')
|
668
|
+
parser.add_argument('--base-outdir', default='%s/partis/bcr-phylo/test' % os.getenv('fs', default=os.getenv('HOME')), help='base output dir')
|
669
|
+
parser.add_argument('--debug', type=int, default=0, choices=[0, 1, 2])
|
670
|
+
parser.add_argument('--run-help', action='store_true')
|
671
|
+
parser.add_argument('--overwrite', action='store_true')
|
672
|
+
parser.add_argument('--only-csv-plots', action='store_true')
|
673
|
+
parser.add_argument('--dont-get-tree-metrics', action='store_true', help='Partition without getting tree metrics, presumably because you want to run them yourself later')
|
674
|
+
parser.add_argument('--tree-inference-method')
|
675
|
+
parser.add_argument('--seed', type=int, default=1, help='random seed (note that bcr-phylo doesn\'t seem to support setting its random seed)')
|
676
|
+
parser.add_argument('--n-procs', type=int, default=1)
|
677
|
+
parser.add_argument('--extrastr', default='simu', help='doesn\'t really do anything, but it\'s required by bcr-phylo EDIT ok it actually doesn\'t need it, it\'s just that the output files look weird without it because they start with \'_\' if it\'s empty')
|
678
|
+
parser.add_argument('--n-sim-seqs-per-generation', default='100', help='Number of sequences to sample at each time in --obs-times.')
|
679
|
+
parser.add_argument('--n-sim-events', type=int, default=1, help='number of simulated rearrangement events')
|
680
|
+
parser.add_argument('--n-max-queries', type=int, help='during parameter caching and partitioning, stop after reading this many queries from simulation file (useful for dtr training samples where we need massive samples to actually train the dtr, but for testing various metrics need far smaller samples).')
|
681
|
+
parser.add_argument('--obs-times', default='100:120', help='Times (generations) at which to select sequences for observation. Note that this is the time that the sequences existed in/exited the gc, not necessarily the time at which we "sequenced" them. If --n-gc-rounds is set, this must be a colon-separated list of comma-separated lists (see help under that arg).')
|
682
|
+
parser.add_argument('--sequence-sample-time-fname', help='Times at which we "sample" for sequencing, i.e. at which we draw blood (as opposed to --obs-times, which is the generation time at which a cell leaves the gc). Specified in a yaml file as a list of key : val pairs, with key the timepoint label and values a dict with keys \'n\' (total number of sequences) and \'times\' (dict keyed by gc round time (if --n-gc-rounds is set, otherwise just a list) of generation times from which to sample those \'n\' sequences uniformly at random). If set, a new output file/dir is created by inserting \'timepoint-sampled\' at end of regular output file. See example in data/sample-seqs.yaml.')
|
683
|
+
parser.add_argument('--n-naive-seq-copies', type=int, help='see bcr-phylo docs')
|
684
|
+
parser.add_argument('--n-gc-rounds', type=int, help='number of rounds of gc entry, i.e. if set, upon gc completion we choose --n-reentry-seqs sampled seqs with which to seed a new (otherwise identical) gc reaction. Results for each gc round N are written to subdirs round-N/ for each event, then all sampled sequences from all reactions are collected into the normal output file locations, with input meta info key \'gc-rounds\' specifying their gc round. If this arg is set, then --obs-times must be a colon-separated list (of length --n-gc-rounds) of comma-separated lists, where each sample time is relative to the *start* of that round.')
|
685
|
+
parser.add_argument('--n-reentry-seqs', type=int, help='number of sampled seqs from previous round (chosen randomly) to inject into the next gc round (if not set, we take all of them).')
|
686
|
+
parser.add_argument('--carry-cap', type=int, default=1000, help='carrying capacity of germinal center')
|
687
|
+
parser.add_argument('--target-distance', type=int, default=15, help='Desired distance (number of non-synonymous mutations) between the naive sequence and the target sequences.')
|
688
|
+
parser.add_argument('--tdist-scale', type=int, help='see bcr-phylo docs')
|
689
|
+
parser.add_argument('--tdist-weights', help='see bcr-phylo docs')
|
690
|
+
parser.add_argument('--metric-for-target-distance', default='aa', choices=['aa', 'nuc', 'aa-sim-ascii', 'aa-sim-blosum'], help='see bcr-phylo docs')
|
691
|
+
parser.add_argument('--target-count', type=int, default=1, help='Number of target sequences to generate.')
|
692
|
+
parser.add_argument('--n-target-clusters', type=int, help='number of cluster into which to divide the --target-count target seqs (see bcr-phylo docs)')
|
693
|
+
parser.add_argument('--min-target-distance', type=int, help='see bcr-phylo docs')
|
694
|
+
parser.add_argument('--min-effective-kd', type=float, help='see bcr-phylo docs')
|
695
|
+
parser.add_argument('--affinity-measurement-error', type=float, help='Fractional measurement error on affinity: if set, replace \'affinities\' in final partis line with new values smeared with a normal distribution with this fractional width, i.e. <a> is replaced with a value drawn from a normal distribution with mean <a> and width <f>*<a> for this fraction <f>.')
|
696
|
+
parser.add_argument('--base-mutation-rate', type=float, default=0.365, help='see bcr-phylo docs')
|
697
|
+
parser.add_argument('--selection-strength', type=float, default=1., help='see bcr-phylo docs')
|
698
|
+
parser.add_argument('--context-depend', type=int, default=0, choices=[0, 1]) # i wish this could be a boolean, but having it int makes it much much easier to interface with the scan infrastructure in cf-tree-metrics.py
|
699
|
+
parser.add_argument('--aa-paratope-positions', help='see bcr-phylo docs')
|
700
|
+
parser.add_argument('--aa-struct-positions', help='see bcr-phylo docs')
|
701
|
+
parser.add_argument('--dont-mutate-struct-positions', action='store_true', help='see bcr-phylo docs')
|
702
|
+
parser.add_argument('--skip-stops', action='store_true', help='see bcr-phylo docs')
|
703
|
+
parser.add_argument('--allow-stops', action='store_true', help='see bcr-phylo docs')
|
704
|
+
parser.add_argument('--no-selection', action='store_true', help='see bcr-phylo docs')
|
705
|
+
parser.add_argument('--multifurcating-tree', action='store_true', help='see bcr-phylo docs')
|
706
|
+
parser.add_argument('--restrict-available-genes', action='store_true', help='restrict v and j gene choice to one each (so context dependence is easier to plot)')
|
707
|
+
parser.add_argument('--restrict-to-single-naive-seq', action='store_true', help='restrict all events to use the same naive sequence')
|
708
|
+
parser.add_argument('--lb-tau', type=float, help='')
|
709
|
+
parser.add_argument('--dont-observe-common-ancestors', action='store_true')
|
710
|
+
parser.add_argument('--leaf-sampling-scheme', help='see bcr-phylo help')
|
711
|
+
parser.add_argument('--parameter-variances', help='if set, parameters vary from family to family in one of two ways 1) the specified parameters are drawn from a uniform distribution of the specified width (with mean from the regular argument) for each family. Format example: n-sim-seqs-per-generation,10:carry-cap,150 would give --n-sim-seqs-per-generation +/-5 and --carry-cap +/-75, or 2) parameters for each family are chosen from a \'..\'-separated list, e.g. obs-times,75..100..150')
|
712
|
+
parser.add_argument('--slurm', action='store_true')
|
713
|
+
parser.add_argument('--paired-loci', action='store_true')
|
714
|
+
parser.add_argument('--parameter-plots', action='store_true', help='DEPRECATED')
|
715
|
+
parser.add_argument('--all-inference-plots', action='store_true')
|
716
|
+
parser.add_argument('--meta-info-key-to-color')
|
717
|
+
parser.add_argument('--single-light-locus', help='set to igk or igl if you want only that one; otherwise each event is chosen at random (see partis help)')
|
718
|
+
parser.add_argument('--rearr-extra-args', help='')
|
719
|
+
parser.add_argument('--inf-extra-args', help='')
|
720
|
+
parser.add_argument('--dry-run', action='store_true')
|
721
|
+
parser.add_argument('--mutated-outpath', action='store_true', help='write final (mutated) output file[s] to --base-outdir, rather than the default of burying them in subdirs with intermediate files')
|
722
|
+
parser.add_argument('--extra-smetric-plots', default=':'.join(treeutils.default_plot_cfg))
|
723
|
+
parser.add_argument('--min-ustr-len', type=int, default=5, help='min length of hashed uid strs (longer makes collisions less likely, but it\'s hard to avoid them entirely since independent bcr-phylo procs choose the uids for each family)')
|
724
|
+
|
725
|
+
args = parser.parse_args()
|
726
|
+
|
727
|
+
if args.parameter_plots:
|
728
|
+
print(' %s transferring deprecated arg --parameter-plots to --all-inference-plots' % utils.wrnstr())
|
729
|
+
args.all_inference_plots = True
|
730
|
+
delattr(args, 'parameter_plots')
|
731
|
+
if args.seed is not None:
|
732
|
+
numpy.random.seed(args.seed)
|
733
|
+
args.obs_times = utils.get_arg_list(args.obs_times, intify=True, list_of_lists=args.n_gc_rounds is not None)
|
734
|
+
args.n_sim_seqs_per_generation = utils.get_arg_list(args.n_sim_seqs_per_generation, intify=True, list_of_lists=args.n_gc_rounds is not None)
|
735
|
+
args.actions = utils.get_arg_list(args.actions, choices=all_actions)
|
736
|
+
args.parameter_variances = utils.get_arg_list(args.parameter_variances, key_val_pairs=True, choices=['selection-strength', 'obs-times', 'n-sim-seqs-per-generation', 'carry-cap', 'metric-for-target-distance']) # if you add more, make sure the bounds enforcement and conversion stuff in get_vpar_val() are still ok
|
737
|
+
args.extra_smetric_plots = utils.get_arg_list(args.extra_smetric_plots, choices=treeutils.all_plot_cfg)
|
738
|
+
if args.rearr_extra_args is not None:
|
739
|
+
args.rearr_extra_args = args.rearr_extra_args.replace('@', ' ') # ick this sucks
|
740
|
+
if args.inf_extra_args is not None:
|
741
|
+
args.inf_extra_args = args.inf_extra_args.replace('@', ' ') # ick this sucks
|
742
|
+
if args.affinity_measurement_error is not None:
|
743
|
+
assert args.affinity_measurement_error >= 0
|
744
|
+
if args.affinity_measurement_error > 1:
|
745
|
+
print(' note: --affinity-measurement-error %.2f is greater than 1 -- this is fine as long as it\'s on purpose, but will result in smearing by a normal with width larger than each affinity value (and probably result in some negative values).' % args.affinity_measurement_error)
|
746
|
+
if args.n_gc_rounds is not None:
|
747
|
+
assert len(args.obs_times) == args.n_gc_rounds
|
748
|
+
for otlist in args.obs_times:
|
749
|
+
if otlist != sorted(otlist): # various things assume it's sorted
|
750
|
+
raise Exception('obs times within each gc round must be sorted')
|
751
|
+
otstrs = ['%s' % ' '.join(str(t) for t in otlist) for otlist in args.obs_times]
|
752
|
+
def fgt(i, t): return t + sum(args.obs_times[j][-1] for j in range(i))
|
753
|
+
fgstrs = ['%s' % ' '.join(str(fgt(i, t)) for t in otlist) for i, otlist in enumerate(args.obs_times)]
|
754
|
+
print(' --obs-times at each of %d gc rounds: %s (final generation times: %s)' % (args.n_gc_rounds, ', '.join(otstrs), ', '.join(fgstrs)))
|
755
|
+
if len(args.n_sim_seqs_per_generation) != args.n_gc_rounds and len(args.n_sim_seqs_per_generation) == 1:
|
756
|
+
args.n_sim_seqs_per_generation = [args.n_sim_seqs_per_generation[0] for _ in range(args.n_gc_rounds)]
|
757
|
+
assert len(args.n_sim_seqs_per_generation) == args.n_gc_rounds
|
758
|
+
if args.parameter_variances is not None: # don't feel like implementing this atm
|
759
|
+
if any(a in args.parameter_variances for a in ['obs-times', 'n-sim-seqs-per-generation']):
|
760
|
+
raise Exception('haven\'t implemented parameter variances for --obs-times/--n-sim-seqs-per-generation with multiple gc rounds')
|
761
|
+
setattr(args, 'sequence_sample_times', None)
|
762
|
+
setattr(args, 'tpsample', False)
|
763
|
+
if args.sequence_sample_time_fname is not None:
|
764
|
+
print(' reading timepoint sample times from %s' % args.sequence_sample_time_fname)
|
765
|
+
with open(args.sequence_sample_time_fname) as sfile:
|
766
|
+
yamlfo = yaml.load(sfile, Loader=yaml.CLoader)
|
767
|
+
if args.n_gc_rounds is not None: # have to translate the "local" gc round times to final times, then also concatenate them into ones list for all gc rounds
|
768
|
+
sum_time = 0
|
769
|
+
for igcr in range(args.n_gc_rounds):
|
770
|
+
for tpfo in yamlfo:
|
771
|
+
if igcr not in tpfo['times']:
|
772
|
+
continue
|
773
|
+
tpfo['times'][igcr] = [t + sum_time for t in tpfo['times'][igcr]]
|
774
|
+
sum_time += args.obs_times[igcr][-1]
|
775
|
+
for tpfo in yamlfo:
|
776
|
+
tpfo['times'] = sorted(t for tlist in tpfo['times'].values() for t in tlist) # should already be sorted, but whatever
|
777
|
+
args.sequence_sample_times = yamlfo
|
778
|
+
args.tpsample = True # just a shorthand
|
779
|
+
delattr(args, 'sequence_sample_time_fname')
|
780
|
+
|
781
|
+
assert args.extrastr == 'simu' # I think at this point this actually can't be changed without changing some other things
|
782
|
+
|
783
|
+
# ----------------------------------------------------------------------------------------
|
784
|
+
if 'simu' in args.actions:
|
785
|
+
if args.n_gc_rounds is None:
|
786
|
+
glfos, final_events = simulate()
|
787
|
+
else:
|
788
|
+
mevt_lists = [] # list (for each gc round) of [sub]lists, where each sublist is the sampled seqs from that round for each event
|
789
|
+
for igcr in range(args.n_gc_rounds):
|
790
|
+
glfos, mevts = simulate(igcr=igcr)
|
791
|
+
mevt_lists.append(mevts)
|
792
|
+
if not args.dry_run:
|
793
|
+
final_events = combine_gc_rounds(glfos, mevt_lists)
|
794
|
+
if not args.dry_run and args.tpsample:
|
795
|
+
write_timepoint_sampled_sequences(glfos, final_events)
|
796
|
+
if 'cache-parameters' in args.actions:
|
797
|
+
cache_parameters()
|
798
|
+
if 'partition' in args.actions:
|
799
|
+
partition()
|